lazar 0.0.7 → 0.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +3 -0
- data/README.md +2 -1
- data/VERSION +1 -1
- data/ext/lazar/extconf.rb +15 -76
- data/ext/lazar/rinstall.R +9 -0
- data/lazar.gemspec +7 -7
- data/lib/classification.rb +5 -78
- data/lib/compound.rb +201 -44
- data/lib/crossvalidation.rb +224 -121
- data/lib/dataset.rb +83 -93
- data/lib/error.rb +1 -1
- data/lib/experiment.rb +99 -0
- data/lib/feature.rb +2 -54
- data/lib/lazar.rb +47 -34
- data/lib/leave-one-out-validation.rb +205 -0
- data/lib/model.rb +131 -76
- data/lib/opentox.rb +2 -2
- data/lib/overwrite.rb +37 -0
- data/lib/physchem.rb +133 -0
- data/lib/regression.rb +117 -189
- data/lib/rest-client-wrapper.rb +4 -5
- data/lib/unique_descriptors.rb +6 -7
- data/lib/validation.rb +63 -69
- data/test/all.rb +2 -2
- data/test/classification.rb +41 -0
- data/test/compound.rb +116 -7
- data/test/data/LOAEL_log_mg_corrected_smiles.csv +567 -567
- data/test/data/LOAEL_log_mmol_corrected_smiles.csv +566 -566
- data/test/data/LOAEL_mmol_corrected_smiles.csv +568 -0
- data/test/data/batch_prediction.csv +25 -0
- data/test/data/batch_prediction_inchi_small.csv +4 -0
- data/test/data/batch_prediction_smiles_small.csv +4 -0
- data/test/data/hamster_carcinogenicity.json +3 -0
- data/test/data/loael.csv +568 -0
- data/test/dataset-long.rb +5 -8
- data/test/dataset.rb +31 -11
- data/test/default_environment.rb +11 -0
- data/test/descriptor.rb +26 -41
- data/test/error.rb +1 -3
- data/test/experiment.rb +301 -0
- data/test/feature.rb +22 -10
- data/test/lazar-long.rb +43 -23
- data/test/lazar-physchem-short.rb +19 -16
- data/test/prediction_models.rb +20 -0
- data/test/regression.rb +43 -0
- data/test/setup.rb +3 -1
- data/test/test_environment.rb +10 -0
- data/test/validation.rb +92 -26
- metadata +64 -38
- data/lib/SMARTS_InteLigand.txt +0 -983
- data/lib/bbrc.rb +0 -165
- data/lib/descriptor.rb +0 -247
- data/lib/neighbor.rb +0 -25
- data/lib/similarity.rb +0 -58
- data/mongoid.yml +0 -8
- data/test/descriptor-long.rb +0 -26
- data/test/fminer-long.rb +0 -38
- data/test/fminer.rb +0 -52
- data/test/lazar-fminer.rb +0 -50
- data/test/lazar-regression.rb +0 -27
data/lib/bbrc.rb
DELETED
@@ -1,165 +0,0 @@
|
|
1
|
-
module OpenTox
|
2
|
-
module Algorithm
|
3
|
-
class Fminer
|
4
|
-
TABLE_OF_ELEMENTS = [
|
5
|
-
"H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th", "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr", "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Uut", "Fl", "Uup", "Lv", "Uus", "Uuo"]
|
6
|
-
|
7
|
-
#
|
8
|
-
# Run bbrc algorithm on dataset
|
9
|
-
#
|
10
|
-
# @param [OpenTox::Dataset] training dataset
|
11
|
-
# @param [optional] parameters BBRC parameters, accepted parameters are
|
12
|
-
# - min_frequency Minimum frequency (default 5)
|
13
|
-
# - feature_type Feature type, can be 'paths' or 'trees' (default "trees")
|
14
|
-
# - backbone BBRC classes, pass 'false' to switch off mining for BBRC representatives. (default "true")
|
15
|
-
# - min_chisq_significance Significance threshold (between 0 and 1)
|
16
|
-
# - nr_hits Set to "true" to get hit count instead of presence
|
17
|
-
# - get_target Set to "true" to obtain target variable as feature
|
18
|
-
# @return [OpenTox::Dataset] Fminer Dataset
|
19
|
-
def self.bbrc training_dataset, params={}
|
20
|
-
|
21
|
-
time = Time.now
|
22
|
-
bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1
|
23
|
-
|
24
|
-
prediction_feature = training_dataset.features.first
|
25
|
-
if params[:min_frequency]
|
26
|
-
minfreq = params[:min_frequency]
|
27
|
-
else
|
28
|
-
per_mil = 5 # value from latest version
|
29
|
-
per_mil = 8 # as suggested below
|
30
|
-
i = training_dataset.feature_ids.index prediction_feature.id
|
31
|
-
nr_labeled_cmpds = training_dataset.data_entries.select{|de| !de[i].nil?}.size
|
32
|
-
minfreq = per_mil * nr_labeled_cmpds.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
|
33
|
-
minfreq = 2 unless minfreq > 2
|
34
|
-
minfreq = minfreq.round
|
35
|
-
end
|
36
|
-
|
37
|
-
@bbrc ||= Bbrc::Bbrc.new
|
38
|
-
@bbrc.Reset
|
39
|
-
if prediction_feature.numeric
|
40
|
-
@bbrc.SetRegression(true) # AM: DO NOT MOVE DOWN! Must happen before the other Set... operations!
|
41
|
-
else
|
42
|
-
bad_request_error "No accept values for "\
|
43
|
-
"dataset '#{training_dataset.id}' and "\
|
44
|
-
"feature '#{prediction_feature.id}'" unless prediction_feature.accept_values
|
45
|
-
value2act = Hash[[*prediction_feature.accept_values.map.with_index]]
|
46
|
-
end
|
47
|
-
@bbrc.SetMinfreq(minfreq)
|
48
|
-
@bbrc.SetType(1) if params[:feature_type] == "paths"
|
49
|
-
@bbrc.SetBackbone(false) if params[:backbone] == "false"
|
50
|
-
@bbrc.SetChisqSig(params[:min_chisq_significance].to_f) if params[:min_chisq_significance]
|
51
|
-
@bbrc.SetConsoleOut(false)
|
52
|
-
|
53
|
-
params[:nr_hits] ? nr_hits = params[:nr_hits] : nr_hits = false
|
54
|
-
feature_dataset = FminerDataset.new(
|
55
|
-
:training_dataset_id => training_dataset.id,
|
56
|
-
:training_algorithm => "#{self.to_s}.bbrc",
|
57
|
-
:training_feature_id => prediction_feature.id ,
|
58
|
-
:training_parameters => {
|
59
|
-
:min_frequency => minfreq,
|
60
|
-
:nr_hits => nr_hits,
|
61
|
-
:backbone => (params[:backbone] == false ? false : true)
|
62
|
-
}
|
63
|
-
|
64
|
-
)
|
65
|
-
feature_dataset.compounds = training_dataset.compounds
|
66
|
-
|
67
|
-
# add data
|
68
|
-
training_dataset.compounds.each_with_index do |compound,i|
|
69
|
-
act = value2act[training_dataset.data_entries[i].first]
|
70
|
-
if act # TODO check if this works
|
71
|
-
@bbrc.AddCompound(compound.smiles,i+1)
|
72
|
-
@bbrc.AddActivity(act,i+1)
|
73
|
-
end
|
74
|
-
end
|
75
|
-
#g_median=@fminer.all_activities.values.to_scale.median
|
76
|
-
|
77
|
-
#task.progress 10
|
78
|
-
#step_width = 80 / @bbrc.GetNoRootNodes().to_f
|
79
|
-
|
80
|
-
$logger.debug "BBRC setup: #{Time.now-time}"
|
81
|
-
time = Time.now
|
82
|
-
ftime = 0
|
83
|
-
itime = 0
|
84
|
-
rtime = 0
|
85
|
-
|
86
|
-
# run @bbrc
|
87
|
-
(0 .. @bbrc.GetNoRootNodes()-1).each do |j|
|
88
|
-
results = @bbrc.MineRoot(j)
|
89
|
-
results.each do |result|
|
90
|
-
rt = Time.now
|
91
|
-
f = YAML.load(result)[0]
|
92
|
-
smarts = f.shift
|
93
|
-
# convert fminer SMARTS representation into a more human readable format
|
94
|
-
smarts.gsub!(%r{\[#(\d+)&(\w)\]}) do
|
95
|
-
element = TABLE_OF_ELEMENTS[$1.to_i-1]
|
96
|
-
$2 == "a" ? element.downcase : element
|
97
|
-
end
|
98
|
-
p_value = f.shift
|
99
|
-
f.flatten!
|
100
|
-
compound_idxs = f.collect{|e| e.first.first-1}
|
101
|
-
# majority class
|
102
|
-
effect = compound_idxs.collect{|i| training_dataset.data_entries[i].first}.mode
|
103
|
-
|
104
|
-
=begin
|
105
|
-
if (!@bbrc.GetRegression)
|
106
|
-
id_arrs = f[2..-1].flatten
|
107
|
-
max = OpenTox::Algorithm::Fminer.effect(f[2..-1].reverse, @fminer.db_class_sizes) # f needs reversal for bbrc
|
108
|
-
effect = max+1
|
109
|
-
else #regression part
|
110
|
-
id_arrs = f[2]
|
111
|
-
# DV: effect calculation
|
112
|
-
f_arr=Array.new
|
113
|
-
f[2].each do |id|
|
114
|
-
id=id.keys[0] # extract id from hit count hash
|
115
|
-
f_arr.push(@fminer.all_activities[id])
|
116
|
-
end
|
117
|
-
f_median=f_arr.to_scale.median
|
118
|
-
if g_median >= f_median
|
119
|
-
effect = 'activating'
|
120
|
-
else
|
121
|
-
effect = 'deactivating'
|
122
|
-
end
|
123
|
-
end
|
124
|
-
=end
|
125
|
-
rtime += Time.now - rt
|
126
|
-
|
127
|
-
ft = Time.now
|
128
|
-
feature = OpenTox::FminerSmarts.find_or_create_by({
|
129
|
-
"smarts" => smarts,
|
130
|
-
"p_value" => p_value.to_f.abs.round(5),
|
131
|
-
"effect" => effect,
|
132
|
-
"dataset_id" => feature_dataset.id
|
133
|
-
})
|
134
|
-
feature_dataset.feature_ids << feature.id
|
135
|
-
ftime += Time.now - ft
|
136
|
-
|
137
|
-
it = Time.now
|
138
|
-
f.each do |id_count_hash|
|
139
|
-
id_count_hash.each do |id,count|
|
140
|
-
nr_hits ? count = count.to_i : count = 1
|
141
|
-
feature_dataset.data_entries[id-1] ||= []
|
142
|
-
feature_dataset.data_entries[id-1][feature_dataset.feature_ids.size-1] = count
|
143
|
-
end
|
144
|
-
end
|
145
|
-
itime += Time.now - it
|
146
|
-
|
147
|
-
end
|
148
|
-
end
|
149
|
-
|
150
|
-
$logger.debug "Fminer: #{Time.now-time} (read: #{rtime}, iterate: #{itime}, find/create Features: #{ftime})"
|
151
|
-
time = Time.now
|
152
|
-
|
153
|
-
feature_dataset.fill_nil_with 0
|
154
|
-
|
155
|
-
$logger.debug "Prepare save: #{Time.now-time}"
|
156
|
-
time = Time.now
|
157
|
-
feature_dataset.save_all
|
158
|
-
|
159
|
-
$logger.debug "Save: #{Time.now-time}"
|
160
|
-
feature_dataset
|
161
|
-
|
162
|
-
end
|
163
|
-
end
|
164
|
-
end
|
165
|
-
end
|
data/lib/descriptor.rb
DELETED
@@ -1,247 +0,0 @@
|
|
1
|
-
require 'digest/md5'
|
2
|
-
ENV["JAVA_HOME"] ||= "/usr/lib/jvm/java-7-openjdk"
|
3
|
-
# TODO store descriptors in mongodb
|
4
|
-
|
5
|
-
module OpenTox
|
6
|
-
|
7
|
-
module Algorithm
|
8
|
-
|
9
|
-
# Class for descriptor calculations
|
10
|
-
class Descriptor
|
11
|
-
include OpenTox
|
12
|
-
|
13
|
-
JAVA_DIR = File.join(File.dirname(__FILE__),"..","java")
|
14
|
-
CDK_JAR = Dir[File.join(JAVA_DIR,"cdk-*jar")].last
|
15
|
-
JOELIB_JAR = File.join(JAVA_DIR,"joelib2.jar")
|
16
|
-
LOG4J_JAR = File.join(JAVA_DIR,"log4j.jar")
|
17
|
-
JMOL_JAR = File.join(JAVA_DIR,"Jmol.jar")
|
18
|
-
|
19
|
-
obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title"]
|
20
|
-
OBDESCRIPTORS = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d|
|
21
|
-
name,description = d.split(/\s+/,2)
|
22
|
-
["Openbabel."+name,description] unless obexclude.include? name
|
23
|
-
end.compact.sort{|a,b| a[0] <=> b[0]}]
|
24
|
-
|
25
|
-
cdk_desc = YAML.load(`java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptorInfo`)
|
26
|
-
CDKDESCRIPTORS = Hash[cdk_desc.collect { |d| ["Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,''), d[:description]] }.sort{|a,b| a[0] <=> b[0]}]
|
27
|
-
CDKDESCRIPTOR_VALUES = cdk_desc.collect { |d| prefix="Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,''); d[:names].collect{ |name| prefix+"."+name } }.flatten
|
28
|
-
|
29
|
-
# exclude Hashcode (not a physchem property) and GlobalTopologicalChargeIndex (Joelib bug)
|
30
|
-
joelibexclude = ["MoleculeHashcode","GlobalTopologicalChargeIndex"]
|
31
|
-
# strip Joelib messages from stdout
|
32
|
-
JOELIBDESCRIPTORS = Hash[YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`).collect do |d|
|
33
|
-
name = d[:java_class].sub(/^joelib2.feature.types./,'')
|
34
|
-
# impossible to obtain meaningful descriptions from JOELIb, see java/JoelibDescriptors.java
|
35
|
-
["Joelib."+name, "no description available"] unless joelibexclude.include? name
|
36
|
-
end.compact.sort{|a,b| a[0] <=> b[0]}]
|
37
|
-
|
38
|
-
DESCRIPTORS = OBDESCRIPTORS.merge(CDKDESCRIPTORS.merge(JOELIBDESCRIPTORS))
|
39
|
-
DESCRIPTOR_VALUES = OBDESCRIPTORS.keys + CDKDESCRIPTOR_VALUES + JOELIBDESCRIPTORS.keys
|
40
|
-
|
41
|
-
require_relative "unique_descriptors.rb"
|
42
|
-
|
43
|
-
# Description of available descriptors
|
44
|
-
def self.description descriptor
|
45
|
-
lib = descriptor.split('.').first
|
46
|
-
case lib
|
47
|
-
when "Openbabel"
|
48
|
-
OBDESCRIPTORS[descriptor]
|
49
|
-
when "Cdk"
|
50
|
-
name = descriptor.split('.')[0..-2].join('.')
|
51
|
-
CDKDESCRIPTORS[name]
|
52
|
-
when "Joelib"
|
53
|
-
JOELIBDESCRIPTORS[descriptor]
|
54
|
-
when "lookup"
|
55
|
-
"Read feature values from a dataset"
|
56
|
-
end
|
57
|
-
end
|
58
|
-
|
59
|
-
# Match an array of smarts features
|
60
|
-
def self.smarts_match compounds, smarts_features, count=false
|
61
|
-
bad_request_error "Compounds for smarts_match are empty" unless compounds
|
62
|
-
bad_request_error "Smarts features for smarts_match are empty" unless smarts_features
|
63
|
-
parse compounds
|
64
|
-
@count = count
|
65
|
-
obconversion = OpenBabel::OBConversion.new
|
66
|
-
obmol = OpenBabel::OBMol.new
|
67
|
-
obconversion.set_in_format('smi')
|
68
|
-
smarts_pattern = OpenBabel::OBSmartsPattern.new
|
69
|
-
smarts_features = [smarts_features] if smarts_features.is_a?(Feature)
|
70
|
-
@smarts = smarts_features.collect{|f| f.smarts}
|
71
|
-
@physchem_descriptors = nil
|
72
|
-
@data_entries = Array.new(@compounds.size){Array.new(@smarts.size,false)}
|
73
|
-
@compounds.each_with_index do |compound,c|
|
74
|
-
obconversion.read_string(obmol,compound.smiles)
|
75
|
-
@smarts.each_with_index do |smart,s|
|
76
|
-
smarts_pattern.init(smart)
|
77
|
-
if smarts_pattern.match(obmol)
|
78
|
-
count ? value = smarts_pattern.get_map_list.to_a.size : value = 1
|
79
|
-
else
|
80
|
-
value = 0
|
81
|
-
end
|
82
|
-
@data_entries[c][s] = value
|
83
|
-
end
|
84
|
-
end
|
85
|
-
serialize
|
86
|
-
end
|
87
|
-
|
88
|
-
# Count matches of an array with smarts features
|
89
|
-
def self.smarts_count compounds, smarts
|
90
|
-
# TODO: non-overlapping matches?
|
91
|
-
smarts_match compounds,smarts,true
|
92
|
-
end
|
93
|
-
|
94
|
-
# Calculate physchem descriptors
|
95
|
-
# @param [OpenTox::Compound,Array,OpenTox::Dataset] input object, either a compound, an array of compounds or a dataset
|
96
|
-
def self.physchem compounds, descriptors=UNIQUEDESCRIPTORS
|
97
|
-
parse compounds
|
98
|
-
@data_entries = Array.new(@compounds.size){[]}
|
99
|
-
@descriptors = descriptors
|
100
|
-
@smarts = nil
|
101
|
-
@physchem_descriptors = [] # CDK may return more than one result per descriptor, they are stored as separate features
|
102
|
-
des = {}
|
103
|
-
@descriptors.each do |d|
|
104
|
-
lib, descriptor = d.split(".",2)
|
105
|
-
lib = lib.downcase.to_sym
|
106
|
-
des[lib] ||= []
|
107
|
-
des[lib] << descriptor
|
108
|
-
end
|
109
|
-
des.each do |lib,descriptors|
|
110
|
-
send(lib, descriptors)
|
111
|
-
end
|
112
|
-
serialize
|
113
|
-
end
|
114
|
-
|
115
|
-
def self.openbabel descriptors
|
116
|
-
$logger.debug "compute #{descriptors.size} openbabel descriptors for #{@compounds.size} compounds"
|
117
|
-
obdescriptors = descriptors.collect{|d| OpenBabel::OBDescriptor.find_type d}
|
118
|
-
obmol = OpenBabel::OBMol.new
|
119
|
-
obconversion = OpenBabel::OBConversion.new
|
120
|
-
obconversion.set_in_format 'smi'
|
121
|
-
last_feature_idx = @physchem_descriptors.size
|
122
|
-
@compounds.each_with_index do |compound,c|
|
123
|
-
obconversion.read_string obmol, compound.smiles
|
124
|
-
obdescriptors.each_with_index do |descriptor,d|
|
125
|
-
@data_entries[c][d+last_feature_idx] = fix_value(descriptor.predict(obmol))
|
126
|
-
end
|
127
|
-
end
|
128
|
-
@physchem_descriptors += descriptors.collect{|d| "Openbabel.#{d}"}
|
129
|
-
end
|
130
|
-
|
131
|
-
def self.java_descriptors descriptors, lib
|
132
|
-
$logger.debug "compute #{descriptors.size} cdk descriptors for #{@compounds.size} compounds"
|
133
|
-
sdf = sdf_3d
|
134
|
-
# use java system call (rjb blocks within tasks)
|
135
|
-
# use Tempfiles to avoid "Argument list too long" error
|
136
|
-
case lib
|
137
|
-
when "cdk"
|
138
|
-
run_cmd "java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptors #{sdf} #{descriptors.join(" ")}"
|
139
|
-
when "joelib"
|
140
|
-
run_cmd "java -classpath #{JOELIB_JAR}:#{JMOL_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptors #{sdf} #{descriptors.join(' ')}"
|
141
|
-
end
|
142
|
-
last_feature_idx = @physchem_descriptors.size
|
143
|
-
YAML.load_file("#{sdf}#{lib}.yaml").each_with_index do |calculation,i|
|
144
|
-
# TODO create warnings
|
145
|
-
#$logger.error "Descriptor calculation failed for compound #{@compounds[i].inchi}." if calculation.empty?
|
146
|
-
# CDK Descriptors may calculate multiple values, they are stored in separate features
|
147
|
-
@physchem_descriptors += calculation.keys if i == 0
|
148
|
-
calculation.keys.each_with_index do |name,j|
|
149
|
-
@data_entries[i][j+last_feature_idx] = fix_value(calculation[name])
|
150
|
-
end
|
151
|
-
end
|
152
|
-
FileUtils.rm "#{sdf}#{lib}.yaml"
|
153
|
-
end
|
154
|
-
|
155
|
-
def self.cdk descriptors
|
156
|
-
java_descriptors descriptors, "cdk"
|
157
|
-
end
|
158
|
-
|
159
|
-
def self.joelib descriptors
|
160
|
-
java_descriptors descriptors, "joelib"
|
161
|
-
end
|
162
|
-
|
163
|
-
def self.lookup compounds, features, dataset
|
164
|
-
parse compounds
|
165
|
-
fingerprint = []
|
166
|
-
compounds.each do |compound|
|
167
|
-
fingerprint << []
|
168
|
-
features.each do |feature|
|
169
|
-
end
|
170
|
-
end
|
171
|
-
end
|
172
|
-
|
173
|
-
def self.run_cmd cmd
|
174
|
-
cmd = "#{cmd} 2>&1"
|
175
|
-
$logger.debug "running external cmd: '#{cmd}'"
|
176
|
-
p = IO.popen(cmd) do |io|
|
177
|
-
while line = io.gets
|
178
|
-
$logger.debug "> #{line.chomp}"
|
179
|
-
end
|
180
|
-
io.close
|
181
|
-
raise "external cmd failed '#{cmd}' (see log file for error msg)" unless $?.to_i == 0
|
182
|
-
end
|
183
|
-
end
|
184
|
-
|
185
|
-
def self.sdf_3d
|
186
|
-
# TODO check if 3d sdfs are stored in GridFS
|
187
|
-
sdf = ""
|
188
|
-
@compounds.each do |compound|
|
189
|
-
sdf << compound.sdf
|
190
|
-
end
|
191
|
-
sdf_file = "/tmp/#{SecureRandom.uuid}.sdf"
|
192
|
-
File.open(sdf_file,"w+"){|f| f.print sdf}
|
193
|
-
sdf_file
|
194
|
-
end
|
195
|
-
|
196
|
-
def self.parse compounds
|
197
|
-
@input_class = compounds.class.to_s
|
198
|
-
case @input_class
|
199
|
-
when "OpenTox::Compound"
|
200
|
-
@compounds = [compounds]
|
201
|
-
when "Array"
|
202
|
-
@compounds = compounds
|
203
|
-
when "OpenTox::Dataset"
|
204
|
-
@compounds = compounds.compounds
|
205
|
-
else
|
206
|
-
bad_request_error "Cannot calculate descriptors for #{compounds.class} objects."
|
207
|
-
end
|
208
|
-
end
|
209
|
-
|
210
|
-
def self.serialize
|
211
|
-
@data_entries.collect!{|de| de.collect{|v| v.round(5) unless v.nil?}}
|
212
|
-
case @input_class
|
213
|
-
when "OpenTox::Compound"
|
214
|
-
@data_entries.first
|
215
|
-
when "Array"
|
216
|
-
@data_entries
|
217
|
-
when "OpenTox::Dataset"
|
218
|
-
dataset = OpenTox::DescriptorDataset.new(:compound_ids => @compounds.collect{|c| c.id})
|
219
|
-
if @smarts
|
220
|
-
dataset.feature_ids = @smarts.collect{|smart| Smarts.find_or_create_by(:smarts => smart).id}
|
221
|
-
@count ? algo = "count" : algo = "match"
|
222
|
-
dataset.feature_calculation_algorithm = "#{self}.smarts_#{algo}"
|
223
|
-
|
224
|
-
elsif @physchem_descriptors
|
225
|
-
dataset.feature_ids = @physchem_descriptors.collect{|d| PhysChemDescriptor.find_or_create_by(:name => d, :creator => __FILE__).id}
|
226
|
-
dataset.data_entries = @data_entries
|
227
|
-
dataset.feature_calculation_algorithm = "#{self}.physchem"
|
228
|
-
#TODO params?
|
229
|
-
end
|
230
|
-
dataset.save_all
|
231
|
-
dataset
|
232
|
-
end
|
233
|
-
end
|
234
|
-
|
235
|
-
def self.fix_value val
|
236
|
-
val = val.first if val.is_a? Array and val.size == 1
|
237
|
-
val = nil if val == "NaN"
|
238
|
-
if val.numeric?
|
239
|
-
val = Float(val)
|
240
|
-
val = nil if val.nan? or val.infinite?
|
241
|
-
end
|
242
|
-
val
|
243
|
-
end
|
244
|
-
private_class_method :sdf_3d, :fix_value, :parse, :run_cmd, :serialize
|
245
|
-
end
|
246
|
-
end
|
247
|
-
end
|
data/lib/neighbor.rb
DELETED
@@ -1,25 +0,0 @@
|
|
1
|
-
module OpenTox
|
2
|
-
module Algorithm
|
3
|
-
class Neighbor
|
4
|
-
|
5
|
-
def self.fingerprint_similarity compound, params={}
|
6
|
-
compound.neighbors params[:min_sim]
|
7
|
-
end
|
8
|
-
|
9
|
-
def self.fminer_similarity compound, params
|
10
|
-
feature_dataset = Dataset.find params[:feature_dataset_id]
|
11
|
-
query_fingerprint = Algorithm::Descriptor.smarts_match(compound, feature_dataset.features)
|
12
|
-
neighbors = []
|
13
|
-
|
14
|
-
# find neighbors
|
15
|
-
feature_dataset.data_entries.each_with_index do |fingerprint, i|
|
16
|
-
sim = Algorithm::Similarity.tanimoto fingerprint, query_fingerprint
|
17
|
-
if sim > params[:min_sim]
|
18
|
-
neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming
|
19
|
-
end
|
20
|
-
end
|
21
|
-
neighbors
|
22
|
-
end
|
23
|
-
end
|
24
|
-
end
|
25
|
-
end
|
data/lib/similarity.rb
DELETED
@@ -1,58 +0,0 @@
|
|
1
|
-
=begin
|
2
|
-
* Name: similarity.rb
|
3
|
-
* Description: Similarity algorithms
|
4
|
-
* Author: Andreas Maunz <andreas@maunz.de
|
5
|
-
* Date: 10/2012
|
6
|
-
=end
|
7
|
-
|
8
|
-
module OpenTox
|
9
|
-
module Algorithm
|
10
|
-
|
11
|
-
class Similarity
|
12
|
-
|
13
|
-
#TODO weighted tanimoto
|
14
|
-
|
15
|
-
# Tanimoto similarity
|
16
|
-
# @param [Array] a fingerprints of first compound
|
17
|
-
# @param [Array] b fingerprints of second compound
|
18
|
-
# @return [Float] Tanimoto similarity
|
19
|
-
def self.tanimoto(a,b)
|
20
|
-
bad_request_error "fingerprints #{a} and #{b} don't have equal size" unless a.size == b.size
|
21
|
-
#common = 0.0
|
22
|
-
#a.each_with_index do |n,i|
|
23
|
-
#common += 1 if n == b[i]
|
24
|
-
#end
|
25
|
-
#common/a.size
|
26
|
-
# TODO check if calculation speed can be improved
|
27
|
-
common_p_sum = 0.0
|
28
|
-
all_p_sum = 0.0
|
29
|
-
(0...a.size).each { |idx|
|
30
|
-
common_p_sum += [ a[idx], b[idx] ].min
|
31
|
-
all_p_sum += [ a[idx], b[idx] ].max
|
32
|
-
}
|
33
|
-
common_p_sum/all_p_sum
|
34
|
-
end
|
35
|
-
|
36
|
-
|
37
|
-
# Cosine similarity
|
38
|
-
# @param [Array] a fingerprints of first compound
|
39
|
-
# @param [Array] b fingerprints of second compound
|
40
|
-
# @return [Float] Cosine similarity, the cosine of angle enclosed between vectors a and b
|
41
|
-
def self.cosine(a, b)
|
42
|
-
val = 0.0
|
43
|
-
if a.size>0 and b.size>0
|
44
|
-
if a.size>12 && b.size>12
|
45
|
-
a = a[0..11]
|
46
|
-
b = b[0..11]
|
47
|
-
end
|
48
|
-
a_vec = a.to_gv
|
49
|
-
b_vec = b.to_gv
|
50
|
-
val = a_vec.dot(b_vec) / (a_vec.norm * b_vec.norm)
|
51
|
-
end
|
52
|
-
val
|
53
|
-
end
|
54
|
-
|
55
|
-
end
|
56
|
-
|
57
|
-
end
|
58
|
-
end
|
data/mongoid.yml
DELETED
data/test/descriptor-long.rb
DELETED
@@ -1,26 +0,0 @@
|
|
1
|
-
require_relative "setup.rb"
|
2
|
-
class DescriptorLongTest < MiniTest::Test
|
3
|
-
|
4
|
-
def test_dataset_all
|
5
|
-
# TODO: improve CDK descriptor calculation speed or add timeout
|
6
|
-
skip "CDK descriptor calculation takes too long for some compounds"
|
7
|
-
dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.mini.csv")
|
8
|
-
d = OpenTox::Algorithm::Descriptor.physchem dataset
|
9
|
-
assert_equal dataset.compounds, d.compounds
|
10
|
-
assert_equal 332, d.features.size
|
11
|
-
assert_equal 332, d.data_entries.first.size
|
12
|
-
d.delete
|
13
|
-
end
|
14
|
-
|
15
|
-
def test_dataset_openbabel
|
16
|
-
# TODO: improve CDK descriptor calculation speed or add timeout
|
17
|
-
dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.mini.csv")
|
18
|
-
d = Algorithm::Descriptor.physchem dataset, Algorithm::Descriptor::OBDESCRIPTORS.keys
|
19
|
-
assert_equal dataset.compounds, d.compounds
|
20
|
-
size = Algorithm::Descriptor::OBDESCRIPTORS.keys.size
|
21
|
-
assert_equal size, d.features.size
|
22
|
-
assert_equal size, d.data_entries.first.size
|
23
|
-
d.delete
|
24
|
-
end
|
25
|
-
|
26
|
-
end
|
data/test/fminer-long.rb
DELETED
@@ -1,38 +0,0 @@
|
|
1
|
-
require_relative "setup.rb"
|
2
|
-
|
3
|
-
class FminerTest < MiniTest::Test
|
4
|
-
|
5
|
-
def test_fminer_multicell
|
6
|
-
#skip "multicell segfaults"
|
7
|
-
# TODO aborts, probably fminer
|
8
|
-
# or OpenBabel segfault
|
9
|
-
dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"multi_cell_call.csv")
|
10
|
-
feature_dataset = OpenTox::Algorithm::Fminer.bbrc(dataset)#, :min_frequency => 15)
|
11
|
-
p feature_dataset.training_parameters
|
12
|
-
assert_equal dataset.compound_ids, feature_dataset.compound_ids
|
13
|
-
dataset.delete
|
14
|
-
feature_dataset.delete
|
15
|
-
end
|
16
|
-
|
17
|
-
def test_fminer_isscan
|
18
|
-
dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"ISSCAN-multi.csv")
|
19
|
-
feature_dataset = OpenTox::Algorithm::Fminer.bbrc(dataset)#, :min_frequency => 15)
|
20
|
-
assert_equal feature_dataset.compounds.size, dataset.compounds.size
|
21
|
-
p feature_dataset.features.size
|
22
|
-
p feature_dataset.training_parameters
|
23
|
-
dataset.delete
|
24
|
-
feature_dataset.delete
|
25
|
-
end
|
26
|
-
|
27
|
-
def test_fminer_kazius
|
28
|
-
dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"kazius.csv")
|
29
|
-
# TODO reactivate default settings
|
30
|
-
feature_dataset = OpenTox::Algorithm::Fminer.bbrc(dataset, :min_frequency => 20)
|
31
|
-
assert_equal feature_dataset.compounds.size, dataset.compounds.size
|
32
|
-
feature_dataset = Dataset.find feature_dataset.id
|
33
|
-
assert feature_dataset.data_entries.size, dataset.compounds.size
|
34
|
-
dataset.delete
|
35
|
-
feature_dataset.delete
|
36
|
-
end
|
37
|
-
|
38
|
-
end
|
data/test/fminer.rb
DELETED
@@ -1,52 +0,0 @@
|
|
1
|
-
require_relative "setup.rb"
|
2
|
-
|
3
|
-
class FminerTest < MiniTest::Test
|
4
|
-
|
5
|
-
def test_fminer_bbrc
|
6
|
-
dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv")
|
7
|
-
refute_nil dataset.id
|
8
|
-
feature_dataset = OpenTox::Algorithm::Fminer.bbrc dataset
|
9
|
-
feature_dataset = Dataset.find feature_dataset.id
|
10
|
-
assert_equal dataset.compounds.size, feature_dataset.compounds.size
|
11
|
-
# TODO: fminer calculates 62 instead of 54 features
|
12
|
-
# it is unclear which commit changed the numbers (occurs with old libraries/mongodb branch too
|
13
|
-
# modification of Compound to use smiles instead of inchis seems to have no effect
|
14
|
-
#assert_equal 54, feature_dataset.features.size
|
15
|
-
#assert_equal "C-C-C=C", feature_dataset.features.first.smarts
|
16
|
-
compounds = feature_dataset.compounds
|
17
|
-
smarts = feature_dataset.features
|
18
|
-
smarts.each do |smart|
|
19
|
-
assert smart.p_value.round(2) >= 0.95
|
20
|
-
end
|
21
|
-
match = OpenTox::Algorithm::Descriptor.smarts_match compounds, smarts
|
22
|
-
feature_dataset.data_entries.each_with_index do |fingerprint,i|
|
23
|
-
assert_equal match[i], fingerprint
|
24
|
-
end
|
25
|
-
|
26
|
-
dataset.delete
|
27
|
-
feature_dataset.delete
|
28
|
-
end
|
29
|
-
|
30
|
-
def test_fminer_last
|
31
|
-
skip "last features have to be activated"
|
32
|
-
dataset = OpenTox::Dataset.new
|
33
|
-
dataset.upload File.join(DATA_DIR,"hamster_carcinogenicity.csv")
|
34
|
-
feature_dataset = OpenTox::Algorithm::Fminer.last :dataset => dataset
|
35
|
-
assert_equal dataset.compounds.size, feature_dataset.compounds.size
|
36
|
-
assert_equal 21, feature_dataset.features.size
|
37
|
-
assert_equal '[#6&A]-[#6&a]:[#6&a]:[#6&a]:[#6&a]:[#6&a]', feature_dataset.features.first.smarts
|
38
|
-
|
39
|
-
compounds = feature_dataset.compounds
|
40
|
-
smarts = feature_dataset.features.collect{|f| f.smarts}
|
41
|
-
match = OpenTox::Algorithm::Descriptor.smarts_match compounds, smarts
|
42
|
-
compounds.each_with_index do |c,i|
|
43
|
-
smarts.each_with_index do |s,j|
|
44
|
-
assert_equal match[i][j], feature_dataset.data_entries[i][j].to_i
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
|
-
dataset.delete
|
49
|
-
feature_dataset.delete
|
50
|
-
end
|
51
|
-
|
52
|
-
end
|
data/test/lazar-fminer.rb
DELETED
@@ -1,50 +0,0 @@
|
|
1
|
-
require_relative "setup.rb"
|
2
|
-
|
3
|
-
class LazarFminerTest < MiniTest::Test
|
4
|
-
|
5
|
-
def test_lazar_fminer
|
6
|
-
training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv")
|
7
|
-
model = Model::LazarFminerClassification.create training_dataset#, feature_dataset
|
8
|
-
feature_dataset = Dataset.find model.neighbor_algorithm_parameters[:feature_dataset_id]
|
9
|
-
assert_equal training_dataset.compounds.size, feature_dataset.compounds.size
|
10
|
-
#TODO check fminer features, see fminer.rb
|
11
|
-
#assert_equal 54, feature_dataset.features.size
|
12
|
-
feature_dataset.data_entries.each do |e|
|
13
|
-
assert_equal e.size, feature_dataset.features.size
|
14
|
-
end
|
15
|
-
#assert_equal 'C-C-C=C', feature_dataset.features.first.smarts
|
16
|
-
|
17
|
-
[ {
|
18
|
-
:compound => OpenTox::Compound.from_inchi("InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"),
|
19
|
-
:prediction => "false",
|
20
|
-
:confidence => 0.25281385281385277,
|
21
|
-
:nr_neighbors => 11
|
22
|
-
},{
|
23
|
-
:compound => OpenTox::Compound.from_smiles("c1ccccc1NN"),
|
24
|
-
:prediction => "false",
|
25
|
-
:confidence => 0.3639589577089577,
|
26
|
-
:nr_neighbors => 14
|
27
|
-
}, {
|
28
|
-
:compound => Compound.from_smiles('OCCCCCCCC\C=C/CCCCCCCC'),
|
29
|
-
:prediction => "false",
|
30
|
-
:confidence => 0.5555555555555556,
|
31
|
-
:nr_neighbors => 1
|
32
|
-
}].each do |example|
|
33
|
-
prediction = model.predict example[:compound]
|
34
|
-
|
35
|
-
assert_equal example[:prediction], prediction[:value]
|
36
|
-
#assert_equal example[:confidence], prediction[:confidence]
|
37
|
-
#assert_equal example[:nr_neighbors], prediction[:neighbors].size
|
38
|
-
end
|
39
|
-
|
40
|
-
# make a dataset prediction
|
41
|
-
compound_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini.csv")
|
42
|
-
prediction = model.predict compound_dataset
|
43
|
-
assert_equal compound_dataset.compounds, prediction.compounds
|
44
|
-
|
45
|
-
assert_equal "Cound not find similar compounds.", prediction.data_entries[7][2]
|
46
|
-
assert_equal "measured", prediction.data_entries[14][1]
|
47
|
-
# cleanup
|
48
|
-
[training_dataset,model,feature_dataset,compound_dataset].each{|o| o.delete}
|
49
|
-
end
|
50
|
-
end
|