lazar 0.0.7 → 0.0.9

Sign up to get free protection for your applications and to get access to all the features.
Files changed (61) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +3 -0
  3. data/README.md +2 -1
  4. data/VERSION +1 -1
  5. data/ext/lazar/extconf.rb +15 -76
  6. data/ext/lazar/rinstall.R +9 -0
  7. data/lazar.gemspec +7 -7
  8. data/lib/classification.rb +5 -78
  9. data/lib/compound.rb +201 -44
  10. data/lib/crossvalidation.rb +224 -121
  11. data/lib/dataset.rb +83 -93
  12. data/lib/error.rb +1 -1
  13. data/lib/experiment.rb +99 -0
  14. data/lib/feature.rb +2 -54
  15. data/lib/lazar.rb +47 -34
  16. data/lib/leave-one-out-validation.rb +205 -0
  17. data/lib/model.rb +131 -76
  18. data/lib/opentox.rb +2 -2
  19. data/lib/overwrite.rb +37 -0
  20. data/lib/physchem.rb +133 -0
  21. data/lib/regression.rb +117 -189
  22. data/lib/rest-client-wrapper.rb +4 -5
  23. data/lib/unique_descriptors.rb +6 -7
  24. data/lib/validation.rb +63 -69
  25. data/test/all.rb +2 -2
  26. data/test/classification.rb +41 -0
  27. data/test/compound.rb +116 -7
  28. data/test/data/LOAEL_log_mg_corrected_smiles.csv +567 -567
  29. data/test/data/LOAEL_log_mmol_corrected_smiles.csv +566 -566
  30. data/test/data/LOAEL_mmol_corrected_smiles.csv +568 -0
  31. data/test/data/batch_prediction.csv +25 -0
  32. data/test/data/batch_prediction_inchi_small.csv +4 -0
  33. data/test/data/batch_prediction_smiles_small.csv +4 -0
  34. data/test/data/hamster_carcinogenicity.json +3 -0
  35. data/test/data/loael.csv +568 -0
  36. data/test/dataset-long.rb +5 -8
  37. data/test/dataset.rb +31 -11
  38. data/test/default_environment.rb +11 -0
  39. data/test/descriptor.rb +26 -41
  40. data/test/error.rb +1 -3
  41. data/test/experiment.rb +301 -0
  42. data/test/feature.rb +22 -10
  43. data/test/lazar-long.rb +43 -23
  44. data/test/lazar-physchem-short.rb +19 -16
  45. data/test/prediction_models.rb +20 -0
  46. data/test/regression.rb +43 -0
  47. data/test/setup.rb +3 -1
  48. data/test/test_environment.rb +10 -0
  49. data/test/validation.rb +92 -26
  50. metadata +64 -38
  51. data/lib/SMARTS_InteLigand.txt +0 -983
  52. data/lib/bbrc.rb +0 -165
  53. data/lib/descriptor.rb +0 -247
  54. data/lib/neighbor.rb +0 -25
  55. data/lib/similarity.rb +0 -58
  56. data/mongoid.yml +0 -8
  57. data/test/descriptor-long.rb +0 -26
  58. data/test/fminer-long.rb +0 -38
  59. data/test/fminer.rb +0 -52
  60. data/test/lazar-fminer.rb +0 -50
  61. data/test/lazar-regression.rb +0 -27
data/lib/bbrc.rb DELETED
@@ -1,165 +0,0 @@
1
- module OpenTox
2
- module Algorithm
3
- class Fminer
4
- TABLE_OF_ELEMENTS = [
5
- "H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th", "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr", "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Uut", "Fl", "Uup", "Lv", "Uus", "Uuo"]
6
-
7
- #
8
- # Run bbrc algorithm on dataset
9
- #
10
- # @param [OpenTox::Dataset] training dataset
11
- # @param [optional] parameters BBRC parameters, accepted parameters are
12
- # - min_frequency Minimum frequency (default 5)
13
- # - feature_type Feature type, can be 'paths' or 'trees' (default "trees")
14
- # - backbone BBRC classes, pass 'false' to switch off mining for BBRC representatives. (default "true")
15
- # - min_chisq_significance Significance threshold (between 0 and 1)
16
- # - nr_hits Set to "true" to get hit count instead of presence
17
- # - get_target Set to "true" to obtain target variable as feature
18
- # @return [OpenTox::Dataset] Fminer Dataset
19
- def self.bbrc training_dataset, params={}
20
-
21
- time = Time.now
22
- bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1
23
-
24
- prediction_feature = training_dataset.features.first
25
- if params[:min_frequency]
26
- minfreq = params[:min_frequency]
27
- else
28
- per_mil = 5 # value from latest version
29
- per_mil = 8 # as suggested below
30
- i = training_dataset.feature_ids.index prediction_feature.id
31
- nr_labeled_cmpds = training_dataset.data_entries.select{|de| !de[i].nil?}.size
32
- minfreq = per_mil * nr_labeled_cmpds.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
33
- minfreq = 2 unless minfreq > 2
34
- minfreq = minfreq.round
35
- end
36
-
37
- @bbrc ||= Bbrc::Bbrc.new
38
- @bbrc.Reset
39
- if prediction_feature.numeric
40
- @bbrc.SetRegression(true) # AM: DO NOT MOVE DOWN! Must happen before the other Set... operations!
41
- else
42
- bad_request_error "No accept values for "\
43
- "dataset '#{training_dataset.id}' and "\
44
- "feature '#{prediction_feature.id}'" unless prediction_feature.accept_values
45
- value2act = Hash[[*prediction_feature.accept_values.map.with_index]]
46
- end
47
- @bbrc.SetMinfreq(minfreq)
48
- @bbrc.SetType(1) if params[:feature_type] == "paths"
49
- @bbrc.SetBackbone(false) if params[:backbone] == "false"
50
- @bbrc.SetChisqSig(params[:min_chisq_significance].to_f) if params[:min_chisq_significance]
51
- @bbrc.SetConsoleOut(false)
52
-
53
- params[:nr_hits] ? nr_hits = params[:nr_hits] : nr_hits = false
54
- feature_dataset = FminerDataset.new(
55
- :training_dataset_id => training_dataset.id,
56
- :training_algorithm => "#{self.to_s}.bbrc",
57
- :training_feature_id => prediction_feature.id ,
58
- :training_parameters => {
59
- :min_frequency => minfreq,
60
- :nr_hits => nr_hits,
61
- :backbone => (params[:backbone] == false ? false : true)
62
- }
63
-
64
- )
65
- feature_dataset.compounds = training_dataset.compounds
66
-
67
- # add data
68
- training_dataset.compounds.each_with_index do |compound,i|
69
- act = value2act[training_dataset.data_entries[i].first]
70
- if act # TODO check if this works
71
- @bbrc.AddCompound(compound.smiles,i+1)
72
- @bbrc.AddActivity(act,i+1)
73
- end
74
- end
75
- #g_median=@fminer.all_activities.values.to_scale.median
76
-
77
- #task.progress 10
78
- #step_width = 80 / @bbrc.GetNoRootNodes().to_f
79
-
80
- $logger.debug "BBRC setup: #{Time.now-time}"
81
- time = Time.now
82
- ftime = 0
83
- itime = 0
84
- rtime = 0
85
-
86
- # run @bbrc
87
- (0 .. @bbrc.GetNoRootNodes()-1).each do |j|
88
- results = @bbrc.MineRoot(j)
89
- results.each do |result|
90
- rt = Time.now
91
- f = YAML.load(result)[0]
92
- smarts = f.shift
93
- # convert fminer SMARTS representation into a more human readable format
94
- smarts.gsub!(%r{\[#(\d+)&(\w)\]}) do
95
- element = TABLE_OF_ELEMENTS[$1.to_i-1]
96
- $2 == "a" ? element.downcase : element
97
- end
98
- p_value = f.shift
99
- f.flatten!
100
- compound_idxs = f.collect{|e| e.first.first-1}
101
- # majority class
102
- effect = compound_idxs.collect{|i| training_dataset.data_entries[i].first}.mode
103
-
104
- =begin
105
- if (!@bbrc.GetRegression)
106
- id_arrs = f[2..-1].flatten
107
- max = OpenTox::Algorithm::Fminer.effect(f[2..-1].reverse, @fminer.db_class_sizes) # f needs reversal for bbrc
108
- effect = max+1
109
- else #regression part
110
- id_arrs = f[2]
111
- # DV: effect calculation
112
- f_arr=Array.new
113
- f[2].each do |id|
114
- id=id.keys[0] # extract id from hit count hash
115
- f_arr.push(@fminer.all_activities[id])
116
- end
117
- f_median=f_arr.to_scale.median
118
- if g_median >= f_median
119
- effect = 'activating'
120
- else
121
- effect = 'deactivating'
122
- end
123
- end
124
- =end
125
- rtime += Time.now - rt
126
-
127
- ft = Time.now
128
- feature = OpenTox::FminerSmarts.find_or_create_by({
129
- "smarts" => smarts,
130
- "p_value" => p_value.to_f.abs.round(5),
131
- "effect" => effect,
132
- "dataset_id" => feature_dataset.id
133
- })
134
- feature_dataset.feature_ids << feature.id
135
- ftime += Time.now - ft
136
-
137
- it = Time.now
138
- f.each do |id_count_hash|
139
- id_count_hash.each do |id,count|
140
- nr_hits ? count = count.to_i : count = 1
141
- feature_dataset.data_entries[id-1] ||= []
142
- feature_dataset.data_entries[id-1][feature_dataset.feature_ids.size-1] = count
143
- end
144
- end
145
- itime += Time.now - it
146
-
147
- end
148
- end
149
-
150
- $logger.debug "Fminer: #{Time.now-time} (read: #{rtime}, iterate: #{itime}, find/create Features: #{ftime})"
151
- time = Time.now
152
-
153
- feature_dataset.fill_nil_with 0
154
-
155
- $logger.debug "Prepare save: #{Time.now-time}"
156
- time = Time.now
157
- feature_dataset.save_all
158
-
159
- $logger.debug "Save: #{Time.now-time}"
160
- feature_dataset
161
-
162
- end
163
- end
164
- end
165
- end
data/lib/descriptor.rb DELETED
@@ -1,247 +0,0 @@
1
- require 'digest/md5'
2
- ENV["JAVA_HOME"] ||= "/usr/lib/jvm/java-7-openjdk"
3
- # TODO store descriptors in mongodb
4
-
5
- module OpenTox
6
-
7
- module Algorithm
8
-
9
- # Class for descriptor calculations
10
- class Descriptor
11
- include OpenTox
12
-
13
- JAVA_DIR = File.join(File.dirname(__FILE__),"..","java")
14
- CDK_JAR = Dir[File.join(JAVA_DIR,"cdk-*jar")].last
15
- JOELIB_JAR = File.join(JAVA_DIR,"joelib2.jar")
16
- LOG4J_JAR = File.join(JAVA_DIR,"log4j.jar")
17
- JMOL_JAR = File.join(JAVA_DIR,"Jmol.jar")
18
-
19
- obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title"]
20
- OBDESCRIPTORS = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d|
21
- name,description = d.split(/\s+/,2)
22
- ["Openbabel."+name,description] unless obexclude.include? name
23
- end.compact.sort{|a,b| a[0] <=> b[0]}]
24
-
25
- cdk_desc = YAML.load(`java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptorInfo`)
26
- CDKDESCRIPTORS = Hash[cdk_desc.collect { |d| ["Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,''), d[:description]] }.sort{|a,b| a[0] <=> b[0]}]
27
- CDKDESCRIPTOR_VALUES = cdk_desc.collect { |d| prefix="Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,''); d[:names].collect{ |name| prefix+"."+name } }.flatten
28
-
29
- # exclude Hashcode (not a physchem property) and GlobalTopologicalChargeIndex (Joelib bug)
30
- joelibexclude = ["MoleculeHashcode","GlobalTopologicalChargeIndex"]
31
- # strip Joelib messages from stdout
32
- JOELIBDESCRIPTORS = Hash[YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`).collect do |d|
33
- name = d[:java_class].sub(/^joelib2.feature.types./,'')
34
- # impossible to obtain meaningful descriptions from JOELIb, see java/JoelibDescriptors.java
35
- ["Joelib."+name, "no description available"] unless joelibexclude.include? name
36
- end.compact.sort{|a,b| a[0] <=> b[0]}]
37
-
38
- DESCRIPTORS = OBDESCRIPTORS.merge(CDKDESCRIPTORS.merge(JOELIBDESCRIPTORS))
39
- DESCRIPTOR_VALUES = OBDESCRIPTORS.keys + CDKDESCRIPTOR_VALUES + JOELIBDESCRIPTORS.keys
40
-
41
- require_relative "unique_descriptors.rb"
42
-
43
- # Description of available descriptors
44
- def self.description descriptor
45
- lib = descriptor.split('.').first
46
- case lib
47
- when "Openbabel"
48
- OBDESCRIPTORS[descriptor]
49
- when "Cdk"
50
- name = descriptor.split('.')[0..-2].join('.')
51
- CDKDESCRIPTORS[name]
52
- when "Joelib"
53
- JOELIBDESCRIPTORS[descriptor]
54
- when "lookup"
55
- "Read feature values from a dataset"
56
- end
57
- end
58
-
59
- # Match an array of smarts features
60
- def self.smarts_match compounds, smarts_features, count=false
61
- bad_request_error "Compounds for smarts_match are empty" unless compounds
62
- bad_request_error "Smarts features for smarts_match are empty" unless smarts_features
63
- parse compounds
64
- @count = count
65
- obconversion = OpenBabel::OBConversion.new
66
- obmol = OpenBabel::OBMol.new
67
- obconversion.set_in_format('smi')
68
- smarts_pattern = OpenBabel::OBSmartsPattern.new
69
- smarts_features = [smarts_features] if smarts_features.is_a?(Feature)
70
- @smarts = smarts_features.collect{|f| f.smarts}
71
- @physchem_descriptors = nil
72
- @data_entries = Array.new(@compounds.size){Array.new(@smarts.size,false)}
73
- @compounds.each_with_index do |compound,c|
74
- obconversion.read_string(obmol,compound.smiles)
75
- @smarts.each_with_index do |smart,s|
76
- smarts_pattern.init(smart)
77
- if smarts_pattern.match(obmol)
78
- count ? value = smarts_pattern.get_map_list.to_a.size : value = 1
79
- else
80
- value = 0
81
- end
82
- @data_entries[c][s] = value
83
- end
84
- end
85
- serialize
86
- end
87
-
88
- # Count matches of an array with smarts features
89
- def self.smarts_count compounds, smarts
90
- # TODO: non-overlapping matches?
91
- smarts_match compounds,smarts,true
92
- end
93
-
94
- # Calculate physchem descriptors
95
- # @param [OpenTox::Compound,Array,OpenTox::Dataset] input object, either a compound, an array of compounds or a dataset
96
- def self.physchem compounds, descriptors=UNIQUEDESCRIPTORS
97
- parse compounds
98
- @data_entries = Array.new(@compounds.size){[]}
99
- @descriptors = descriptors
100
- @smarts = nil
101
- @physchem_descriptors = [] # CDK may return more than one result per descriptor, they are stored as separate features
102
- des = {}
103
- @descriptors.each do |d|
104
- lib, descriptor = d.split(".",2)
105
- lib = lib.downcase.to_sym
106
- des[lib] ||= []
107
- des[lib] << descriptor
108
- end
109
- des.each do |lib,descriptors|
110
- send(lib, descriptors)
111
- end
112
- serialize
113
- end
114
-
115
- def self.openbabel descriptors
116
- $logger.debug "compute #{descriptors.size} openbabel descriptors for #{@compounds.size} compounds"
117
- obdescriptors = descriptors.collect{|d| OpenBabel::OBDescriptor.find_type d}
118
- obmol = OpenBabel::OBMol.new
119
- obconversion = OpenBabel::OBConversion.new
120
- obconversion.set_in_format 'smi'
121
- last_feature_idx = @physchem_descriptors.size
122
- @compounds.each_with_index do |compound,c|
123
- obconversion.read_string obmol, compound.smiles
124
- obdescriptors.each_with_index do |descriptor,d|
125
- @data_entries[c][d+last_feature_idx] = fix_value(descriptor.predict(obmol))
126
- end
127
- end
128
- @physchem_descriptors += descriptors.collect{|d| "Openbabel.#{d}"}
129
- end
130
-
131
- def self.java_descriptors descriptors, lib
132
- $logger.debug "compute #{descriptors.size} cdk descriptors for #{@compounds.size} compounds"
133
- sdf = sdf_3d
134
- # use java system call (rjb blocks within tasks)
135
- # use Tempfiles to avoid "Argument list too long" error
136
- case lib
137
- when "cdk"
138
- run_cmd "java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptors #{sdf} #{descriptors.join(" ")}"
139
- when "joelib"
140
- run_cmd "java -classpath #{JOELIB_JAR}:#{JMOL_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptors #{sdf} #{descriptors.join(' ')}"
141
- end
142
- last_feature_idx = @physchem_descriptors.size
143
- YAML.load_file("#{sdf}#{lib}.yaml").each_with_index do |calculation,i|
144
- # TODO create warnings
145
- #$logger.error "Descriptor calculation failed for compound #{@compounds[i].inchi}." if calculation.empty?
146
- # CDK Descriptors may calculate multiple values, they are stored in separate features
147
- @physchem_descriptors += calculation.keys if i == 0
148
- calculation.keys.each_with_index do |name,j|
149
- @data_entries[i][j+last_feature_idx] = fix_value(calculation[name])
150
- end
151
- end
152
- FileUtils.rm "#{sdf}#{lib}.yaml"
153
- end
154
-
155
- def self.cdk descriptors
156
- java_descriptors descriptors, "cdk"
157
- end
158
-
159
- def self.joelib descriptors
160
- java_descriptors descriptors, "joelib"
161
- end
162
-
163
- def self.lookup compounds, features, dataset
164
- parse compounds
165
- fingerprint = []
166
- compounds.each do |compound|
167
- fingerprint << []
168
- features.each do |feature|
169
- end
170
- end
171
- end
172
-
173
- def self.run_cmd cmd
174
- cmd = "#{cmd} 2>&1"
175
- $logger.debug "running external cmd: '#{cmd}'"
176
- p = IO.popen(cmd) do |io|
177
- while line = io.gets
178
- $logger.debug "> #{line.chomp}"
179
- end
180
- io.close
181
- raise "external cmd failed '#{cmd}' (see log file for error msg)" unless $?.to_i == 0
182
- end
183
- end
184
-
185
- def self.sdf_3d
186
- # TODO check if 3d sdfs are stored in GridFS
187
- sdf = ""
188
- @compounds.each do |compound|
189
- sdf << compound.sdf
190
- end
191
- sdf_file = "/tmp/#{SecureRandom.uuid}.sdf"
192
- File.open(sdf_file,"w+"){|f| f.print sdf}
193
- sdf_file
194
- end
195
-
196
- def self.parse compounds
197
- @input_class = compounds.class.to_s
198
- case @input_class
199
- when "OpenTox::Compound"
200
- @compounds = [compounds]
201
- when "Array"
202
- @compounds = compounds
203
- when "OpenTox::Dataset"
204
- @compounds = compounds.compounds
205
- else
206
- bad_request_error "Cannot calculate descriptors for #{compounds.class} objects."
207
- end
208
- end
209
-
210
- def self.serialize
211
- @data_entries.collect!{|de| de.collect{|v| v.round(5) unless v.nil?}}
212
- case @input_class
213
- when "OpenTox::Compound"
214
- @data_entries.first
215
- when "Array"
216
- @data_entries
217
- when "OpenTox::Dataset"
218
- dataset = OpenTox::DescriptorDataset.new(:compound_ids => @compounds.collect{|c| c.id})
219
- if @smarts
220
- dataset.feature_ids = @smarts.collect{|smart| Smarts.find_or_create_by(:smarts => smart).id}
221
- @count ? algo = "count" : algo = "match"
222
- dataset.feature_calculation_algorithm = "#{self}.smarts_#{algo}"
223
-
224
- elsif @physchem_descriptors
225
- dataset.feature_ids = @physchem_descriptors.collect{|d| PhysChemDescriptor.find_or_create_by(:name => d, :creator => __FILE__).id}
226
- dataset.data_entries = @data_entries
227
- dataset.feature_calculation_algorithm = "#{self}.physchem"
228
- #TODO params?
229
- end
230
- dataset.save_all
231
- dataset
232
- end
233
- end
234
-
235
- def self.fix_value val
236
- val = val.first if val.is_a? Array and val.size == 1
237
- val = nil if val == "NaN"
238
- if val.numeric?
239
- val = Float(val)
240
- val = nil if val.nan? or val.infinite?
241
- end
242
- val
243
- end
244
- private_class_method :sdf_3d, :fix_value, :parse, :run_cmd, :serialize
245
- end
246
- end
247
- end
data/lib/neighbor.rb DELETED
@@ -1,25 +0,0 @@
1
- module OpenTox
2
- module Algorithm
3
- class Neighbor
4
-
5
- def self.fingerprint_similarity compound, params={}
6
- compound.neighbors params[:min_sim]
7
- end
8
-
9
- def self.fminer_similarity compound, params
10
- feature_dataset = Dataset.find params[:feature_dataset_id]
11
- query_fingerprint = Algorithm::Descriptor.smarts_match(compound, feature_dataset.features)
12
- neighbors = []
13
-
14
- # find neighbors
15
- feature_dataset.data_entries.each_with_index do |fingerprint, i|
16
- sim = Algorithm::Similarity.tanimoto fingerprint, query_fingerprint
17
- if sim > params[:min_sim]
18
- neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming
19
- end
20
- end
21
- neighbors
22
- end
23
- end
24
- end
25
- end
data/lib/similarity.rb DELETED
@@ -1,58 +0,0 @@
1
- =begin
2
- * Name: similarity.rb
3
- * Description: Similarity algorithms
4
- * Author: Andreas Maunz <andreas@maunz.de
5
- * Date: 10/2012
6
- =end
7
-
8
- module OpenTox
9
- module Algorithm
10
-
11
- class Similarity
12
-
13
- #TODO weighted tanimoto
14
-
15
- # Tanimoto similarity
16
- # @param [Array] a fingerprints of first compound
17
- # @param [Array] b fingerprints of second compound
18
- # @return [Float] Tanimoto similarity
19
- def self.tanimoto(a,b)
20
- bad_request_error "fingerprints #{a} and #{b} don't have equal size" unless a.size == b.size
21
- #common = 0.0
22
- #a.each_with_index do |n,i|
23
- #common += 1 if n == b[i]
24
- #end
25
- #common/a.size
26
- # TODO check if calculation speed can be improved
27
- common_p_sum = 0.0
28
- all_p_sum = 0.0
29
- (0...a.size).each { |idx|
30
- common_p_sum += [ a[idx], b[idx] ].min
31
- all_p_sum += [ a[idx], b[idx] ].max
32
- }
33
- common_p_sum/all_p_sum
34
- end
35
-
36
-
37
- # Cosine similarity
38
- # @param [Array] a fingerprints of first compound
39
- # @param [Array] b fingerprints of second compound
40
- # @return [Float] Cosine similarity, the cosine of angle enclosed between vectors a and b
41
- def self.cosine(a, b)
42
- val = 0.0
43
- if a.size>0 and b.size>0
44
- if a.size>12 && b.size>12
45
- a = a[0..11]
46
- b = b[0..11]
47
- end
48
- a_vec = a.to_gv
49
- b_vec = b.to_gv
50
- val = a_vec.dot(b_vec) / (a_vec.norm * b_vec.norm)
51
- end
52
- val
53
- end
54
-
55
- end
56
-
57
- end
58
- end
data/mongoid.yml DELETED
@@ -1,8 +0,0 @@
1
- development:
2
- clients:
3
- default:
4
- database: opentox
5
- hosts:
6
- - localhost:27017
7
- options:
8
- raise_not_found_error: false
@@ -1,26 +0,0 @@
1
- require_relative "setup.rb"
2
- class DescriptorLongTest < MiniTest::Test
3
-
4
- def test_dataset_all
5
- # TODO: improve CDK descriptor calculation speed or add timeout
6
- skip "CDK descriptor calculation takes too long for some compounds"
7
- dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.mini.csv")
8
- d = OpenTox::Algorithm::Descriptor.physchem dataset
9
- assert_equal dataset.compounds, d.compounds
10
- assert_equal 332, d.features.size
11
- assert_equal 332, d.data_entries.first.size
12
- d.delete
13
- end
14
-
15
- def test_dataset_openbabel
16
- # TODO: improve CDK descriptor calculation speed or add timeout
17
- dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.mini.csv")
18
- d = Algorithm::Descriptor.physchem dataset, Algorithm::Descriptor::OBDESCRIPTORS.keys
19
- assert_equal dataset.compounds, d.compounds
20
- size = Algorithm::Descriptor::OBDESCRIPTORS.keys.size
21
- assert_equal size, d.features.size
22
- assert_equal size, d.data_entries.first.size
23
- d.delete
24
- end
25
-
26
- end
data/test/fminer-long.rb DELETED
@@ -1,38 +0,0 @@
1
- require_relative "setup.rb"
2
-
3
- class FminerTest < MiniTest::Test
4
-
5
- def test_fminer_multicell
6
- #skip "multicell segfaults"
7
- # TODO aborts, probably fminer
8
- # or OpenBabel segfault
9
- dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"multi_cell_call.csv")
10
- feature_dataset = OpenTox::Algorithm::Fminer.bbrc(dataset)#, :min_frequency => 15)
11
- p feature_dataset.training_parameters
12
- assert_equal dataset.compound_ids, feature_dataset.compound_ids
13
- dataset.delete
14
- feature_dataset.delete
15
- end
16
-
17
- def test_fminer_isscan
18
- dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"ISSCAN-multi.csv")
19
- feature_dataset = OpenTox::Algorithm::Fminer.bbrc(dataset)#, :min_frequency => 15)
20
- assert_equal feature_dataset.compounds.size, dataset.compounds.size
21
- p feature_dataset.features.size
22
- p feature_dataset.training_parameters
23
- dataset.delete
24
- feature_dataset.delete
25
- end
26
-
27
- def test_fminer_kazius
28
- dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"kazius.csv")
29
- # TODO reactivate default settings
30
- feature_dataset = OpenTox::Algorithm::Fminer.bbrc(dataset, :min_frequency => 20)
31
- assert_equal feature_dataset.compounds.size, dataset.compounds.size
32
- feature_dataset = Dataset.find feature_dataset.id
33
- assert feature_dataset.data_entries.size, dataset.compounds.size
34
- dataset.delete
35
- feature_dataset.delete
36
- end
37
-
38
- end
data/test/fminer.rb DELETED
@@ -1,52 +0,0 @@
1
- require_relative "setup.rb"
2
-
3
- class FminerTest < MiniTest::Test
4
-
5
- def test_fminer_bbrc
6
- dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv")
7
- refute_nil dataset.id
8
- feature_dataset = OpenTox::Algorithm::Fminer.bbrc dataset
9
- feature_dataset = Dataset.find feature_dataset.id
10
- assert_equal dataset.compounds.size, feature_dataset.compounds.size
11
- # TODO: fminer calculates 62 instead of 54 features
12
- # it is unclear which commit changed the numbers (occurs with old libraries/mongodb branch too
13
- # modification of Compound to use smiles instead of inchis seems to have no effect
14
- #assert_equal 54, feature_dataset.features.size
15
- #assert_equal "C-C-C=C", feature_dataset.features.first.smarts
16
- compounds = feature_dataset.compounds
17
- smarts = feature_dataset.features
18
- smarts.each do |smart|
19
- assert smart.p_value.round(2) >= 0.95
20
- end
21
- match = OpenTox::Algorithm::Descriptor.smarts_match compounds, smarts
22
- feature_dataset.data_entries.each_with_index do |fingerprint,i|
23
- assert_equal match[i], fingerprint
24
- end
25
-
26
- dataset.delete
27
- feature_dataset.delete
28
- end
29
-
30
- def test_fminer_last
31
- skip "last features have to be activated"
32
- dataset = OpenTox::Dataset.new
33
- dataset.upload File.join(DATA_DIR,"hamster_carcinogenicity.csv")
34
- feature_dataset = OpenTox::Algorithm::Fminer.last :dataset => dataset
35
- assert_equal dataset.compounds.size, feature_dataset.compounds.size
36
- assert_equal 21, feature_dataset.features.size
37
- assert_equal '[#6&A]-[#6&a]:[#6&a]:[#6&a]:[#6&a]:[#6&a]', feature_dataset.features.first.smarts
38
-
39
- compounds = feature_dataset.compounds
40
- smarts = feature_dataset.features.collect{|f| f.smarts}
41
- match = OpenTox::Algorithm::Descriptor.smarts_match compounds, smarts
42
- compounds.each_with_index do |c,i|
43
- smarts.each_with_index do |s,j|
44
- assert_equal match[i][j], feature_dataset.data_entries[i][j].to_i
45
- end
46
- end
47
-
48
- dataset.delete
49
- feature_dataset.delete
50
- end
51
-
52
- end
data/test/lazar-fminer.rb DELETED
@@ -1,50 +0,0 @@
1
- require_relative "setup.rb"
2
-
3
- class LazarFminerTest < MiniTest::Test
4
-
5
- def test_lazar_fminer
6
- training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv")
7
- model = Model::LazarFminerClassification.create training_dataset#, feature_dataset
8
- feature_dataset = Dataset.find model.neighbor_algorithm_parameters[:feature_dataset_id]
9
- assert_equal training_dataset.compounds.size, feature_dataset.compounds.size
10
- #TODO check fminer features, see fminer.rb
11
- #assert_equal 54, feature_dataset.features.size
12
- feature_dataset.data_entries.each do |e|
13
- assert_equal e.size, feature_dataset.features.size
14
- end
15
- #assert_equal 'C-C-C=C', feature_dataset.features.first.smarts
16
-
17
- [ {
18
- :compound => OpenTox::Compound.from_inchi("InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"),
19
- :prediction => "false",
20
- :confidence => 0.25281385281385277,
21
- :nr_neighbors => 11
22
- },{
23
- :compound => OpenTox::Compound.from_smiles("c1ccccc1NN"),
24
- :prediction => "false",
25
- :confidence => 0.3639589577089577,
26
- :nr_neighbors => 14
27
- }, {
28
- :compound => Compound.from_smiles('OCCCCCCCC\C=C/CCCCCCCC'),
29
- :prediction => "false",
30
- :confidence => 0.5555555555555556,
31
- :nr_neighbors => 1
32
- }].each do |example|
33
- prediction = model.predict example[:compound]
34
-
35
- assert_equal example[:prediction], prediction[:value]
36
- #assert_equal example[:confidence], prediction[:confidence]
37
- #assert_equal example[:nr_neighbors], prediction[:neighbors].size
38
- end
39
-
40
- # make a dataset prediction
41
- compound_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini.csv")
42
- prediction = model.predict compound_dataset
43
- assert_equal compound_dataset.compounds, prediction.compounds
44
-
45
- assert_equal "Cound not find similar compounds.", prediction.data_entries[7][2]
46
- assert_equal "measured", prediction.data_entries[14][1]
47
- # cleanup
48
- [training_dataset,model,feature_dataset,compound_dataset].each{|o| o.delete}
49
- end
50
- end