lazar 0.0.7 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +3 -0
  3. data/README.md +2 -1
  4. data/VERSION +1 -1
  5. data/ext/lazar/extconf.rb +15 -76
  6. data/ext/lazar/rinstall.R +9 -0
  7. data/lazar.gemspec +7 -7
  8. data/lib/classification.rb +5 -78
  9. data/lib/compound.rb +201 -44
  10. data/lib/crossvalidation.rb +224 -121
  11. data/lib/dataset.rb +83 -93
  12. data/lib/error.rb +1 -1
  13. data/lib/experiment.rb +99 -0
  14. data/lib/feature.rb +2 -54
  15. data/lib/lazar.rb +47 -34
  16. data/lib/leave-one-out-validation.rb +205 -0
  17. data/lib/model.rb +131 -76
  18. data/lib/opentox.rb +2 -2
  19. data/lib/overwrite.rb +37 -0
  20. data/lib/physchem.rb +133 -0
  21. data/lib/regression.rb +117 -189
  22. data/lib/rest-client-wrapper.rb +4 -5
  23. data/lib/unique_descriptors.rb +6 -7
  24. data/lib/validation.rb +63 -69
  25. data/test/all.rb +2 -2
  26. data/test/classification.rb +41 -0
  27. data/test/compound.rb +116 -7
  28. data/test/data/LOAEL_log_mg_corrected_smiles.csv +567 -567
  29. data/test/data/LOAEL_log_mmol_corrected_smiles.csv +566 -566
  30. data/test/data/LOAEL_mmol_corrected_smiles.csv +568 -0
  31. data/test/data/batch_prediction.csv +25 -0
  32. data/test/data/batch_prediction_inchi_small.csv +4 -0
  33. data/test/data/batch_prediction_smiles_small.csv +4 -0
  34. data/test/data/hamster_carcinogenicity.json +3 -0
  35. data/test/data/loael.csv +568 -0
  36. data/test/dataset-long.rb +5 -8
  37. data/test/dataset.rb +31 -11
  38. data/test/default_environment.rb +11 -0
  39. data/test/descriptor.rb +26 -41
  40. data/test/error.rb +1 -3
  41. data/test/experiment.rb +301 -0
  42. data/test/feature.rb +22 -10
  43. data/test/lazar-long.rb +43 -23
  44. data/test/lazar-physchem-short.rb +19 -16
  45. data/test/prediction_models.rb +20 -0
  46. data/test/regression.rb +43 -0
  47. data/test/setup.rb +3 -1
  48. data/test/test_environment.rb +10 -0
  49. data/test/validation.rb +92 -26
  50. metadata +64 -38
  51. data/lib/SMARTS_InteLigand.txt +0 -983
  52. data/lib/bbrc.rb +0 -165
  53. data/lib/descriptor.rb +0 -247
  54. data/lib/neighbor.rb +0 -25
  55. data/lib/similarity.rb +0 -58
  56. data/mongoid.yml +0 -8
  57. data/test/descriptor-long.rb +0 -26
  58. data/test/fminer-long.rb +0 -38
  59. data/test/fminer.rb +0 -52
  60. data/test/lazar-fminer.rb +0 -50
  61. data/test/lazar-regression.rb +0 -27
data/lib/bbrc.rb DELETED
@@ -1,165 +0,0 @@
1
- module OpenTox
2
- module Algorithm
3
- class Fminer
4
- TABLE_OF_ELEMENTS = [
5
- "H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th", "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr", "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Uut", "Fl", "Uup", "Lv", "Uus", "Uuo"]
6
-
7
- #
8
- # Run bbrc algorithm on dataset
9
- #
10
- # @param [OpenTox::Dataset] training dataset
11
- # @param [optional] parameters BBRC parameters, accepted parameters are
12
- # - min_frequency Minimum frequency (default 5)
13
- # - feature_type Feature type, can be 'paths' or 'trees' (default "trees")
14
- # - backbone BBRC classes, pass 'false' to switch off mining for BBRC representatives. (default "true")
15
- # - min_chisq_significance Significance threshold (between 0 and 1)
16
- # - nr_hits Set to "true" to get hit count instead of presence
17
- # - get_target Set to "true" to obtain target variable as feature
18
- # @return [OpenTox::Dataset] Fminer Dataset
19
- def self.bbrc training_dataset, params={}
20
-
21
- time = Time.now
22
- bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1
23
-
24
- prediction_feature = training_dataset.features.first
25
- if params[:min_frequency]
26
- minfreq = params[:min_frequency]
27
- else
28
- per_mil = 5 # value from latest version
29
- per_mil = 8 # as suggested below
30
- i = training_dataset.feature_ids.index prediction_feature.id
31
- nr_labeled_cmpds = training_dataset.data_entries.select{|de| !de[i].nil?}.size
32
- minfreq = per_mil * nr_labeled_cmpds.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
33
- minfreq = 2 unless minfreq > 2
34
- minfreq = minfreq.round
35
- end
36
-
37
- @bbrc ||= Bbrc::Bbrc.new
38
- @bbrc.Reset
39
- if prediction_feature.numeric
40
- @bbrc.SetRegression(true) # AM: DO NOT MOVE DOWN! Must happen before the other Set... operations!
41
- else
42
- bad_request_error "No accept values for "\
43
- "dataset '#{training_dataset.id}' and "\
44
- "feature '#{prediction_feature.id}'" unless prediction_feature.accept_values
45
- value2act = Hash[[*prediction_feature.accept_values.map.with_index]]
46
- end
47
- @bbrc.SetMinfreq(minfreq)
48
- @bbrc.SetType(1) if params[:feature_type] == "paths"
49
- @bbrc.SetBackbone(false) if params[:backbone] == "false"
50
- @bbrc.SetChisqSig(params[:min_chisq_significance].to_f) if params[:min_chisq_significance]
51
- @bbrc.SetConsoleOut(false)
52
-
53
- params[:nr_hits] ? nr_hits = params[:nr_hits] : nr_hits = false
54
- feature_dataset = FminerDataset.new(
55
- :training_dataset_id => training_dataset.id,
56
- :training_algorithm => "#{self.to_s}.bbrc",
57
- :training_feature_id => prediction_feature.id ,
58
- :training_parameters => {
59
- :min_frequency => minfreq,
60
- :nr_hits => nr_hits,
61
- :backbone => (params[:backbone] == false ? false : true)
62
- }
63
-
64
- )
65
- feature_dataset.compounds = training_dataset.compounds
66
-
67
- # add data
68
- training_dataset.compounds.each_with_index do |compound,i|
69
- act = value2act[training_dataset.data_entries[i].first]
70
- if act # TODO check if this works
71
- @bbrc.AddCompound(compound.smiles,i+1)
72
- @bbrc.AddActivity(act,i+1)
73
- end
74
- end
75
- #g_median=@fminer.all_activities.values.to_scale.median
76
-
77
- #task.progress 10
78
- #step_width = 80 / @bbrc.GetNoRootNodes().to_f
79
-
80
- $logger.debug "BBRC setup: #{Time.now-time}"
81
- time = Time.now
82
- ftime = 0
83
- itime = 0
84
- rtime = 0
85
-
86
- # run @bbrc
87
- (0 .. @bbrc.GetNoRootNodes()-1).each do |j|
88
- results = @bbrc.MineRoot(j)
89
- results.each do |result|
90
- rt = Time.now
91
- f = YAML.load(result)[0]
92
- smarts = f.shift
93
- # convert fminer SMARTS representation into a more human readable format
94
- smarts.gsub!(%r{\[#(\d+)&(\w)\]}) do
95
- element = TABLE_OF_ELEMENTS[$1.to_i-1]
96
- $2 == "a" ? element.downcase : element
97
- end
98
- p_value = f.shift
99
- f.flatten!
100
- compound_idxs = f.collect{|e| e.first.first-1}
101
- # majority class
102
- effect = compound_idxs.collect{|i| training_dataset.data_entries[i].first}.mode
103
-
104
- =begin
105
- if (!@bbrc.GetRegression)
106
- id_arrs = f[2..-1].flatten
107
- max = OpenTox::Algorithm::Fminer.effect(f[2..-1].reverse, @fminer.db_class_sizes) # f needs reversal for bbrc
108
- effect = max+1
109
- else #regression part
110
- id_arrs = f[2]
111
- # DV: effect calculation
112
- f_arr=Array.new
113
- f[2].each do |id|
114
- id=id.keys[0] # extract id from hit count hash
115
- f_arr.push(@fminer.all_activities[id])
116
- end
117
- f_median=f_arr.to_scale.median
118
- if g_median >= f_median
119
- effect = 'activating'
120
- else
121
- effect = 'deactivating'
122
- end
123
- end
124
- =end
125
- rtime += Time.now - rt
126
-
127
- ft = Time.now
128
- feature = OpenTox::FminerSmarts.find_or_create_by({
129
- "smarts" => smarts,
130
- "p_value" => p_value.to_f.abs.round(5),
131
- "effect" => effect,
132
- "dataset_id" => feature_dataset.id
133
- })
134
- feature_dataset.feature_ids << feature.id
135
- ftime += Time.now - ft
136
-
137
- it = Time.now
138
- f.each do |id_count_hash|
139
- id_count_hash.each do |id,count|
140
- nr_hits ? count = count.to_i : count = 1
141
- feature_dataset.data_entries[id-1] ||= []
142
- feature_dataset.data_entries[id-1][feature_dataset.feature_ids.size-1] = count
143
- end
144
- end
145
- itime += Time.now - it
146
-
147
- end
148
- end
149
-
150
- $logger.debug "Fminer: #{Time.now-time} (read: #{rtime}, iterate: #{itime}, find/create Features: #{ftime})"
151
- time = Time.now
152
-
153
- feature_dataset.fill_nil_with 0
154
-
155
- $logger.debug "Prepare save: #{Time.now-time}"
156
- time = Time.now
157
- feature_dataset.save_all
158
-
159
- $logger.debug "Save: #{Time.now-time}"
160
- feature_dataset
161
-
162
- end
163
- end
164
- end
165
- end
data/lib/descriptor.rb DELETED
@@ -1,247 +0,0 @@
1
- require 'digest/md5'
2
- ENV["JAVA_HOME"] ||= "/usr/lib/jvm/java-7-openjdk"
3
- # TODO store descriptors in mongodb
4
-
5
- module OpenTox
6
-
7
- module Algorithm
8
-
9
- # Class for descriptor calculations
10
- class Descriptor
11
- include OpenTox
12
-
13
- JAVA_DIR = File.join(File.dirname(__FILE__),"..","java")
14
- CDK_JAR = Dir[File.join(JAVA_DIR,"cdk-*jar")].last
15
- JOELIB_JAR = File.join(JAVA_DIR,"joelib2.jar")
16
- LOG4J_JAR = File.join(JAVA_DIR,"log4j.jar")
17
- JMOL_JAR = File.join(JAVA_DIR,"Jmol.jar")
18
-
19
- obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title"]
20
- OBDESCRIPTORS = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d|
21
- name,description = d.split(/\s+/,2)
22
- ["Openbabel."+name,description] unless obexclude.include? name
23
- end.compact.sort{|a,b| a[0] <=> b[0]}]
24
-
25
- cdk_desc = YAML.load(`java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptorInfo`)
26
- CDKDESCRIPTORS = Hash[cdk_desc.collect { |d| ["Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,''), d[:description]] }.sort{|a,b| a[0] <=> b[0]}]
27
- CDKDESCRIPTOR_VALUES = cdk_desc.collect { |d| prefix="Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,''); d[:names].collect{ |name| prefix+"."+name } }.flatten
28
-
29
- # exclude Hashcode (not a physchem property) and GlobalTopologicalChargeIndex (Joelib bug)
30
- joelibexclude = ["MoleculeHashcode","GlobalTopologicalChargeIndex"]
31
- # strip Joelib messages from stdout
32
- JOELIBDESCRIPTORS = Hash[YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`).collect do |d|
33
- name = d[:java_class].sub(/^joelib2.feature.types./,'')
34
- # impossible to obtain meaningful descriptions from JOELIb, see java/JoelibDescriptors.java
35
- ["Joelib."+name, "no description available"] unless joelibexclude.include? name
36
- end.compact.sort{|a,b| a[0] <=> b[0]}]
37
-
38
- DESCRIPTORS = OBDESCRIPTORS.merge(CDKDESCRIPTORS.merge(JOELIBDESCRIPTORS))
39
- DESCRIPTOR_VALUES = OBDESCRIPTORS.keys + CDKDESCRIPTOR_VALUES + JOELIBDESCRIPTORS.keys
40
-
41
- require_relative "unique_descriptors.rb"
42
-
43
- # Description of available descriptors
44
- def self.description descriptor
45
- lib = descriptor.split('.').first
46
- case lib
47
- when "Openbabel"
48
- OBDESCRIPTORS[descriptor]
49
- when "Cdk"
50
- name = descriptor.split('.')[0..-2].join('.')
51
- CDKDESCRIPTORS[name]
52
- when "Joelib"
53
- JOELIBDESCRIPTORS[descriptor]
54
- when "lookup"
55
- "Read feature values from a dataset"
56
- end
57
- end
58
-
59
- # Match an array of smarts features
60
- def self.smarts_match compounds, smarts_features, count=false
61
- bad_request_error "Compounds for smarts_match are empty" unless compounds
62
- bad_request_error "Smarts features for smarts_match are empty" unless smarts_features
63
- parse compounds
64
- @count = count
65
- obconversion = OpenBabel::OBConversion.new
66
- obmol = OpenBabel::OBMol.new
67
- obconversion.set_in_format('smi')
68
- smarts_pattern = OpenBabel::OBSmartsPattern.new
69
- smarts_features = [smarts_features] if smarts_features.is_a?(Feature)
70
- @smarts = smarts_features.collect{|f| f.smarts}
71
- @physchem_descriptors = nil
72
- @data_entries = Array.new(@compounds.size){Array.new(@smarts.size,false)}
73
- @compounds.each_with_index do |compound,c|
74
- obconversion.read_string(obmol,compound.smiles)
75
- @smarts.each_with_index do |smart,s|
76
- smarts_pattern.init(smart)
77
- if smarts_pattern.match(obmol)
78
- count ? value = smarts_pattern.get_map_list.to_a.size : value = 1
79
- else
80
- value = 0
81
- end
82
- @data_entries[c][s] = value
83
- end
84
- end
85
- serialize
86
- end
87
-
88
- # Count matches of an array with smarts features
89
- def self.smarts_count compounds, smarts
90
- # TODO: non-overlapping matches?
91
- smarts_match compounds,smarts,true
92
- end
93
-
94
- # Calculate physchem descriptors
95
- # @param [OpenTox::Compound,Array,OpenTox::Dataset] input object, either a compound, an array of compounds or a dataset
96
- def self.physchem compounds, descriptors=UNIQUEDESCRIPTORS
97
- parse compounds
98
- @data_entries = Array.new(@compounds.size){[]}
99
- @descriptors = descriptors
100
- @smarts = nil
101
- @physchem_descriptors = [] # CDK may return more than one result per descriptor, they are stored as separate features
102
- des = {}
103
- @descriptors.each do |d|
104
- lib, descriptor = d.split(".",2)
105
- lib = lib.downcase.to_sym
106
- des[lib] ||= []
107
- des[lib] << descriptor
108
- end
109
- des.each do |lib,descriptors|
110
- send(lib, descriptors)
111
- end
112
- serialize
113
- end
114
-
115
- def self.openbabel descriptors
116
- $logger.debug "compute #{descriptors.size} openbabel descriptors for #{@compounds.size} compounds"
117
- obdescriptors = descriptors.collect{|d| OpenBabel::OBDescriptor.find_type d}
118
- obmol = OpenBabel::OBMol.new
119
- obconversion = OpenBabel::OBConversion.new
120
- obconversion.set_in_format 'smi'
121
- last_feature_idx = @physchem_descriptors.size
122
- @compounds.each_with_index do |compound,c|
123
- obconversion.read_string obmol, compound.smiles
124
- obdescriptors.each_with_index do |descriptor,d|
125
- @data_entries[c][d+last_feature_idx] = fix_value(descriptor.predict(obmol))
126
- end
127
- end
128
- @physchem_descriptors += descriptors.collect{|d| "Openbabel.#{d}"}
129
- end
130
-
131
- def self.java_descriptors descriptors, lib
132
- $logger.debug "compute #{descriptors.size} cdk descriptors for #{@compounds.size} compounds"
133
- sdf = sdf_3d
134
- # use java system call (rjb blocks within tasks)
135
- # use Tempfiles to avoid "Argument list too long" error
136
- case lib
137
- when "cdk"
138
- run_cmd "java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptors #{sdf} #{descriptors.join(" ")}"
139
- when "joelib"
140
- run_cmd "java -classpath #{JOELIB_JAR}:#{JMOL_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptors #{sdf} #{descriptors.join(' ')}"
141
- end
142
- last_feature_idx = @physchem_descriptors.size
143
- YAML.load_file("#{sdf}#{lib}.yaml").each_with_index do |calculation,i|
144
- # TODO create warnings
145
- #$logger.error "Descriptor calculation failed for compound #{@compounds[i].inchi}." if calculation.empty?
146
- # CDK Descriptors may calculate multiple values, they are stored in separate features
147
- @physchem_descriptors += calculation.keys if i == 0
148
- calculation.keys.each_with_index do |name,j|
149
- @data_entries[i][j+last_feature_idx] = fix_value(calculation[name])
150
- end
151
- end
152
- FileUtils.rm "#{sdf}#{lib}.yaml"
153
- end
154
-
155
- def self.cdk descriptors
156
- java_descriptors descriptors, "cdk"
157
- end
158
-
159
- def self.joelib descriptors
160
- java_descriptors descriptors, "joelib"
161
- end
162
-
163
- def self.lookup compounds, features, dataset
164
- parse compounds
165
- fingerprint = []
166
- compounds.each do |compound|
167
- fingerprint << []
168
- features.each do |feature|
169
- end
170
- end
171
- end
172
-
173
- def self.run_cmd cmd
174
- cmd = "#{cmd} 2>&1"
175
- $logger.debug "running external cmd: '#{cmd}'"
176
- p = IO.popen(cmd) do |io|
177
- while line = io.gets
178
- $logger.debug "> #{line.chomp}"
179
- end
180
- io.close
181
- raise "external cmd failed '#{cmd}' (see log file for error msg)" unless $?.to_i == 0
182
- end
183
- end
184
-
185
- def self.sdf_3d
186
- # TODO check if 3d sdfs are stored in GridFS
187
- sdf = ""
188
- @compounds.each do |compound|
189
- sdf << compound.sdf
190
- end
191
- sdf_file = "/tmp/#{SecureRandom.uuid}.sdf"
192
- File.open(sdf_file,"w+"){|f| f.print sdf}
193
- sdf_file
194
- end
195
-
196
- def self.parse compounds
197
- @input_class = compounds.class.to_s
198
- case @input_class
199
- when "OpenTox::Compound"
200
- @compounds = [compounds]
201
- when "Array"
202
- @compounds = compounds
203
- when "OpenTox::Dataset"
204
- @compounds = compounds.compounds
205
- else
206
- bad_request_error "Cannot calculate descriptors for #{compounds.class} objects."
207
- end
208
- end
209
-
210
- def self.serialize
211
- @data_entries.collect!{|de| de.collect{|v| v.round(5) unless v.nil?}}
212
- case @input_class
213
- when "OpenTox::Compound"
214
- @data_entries.first
215
- when "Array"
216
- @data_entries
217
- when "OpenTox::Dataset"
218
- dataset = OpenTox::DescriptorDataset.new(:compound_ids => @compounds.collect{|c| c.id})
219
- if @smarts
220
- dataset.feature_ids = @smarts.collect{|smart| Smarts.find_or_create_by(:smarts => smart).id}
221
- @count ? algo = "count" : algo = "match"
222
- dataset.feature_calculation_algorithm = "#{self}.smarts_#{algo}"
223
-
224
- elsif @physchem_descriptors
225
- dataset.feature_ids = @physchem_descriptors.collect{|d| PhysChemDescriptor.find_or_create_by(:name => d, :creator => __FILE__).id}
226
- dataset.data_entries = @data_entries
227
- dataset.feature_calculation_algorithm = "#{self}.physchem"
228
- #TODO params?
229
- end
230
- dataset.save_all
231
- dataset
232
- end
233
- end
234
-
235
- def self.fix_value val
236
- val = val.first if val.is_a? Array and val.size == 1
237
- val = nil if val == "NaN"
238
- if val.numeric?
239
- val = Float(val)
240
- val = nil if val.nan? or val.infinite?
241
- end
242
- val
243
- end
244
- private_class_method :sdf_3d, :fix_value, :parse, :run_cmd, :serialize
245
- end
246
- end
247
- end
data/lib/neighbor.rb DELETED
@@ -1,25 +0,0 @@
1
- module OpenTox
2
- module Algorithm
3
- class Neighbor
4
-
5
- def self.fingerprint_similarity compound, params={}
6
- compound.neighbors params[:min_sim]
7
- end
8
-
9
- def self.fminer_similarity compound, params
10
- feature_dataset = Dataset.find params[:feature_dataset_id]
11
- query_fingerprint = Algorithm::Descriptor.smarts_match(compound, feature_dataset.features)
12
- neighbors = []
13
-
14
- # find neighbors
15
- feature_dataset.data_entries.each_with_index do |fingerprint, i|
16
- sim = Algorithm::Similarity.tanimoto fingerprint, query_fingerprint
17
- if sim > params[:min_sim]
18
- neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming
19
- end
20
- end
21
- neighbors
22
- end
23
- end
24
- end
25
- end
data/lib/similarity.rb DELETED
@@ -1,58 +0,0 @@
1
- =begin
2
- * Name: similarity.rb
3
- * Description: Similarity algorithms
4
- * Author: Andreas Maunz <andreas@maunz.de
5
- * Date: 10/2012
6
- =end
7
-
8
- module OpenTox
9
- module Algorithm
10
-
11
- class Similarity
12
-
13
- #TODO weighted tanimoto
14
-
15
- # Tanimoto similarity
16
- # @param [Array] a fingerprints of first compound
17
- # @param [Array] b fingerprints of second compound
18
- # @return [Float] Tanimoto similarity
19
- def self.tanimoto(a,b)
20
- bad_request_error "fingerprints #{a} and #{b} don't have equal size" unless a.size == b.size
21
- #common = 0.0
22
- #a.each_with_index do |n,i|
23
- #common += 1 if n == b[i]
24
- #end
25
- #common/a.size
26
- # TODO check if calculation speed can be improved
27
- common_p_sum = 0.0
28
- all_p_sum = 0.0
29
- (0...a.size).each { |idx|
30
- common_p_sum += [ a[idx], b[idx] ].min
31
- all_p_sum += [ a[idx], b[idx] ].max
32
- }
33
- common_p_sum/all_p_sum
34
- end
35
-
36
-
37
- # Cosine similarity
38
- # @param [Array] a fingerprints of first compound
39
- # @param [Array] b fingerprints of second compound
40
- # @return [Float] Cosine similarity, the cosine of angle enclosed between vectors a and b
41
- def self.cosine(a, b)
42
- val = 0.0
43
- if a.size>0 and b.size>0
44
- if a.size>12 && b.size>12
45
- a = a[0..11]
46
- b = b[0..11]
47
- end
48
- a_vec = a.to_gv
49
- b_vec = b.to_gv
50
- val = a_vec.dot(b_vec) / (a_vec.norm * b_vec.norm)
51
- end
52
- val
53
- end
54
-
55
- end
56
-
57
- end
58
- end
data/mongoid.yml DELETED
@@ -1,8 +0,0 @@
1
- development:
2
- clients:
3
- default:
4
- database: opentox
5
- hosts:
6
- - localhost:27017
7
- options:
8
- raise_not_found_error: false
@@ -1,26 +0,0 @@
1
- require_relative "setup.rb"
2
- class DescriptorLongTest < MiniTest::Test
3
-
4
- def test_dataset_all
5
- # TODO: improve CDK descriptor calculation speed or add timeout
6
- skip "CDK descriptor calculation takes too long for some compounds"
7
- dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.mini.csv")
8
- d = OpenTox::Algorithm::Descriptor.physchem dataset
9
- assert_equal dataset.compounds, d.compounds
10
- assert_equal 332, d.features.size
11
- assert_equal 332, d.data_entries.first.size
12
- d.delete
13
- end
14
-
15
- def test_dataset_openbabel
16
- # TODO: improve CDK descriptor calculation speed or add timeout
17
- dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.mini.csv")
18
- d = Algorithm::Descriptor.physchem dataset, Algorithm::Descriptor::OBDESCRIPTORS.keys
19
- assert_equal dataset.compounds, d.compounds
20
- size = Algorithm::Descriptor::OBDESCRIPTORS.keys.size
21
- assert_equal size, d.features.size
22
- assert_equal size, d.data_entries.first.size
23
- d.delete
24
- end
25
-
26
- end
data/test/fminer-long.rb DELETED
@@ -1,38 +0,0 @@
1
- require_relative "setup.rb"
2
-
3
- class FminerTest < MiniTest::Test
4
-
5
- def test_fminer_multicell
6
- #skip "multicell segfaults"
7
- # TODO aborts, probably fminer
8
- # or OpenBabel segfault
9
- dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"multi_cell_call.csv")
10
- feature_dataset = OpenTox::Algorithm::Fminer.bbrc(dataset)#, :min_frequency => 15)
11
- p feature_dataset.training_parameters
12
- assert_equal dataset.compound_ids, feature_dataset.compound_ids
13
- dataset.delete
14
- feature_dataset.delete
15
- end
16
-
17
- def test_fminer_isscan
18
- dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"ISSCAN-multi.csv")
19
- feature_dataset = OpenTox::Algorithm::Fminer.bbrc(dataset)#, :min_frequency => 15)
20
- assert_equal feature_dataset.compounds.size, dataset.compounds.size
21
- p feature_dataset.features.size
22
- p feature_dataset.training_parameters
23
- dataset.delete
24
- feature_dataset.delete
25
- end
26
-
27
- def test_fminer_kazius
28
- dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"kazius.csv")
29
- # TODO reactivate default settings
30
- feature_dataset = OpenTox::Algorithm::Fminer.bbrc(dataset, :min_frequency => 20)
31
- assert_equal feature_dataset.compounds.size, dataset.compounds.size
32
- feature_dataset = Dataset.find feature_dataset.id
33
- assert feature_dataset.data_entries.size, dataset.compounds.size
34
- dataset.delete
35
- feature_dataset.delete
36
- end
37
-
38
- end
data/test/fminer.rb DELETED
@@ -1,52 +0,0 @@
1
- require_relative "setup.rb"
2
-
3
- class FminerTest < MiniTest::Test
4
-
5
- def test_fminer_bbrc
6
- dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv")
7
- refute_nil dataset.id
8
- feature_dataset = OpenTox::Algorithm::Fminer.bbrc dataset
9
- feature_dataset = Dataset.find feature_dataset.id
10
- assert_equal dataset.compounds.size, feature_dataset.compounds.size
11
- # TODO: fminer calculates 62 instead of 54 features
12
- # it is unclear which commit changed the numbers (occurs with old libraries/mongodb branch too
13
- # modification of Compound to use smiles instead of inchis seems to have no effect
14
- #assert_equal 54, feature_dataset.features.size
15
- #assert_equal "C-C-C=C", feature_dataset.features.first.smarts
16
- compounds = feature_dataset.compounds
17
- smarts = feature_dataset.features
18
- smarts.each do |smart|
19
- assert smart.p_value.round(2) >= 0.95
20
- end
21
- match = OpenTox::Algorithm::Descriptor.smarts_match compounds, smarts
22
- feature_dataset.data_entries.each_with_index do |fingerprint,i|
23
- assert_equal match[i], fingerprint
24
- end
25
-
26
- dataset.delete
27
- feature_dataset.delete
28
- end
29
-
30
- def test_fminer_last
31
- skip "last features have to be activated"
32
- dataset = OpenTox::Dataset.new
33
- dataset.upload File.join(DATA_DIR,"hamster_carcinogenicity.csv")
34
- feature_dataset = OpenTox::Algorithm::Fminer.last :dataset => dataset
35
- assert_equal dataset.compounds.size, feature_dataset.compounds.size
36
- assert_equal 21, feature_dataset.features.size
37
- assert_equal '[#6&A]-[#6&a]:[#6&a]:[#6&a]:[#6&a]:[#6&a]', feature_dataset.features.first.smarts
38
-
39
- compounds = feature_dataset.compounds
40
- smarts = feature_dataset.features.collect{|f| f.smarts}
41
- match = OpenTox::Algorithm::Descriptor.smarts_match compounds, smarts
42
- compounds.each_with_index do |c,i|
43
- smarts.each_with_index do |s,j|
44
- assert_equal match[i][j], feature_dataset.data_entries[i][j].to_i
45
- end
46
- end
47
-
48
- dataset.delete
49
- feature_dataset.delete
50
- end
51
-
52
- end
data/test/lazar-fminer.rb DELETED
@@ -1,50 +0,0 @@
1
- require_relative "setup.rb"
2
-
3
- class LazarFminerTest < MiniTest::Test
4
-
5
- def test_lazar_fminer
6
- training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv")
7
- model = Model::LazarFminerClassification.create training_dataset#, feature_dataset
8
- feature_dataset = Dataset.find model.neighbor_algorithm_parameters[:feature_dataset_id]
9
- assert_equal training_dataset.compounds.size, feature_dataset.compounds.size
10
- #TODO check fminer features, see fminer.rb
11
- #assert_equal 54, feature_dataset.features.size
12
- feature_dataset.data_entries.each do |e|
13
- assert_equal e.size, feature_dataset.features.size
14
- end
15
- #assert_equal 'C-C-C=C', feature_dataset.features.first.smarts
16
-
17
- [ {
18
- :compound => OpenTox::Compound.from_inchi("InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"),
19
- :prediction => "false",
20
- :confidence => 0.25281385281385277,
21
- :nr_neighbors => 11
22
- },{
23
- :compound => OpenTox::Compound.from_smiles("c1ccccc1NN"),
24
- :prediction => "false",
25
- :confidence => 0.3639589577089577,
26
- :nr_neighbors => 14
27
- }, {
28
- :compound => Compound.from_smiles('OCCCCCCCC\C=C/CCCCCCCC'),
29
- :prediction => "false",
30
- :confidence => 0.5555555555555556,
31
- :nr_neighbors => 1
32
- }].each do |example|
33
- prediction = model.predict example[:compound]
34
-
35
- assert_equal example[:prediction], prediction[:value]
36
- #assert_equal example[:confidence], prediction[:confidence]
37
- #assert_equal example[:nr_neighbors], prediction[:neighbors].size
38
- end
39
-
40
- # make a dataset prediction
41
- compound_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini.csv")
42
- prediction = model.predict compound_dataset
43
- assert_equal compound_dataset.compounds, prediction.compounds
44
-
45
- assert_equal "Cound not find similar compounds.", prediction.data_entries[7][2]
46
- assert_equal "measured", prediction.data_entries[14][1]
47
- # cleanup
48
- [training_dataset,model,feature_dataset,compound_dataset].each{|o| o.delete}
49
- end
50
- end