lazar 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (98) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/.yardopts +4 -0
  4. data/Gemfile +2 -0
  5. data/LICENSE +674 -0
  6. data/README.md +44 -0
  7. data/Rakefile +1 -0
  8. data/VERSION +1 -0
  9. data/ext/lazar/extconf.rb +87 -0
  10. data/java/CdkDescriptorInfo.class +0 -0
  11. data/java/CdkDescriptorInfo.java +22 -0
  12. data/java/CdkDescriptors.class +0 -0
  13. data/java/CdkDescriptors.java +141 -0
  14. data/java/Jmol.jar +0 -0
  15. data/java/JoelibDescriptorInfo.class +0 -0
  16. data/java/JoelibDescriptorInfo.java +15 -0
  17. data/java/JoelibDescriptors.class +0 -0
  18. data/java/JoelibDescriptors.java +60 -0
  19. data/java/Rakefile +15 -0
  20. data/java/cdk-1.4.19.jar +0 -0
  21. data/java/joelib2.jar +0 -0
  22. data/java/log4j.jar +0 -0
  23. data/lazar.gemspec +29 -0
  24. data/lib/SMARTS_InteLigand.txt +983 -0
  25. data/lib/algorithm.rb +21 -0
  26. data/lib/bbrc.rb +165 -0
  27. data/lib/classification.rb +107 -0
  28. data/lib/compound.rb +254 -0
  29. data/lib/crossvalidation.rb +187 -0
  30. data/lib/dataset.rb +334 -0
  31. data/lib/descriptor.rb +247 -0
  32. data/lib/error.rb +66 -0
  33. data/lib/feature.rb +97 -0
  34. data/lib/lazar-model.rb +170 -0
  35. data/lib/lazar.rb +69 -0
  36. data/lib/neighbor.rb +25 -0
  37. data/lib/opentox.rb +22 -0
  38. data/lib/overwrite.rb +119 -0
  39. data/lib/regression.rb +199 -0
  40. data/lib/rest-client-wrapper.rb +98 -0
  41. data/lib/similarity.rb +58 -0
  42. data/lib/unique_descriptors.rb +120 -0
  43. data/lib/validation.rb +114 -0
  44. data/mongoid.yml +8 -0
  45. data/test/all.rb +5 -0
  46. data/test/compound.rb +100 -0
  47. data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +13553 -0
  48. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +436 -0
  49. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +568 -0
  50. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +87 -0
  51. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +978 -0
  52. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +1120 -0
  53. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +1113 -0
  54. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +850 -0
  55. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +829 -0
  56. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +1198 -0
  57. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +1505 -0
  58. data/test/data/EPAFHM.csv +618 -0
  59. data/test/data/EPAFHM.medi.csv +100 -0
  60. data/test/data/EPAFHM.mini.csv +22 -0
  61. data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +581 -0
  62. data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +1217 -0
  63. data/test/data/ISSCAN-multi.csv +59 -0
  64. data/test/data/LOAEL_log_mg_corrected_smiles.csv +568 -0
  65. data/test/data/LOAEL_log_mmol_corrected_smiles.csv +568 -0
  66. data/test/data/acetaldehyde.sdf +14 -0
  67. data/test/data/boiling_points.ext.sdf +11460 -0
  68. data/test/data/cpdb_100.csv +101 -0
  69. data/test/data/hamster_carcinogenicity.csv +86 -0
  70. data/test/data/hamster_carcinogenicity.mini.bool_float.csv +11 -0
  71. data/test/data/hamster_carcinogenicity.mini.bool_int.csv +11 -0
  72. data/test/data/hamster_carcinogenicity.mini.bool_string.csv +11 -0
  73. data/test/data/hamster_carcinogenicity.mini.csv +11 -0
  74. data/test/data/hamster_carcinogenicity.ntriples +618 -0
  75. data/test/data/hamster_carcinogenicity.sdf +2805 -0
  76. data/test/data/hamster_carcinogenicity.xls +0 -0
  77. data/test/data/hamster_carcinogenicity.yaml +352 -0
  78. data/test/data/hamster_carcinogenicity_with_errors.csv +88 -0
  79. data/test/data/kazius.csv +4070 -0
  80. data/test/data/multi_cell_call.csv +1067 -0
  81. data/test/data/multi_cell_call_no_dup.csv +1057 -0
  82. data/test/data/multicolumn.csv +8 -0
  83. data/test/data/rat_feature_dataset.csv +1179 -0
  84. data/test/data/wrong_dataset.csv +8 -0
  85. data/test/dataset-long.rb +117 -0
  86. data/test/dataset.rb +199 -0
  87. data/test/descriptor-long.rb +26 -0
  88. data/test/descriptor.rb +83 -0
  89. data/test/error.rb +24 -0
  90. data/test/feature.rb +65 -0
  91. data/test/fminer-long.rb +38 -0
  92. data/test/fminer.rb +52 -0
  93. data/test/lazar-fminer.rb +50 -0
  94. data/test/lazar-long.rb +72 -0
  95. data/test/lazar-physchem-short.rb +27 -0
  96. data/test/setup.rb +6 -0
  97. data/test/validation.rb +41 -0
  98. metadata +212 -0
data/lib/descriptor.rb ADDED
@@ -0,0 +1,247 @@
1
+ require 'digest/md5'
2
+ ENV["JAVA_HOME"] ||= "/usr/lib/jvm/java-7-openjdk"
3
+ # TODO store descriptors in mongodb
4
+
5
+ module OpenTox
6
+
7
+ module Algorithm
8
+
9
+ # Class for descriptor calculations
10
+ class Descriptor
11
+ include OpenTox
12
+
13
+ JAVA_DIR = File.join(File.dirname(__FILE__),"..","java")
14
+ CDK_JAR = Dir[File.join(JAVA_DIR,"cdk-*jar")].last
15
+ JOELIB_JAR = File.join(JAVA_DIR,"joelib2.jar")
16
+ LOG4J_JAR = File.join(JAVA_DIR,"log4j.jar")
17
+ JMOL_JAR = File.join(JAVA_DIR,"Jmol.jar")
18
+
19
+ obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title"]
20
+ OBDESCRIPTORS = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d|
21
+ name,description = d.split(/\s+/,2)
22
+ ["Openbabel."+name,description] unless obexclude.include? name
23
+ end.compact.sort{|a,b| a[0] <=> b[0]}]
24
+
25
+ cdk_desc = YAML.load(`java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptorInfo`)
26
+ CDKDESCRIPTORS = Hash[cdk_desc.collect { |d| ["Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,''), d[:description]] }.sort{|a,b| a[0] <=> b[0]}]
27
+ CDKDESCRIPTOR_VALUES = cdk_desc.collect { |d| prefix="Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,''); d[:names].collect{ |name| prefix+"."+name } }.flatten
28
+
29
+ # exclude Hashcode (not a physchem property) and GlobalTopologicalChargeIndex (Joelib bug)
30
+ joelibexclude = ["MoleculeHashcode","GlobalTopologicalChargeIndex"]
31
+ # strip Joelib messages from stdout
32
+ JOELIBDESCRIPTORS = Hash[YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`).collect do |d|
33
+ name = d[:java_class].sub(/^joelib2.feature.types./,'')
34
+ # impossible to obtain meaningful descriptions from JOELIb, see java/JoelibDescriptors.java
35
+ ["Joelib."+name, "no description available"] unless joelibexclude.include? name
36
+ end.compact.sort{|a,b| a[0] <=> b[0]}]
37
+
38
+ DESCRIPTORS = OBDESCRIPTORS.merge(CDKDESCRIPTORS.merge(JOELIBDESCRIPTORS))
39
+ DESCRIPTOR_VALUES = OBDESCRIPTORS.keys + CDKDESCRIPTOR_VALUES + JOELIBDESCRIPTORS.keys
40
+
41
+ require_relative "unique_descriptors.rb"
42
+
43
+ # Description of available descriptors
44
+ def self.description descriptor
45
+ lib = descriptor.split('.').first
46
+ case lib
47
+ when "Openbabel"
48
+ OBDESCRIPTORS[descriptor]
49
+ when "Cdk"
50
+ name = descriptor.split('.')[0..-2].join('.')
51
+ CDKDESCRIPTORS[name]
52
+ when "Joelib"
53
+ JOELIBDESCRIPTORS[descriptor]
54
+ when "lookup"
55
+ "Read feature values from a dataset"
56
+ end
57
+ end
58
+
59
+ # Match an array of smarts features
60
+ def self.smarts_match compounds, smarts_features, count=false
61
+ bad_request_error "Compounds for smarts_match are empty" unless compounds
62
+ bad_request_error "Smarts features for smarts_match are empty" unless smarts_features
63
+ parse compounds
64
+ @count = count
65
+ obconversion = OpenBabel::OBConversion.new
66
+ obmol = OpenBabel::OBMol.new
67
+ obconversion.set_in_format('smi')
68
+ smarts_pattern = OpenBabel::OBSmartsPattern.new
69
+ smarts_features = [smarts_features] if smarts_features.is_a?(Feature)
70
+ @smarts = smarts_features.collect{|f| f.smarts}
71
+ @physchem_descriptors = nil
72
+ @data_entries = Array.new(@compounds.size){Array.new(@smarts.size,false)}
73
+ @compounds.each_with_index do |compound,c|
74
+ obconversion.read_string(obmol,compound.smiles)
75
+ @smarts.each_with_index do |smart,s|
76
+ smarts_pattern.init(smart)
77
+ if smarts_pattern.match(obmol)
78
+ count ? value = smarts_pattern.get_map_list.to_a.size : value = 1
79
+ else
80
+ value = 0
81
+ end
82
+ @data_entries[c][s] = value
83
+ end
84
+ end
85
+ serialize
86
+ end
87
+
88
+ # Count matches of an array with smarts features
89
+ def self.smarts_count compounds, smarts
90
+ # TODO: non-overlapping matches?
91
+ smarts_match compounds,smarts,true
92
+ end
93
+
94
+ # Calculate physchem descriptors
95
+ # @param [OpenTox::Compound,Array,OpenTox::Dataset] input object, either a compound, an array of compounds or a dataset
96
+ def self.physchem compounds, descriptors=UNIQUEDESCRIPTORS
97
+ parse compounds
98
+ @data_entries = Array.new(@compounds.size){[]}
99
+ @descriptors = descriptors
100
+ @smarts = nil
101
+ @physchem_descriptors = [] # CDK may return more than one result per descriptor, they are stored as separate features
102
+ des = {}
103
+ @descriptors.each do |d|
104
+ lib, descriptor = d.split(".",2)
105
+ lib = lib.downcase.to_sym
106
+ des[lib] ||= []
107
+ des[lib] << descriptor
108
+ end
109
+ des.each do |lib,descriptors|
110
+ send(lib, descriptors)
111
+ end
112
+ serialize
113
+ end
114
+
115
+ def self.openbabel descriptors
116
+ $logger.debug "compute #{descriptors.size} openbabel descriptors for #{@compounds.size} compounds"
117
+ obdescriptors = descriptors.collect{|d| OpenBabel::OBDescriptor.find_type d}
118
+ obmol = OpenBabel::OBMol.new
119
+ obconversion = OpenBabel::OBConversion.new
120
+ obconversion.set_in_format 'smi'
121
+ last_feature_idx = @physchem_descriptors.size
122
+ @compounds.each_with_index do |compound,c|
123
+ obconversion.read_string obmol, compound.smiles
124
+ obdescriptors.each_with_index do |descriptor,d|
125
+ @data_entries[c][d+last_feature_idx] = fix_value(descriptor.predict(obmol))
126
+ end
127
+ end
128
+ @physchem_descriptors += descriptors.collect{|d| "Openbabel.#{d}"}
129
+ end
130
+
131
+ def self.java_descriptors descriptors, lib
132
+ $logger.debug "compute #{descriptors.size} cdk descriptors for #{@compounds.size} compounds"
133
+ sdf = sdf_3d
134
+ # use java system call (rjb blocks within tasks)
135
+ # use Tempfiles to avoid "Argument list too long" error
136
+ case lib
137
+ when "cdk"
138
+ run_cmd "java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptors #{sdf} #{descriptors.join(" ")}"
139
+ when "joelib"
140
+ run_cmd "java -classpath #{JOELIB_JAR}:#{JMOL_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptors #{sdf} #{descriptors.join(' ')}"
141
+ end
142
+ last_feature_idx = @physchem_descriptors.size
143
+ YAML.load_file("#{sdf}#{lib}.yaml").each_with_index do |calculation,i|
144
+ # TODO create warnings
145
+ #$logger.error "Descriptor calculation failed for compound #{@compounds[i].inchi}." if calculation.empty?
146
+ # CDK Descriptors may calculate multiple values, they are stored in separate features
147
+ @physchem_descriptors += calculation.keys if i == 0
148
+ calculation.keys.each_with_index do |name,j|
149
+ @data_entries[i][j+last_feature_idx] = fix_value(calculation[name])
150
+ end
151
+ end
152
+ FileUtils.rm "#{sdf}#{lib}.yaml"
153
+ end
154
+
155
+ def self.cdk descriptors
156
+ java_descriptors descriptors, "cdk"
157
+ end
158
+
159
+ def self.joelib descriptors
160
+ java_descriptors descriptors, "joelib"
161
+ end
162
+
163
+ def self.lookup compounds, features, dataset
164
+ parse compounds
165
+ fingerprint = []
166
+ compounds.each do |compound|
167
+ fingerprint << []
168
+ features.each do |feature|
169
+ end
170
+ end
171
+ end
172
+
173
+ def self.run_cmd cmd
174
+ cmd = "#{cmd} 2>&1"
175
+ $logger.debug "running external cmd: '#{cmd}'"
176
+ p = IO.popen(cmd) do |io|
177
+ while line = io.gets
178
+ $logger.debug "> #{line.chomp}"
179
+ end
180
+ io.close
181
+ raise "external cmd failed '#{cmd}' (see log file for error msg)" unless $?.to_i == 0
182
+ end
183
+ end
184
+
185
+ def self.sdf_3d
186
+ # TODO check if 3d sdfs are stored in GridFS
187
+ sdf = ""
188
+ @compounds.each do |compound|
189
+ sdf << compound.sdf
190
+ end
191
+ sdf_file = "/tmp/#{SecureRandom.uuid}.sdf"
192
+ File.open(sdf_file,"w+"){|f| f.print sdf}
193
+ sdf_file
194
+ end
195
+
196
+ def self.parse compounds
197
+ @input_class = compounds.class.to_s
198
+ case @input_class
199
+ when "OpenTox::Compound"
200
+ @compounds = [compounds]
201
+ when "Array"
202
+ @compounds = compounds
203
+ when "OpenTox::Dataset"
204
+ @compounds = compounds.compounds
205
+ else
206
+ bad_request_error "Cannot calculate descriptors for #{compounds.class} objects."
207
+ end
208
+ end
209
+
210
+ def self.serialize
211
+ @data_entries.collect!{|de| de.collect{|v| v.round(5) unless v.nil?}}
212
+ case @input_class
213
+ when "OpenTox::Compound"
214
+ @data_entries.first
215
+ when "Array"
216
+ @data_entries
217
+ when "OpenTox::Dataset"
218
+ dataset = OpenTox::DescriptorDataset.new(:compound_ids => @compounds.collect{|c| c.id})
219
+ if @smarts
220
+ dataset.feature_ids = @smarts.collect{|smart| Smarts.find_or_create_by(:smarts => smart).id}
221
+ @count ? algo = "count" : algo = "match"
222
+ dataset.feature_calculation_algorithm = "#{self}.smarts_#{algo}"
223
+
224
+ elsif @physchem_descriptors
225
+ dataset.feature_ids = @physchem_descriptors.collect{|d| PhysChemDescriptor.find_or_create_by(:name => d, :creator => __FILE__).id}
226
+ dataset.data_entries = @data_entries
227
+ dataset.feature_calculation_algorithm = "#{self}.physchem"
228
+ #TODO params?
229
+ end
230
+ dataset.save_all
231
+ dataset
232
+ end
233
+ end
234
+
235
+ def self.fix_value val
236
+ val = val.first if val.is_a? Array and val.size == 1
237
+ val = nil if val == "NaN"
238
+ if val.numeric?
239
+ val = Float(val)
240
+ val = nil if val.nan? or val.infinite?
241
+ end
242
+ val
243
+ end
244
+ private_class_method :sdf_3d, :fix_value, :parse, :run_cmd, :serialize
245
+ end
246
+ end
247
+ end
data/lib/error.rb ADDED
@@ -0,0 +1,66 @@
1
+ module OpenToxError
2
+ attr_accessor :http_code, :message, :cause
3
+ def initialize message=nil
4
+ message = message.to_s.gsub(/\A"|"\Z/, '') if message # remove quotes
5
+ super message
6
+ @http_code ||= 500
7
+ @message = message.to_s
8
+ @cause = cut_backtrace(caller)
9
+ $logger.error("\n"+JSON.pretty_generate({
10
+ :http_code => @http_code,
11
+ :message => @message,
12
+ :cause => @cause
13
+ }))
14
+ end
15
+
16
+ def cut_backtrace(trace)
17
+ if trace.is_a?(Array)
18
+ cut_index = trace.find_index{|line| line.match(/sinatra|minitest/)}
19
+ cut_index ||= trace.size
20
+ cut_index -= 1
21
+ cut_index = trace.size-1 if cut_index < 0
22
+ trace[0..cut_index]
23
+ else
24
+ trace
25
+ end
26
+ end
27
+
28
+ end
29
+
30
+ class RuntimeError
31
+ include OpenToxError
32
+ end
33
+
34
+ # clutters log file with library errors
35
+ #class NoMethodError
36
+ #include OpenToxError
37
+ #end
38
+
39
+ module OpenTox
40
+
41
+ class Error < RuntimeError
42
+ include OpenToxError
43
+
44
+ def initialize(code, message=nil)
45
+ @http_code = code
46
+ super message
47
+ end
48
+ end
49
+
50
+ # OpenTox errors
51
+ RestClientWrapper.known_errors.each do |error|
52
+ # create error classes
53
+ c = Class.new Error do
54
+ define_method :initialize do |message=nil|
55
+ super error[:code], message
56
+ end
57
+ end
58
+ OpenTox.const_set error[:class],c
59
+
60
+ # define global methods for raising errors, eg. bad_request_error
61
+ Object.send(:define_method, error[:method]) do |message,uri=nil,cause=nil|
62
+ raise c.new(message)
63
+ end
64
+ end
65
+
66
+ end
data/lib/feature.rb ADDED
@@ -0,0 +1,97 @@
1
+ module OpenTox
2
+
3
+ # Basic feature class
4
+ class Feature
5
+ field :name, as: :title, type: String
6
+ field :nominal, type: Boolean
7
+ field :numeric, type: Boolean
8
+ field :measured, type: Boolean
9
+ end
10
+
11
+ # Feature for categorical variables
12
+ class NominalFeature < Feature
13
+ # TODO check if accept_values are still needed
14
+ field :accept_values, type: Array
15
+ def initialize params
16
+ super params
17
+ nominal = true
18
+ end
19
+ end
20
+
21
+ # Feature for quantitative variables
22
+ class NumericFeature < Feature
23
+ def initialize params
24
+ super params
25
+ numeric = true
26
+ end
27
+ end
28
+
29
+ # Feature for SMARTS fragments
30
+ class Smarts < NominalFeature
31
+ field :smarts, type: String
32
+ def self.from_smarts smarts
33
+ self.find_or_create_by :smarts => smarts
34
+ end
35
+ end
36
+
37
+ # Feature for supervised fragments from Fminer algorithm
38
+ class FminerSmarts < Smarts
39
+ field :p_value, type: Float
40
+ # TODO check if effect is used
41
+ field :effect, type: String
42
+ field :dataset_id
43
+ end
44
+
45
+ # Feature for database fingerprints
46
+ # needs count for efficient retrieval (see compound.rb)
47
+ class FingerprintSmarts < Smarts
48
+ field :count, type: Integer
49
+ def self.fingerprint
50
+ @@fp4 ||= OpenTox::FingerprintSmarts.all
51
+ unless @@fp4.size == 306
52
+ @@fp4 = []
53
+ # OpenBabel FP4 fingerprints
54
+ # OpenBabel http://open-babel.readthedocs.org/en/latest/Fingerprints/intro.html
55
+ # TODO investigate other types of fingerprints (MACCS)
56
+ # OpenBabel http://open-babel.readthedocs.org/en/latest/Fingerprints/intro.html
57
+ # http://www.dalkescientific.com/writings/diary/archive/2008/06/26/fingerprint_background.html
58
+ # OpenBabel MNA http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html#multilevel-neighborhoods-of-atoms-mna
59
+ # Morgan ECFP, FCFP
60
+ # http://cdk.github.io/cdk/1.5/docs/api/org/openscience/cdk/fingerprint/CircularFingerprinter.html
61
+ # http://www.rdkit.org/docs/GettingStartedInPython.html
62
+ # Chemfp
63
+ # https://chemfp.readthedocs.org/en/latest/using-tools.html
64
+ # CACTVS/PubChem
65
+
66
+ File.open(File.join(File.dirname(__FILE__),"SMARTS_InteLigand.txt")).each do |l|
67
+ l.strip!
68
+ unless l.empty? or l.match /^#/
69
+ name,smarts = l.split(': ')
70
+ @@fp4 << OpenTox::FingerprintSmarts.find_or_create_by(:name => name, :smarts => smarts) unless smarts.nil?
71
+ end
72
+ end
73
+ end
74
+ @@fp4
75
+ end
76
+ end
77
+
78
+ # Feature for physico-chemical descriptors
79
+ class PhysChemDescriptor < NumericFeature
80
+ field :algorithm, type: String, default: "OpenTox::Algorithm::Descriptor.physchem"
81
+ field :parameters, type: Hash
82
+ field :creator, type: String
83
+ end
84
+
85
+ # Feature for categorical bioassay results
86
+ class NominalBioAssay < NominalFeature
87
+ # TODO: needed? move to dataset?
88
+ field :description, type: String
89
+ end
90
+
91
+ # Feature for quantitative bioassay results
92
+ class NumericBioAssay < NumericFeature
93
+ # TODO: needed? move to dataset?
94
+ field :description, type: String
95
+ end
96
+
97
+ end
@@ -0,0 +1,170 @@
1
+ module OpenTox
2
+
3
+ module Model
4
+
5
+ class Lazar
6
+ include OpenTox
7
+ include Mongoid::Document
8
+ include Mongoid::Timestamps
9
+ store_in collection: "models"
10
+
11
+ field :title, type: String
12
+ field :creator, type: String, default: __FILE__
13
+ # datasets
14
+ field :training_dataset_id, type: BSON::ObjectId
15
+ # algorithms
16
+ field :prediction_algorithm, type: String
17
+ field :neighbor_algorithm, type: String
18
+ field :neighbor_algorithm_parameters, type: Hash
19
+ # prediction feature
20
+ field :prediction_feature_id, type: BSON::ObjectId
21
+
22
+ attr_accessor :prediction_dataset
23
+ attr_accessor :training_dataset
24
+
25
+ # Create a lazar model from a training_dataset and a feature_dataset
26
+ # @param [OpenTox::Dataset] training_dataset
27
+ # @return [OpenTox::Model::Lazar] Regression or classification model
28
+ def self.create training_dataset
29
+
30
+ bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1
31
+
32
+ # TODO document convention
33
+ prediction_feature = training_dataset.features.first
34
+ prediction_feature.nominal ? lazar = OpenTox::Model::LazarClassification.new : lazar = OpenTox::Model::LazarRegression.new
35
+ lazar.training_dataset_id = training_dataset.id
36
+ lazar.prediction_feature_id = prediction_feature.id
37
+ lazar.title = prediction_feature.title
38
+
39
+ lazar.save
40
+ lazar
41
+ end
42
+
43
+ def predict object
44
+
45
+ t = Time.now
46
+ at = Time.now
47
+
48
+ training_dataset = Dataset.find training_dataset_id
49
+ prediction_feature = Feature.find prediction_feature_id
50
+
51
+ # parse data
52
+ compounds = []
53
+ case object.class.to_s
54
+ when "OpenTox::Compound"
55
+ compounds = [object]
56
+ when "Array"
57
+ compounds = object
58
+ when "OpenTox::Dataset"
59
+ compounds = object.compounds
60
+ else
61
+ bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Compounds or an OpenTox::Dataset as parameter."
62
+ end
63
+
64
+ # make predictions
65
+ predictions = []
66
+ neighbors = []
67
+ compounds.each_with_index do |compound,c|
68
+ t = Time.new
69
+ database_activities = training_dataset.values(compound,prediction_feature)
70
+ if database_activities and !database_activities.empty?
71
+ database_activities = database_activities.first if database_activities.size == 1
72
+ predictions << {:compound => compound, :value => database_activities, :confidence => "measured", :warning => "Compound #{compound.smiles} occurs in training dataset with activity '#{database_activities}'."}
73
+ next
74
+ end
75
+ neighbors = Algorithm.run(neighbor_algorithm, compound, neighbor_algorithm_parameters)
76
+ # add activities
77
+ # TODO: improve efficiency, takes 3 times longer than previous version
78
+ neighbors.collect! do |n|
79
+ rows = training_dataset.compound_ids.each_index.select{|i| training_dataset.compound_ids[i] == n.first}
80
+ acts = rows.collect{|row| training_dataset.data_entries[row][0]}.compact
81
+ acts.empty? ? nil : n << acts
82
+ end
83
+ neighbors.compact! # remove neighbors without training activities
84
+ predictions << Algorithm.run(prediction_algorithm, neighbors)
85
+ end
86
+
87
+ # serialize result
88
+ case object.class.to_s
89
+ when "OpenTox::Compound"
90
+ prediction = predictions.first
91
+ prediction[:neighbors] = neighbors.sort{|a,b| b[1] <=> a[1]} # sort according to similarity
92
+ return prediction
93
+ when "Array"
94
+ return predictions
95
+ when "OpenTox::Dataset"
96
+ # prepare prediction dataset
97
+ prediction_dataset = LazarPrediction.new(
98
+ :title => "Lazar prediction for #{prediction_feature.title}",
99
+ :creator => __FILE__,
100
+ :prediction_feature_id => prediction_feature.id
101
+
102
+ )
103
+ confidence_feature = OpenTox::NumericFeature.find_or_create_by( "title" => "Prediction confidence" )
104
+ # TODO move into warnings field
105
+ warning_feature = OpenTox::NominalFeature.find_or_create_by("title" => "Warnings")
106
+ prediction_dataset.features = [ prediction_feature, confidence_feature, warning_feature ]
107
+ prediction_dataset.compounds = compounds
108
+ prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:confidence], p[:warning]]}
109
+ prediction_dataset.save_all
110
+ return prediction_dataset
111
+ end
112
+
113
+ end
114
+
115
+ def training_activities
116
+ i = training_dataset.feature_ids.index prediction_feature_id
117
+ training_dataset.data_entries.collect{|de| de[i]}
118
+ end
119
+
120
+ end
121
+
122
+ class LazarClassification < Lazar
123
+ def initialize
124
+ super
125
+ self.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote"
126
+ self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity"
127
+ self.neighbor_algorithm_parameters = {:min_sim => 0.7}
128
+ end
129
+ end
130
+
131
+ class LazarFminerClassification < LazarClassification
132
+
133
+ def self.create training_dataset
134
+ model = super(training_dataset)
135
+ model.update "_type" => self.to_s # adjust class
136
+ model = self.find model.id # adjust class
137
+ model.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fminer_similarity"
138
+ model.neighbor_algorithm_parameters = {
139
+ :feature_calculation_algorithm => "OpenTox::Algorithm::Descriptor.smarts_match",
140
+ :feature_dataset_id => Algorithm::Fminer.bbrc(training_dataset).id,
141
+ :min_sim => 0.3
142
+ }
143
+ model.save
144
+ model
145
+ end
146
+ end
147
+
148
+ class LazarRegression < Lazar
149
+
150
+ def initialize
151
+ super
152
+ self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity"
153
+ self.prediction_algorithm = "OpenTox::Algorithm::Regression.weighted_average"
154
+ self.neighbor_algorithm_parameters = {:min_sim => 0.7}
155
+ end
156
+
157
+ end
158
+
159
+ class PredictionModel < Lazar
160
+ field :category, type: String
161
+ field :endpoint, type: String
162
+ field :unit, type: String
163
+ field :model_id, type: BSON::ObjectId
164
+ field :crossvalidation_id, type: BSON::ObjectId
165
+ end
166
+
167
+ end
168
+
169
+ end
170
+
data/lib/lazar.rb ADDED
@@ -0,0 +1,69 @@
1
+ require 'rubygems'
2
+ require "bundler/setup"
3
+ require "rest-client"
4
+ require 'yaml'
5
+ require 'json'
6
+ require 'logger'
7
+ require 'mongoid'
8
+ require 'rserve'
9
+ require "nokogiri"
10
+ require "base64"
11
+
12
+
13
+ # Mongo setup
14
+ # TODO retrieve correct environment from Rack/Sinatra
15
+ ENV["MONGOID_ENV"] ||= "development"
16
+ # TODO remove config files, change default via ENV or directly in Mongoid class
17
+ Mongoid.load!("#{File.expand_path(File.join(File.dirname(__FILE__),'..','mongoid.yml'))}")
18
+ # TODO get Mongo::Client from Mongoid
19
+ $mongo = Mongo::Client.new('mongodb://127.0.0.1:27017/opentox')
20
+ # TODO same for GridFS
21
+ $gridfs = $mongo.database.fs
22
+
23
+ # R setup
24
+ R = Rserve::Connection.new
25
+
26
+ # Logger setup
27
+ $logger = Logger.new STDOUT # STDERR did not work on my development machine (CH)
28
+ $logger.level = Logger::DEBUG
29
+ Mongo::Logger.logger = $logger
30
+ Mongo::Logger.level = Logger::WARN
31
+ #Mongoid.logger = $logger
32
+
33
+ # Require sub-Repositories
34
+ require_relative '../libfminer/libbbrc/bbrc' # include before openbabel
35
+ require_relative '../libfminer/liblast/last' #
36
+ require_relative '../last-utils/lu.rb'
37
+ require_relative '../openbabel/lib/openbabel'
38
+
39
+ # Fminer environment variables
40
+ ENV['FMINER_SMARTS'] = 'true'
41
+ ENV['FMINER_NO_AROMATIC'] = 'true'
42
+ ENV['FMINER_PVALUES'] = 'true'
43
+ ENV['FMINER_SILENT'] = 'true'
44
+ ENV['FMINER_NR_HITS'] = 'true'
45
+
46
+ # OpenTox classes and includes
47
+ CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation"]# Algorithm and Models are modules
48
+
49
+ [ # be aware of the require sequence as it affects class/method overwrites
50
+ "overwrite.rb",
51
+ "rest-client-wrapper.rb",
52
+ "error.rb",
53
+ "opentox.rb",
54
+ "feature.rb",
55
+ "compound.rb",
56
+ "dataset.rb",
57
+ "descriptor.rb",
58
+ "algorithm.rb",
59
+ "descriptor.rb",
60
+ "bbrc.rb",
61
+ "lazar-model.rb",
62
+ "similarity.rb",
63
+ "neighbor.rb",
64
+ "classification.rb",
65
+ "regression.rb",
66
+ "validation.rb",
67
+ "crossvalidation.rb",
68
+ ].each{ |f| require_relative f }
69
+
data/lib/neighbor.rb ADDED
@@ -0,0 +1,25 @@
1
+ module OpenTox
2
+ module Algorithm
3
+ class Neighbor
4
+
5
+ def self.fingerprint_similarity compound, params={}
6
+ compound.neighbors params[:min_sim]
7
+ end
8
+
9
+ def self.fminer_similarity compound, params
10
+ feature_dataset = Dataset.find params[:feature_dataset_id]
11
+ query_fingerprint = Algorithm::Descriptor.smarts_match(compound, feature_dataset.features)
12
+ neighbors = []
13
+
14
+ # find neighbors
15
+ feature_dataset.data_entries.each_with_index do |fingerprint, i|
16
+ sim = Algorithm::Similarity.tanimoto fingerprint, query_fingerprint
17
+ if sim > params[:min_sim]
18
+ neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming
19
+ end
20
+ end
21
+ neighbors
22
+ end
23
+ end
24
+ end
25
+ end
data/lib/opentox.rb ADDED
@@ -0,0 +1,22 @@
1
+ module OpenTox
2
+
3
+ # Ruby interface
4
+
5
+ # create default OpenTox classes (defined in opentox-client.rb)
6
+ # provides Mongoid's query and persistence methods
7
+ # http://mongoid.org/en/mongoid/docs/persistence.html
8
+ # http://mongoid.org/en/mongoid/docs/querying.html
9
+ CLASSES.each do |klass|
10
+ c = Class.new do
11
+ include OpenTox
12
+ include Mongoid::Document
13
+ include Mongoid::Timestamps
14
+ store_in collection: klass.downcase.pluralize
15
+ field :title, as: :name, type: String
16
+
17
+ end
18
+ OpenTox.const_set klass,c
19
+ end
20
+
21
+ end
22
+