lazar 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/.yardopts +4 -0
  4. data/Gemfile +2 -0
  5. data/LICENSE +674 -0
  6. data/README.md +44 -0
  7. data/Rakefile +1 -0
  8. data/VERSION +1 -0
  9. data/ext/lazar/extconf.rb +87 -0
  10. data/java/CdkDescriptorInfo.class +0 -0
  11. data/java/CdkDescriptorInfo.java +22 -0
  12. data/java/CdkDescriptors.class +0 -0
  13. data/java/CdkDescriptors.java +141 -0
  14. data/java/Jmol.jar +0 -0
  15. data/java/JoelibDescriptorInfo.class +0 -0
  16. data/java/JoelibDescriptorInfo.java +15 -0
  17. data/java/JoelibDescriptors.class +0 -0
  18. data/java/JoelibDescriptors.java +60 -0
  19. data/java/Rakefile +15 -0
  20. data/java/cdk-1.4.19.jar +0 -0
  21. data/java/joelib2.jar +0 -0
  22. data/java/log4j.jar +0 -0
  23. data/lazar.gemspec +29 -0
  24. data/lib/SMARTS_InteLigand.txt +983 -0
  25. data/lib/algorithm.rb +21 -0
  26. data/lib/bbrc.rb +165 -0
  27. data/lib/classification.rb +107 -0
  28. data/lib/compound.rb +254 -0
  29. data/lib/crossvalidation.rb +187 -0
  30. data/lib/dataset.rb +334 -0
  31. data/lib/descriptor.rb +247 -0
  32. data/lib/error.rb +66 -0
  33. data/lib/feature.rb +97 -0
  34. data/lib/lazar-model.rb +170 -0
  35. data/lib/lazar.rb +69 -0
  36. data/lib/neighbor.rb +25 -0
  37. data/lib/opentox.rb +22 -0
  38. data/lib/overwrite.rb +119 -0
  39. data/lib/regression.rb +199 -0
  40. data/lib/rest-client-wrapper.rb +98 -0
  41. data/lib/similarity.rb +58 -0
  42. data/lib/unique_descriptors.rb +120 -0
  43. data/lib/validation.rb +114 -0
  44. data/mongoid.yml +8 -0
  45. data/test/all.rb +5 -0
  46. data/test/compound.rb +100 -0
  47. data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +13553 -0
  48. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +436 -0
  49. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +568 -0
  50. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +87 -0
  51. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +978 -0
  52. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +1120 -0
  53. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +1113 -0
  54. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +850 -0
  55. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +829 -0
  56. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +1198 -0
  57. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +1505 -0
  58. data/test/data/EPAFHM.csv +618 -0
  59. data/test/data/EPAFHM.medi.csv +100 -0
  60. data/test/data/EPAFHM.mini.csv +22 -0
  61. data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +581 -0
  62. data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +1217 -0
  63. data/test/data/ISSCAN-multi.csv +59 -0
  64. data/test/data/LOAEL_log_mg_corrected_smiles.csv +568 -0
  65. data/test/data/LOAEL_log_mmol_corrected_smiles.csv +568 -0
  66. data/test/data/acetaldehyde.sdf +14 -0
  67. data/test/data/boiling_points.ext.sdf +11460 -0
  68. data/test/data/cpdb_100.csv +101 -0
  69. data/test/data/hamster_carcinogenicity.csv +86 -0
  70. data/test/data/hamster_carcinogenicity.mini.bool_float.csv +11 -0
  71. data/test/data/hamster_carcinogenicity.mini.bool_int.csv +11 -0
  72. data/test/data/hamster_carcinogenicity.mini.bool_string.csv +11 -0
  73. data/test/data/hamster_carcinogenicity.mini.csv +11 -0
  74. data/test/data/hamster_carcinogenicity.ntriples +618 -0
  75. data/test/data/hamster_carcinogenicity.sdf +2805 -0
  76. data/test/data/hamster_carcinogenicity.xls +0 -0
  77. data/test/data/hamster_carcinogenicity.yaml +352 -0
  78. data/test/data/hamster_carcinogenicity_with_errors.csv +88 -0
  79. data/test/data/kazius.csv +4070 -0
  80. data/test/data/multi_cell_call.csv +1067 -0
  81. data/test/data/multi_cell_call_no_dup.csv +1057 -0
  82. data/test/data/multicolumn.csv +8 -0
  83. data/test/data/rat_feature_dataset.csv +1179 -0
  84. data/test/data/wrong_dataset.csv +8 -0
  85. data/test/dataset-long.rb +117 -0
  86. data/test/dataset.rb +199 -0
  87. data/test/descriptor-long.rb +26 -0
  88. data/test/descriptor.rb +83 -0
  89. data/test/error.rb +24 -0
  90. data/test/feature.rb +65 -0
  91. data/test/fminer-long.rb +38 -0
  92. data/test/fminer.rb +52 -0
  93. data/test/lazar-fminer.rb +50 -0
  94. data/test/lazar-long.rb +72 -0
  95. data/test/lazar-physchem-short.rb +27 -0
  96. data/test/setup.rb +6 -0
  97. data/test/validation.rb +41 -0
  98. metadata +212 -0
data/lib/descriptor.rb ADDED
@@ -0,0 +1,247 @@
1
+ require 'digest/md5'
2
+ ENV["JAVA_HOME"] ||= "/usr/lib/jvm/java-7-openjdk"
3
+ # TODO store descriptors in mongodb
4
+
5
+ module OpenTox
6
+
7
+ module Algorithm
8
+
9
+ # Class for descriptor calculations
10
+ class Descriptor
11
+ include OpenTox
12
+
13
+ JAVA_DIR = File.join(File.dirname(__FILE__),"..","java")
14
+ CDK_JAR = Dir[File.join(JAVA_DIR,"cdk-*jar")].last
15
+ JOELIB_JAR = File.join(JAVA_DIR,"joelib2.jar")
16
+ LOG4J_JAR = File.join(JAVA_DIR,"log4j.jar")
17
+ JMOL_JAR = File.join(JAVA_DIR,"Jmol.jar")
18
+
19
+ obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title"]
20
+ OBDESCRIPTORS = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d|
21
+ name,description = d.split(/\s+/,2)
22
+ ["Openbabel."+name,description] unless obexclude.include? name
23
+ end.compact.sort{|a,b| a[0] <=> b[0]}]
24
+
25
+ cdk_desc = YAML.load(`java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptorInfo`)
26
+ CDKDESCRIPTORS = Hash[cdk_desc.collect { |d| ["Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,''), d[:description]] }.sort{|a,b| a[0] <=> b[0]}]
27
+ CDKDESCRIPTOR_VALUES = cdk_desc.collect { |d| prefix="Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,''); d[:names].collect{ |name| prefix+"."+name } }.flatten
28
+
29
+ # exclude Hashcode (not a physchem property) and GlobalTopologicalChargeIndex (Joelib bug)
30
+ joelibexclude = ["MoleculeHashcode","GlobalTopologicalChargeIndex"]
31
+ # strip Joelib messages from stdout
32
+ JOELIBDESCRIPTORS = Hash[YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`).collect do |d|
33
+ name = d[:java_class].sub(/^joelib2.feature.types./,'')
34
+ # impossible to obtain meaningful descriptions from JOELIb, see java/JoelibDescriptors.java
35
+ ["Joelib."+name, "no description available"] unless joelibexclude.include? name
36
+ end.compact.sort{|a,b| a[0] <=> b[0]}]
37
+
38
+ DESCRIPTORS = OBDESCRIPTORS.merge(CDKDESCRIPTORS.merge(JOELIBDESCRIPTORS))
39
+ DESCRIPTOR_VALUES = OBDESCRIPTORS.keys + CDKDESCRIPTOR_VALUES + JOELIBDESCRIPTORS.keys
40
+
41
+ require_relative "unique_descriptors.rb"
42
+
43
+ # Description of available descriptors
44
+ def self.description descriptor
45
+ lib = descriptor.split('.').first
46
+ case lib
47
+ when "Openbabel"
48
+ OBDESCRIPTORS[descriptor]
49
+ when "Cdk"
50
+ name = descriptor.split('.')[0..-2].join('.')
51
+ CDKDESCRIPTORS[name]
52
+ when "Joelib"
53
+ JOELIBDESCRIPTORS[descriptor]
54
+ when "lookup"
55
+ "Read feature values from a dataset"
56
+ end
57
+ end
58
+
59
+ # Match an array of smarts features
60
+ def self.smarts_match compounds, smarts_features, count=false
61
+ bad_request_error "Compounds for smarts_match are empty" unless compounds
62
+ bad_request_error "Smarts features for smarts_match are empty" unless smarts_features
63
+ parse compounds
64
+ @count = count
65
+ obconversion = OpenBabel::OBConversion.new
66
+ obmol = OpenBabel::OBMol.new
67
+ obconversion.set_in_format('smi')
68
+ smarts_pattern = OpenBabel::OBSmartsPattern.new
69
+ smarts_features = [smarts_features] if smarts_features.is_a?(Feature)
70
+ @smarts = smarts_features.collect{|f| f.smarts}
71
+ @physchem_descriptors = nil
72
+ @data_entries = Array.new(@compounds.size){Array.new(@smarts.size,false)}
73
+ @compounds.each_with_index do |compound,c|
74
+ obconversion.read_string(obmol,compound.smiles)
75
+ @smarts.each_with_index do |smart,s|
76
+ smarts_pattern.init(smart)
77
+ if smarts_pattern.match(obmol)
78
+ count ? value = smarts_pattern.get_map_list.to_a.size : value = 1
79
+ else
80
+ value = 0
81
+ end
82
+ @data_entries[c][s] = value
83
+ end
84
+ end
85
+ serialize
86
+ end
87
+
88
+ # Count matches of an array with smarts features
89
+ def self.smarts_count compounds, smarts
90
+ # TODO: non-overlapping matches?
91
+ smarts_match compounds,smarts,true
92
+ end
93
+
94
+ # Calculate physchem descriptors
95
+ # @param [OpenTox::Compound,Array,OpenTox::Dataset] input object, either a compound, an array of compounds or a dataset
96
+ def self.physchem compounds, descriptors=UNIQUEDESCRIPTORS
97
+ parse compounds
98
+ @data_entries = Array.new(@compounds.size){[]}
99
+ @descriptors = descriptors
100
+ @smarts = nil
101
+ @physchem_descriptors = [] # CDK may return more than one result per descriptor, they are stored as separate features
102
+ des = {}
103
+ @descriptors.each do |d|
104
+ lib, descriptor = d.split(".",2)
105
+ lib = lib.downcase.to_sym
106
+ des[lib] ||= []
107
+ des[lib] << descriptor
108
+ end
109
+ des.each do |lib,descriptors|
110
+ send(lib, descriptors)
111
+ end
112
+ serialize
113
+ end
114
+
115
+ def self.openbabel descriptors
116
+ $logger.debug "compute #{descriptors.size} openbabel descriptors for #{@compounds.size} compounds"
117
+ obdescriptors = descriptors.collect{|d| OpenBabel::OBDescriptor.find_type d}
118
+ obmol = OpenBabel::OBMol.new
119
+ obconversion = OpenBabel::OBConversion.new
120
+ obconversion.set_in_format 'smi'
121
+ last_feature_idx = @physchem_descriptors.size
122
+ @compounds.each_with_index do |compound,c|
123
+ obconversion.read_string obmol, compound.smiles
124
+ obdescriptors.each_with_index do |descriptor,d|
125
+ @data_entries[c][d+last_feature_idx] = fix_value(descriptor.predict(obmol))
126
+ end
127
+ end
128
+ @physchem_descriptors += descriptors.collect{|d| "Openbabel.#{d}"}
129
+ end
130
+
131
+ def self.java_descriptors descriptors, lib
132
+ $logger.debug "compute #{descriptors.size} cdk descriptors for #{@compounds.size} compounds"
133
+ sdf = sdf_3d
134
+ # use java system call (rjb blocks within tasks)
135
+ # use Tempfiles to avoid "Argument list too long" error
136
+ case lib
137
+ when "cdk"
138
+ run_cmd "java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptors #{sdf} #{descriptors.join(" ")}"
139
+ when "joelib"
140
+ run_cmd "java -classpath #{JOELIB_JAR}:#{JMOL_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptors #{sdf} #{descriptors.join(' ')}"
141
+ end
142
+ last_feature_idx = @physchem_descriptors.size
143
+ YAML.load_file("#{sdf}#{lib}.yaml").each_with_index do |calculation,i|
144
+ # TODO create warnings
145
+ #$logger.error "Descriptor calculation failed for compound #{@compounds[i].inchi}." if calculation.empty?
146
+ # CDK Descriptors may calculate multiple values, they are stored in separate features
147
+ @physchem_descriptors += calculation.keys if i == 0
148
+ calculation.keys.each_with_index do |name,j|
149
+ @data_entries[i][j+last_feature_idx] = fix_value(calculation[name])
150
+ end
151
+ end
152
+ FileUtils.rm "#{sdf}#{lib}.yaml"
153
+ end
154
+
155
+ def self.cdk descriptors
156
+ java_descriptors descriptors, "cdk"
157
+ end
158
+
159
+ def self.joelib descriptors
160
+ java_descriptors descriptors, "joelib"
161
+ end
162
+
163
+ def self.lookup compounds, features, dataset
164
+ parse compounds
165
+ fingerprint = []
166
+ compounds.each do |compound|
167
+ fingerprint << []
168
+ features.each do |feature|
169
+ end
170
+ end
171
+ end
172
+
173
+ def self.run_cmd cmd
174
+ cmd = "#{cmd} 2>&1"
175
+ $logger.debug "running external cmd: '#{cmd}'"
176
+ p = IO.popen(cmd) do |io|
177
+ while line = io.gets
178
+ $logger.debug "> #{line.chomp}"
179
+ end
180
+ io.close
181
+ raise "external cmd failed '#{cmd}' (see log file for error msg)" unless $?.to_i == 0
182
+ end
183
+ end
184
+
185
+ def self.sdf_3d
186
+ # TODO check if 3d sdfs are stored in GridFS
187
+ sdf = ""
188
+ @compounds.each do |compound|
189
+ sdf << compound.sdf
190
+ end
191
+ sdf_file = "/tmp/#{SecureRandom.uuid}.sdf"
192
+ File.open(sdf_file,"w+"){|f| f.print sdf}
193
+ sdf_file
194
+ end
195
+
196
+ def self.parse compounds
197
+ @input_class = compounds.class.to_s
198
+ case @input_class
199
+ when "OpenTox::Compound"
200
+ @compounds = [compounds]
201
+ when "Array"
202
+ @compounds = compounds
203
+ when "OpenTox::Dataset"
204
+ @compounds = compounds.compounds
205
+ else
206
+ bad_request_error "Cannot calculate descriptors for #{compounds.class} objects."
207
+ end
208
+ end
209
+
210
+ def self.serialize
211
+ @data_entries.collect!{|de| de.collect{|v| v.round(5) unless v.nil?}}
212
+ case @input_class
213
+ when "OpenTox::Compound"
214
+ @data_entries.first
215
+ when "Array"
216
+ @data_entries
217
+ when "OpenTox::Dataset"
218
+ dataset = OpenTox::DescriptorDataset.new(:compound_ids => @compounds.collect{|c| c.id})
219
+ if @smarts
220
+ dataset.feature_ids = @smarts.collect{|smart| Smarts.find_or_create_by(:smarts => smart).id}
221
+ @count ? algo = "count" : algo = "match"
222
+ dataset.feature_calculation_algorithm = "#{self}.smarts_#{algo}"
223
+
224
+ elsif @physchem_descriptors
225
+ dataset.feature_ids = @physchem_descriptors.collect{|d| PhysChemDescriptor.find_or_create_by(:name => d, :creator => __FILE__).id}
226
+ dataset.data_entries = @data_entries
227
+ dataset.feature_calculation_algorithm = "#{self}.physchem"
228
+ #TODO params?
229
+ end
230
+ dataset.save_all
231
+ dataset
232
+ end
233
+ end
234
+
235
+ def self.fix_value val
236
+ val = val.first if val.is_a? Array and val.size == 1
237
+ val = nil if val == "NaN"
238
+ if val.numeric?
239
+ val = Float(val)
240
+ val = nil if val.nan? or val.infinite?
241
+ end
242
+ val
243
+ end
244
+ private_class_method :sdf_3d, :fix_value, :parse, :run_cmd, :serialize
245
+ end
246
+ end
247
+ end
data/lib/error.rb ADDED
@@ -0,0 +1,66 @@
1
+ module OpenToxError
2
+ attr_accessor :http_code, :message, :cause
3
+ def initialize message=nil
4
+ message = message.to_s.gsub(/\A"|"\Z/, '') if message # remove quotes
5
+ super message
6
+ @http_code ||= 500
7
+ @message = message.to_s
8
+ @cause = cut_backtrace(caller)
9
+ $logger.error("\n"+JSON.pretty_generate({
10
+ :http_code => @http_code,
11
+ :message => @message,
12
+ :cause => @cause
13
+ }))
14
+ end
15
+
16
+ def cut_backtrace(trace)
17
+ if trace.is_a?(Array)
18
+ cut_index = trace.find_index{|line| line.match(/sinatra|minitest/)}
19
+ cut_index ||= trace.size
20
+ cut_index -= 1
21
+ cut_index = trace.size-1 if cut_index < 0
22
+ trace[0..cut_index]
23
+ else
24
+ trace
25
+ end
26
+ end
27
+
28
+ end
29
+
30
+ class RuntimeError
31
+ include OpenToxError
32
+ end
33
+
34
+ # clutters log file with library errors
35
+ #class NoMethodError
36
+ #include OpenToxError
37
+ #end
38
+
39
+ module OpenTox
40
+
41
+ class Error < RuntimeError
42
+ include OpenToxError
43
+
44
+ def initialize(code, message=nil)
45
+ @http_code = code
46
+ super message
47
+ end
48
+ end
49
+
50
+ # OpenTox errors
51
+ RestClientWrapper.known_errors.each do |error|
52
+ # create error classes
53
+ c = Class.new Error do
54
+ define_method :initialize do |message=nil|
55
+ super error[:code], message
56
+ end
57
+ end
58
+ OpenTox.const_set error[:class],c
59
+
60
+ # define global methods for raising errors, eg. bad_request_error
61
+ Object.send(:define_method, error[:method]) do |message,uri=nil,cause=nil|
62
+ raise c.new(message)
63
+ end
64
+ end
65
+
66
+ end
data/lib/feature.rb ADDED
@@ -0,0 +1,97 @@
1
+ module OpenTox
2
+
3
+ # Basic feature class
4
+ class Feature
5
+ field :name, as: :title, type: String
6
+ field :nominal, type: Boolean
7
+ field :numeric, type: Boolean
8
+ field :measured, type: Boolean
9
+ end
10
+
11
+ # Feature for categorical variables
12
+ class NominalFeature < Feature
13
+ # TODO check if accept_values are still needed
14
+ field :accept_values, type: Array
15
+ def initialize params
16
+ super params
17
+ nominal = true
18
+ end
19
+ end
20
+
21
+ # Feature for quantitative variables
22
+ class NumericFeature < Feature
23
+ def initialize params
24
+ super params
25
+ numeric = true
26
+ end
27
+ end
28
+
29
+ # Feature for SMARTS fragments
30
+ class Smarts < NominalFeature
31
+ field :smarts, type: String
32
+ def self.from_smarts smarts
33
+ self.find_or_create_by :smarts => smarts
34
+ end
35
+ end
36
+
37
+ # Feature for supervised fragments from Fminer algorithm
38
+ class FminerSmarts < Smarts
39
+ field :p_value, type: Float
40
+ # TODO check if effect is used
41
+ field :effect, type: String
42
+ field :dataset_id
43
+ end
44
+
45
+ # Feature for database fingerprints
46
+ # needs count for efficient retrieval (see compound.rb)
47
+ class FingerprintSmarts < Smarts
48
+ field :count, type: Integer
49
+ def self.fingerprint
50
+ @@fp4 ||= OpenTox::FingerprintSmarts.all
51
+ unless @@fp4.size == 306
52
+ @@fp4 = []
53
+ # OpenBabel FP4 fingerprints
54
+ # OpenBabel http://open-babel.readthedocs.org/en/latest/Fingerprints/intro.html
55
+ # TODO investigate other types of fingerprints (MACCS)
56
+ # OpenBabel http://open-babel.readthedocs.org/en/latest/Fingerprints/intro.html
57
+ # http://www.dalkescientific.com/writings/diary/archive/2008/06/26/fingerprint_background.html
58
+ # OpenBabel MNA http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html#multilevel-neighborhoods-of-atoms-mna
59
+ # Morgan ECFP, FCFP
60
+ # http://cdk.github.io/cdk/1.5/docs/api/org/openscience/cdk/fingerprint/CircularFingerprinter.html
61
+ # http://www.rdkit.org/docs/GettingStartedInPython.html
62
+ # Chemfp
63
+ # https://chemfp.readthedocs.org/en/latest/using-tools.html
64
+ # CACTVS/PubChem
65
+
66
+ File.open(File.join(File.dirname(__FILE__),"SMARTS_InteLigand.txt")).each do |l|
67
+ l.strip!
68
+ unless l.empty? or l.match /^#/
69
+ name,smarts = l.split(': ')
70
+ @@fp4 << OpenTox::FingerprintSmarts.find_or_create_by(:name => name, :smarts => smarts) unless smarts.nil?
71
+ end
72
+ end
73
+ end
74
+ @@fp4
75
+ end
76
+ end
77
+
78
+ # Feature for physico-chemical descriptors
79
+ class PhysChemDescriptor < NumericFeature
80
+ field :algorithm, type: String, default: "OpenTox::Algorithm::Descriptor.physchem"
81
+ field :parameters, type: Hash
82
+ field :creator, type: String
83
+ end
84
+
85
+ # Feature for categorical bioassay results
86
+ class NominalBioAssay < NominalFeature
87
+ # TODO: needed? move to dataset?
88
+ field :description, type: String
89
+ end
90
+
91
+ # Feature for quantitative bioassay results
92
+ class NumericBioAssay < NumericFeature
93
+ # TODO: needed? move to dataset?
94
+ field :description, type: String
95
+ end
96
+
97
+ end
@@ -0,0 +1,170 @@
1
+ module OpenTox
2
+
3
+ module Model
4
+
5
+ class Lazar
6
+ include OpenTox
7
+ include Mongoid::Document
8
+ include Mongoid::Timestamps
9
+ store_in collection: "models"
10
+
11
+ field :title, type: String
12
+ field :creator, type: String, default: __FILE__
13
+ # datasets
14
+ field :training_dataset_id, type: BSON::ObjectId
15
+ # algorithms
16
+ field :prediction_algorithm, type: String
17
+ field :neighbor_algorithm, type: String
18
+ field :neighbor_algorithm_parameters, type: Hash
19
+ # prediction feature
20
+ field :prediction_feature_id, type: BSON::ObjectId
21
+
22
+ attr_accessor :prediction_dataset
23
+ attr_accessor :training_dataset
24
+
25
+ # Create a lazar model from a training_dataset and a feature_dataset
26
+ # @param [OpenTox::Dataset] training_dataset
27
+ # @return [OpenTox::Model::Lazar] Regression or classification model
28
+ def self.create training_dataset
29
+
30
+ bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1
31
+
32
+ # TODO document convention
33
+ prediction_feature = training_dataset.features.first
34
+ prediction_feature.nominal ? lazar = OpenTox::Model::LazarClassification.new : lazar = OpenTox::Model::LazarRegression.new
35
+ lazar.training_dataset_id = training_dataset.id
36
+ lazar.prediction_feature_id = prediction_feature.id
37
+ lazar.title = prediction_feature.title
38
+
39
+ lazar.save
40
+ lazar
41
+ end
42
+
43
+ def predict object
44
+
45
+ t = Time.now
46
+ at = Time.now
47
+
48
+ training_dataset = Dataset.find training_dataset_id
49
+ prediction_feature = Feature.find prediction_feature_id
50
+
51
+ # parse data
52
+ compounds = []
53
+ case object.class.to_s
54
+ when "OpenTox::Compound"
55
+ compounds = [object]
56
+ when "Array"
57
+ compounds = object
58
+ when "OpenTox::Dataset"
59
+ compounds = object.compounds
60
+ else
61
+ bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Compounds or an OpenTox::Dataset as parameter."
62
+ end
63
+
64
+ # make predictions
65
+ predictions = []
66
+ neighbors = []
67
+ compounds.each_with_index do |compound,c|
68
+ t = Time.new
69
+ database_activities = training_dataset.values(compound,prediction_feature)
70
+ if database_activities and !database_activities.empty?
71
+ database_activities = database_activities.first if database_activities.size == 1
72
+ predictions << {:compound => compound, :value => database_activities, :confidence => "measured", :warning => "Compound #{compound.smiles} occurs in training dataset with activity '#{database_activities}'."}
73
+ next
74
+ end
75
+ neighbors = Algorithm.run(neighbor_algorithm, compound, neighbor_algorithm_parameters)
76
+ # add activities
77
+ # TODO: improve efficiency, takes 3 times longer than previous version
78
+ neighbors.collect! do |n|
79
+ rows = training_dataset.compound_ids.each_index.select{|i| training_dataset.compound_ids[i] == n.first}
80
+ acts = rows.collect{|row| training_dataset.data_entries[row][0]}.compact
81
+ acts.empty? ? nil : n << acts
82
+ end
83
+ neighbors.compact! # remove neighbors without training activities
84
+ predictions << Algorithm.run(prediction_algorithm, neighbors)
85
+ end
86
+
87
+ # serialize result
88
+ case object.class.to_s
89
+ when "OpenTox::Compound"
90
+ prediction = predictions.first
91
+ prediction[:neighbors] = neighbors.sort{|a,b| b[1] <=> a[1]} # sort according to similarity
92
+ return prediction
93
+ when "Array"
94
+ return predictions
95
+ when "OpenTox::Dataset"
96
+ # prepare prediction dataset
97
+ prediction_dataset = LazarPrediction.new(
98
+ :title => "Lazar prediction for #{prediction_feature.title}",
99
+ :creator => __FILE__,
100
+ :prediction_feature_id => prediction_feature.id
101
+
102
+ )
103
+ confidence_feature = OpenTox::NumericFeature.find_or_create_by( "title" => "Prediction confidence" )
104
+ # TODO move into warnings field
105
+ warning_feature = OpenTox::NominalFeature.find_or_create_by("title" => "Warnings")
106
+ prediction_dataset.features = [ prediction_feature, confidence_feature, warning_feature ]
107
+ prediction_dataset.compounds = compounds
108
+ prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:confidence], p[:warning]]}
109
+ prediction_dataset.save_all
110
+ return prediction_dataset
111
+ end
112
+
113
+ end
114
+
115
+ def training_activities
116
+ i = training_dataset.feature_ids.index prediction_feature_id
117
+ training_dataset.data_entries.collect{|de| de[i]}
118
+ end
119
+
120
+ end
121
+
122
+ class LazarClassification < Lazar
123
+ def initialize
124
+ super
125
+ self.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote"
126
+ self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity"
127
+ self.neighbor_algorithm_parameters = {:min_sim => 0.7}
128
+ end
129
+ end
130
+
131
+ class LazarFminerClassification < LazarClassification
132
+
133
+ def self.create training_dataset
134
+ model = super(training_dataset)
135
+ model.update "_type" => self.to_s # adjust class
136
+ model = self.find model.id # adjust class
137
+ model.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fminer_similarity"
138
+ model.neighbor_algorithm_parameters = {
139
+ :feature_calculation_algorithm => "OpenTox::Algorithm::Descriptor.smarts_match",
140
+ :feature_dataset_id => Algorithm::Fminer.bbrc(training_dataset).id,
141
+ :min_sim => 0.3
142
+ }
143
+ model.save
144
+ model
145
+ end
146
+ end
147
+
148
+ class LazarRegression < Lazar
149
+
150
+ def initialize
151
+ super
152
+ self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity"
153
+ self.prediction_algorithm = "OpenTox::Algorithm::Regression.weighted_average"
154
+ self.neighbor_algorithm_parameters = {:min_sim => 0.7}
155
+ end
156
+
157
+ end
158
+
159
+ class PredictionModel < Lazar
160
+ field :category, type: String
161
+ field :endpoint, type: String
162
+ field :unit, type: String
163
+ field :model_id, type: BSON::ObjectId
164
+ field :crossvalidation_id, type: BSON::ObjectId
165
+ end
166
+
167
+ end
168
+
169
+ end
170
+
data/lib/lazar.rb ADDED
@@ -0,0 +1,69 @@
1
+ require 'rubygems'
2
+ require "bundler/setup"
3
+ require "rest-client"
4
+ require 'yaml'
5
+ require 'json'
6
+ require 'logger'
7
+ require 'mongoid'
8
+ require 'rserve'
9
+ require "nokogiri"
10
+ require "base64"
11
+
12
+
13
+ # Mongo setup
14
+ # TODO retrieve correct environment from Rack/Sinatra
15
+ ENV["MONGOID_ENV"] ||= "development"
16
+ # TODO remove config files, change default via ENV or directly in Mongoid class
17
+ Mongoid.load!("#{File.expand_path(File.join(File.dirname(__FILE__),'..','mongoid.yml'))}")
18
+ # TODO get Mongo::Client from Mongoid
19
+ $mongo = Mongo::Client.new('mongodb://127.0.0.1:27017/opentox')
20
+ # TODO same for GridFS
21
+ $gridfs = $mongo.database.fs
22
+
23
+ # R setup
24
+ R = Rserve::Connection.new
25
+
26
+ # Logger setup
27
+ $logger = Logger.new STDOUT # STDERR did not work on my development machine (CH)
28
+ $logger.level = Logger::DEBUG
29
+ Mongo::Logger.logger = $logger
30
+ Mongo::Logger.level = Logger::WARN
31
+ #Mongoid.logger = $logger
32
+
33
+ # Require sub-Repositories
34
+ require_relative '../libfminer/libbbrc/bbrc' # include before openbabel
35
+ require_relative '../libfminer/liblast/last' #
36
+ require_relative '../last-utils/lu.rb'
37
+ require_relative '../openbabel/lib/openbabel'
38
+
39
+ # Fminer environment variables
40
+ ENV['FMINER_SMARTS'] = 'true'
41
+ ENV['FMINER_NO_AROMATIC'] = 'true'
42
+ ENV['FMINER_PVALUES'] = 'true'
43
+ ENV['FMINER_SILENT'] = 'true'
44
+ ENV['FMINER_NR_HITS'] = 'true'
45
+
46
+ # OpenTox classes and includes
47
+ CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation"]# Algorithm and Models are modules
48
+
49
+ [ # be aware of the require sequence as it affects class/method overwrites
50
+ "overwrite.rb",
51
+ "rest-client-wrapper.rb",
52
+ "error.rb",
53
+ "opentox.rb",
54
+ "feature.rb",
55
+ "compound.rb",
56
+ "dataset.rb",
57
+ "descriptor.rb",
58
+ "algorithm.rb",
59
+ "descriptor.rb",
60
+ "bbrc.rb",
61
+ "lazar-model.rb",
62
+ "similarity.rb",
63
+ "neighbor.rb",
64
+ "classification.rb",
65
+ "regression.rb",
66
+ "validation.rb",
67
+ "crossvalidation.rb",
68
+ ].each{ |f| require_relative f }
69
+
data/lib/neighbor.rb ADDED
@@ -0,0 +1,25 @@
1
+ module OpenTox
2
+ module Algorithm
3
+ class Neighbor
4
+
5
+ def self.fingerprint_similarity compound, params={}
6
+ compound.neighbors params[:min_sim]
7
+ end
8
+
9
+ def self.fminer_similarity compound, params
10
+ feature_dataset = Dataset.find params[:feature_dataset_id]
11
+ query_fingerprint = Algorithm::Descriptor.smarts_match(compound, feature_dataset.features)
12
+ neighbors = []
13
+
14
+ # find neighbors
15
+ feature_dataset.data_entries.each_with_index do |fingerprint, i|
16
+ sim = Algorithm::Similarity.tanimoto fingerprint, query_fingerprint
17
+ if sim > params[:min_sim]
18
+ neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming
19
+ end
20
+ end
21
+ neighbors
22
+ end
23
+ end
24
+ end
25
+ end
data/lib/opentox.rb ADDED
@@ -0,0 +1,22 @@
1
+ module OpenTox
2
+
3
+ # Ruby interface
4
+
5
+ # create default OpenTox classes (defined in opentox-client.rb)
6
+ # provides Mongoid's query and persistence methods
7
+ # http://mongoid.org/en/mongoid/docs/persistence.html
8
+ # http://mongoid.org/en/mongoid/docs/querying.html
9
+ CLASSES.each do |klass|
10
+ c = Class.new do
11
+ include OpenTox
12
+ include Mongoid::Document
13
+ include Mongoid::Timestamps
14
+ store_in collection: klass.downcase.pluralize
15
+ field :title, as: :name, type: String
16
+
17
+ end
18
+ OpenTox.const_set klass,c
19
+ end
20
+
21
+ end
22
+