lazar 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/.yardopts +4 -0
- data/Gemfile +2 -0
- data/LICENSE +674 -0
- data/README.md +44 -0
- data/Rakefile +1 -0
- data/VERSION +1 -0
- data/ext/lazar/extconf.rb +87 -0
- data/java/CdkDescriptorInfo.class +0 -0
- data/java/CdkDescriptorInfo.java +22 -0
- data/java/CdkDescriptors.class +0 -0
- data/java/CdkDescriptors.java +141 -0
- data/java/Jmol.jar +0 -0
- data/java/JoelibDescriptorInfo.class +0 -0
- data/java/JoelibDescriptorInfo.java +15 -0
- data/java/JoelibDescriptors.class +0 -0
- data/java/JoelibDescriptors.java +60 -0
- data/java/Rakefile +15 -0
- data/java/cdk-1.4.19.jar +0 -0
- data/java/joelib2.jar +0 -0
- data/java/log4j.jar +0 -0
- data/lazar.gemspec +29 -0
- data/lib/SMARTS_InteLigand.txt +983 -0
- data/lib/algorithm.rb +21 -0
- data/lib/bbrc.rb +165 -0
- data/lib/classification.rb +107 -0
- data/lib/compound.rb +254 -0
- data/lib/crossvalidation.rb +187 -0
- data/lib/dataset.rb +334 -0
- data/lib/descriptor.rb +247 -0
- data/lib/error.rb +66 -0
- data/lib/feature.rb +97 -0
- data/lib/lazar-model.rb +170 -0
- data/lib/lazar.rb +69 -0
- data/lib/neighbor.rb +25 -0
- data/lib/opentox.rb +22 -0
- data/lib/overwrite.rb +119 -0
- data/lib/regression.rb +199 -0
- data/lib/rest-client-wrapper.rb +98 -0
- data/lib/similarity.rb +58 -0
- data/lib/unique_descriptors.rb +120 -0
- data/lib/validation.rb +114 -0
- data/mongoid.yml +8 -0
- data/test/all.rb +5 -0
- data/test/compound.rb +100 -0
- data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +13553 -0
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +436 -0
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +568 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +87 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +978 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +1120 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +1113 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +850 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +829 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +1198 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +1505 -0
- data/test/data/EPAFHM.csv +618 -0
- data/test/data/EPAFHM.medi.csv +100 -0
- data/test/data/EPAFHM.mini.csv +22 -0
- data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +581 -0
- data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +1217 -0
- data/test/data/ISSCAN-multi.csv +59 -0
- data/test/data/LOAEL_log_mg_corrected_smiles.csv +568 -0
- data/test/data/LOAEL_log_mmol_corrected_smiles.csv +568 -0
- data/test/data/acetaldehyde.sdf +14 -0
- data/test/data/boiling_points.ext.sdf +11460 -0
- data/test/data/cpdb_100.csv +101 -0
- data/test/data/hamster_carcinogenicity.csv +86 -0
- data/test/data/hamster_carcinogenicity.mini.bool_float.csv +11 -0
- data/test/data/hamster_carcinogenicity.mini.bool_int.csv +11 -0
- data/test/data/hamster_carcinogenicity.mini.bool_string.csv +11 -0
- data/test/data/hamster_carcinogenicity.mini.csv +11 -0
- data/test/data/hamster_carcinogenicity.ntriples +618 -0
- data/test/data/hamster_carcinogenicity.sdf +2805 -0
- data/test/data/hamster_carcinogenicity.xls +0 -0
- data/test/data/hamster_carcinogenicity.yaml +352 -0
- data/test/data/hamster_carcinogenicity_with_errors.csv +88 -0
- data/test/data/kazius.csv +4070 -0
- data/test/data/multi_cell_call.csv +1067 -0
- data/test/data/multi_cell_call_no_dup.csv +1057 -0
- data/test/data/multicolumn.csv +8 -0
- data/test/data/rat_feature_dataset.csv +1179 -0
- data/test/data/wrong_dataset.csv +8 -0
- data/test/dataset-long.rb +117 -0
- data/test/dataset.rb +199 -0
- data/test/descriptor-long.rb +26 -0
- data/test/descriptor.rb +83 -0
- data/test/error.rb +24 -0
- data/test/feature.rb +65 -0
- data/test/fminer-long.rb +38 -0
- data/test/fminer.rb +52 -0
- data/test/lazar-fminer.rb +50 -0
- data/test/lazar-long.rb +72 -0
- data/test/lazar-physchem-short.rb +27 -0
- data/test/setup.rb +6 -0
- data/test/validation.rb +41 -0
- metadata +212 -0
data/lib/descriptor.rb
ADDED
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
require 'digest/md5'
|
|
2
|
+
ENV["JAVA_HOME"] ||= "/usr/lib/jvm/java-7-openjdk"
|
|
3
|
+
# TODO store descriptors in mongodb
|
|
4
|
+
|
|
5
|
+
module OpenTox
|
|
6
|
+
|
|
7
|
+
module Algorithm
|
|
8
|
+
|
|
9
|
+
# Class for descriptor calculations
|
|
10
|
+
class Descriptor
|
|
11
|
+
include OpenTox
|
|
12
|
+
|
|
13
|
+
JAVA_DIR = File.join(File.dirname(__FILE__),"..","java")
|
|
14
|
+
CDK_JAR = Dir[File.join(JAVA_DIR,"cdk-*jar")].last
|
|
15
|
+
JOELIB_JAR = File.join(JAVA_DIR,"joelib2.jar")
|
|
16
|
+
LOG4J_JAR = File.join(JAVA_DIR,"log4j.jar")
|
|
17
|
+
JMOL_JAR = File.join(JAVA_DIR,"Jmol.jar")
|
|
18
|
+
|
|
19
|
+
obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title"]
|
|
20
|
+
OBDESCRIPTORS = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d|
|
|
21
|
+
name,description = d.split(/\s+/,2)
|
|
22
|
+
["Openbabel."+name,description] unless obexclude.include? name
|
|
23
|
+
end.compact.sort{|a,b| a[0] <=> b[0]}]
|
|
24
|
+
|
|
25
|
+
cdk_desc = YAML.load(`java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptorInfo`)
|
|
26
|
+
CDKDESCRIPTORS = Hash[cdk_desc.collect { |d| ["Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,''), d[:description]] }.sort{|a,b| a[0] <=> b[0]}]
|
|
27
|
+
CDKDESCRIPTOR_VALUES = cdk_desc.collect { |d| prefix="Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,''); d[:names].collect{ |name| prefix+"."+name } }.flatten
|
|
28
|
+
|
|
29
|
+
# exclude Hashcode (not a physchem property) and GlobalTopologicalChargeIndex (Joelib bug)
|
|
30
|
+
joelibexclude = ["MoleculeHashcode","GlobalTopologicalChargeIndex"]
|
|
31
|
+
# strip Joelib messages from stdout
|
|
32
|
+
JOELIBDESCRIPTORS = Hash[YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`).collect do |d|
|
|
33
|
+
name = d[:java_class].sub(/^joelib2.feature.types./,'')
|
|
34
|
+
# impossible to obtain meaningful descriptions from JOELIb, see java/JoelibDescriptors.java
|
|
35
|
+
["Joelib."+name, "no description available"] unless joelibexclude.include? name
|
|
36
|
+
end.compact.sort{|a,b| a[0] <=> b[0]}]
|
|
37
|
+
|
|
38
|
+
DESCRIPTORS = OBDESCRIPTORS.merge(CDKDESCRIPTORS.merge(JOELIBDESCRIPTORS))
|
|
39
|
+
DESCRIPTOR_VALUES = OBDESCRIPTORS.keys + CDKDESCRIPTOR_VALUES + JOELIBDESCRIPTORS.keys
|
|
40
|
+
|
|
41
|
+
require_relative "unique_descriptors.rb"
|
|
42
|
+
|
|
43
|
+
# Description of available descriptors
|
|
44
|
+
def self.description descriptor
|
|
45
|
+
lib = descriptor.split('.').first
|
|
46
|
+
case lib
|
|
47
|
+
when "Openbabel"
|
|
48
|
+
OBDESCRIPTORS[descriptor]
|
|
49
|
+
when "Cdk"
|
|
50
|
+
name = descriptor.split('.')[0..-2].join('.')
|
|
51
|
+
CDKDESCRIPTORS[name]
|
|
52
|
+
when "Joelib"
|
|
53
|
+
JOELIBDESCRIPTORS[descriptor]
|
|
54
|
+
when "lookup"
|
|
55
|
+
"Read feature values from a dataset"
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Match an array of smarts features
|
|
60
|
+
def self.smarts_match compounds, smarts_features, count=false
|
|
61
|
+
bad_request_error "Compounds for smarts_match are empty" unless compounds
|
|
62
|
+
bad_request_error "Smarts features for smarts_match are empty" unless smarts_features
|
|
63
|
+
parse compounds
|
|
64
|
+
@count = count
|
|
65
|
+
obconversion = OpenBabel::OBConversion.new
|
|
66
|
+
obmol = OpenBabel::OBMol.new
|
|
67
|
+
obconversion.set_in_format('smi')
|
|
68
|
+
smarts_pattern = OpenBabel::OBSmartsPattern.new
|
|
69
|
+
smarts_features = [smarts_features] if smarts_features.is_a?(Feature)
|
|
70
|
+
@smarts = smarts_features.collect{|f| f.smarts}
|
|
71
|
+
@physchem_descriptors = nil
|
|
72
|
+
@data_entries = Array.new(@compounds.size){Array.new(@smarts.size,false)}
|
|
73
|
+
@compounds.each_with_index do |compound,c|
|
|
74
|
+
obconversion.read_string(obmol,compound.smiles)
|
|
75
|
+
@smarts.each_with_index do |smart,s|
|
|
76
|
+
smarts_pattern.init(smart)
|
|
77
|
+
if smarts_pattern.match(obmol)
|
|
78
|
+
count ? value = smarts_pattern.get_map_list.to_a.size : value = 1
|
|
79
|
+
else
|
|
80
|
+
value = 0
|
|
81
|
+
end
|
|
82
|
+
@data_entries[c][s] = value
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
serialize
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# Count matches of an array with smarts features
|
|
89
|
+
def self.smarts_count compounds, smarts
|
|
90
|
+
# TODO: non-overlapping matches?
|
|
91
|
+
smarts_match compounds,smarts,true
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# Calculate physchem descriptors
|
|
95
|
+
# @param [OpenTox::Compound,Array,OpenTox::Dataset] input object, either a compound, an array of compounds or a dataset
|
|
96
|
+
def self.physchem compounds, descriptors=UNIQUEDESCRIPTORS
|
|
97
|
+
parse compounds
|
|
98
|
+
@data_entries = Array.new(@compounds.size){[]}
|
|
99
|
+
@descriptors = descriptors
|
|
100
|
+
@smarts = nil
|
|
101
|
+
@physchem_descriptors = [] # CDK may return more than one result per descriptor, they are stored as separate features
|
|
102
|
+
des = {}
|
|
103
|
+
@descriptors.each do |d|
|
|
104
|
+
lib, descriptor = d.split(".",2)
|
|
105
|
+
lib = lib.downcase.to_sym
|
|
106
|
+
des[lib] ||= []
|
|
107
|
+
des[lib] << descriptor
|
|
108
|
+
end
|
|
109
|
+
des.each do |lib,descriptors|
|
|
110
|
+
send(lib, descriptors)
|
|
111
|
+
end
|
|
112
|
+
serialize
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def self.openbabel descriptors
|
|
116
|
+
$logger.debug "compute #{descriptors.size} openbabel descriptors for #{@compounds.size} compounds"
|
|
117
|
+
obdescriptors = descriptors.collect{|d| OpenBabel::OBDescriptor.find_type d}
|
|
118
|
+
obmol = OpenBabel::OBMol.new
|
|
119
|
+
obconversion = OpenBabel::OBConversion.new
|
|
120
|
+
obconversion.set_in_format 'smi'
|
|
121
|
+
last_feature_idx = @physchem_descriptors.size
|
|
122
|
+
@compounds.each_with_index do |compound,c|
|
|
123
|
+
obconversion.read_string obmol, compound.smiles
|
|
124
|
+
obdescriptors.each_with_index do |descriptor,d|
|
|
125
|
+
@data_entries[c][d+last_feature_idx] = fix_value(descriptor.predict(obmol))
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
@physchem_descriptors += descriptors.collect{|d| "Openbabel.#{d}"}
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
def self.java_descriptors descriptors, lib
|
|
132
|
+
$logger.debug "compute #{descriptors.size} cdk descriptors for #{@compounds.size} compounds"
|
|
133
|
+
sdf = sdf_3d
|
|
134
|
+
# use java system call (rjb blocks within tasks)
|
|
135
|
+
# use Tempfiles to avoid "Argument list too long" error
|
|
136
|
+
case lib
|
|
137
|
+
when "cdk"
|
|
138
|
+
run_cmd "java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptors #{sdf} #{descriptors.join(" ")}"
|
|
139
|
+
when "joelib"
|
|
140
|
+
run_cmd "java -classpath #{JOELIB_JAR}:#{JMOL_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptors #{sdf} #{descriptors.join(' ')}"
|
|
141
|
+
end
|
|
142
|
+
last_feature_idx = @physchem_descriptors.size
|
|
143
|
+
YAML.load_file("#{sdf}#{lib}.yaml").each_with_index do |calculation,i|
|
|
144
|
+
# TODO create warnings
|
|
145
|
+
#$logger.error "Descriptor calculation failed for compound #{@compounds[i].inchi}." if calculation.empty?
|
|
146
|
+
# CDK Descriptors may calculate multiple values, they are stored in separate features
|
|
147
|
+
@physchem_descriptors += calculation.keys if i == 0
|
|
148
|
+
calculation.keys.each_with_index do |name,j|
|
|
149
|
+
@data_entries[i][j+last_feature_idx] = fix_value(calculation[name])
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
FileUtils.rm "#{sdf}#{lib}.yaml"
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
def self.cdk descriptors
|
|
156
|
+
java_descriptors descriptors, "cdk"
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
def self.joelib descriptors
|
|
160
|
+
java_descriptors descriptors, "joelib"
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
def self.lookup compounds, features, dataset
|
|
164
|
+
parse compounds
|
|
165
|
+
fingerprint = []
|
|
166
|
+
compounds.each do |compound|
|
|
167
|
+
fingerprint << []
|
|
168
|
+
features.each do |feature|
|
|
169
|
+
end
|
|
170
|
+
end
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
def self.run_cmd cmd
|
|
174
|
+
cmd = "#{cmd} 2>&1"
|
|
175
|
+
$logger.debug "running external cmd: '#{cmd}'"
|
|
176
|
+
p = IO.popen(cmd) do |io|
|
|
177
|
+
while line = io.gets
|
|
178
|
+
$logger.debug "> #{line.chomp}"
|
|
179
|
+
end
|
|
180
|
+
io.close
|
|
181
|
+
raise "external cmd failed '#{cmd}' (see log file for error msg)" unless $?.to_i == 0
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
def self.sdf_3d
|
|
186
|
+
# TODO check if 3d sdfs are stored in GridFS
|
|
187
|
+
sdf = ""
|
|
188
|
+
@compounds.each do |compound|
|
|
189
|
+
sdf << compound.sdf
|
|
190
|
+
end
|
|
191
|
+
sdf_file = "/tmp/#{SecureRandom.uuid}.sdf"
|
|
192
|
+
File.open(sdf_file,"w+"){|f| f.print sdf}
|
|
193
|
+
sdf_file
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
def self.parse compounds
|
|
197
|
+
@input_class = compounds.class.to_s
|
|
198
|
+
case @input_class
|
|
199
|
+
when "OpenTox::Compound"
|
|
200
|
+
@compounds = [compounds]
|
|
201
|
+
when "Array"
|
|
202
|
+
@compounds = compounds
|
|
203
|
+
when "OpenTox::Dataset"
|
|
204
|
+
@compounds = compounds.compounds
|
|
205
|
+
else
|
|
206
|
+
bad_request_error "Cannot calculate descriptors for #{compounds.class} objects."
|
|
207
|
+
end
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
def self.serialize
|
|
211
|
+
@data_entries.collect!{|de| de.collect{|v| v.round(5) unless v.nil?}}
|
|
212
|
+
case @input_class
|
|
213
|
+
when "OpenTox::Compound"
|
|
214
|
+
@data_entries.first
|
|
215
|
+
when "Array"
|
|
216
|
+
@data_entries
|
|
217
|
+
when "OpenTox::Dataset"
|
|
218
|
+
dataset = OpenTox::DescriptorDataset.new(:compound_ids => @compounds.collect{|c| c.id})
|
|
219
|
+
if @smarts
|
|
220
|
+
dataset.feature_ids = @smarts.collect{|smart| Smarts.find_or_create_by(:smarts => smart).id}
|
|
221
|
+
@count ? algo = "count" : algo = "match"
|
|
222
|
+
dataset.feature_calculation_algorithm = "#{self}.smarts_#{algo}"
|
|
223
|
+
|
|
224
|
+
elsif @physchem_descriptors
|
|
225
|
+
dataset.feature_ids = @physchem_descriptors.collect{|d| PhysChemDescriptor.find_or_create_by(:name => d, :creator => __FILE__).id}
|
|
226
|
+
dataset.data_entries = @data_entries
|
|
227
|
+
dataset.feature_calculation_algorithm = "#{self}.physchem"
|
|
228
|
+
#TODO params?
|
|
229
|
+
end
|
|
230
|
+
dataset.save_all
|
|
231
|
+
dataset
|
|
232
|
+
end
|
|
233
|
+
end
|
|
234
|
+
|
|
235
|
+
def self.fix_value val
|
|
236
|
+
val = val.first if val.is_a? Array and val.size == 1
|
|
237
|
+
val = nil if val == "NaN"
|
|
238
|
+
if val.numeric?
|
|
239
|
+
val = Float(val)
|
|
240
|
+
val = nil if val.nan? or val.infinite?
|
|
241
|
+
end
|
|
242
|
+
val
|
|
243
|
+
end
|
|
244
|
+
private_class_method :sdf_3d, :fix_value, :parse, :run_cmd, :serialize
|
|
245
|
+
end
|
|
246
|
+
end
|
|
247
|
+
end
|
data/lib/error.rb
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
module OpenToxError
|
|
2
|
+
attr_accessor :http_code, :message, :cause
|
|
3
|
+
def initialize message=nil
|
|
4
|
+
message = message.to_s.gsub(/\A"|"\Z/, '') if message # remove quotes
|
|
5
|
+
super message
|
|
6
|
+
@http_code ||= 500
|
|
7
|
+
@message = message.to_s
|
|
8
|
+
@cause = cut_backtrace(caller)
|
|
9
|
+
$logger.error("\n"+JSON.pretty_generate({
|
|
10
|
+
:http_code => @http_code,
|
|
11
|
+
:message => @message,
|
|
12
|
+
:cause => @cause
|
|
13
|
+
}))
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def cut_backtrace(trace)
|
|
17
|
+
if trace.is_a?(Array)
|
|
18
|
+
cut_index = trace.find_index{|line| line.match(/sinatra|minitest/)}
|
|
19
|
+
cut_index ||= trace.size
|
|
20
|
+
cut_index -= 1
|
|
21
|
+
cut_index = trace.size-1 if cut_index < 0
|
|
22
|
+
trace[0..cut_index]
|
|
23
|
+
else
|
|
24
|
+
trace
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
class RuntimeError
|
|
31
|
+
include OpenToxError
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# clutters log file with library errors
|
|
35
|
+
#class NoMethodError
|
|
36
|
+
#include OpenToxError
|
|
37
|
+
#end
|
|
38
|
+
|
|
39
|
+
module OpenTox
|
|
40
|
+
|
|
41
|
+
class Error < RuntimeError
|
|
42
|
+
include OpenToxError
|
|
43
|
+
|
|
44
|
+
def initialize(code, message=nil)
|
|
45
|
+
@http_code = code
|
|
46
|
+
super message
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# OpenTox errors
|
|
51
|
+
RestClientWrapper.known_errors.each do |error|
|
|
52
|
+
# create error classes
|
|
53
|
+
c = Class.new Error do
|
|
54
|
+
define_method :initialize do |message=nil|
|
|
55
|
+
super error[:code], message
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
OpenTox.const_set error[:class],c
|
|
59
|
+
|
|
60
|
+
# define global methods for raising errors, eg. bad_request_error
|
|
61
|
+
Object.send(:define_method, error[:method]) do |message,uri=nil,cause=nil|
|
|
62
|
+
raise c.new(message)
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
end
|
data/lib/feature.rb
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
module OpenTox
|
|
2
|
+
|
|
3
|
+
# Basic feature class
|
|
4
|
+
class Feature
|
|
5
|
+
field :name, as: :title, type: String
|
|
6
|
+
field :nominal, type: Boolean
|
|
7
|
+
field :numeric, type: Boolean
|
|
8
|
+
field :measured, type: Boolean
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
# Feature for categorical variables
|
|
12
|
+
class NominalFeature < Feature
|
|
13
|
+
# TODO check if accept_values are still needed
|
|
14
|
+
field :accept_values, type: Array
|
|
15
|
+
def initialize params
|
|
16
|
+
super params
|
|
17
|
+
nominal = true
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
# Feature for quantitative variables
|
|
22
|
+
class NumericFeature < Feature
|
|
23
|
+
def initialize params
|
|
24
|
+
super params
|
|
25
|
+
numeric = true
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Feature for SMARTS fragments
|
|
30
|
+
class Smarts < NominalFeature
|
|
31
|
+
field :smarts, type: String
|
|
32
|
+
def self.from_smarts smarts
|
|
33
|
+
self.find_or_create_by :smarts => smarts
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Feature for supervised fragments from Fminer algorithm
|
|
38
|
+
class FminerSmarts < Smarts
|
|
39
|
+
field :p_value, type: Float
|
|
40
|
+
# TODO check if effect is used
|
|
41
|
+
field :effect, type: String
|
|
42
|
+
field :dataset_id
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# Feature for database fingerprints
|
|
46
|
+
# needs count for efficient retrieval (see compound.rb)
|
|
47
|
+
class FingerprintSmarts < Smarts
|
|
48
|
+
field :count, type: Integer
|
|
49
|
+
def self.fingerprint
|
|
50
|
+
@@fp4 ||= OpenTox::FingerprintSmarts.all
|
|
51
|
+
unless @@fp4.size == 306
|
|
52
|
+
@@fp4 = []
|
|
53
|
+
# OpenBabel FP4 fingerprints
|
|
54
|
+
# OpenBabel http://open-babel.readthedocs.org/en/latest/Fingerprints/intro.html
|
|
55
|
+
# TODO investigate other types of fingerprints (MACCS)
|
|
56
|
+
# OpenBabel http://open-babel.readthedocs.org/en/latest/Fingerprints/intro.html
|
|
57
|
+
# http://www.dalkescientific.com/writings/diary/archive/2008/06/26/fingerprint_background.html
|
|
58
|
+
# OpenBabel MNA http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html#multilevel-neighborhoods-of-atoms-mna
|
|
59
|
+
# Morgan ECFP, FCFP
|
|
60
|
+
# http://cdk.github.io/cdk/1.5/docs/api/org/openscience/cdk/fingerprint/CircularFingerprinter.html
|
|
61
|
+
# http://www.rdkit.org/docs/GettingStartedInPython.html
|
|
62
|
+
# Chemfp
|
|
63
|
+
# https://chemfp.readthedocs.org/en/latest/using-tools.html
|
|
64
|
+
# CACTVS/PubChem
|
|
65
|
+
|
|
66
|
+
File.open(File.join(File.dirname(__FILE__),"SMARTS_InteLigand.txt")).each do |l|
|
|
67
|
+
l.strip!
|
|
68
|
+
unless l.empty? or l.match /^#/
|
|
69
|
+
name,smarts = l.split(': ')
|
|
70
|
+
@@fp4 << OpenTox::FingerprintSmarts.find_or_create_by(:name => name, :smarts => smarts) unless smarts.nil?
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
@@fp4
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# Feature for physico-chemical descriptors
|
|
79
|
+
class PhysChemDescriptor < NumericFeature
|
|
80
|
+
field :algorithm, type: String, default: "OpenTox::Algorithm::Descriptor.physchem"
|
|
81
|
+
field :parameters, type: Hash
|
|
82
|
+
field :creator, type: String
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# Feature for categorical bioassay results
|
|
86
|
+
class NominalBioAssay < NominalFeature
|
|
87
|
+
# TODO: needed? move to dataset?
|
|
88
|
+
field :description, type: String
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Feature for quantitative bioassay results
|
|
92
|
+
class NumericBioAssay < NumericFeature
|
|
93
|
+
# TODO: needed? move to dataset?
|
|
94
|
+
field :description, type: String
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
end
|
data/lib/lazar-model.rb
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
module OpenTox
|
|
2
|
+
|
|
3
|
+
module Model
|
|
4
|
+
|
|
5
|
+
class Lazar
|
|
6
|
+
include OpenTox
|
|
7
|
+
include Mongoid::Document
|
|
8
|
+
include Mongoid::Timestamps
|
|
9
|
+
store_in collection: "models"
|
|
10
|
+
|
|
11
|
+
field :title, type: String
|
|
12
|
+
field :creator, type: String, default: __FILE__
|
|
13
|
+
# datasets
|
|
14
|
+
field :training_dataset_id, type: BSON::ObjectId
|
|
15
|
+
# algorithms
|
|
16
|
+
field :prediction_algorithm, type: String
|
|
17
|
+
field :neighbor_algorithm, type: String
|
|
18
|
+
field :neighbor_algorithm_parameters, type: Hash
|
|
19
|
+
# prediction feature
|
|
20
|
+
field :prediction_feature_id, type: BSON::ObjectId
|
|
21
|
+
|
|
22
|
+
attr_accessor :prediction_dataset
|
|
23
|
+
attr_accessor :training_dataset
|
|
24
|
+
|
|
25
|
+
# Create a lazar model from a training_dataset and a feature_dataset
|
|
26
|
+
# @param [OpenTox::Dataset] training_dataset
|
|
27
|
+
# @return [OpenTox::Model::Lazar] Regression or classification model
|
|
28
|
+
def self.create training_dataset
|
|
29
|
+
|
|
30
|
+
bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1
|
|
31
|
+
|
|
32
|
+
# TODO document convention
|
|
33
|
+
prediction_feature = training_dataset.features.first
|
|
34
|
+
prediction_feature.nominal ? lazar = OpenTox::Model::LazarClassification.new : lazar = OpenTox::Model::LazarRegression.new
|
|
35
|
+
lazar.training_dataset_id = training_dataset.id
|
|
36
|
+
lazar.prediction_feature_id = prediction_feature.id
|
|
37
|
+
lazar.title = prediction_feature.title
|
|
38
|
+
|
|
39
|
+
lazar.save
|
|
40
|
+
lazar
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def predict object
|
|
44
|
+
|
|
45
|
+
t = Time.now
|
|
46
|
+
at = Time.now
|
|
47
|
+
|
|
48
|
+
training_dataset = Dataset.find training_dataset_id
|
|
49
|
+
prediction_feature = Feature.find prediction_feature_id
|
|
50
|
+
|
|
51
|
+
# parse data
|
|
52
|
+
compounds = []
|
|
53
|
+
case object.class.to_s
|
|
54
|
+
when "OpenTox::Compound"
|
|
55
|
+
compounds = [object]
|
|
56
|
+
when "Array"
|
|
57
|
+
compounds = object
|
|
58
|
+
when "OpenTox::Dataset"
|
|
59
|
+
compounds = object.compounds
|
|
60
|
+
else
|
|
61
|
+
bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Compounds or an OpenTox::Dataset as parameter."
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# make predictions
|
|
65
|
+
predictions = []
|
|
66
|
+
neighbors = []
|
|
67
|
+
compounds.each_with_index do |compound,c|
|
|
68
|
+
t = Time.new
|
|
69
|
+
database_activities = training_dataset.values(compound,prediction_feature)
|
|
70
|
+
if database_activities and !database_activities.empty?
|
|
71
|
+
database_activities = database_activities.first if database_activities.size == 1
|
|
72
|
+
predictions << {:compound => compound, :value => database_activities, :confidence => "measured", :warning => "Compound #{compound.smiles} occurs in training dataset with activity '#{database_activities}'."}
|
|
73
|
+
next
|
|
74
|
+
end
|
|
75
|
+
neighbors = Algorithm.run(neighbor_algorithm, compound, neighbor_algorithm_parameters)
|
|
76
|
+
# add activities
|
|
77
|
+
# TODO: improve efficiency, takes 3 times longer than previous version
|
|
78
|
+
neighbors.collect! do |n|
|
|
79
|
+
rows = training_dataset.compound_ids.each_index.select{|i| training_dataset.compound_ids[i] == n.first}
|
|
80
|
+
acts = rows.collect{|row| training_dataset.data_entries[row][0]}.compact
|
|
81
|
+
acts.empty? ? nil : n << acts
|
|
82
|
+
end
|
|
83
|
+
neighbors.compact! # remove neighbors without training activities
|
|
84
|
+
predictions << Algorithm.run(prediction_algorithm, neighbors)
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# serialize result
|
|
88
|
+
case object.class.to_s
|
|
89
|
+
when "OpenTox::Compound"
|
|
90
|
+
prediction = predictions.first
|
|
91
|
+
prediction[:neighbors] = neighbors.sort{|a,b| b[1] <=> a[1]} # sort according to similarity
|
|
92
|
+
return prediction
|
|
93
|
+
when "Array"
|
|
94
|
+
return predictions
|
|
95
|
+
when "OpenTox::Dataset"
|
|
96
|
+
# prepare prediction dataset
|
|
97
|
+
prediction_dataset = LazarPrediction.new(
|
|
98
|
+
:title => "Lazar prediction for #{prediction_feature.title}",
|
|
99
|
+
:creator => __FILE__,
|
|
100
|
+
:prediction_feature_id => prediction_feature.id
|
|
101
|
+
|
|
102
|
+
)
|
|
103
|
+
confidence_feature = OpenTox::NumericFeature.find_or_create_by( "title" => "Prediction confidence" )
|
|
104
|
+
# TODO move into warnings field
|
|
105
|
+
warning_feature = OpenTox::NominalFeature.find_or_create_by("title" => "Warnings")
|
|
106
|
+
prediction_dataset.features = [ prediction_feature, confidence_feature, warning_feature ]
|
|
107
|
+
prediction_dataset.compounds = compounds
|
|
108
|
+
prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:confidence], p[:warning]]}
|
|
109
|
+
prediction_dataset.save_all
|
|
110
|
+
return prediction_dataset
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def training_activities
|
|
116
|
+
i = training_dataset.feature_ids.index prediction_feature_id
|
|
117
|
+
training_dataset.data_entries.collect{|de| de[i]}
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
class LazarClassification < Lazar
|
|
123
|
+
def initialize
|
|
124
|
+
super
|
|
125
|
+
self.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote"
|
|
126
|
+
self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity"
|
|
127
|
+
self.neighbor_algorithm_parameters = {:min_sim => 0.7}
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
class LazarFminerClassification < LazarClassification
|
|
132
|
+
|
|
133
|
+
def self.create training_dataset
|
|
134
|
+
model = super(training_dataset)
|
|
135
|
+
model.update "_type" => self.to_s # adjust class
|
|
136
|
+
model = self.find model.id # adjust class
|
|
137
|
+
model.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fminer_similarity"
|
|
138
|
+
model.neighbor_algorithm_parameters = {
|
|
139
|
+
:feature_calculation_algorithm => "OpenTox::Algorithm::Descriptor.smarts_match",
|
|
140
|
+
:feature_dataset_id => Algorithm::Fminer.bbrc(training_dataset).id,
|
|
141
|
+
:min_sim => 0.3
|
|
142
|
+
}
|
|
143
|
+
model.save
|
|
144
|
+
model
|
|
145
|
+
end
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
class LazarRegression < Lazar
|
|
149
|
+
|
|
150
|
+
def initialize
|
|
151
|
+
super
|
|
152
|
+
self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity"
|
|
153
|
+
self.prediction_algorithm = "OpenTox::Algorithm::Regression.weighted_average"
|
|
154
|
+
self.neighbor_algorithm_parameters = {:min_sim => 0.7}
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
class PredictionModel < Lazar
|
|
160
|
+
field :category, type: String
|
|
161
|
+
field :endpoint, type: String
|
|
162
|
+
field :unit, type: String
|
|
163
|
+
field :model_id, type: BSON::ObjectId
|
|
164
|
+
field :crossvalidation_id, type: BSON::ObjectId
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
end
|
|
170
|
+
|
data/lib/lazar.rb
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
require 'rubygems'
|
|
2
|
+
require "bundler/setup"
|
|
3
|
+
require "rest-client"
|
|
4
|
+
require 'yaml'
|
|
5
|
+
require 'json'
|
|
6
|
+
require 'logger'
|
|
7
|
+
require 'mongoid'
|
|
8
|
+
require 'rserve'
|
|
9
|
+
require "nokogiri"
|
|
10
|
+
require "base64"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# Mongo setup
|
|
14
|
+
# TODO retrieve correct environment from Rack/Sinatra
|
|
15
|
+
ENV["MONGOID_ENV"] ||= "development"
|
|
16
|
+
# TODO remove config files, change default via ENV or directly in Mongoid class
|
|
17
|
+
Mongoid.load!("#{File.expand_path(File.join(File.dirname(__FILE__),'..','mongoid.yml'))}")
|
|
18
|
+
# TODO get Mongo::Client from Mongoid
|
|
19
|
+
$mongo = Mongo::Client.new('mongodb://127.0.0.1:27017/opentox')
|
|
20
|
+
# TODO same for GridFS
|
|
21
|
+
$gridfs = $mongo.database.fs
|
|
22
|
+
|
|
23
|
+
# R setup
|
|
24
|
+
R = Rserve::Connection.new
|
|
25
|
+
|
|
26
|
+
# Logger setup
|
|
27
|
+
$logger = Logger.new STDOUT # STDERR did not work on my development machine (CH)
|
|
28
|
+
$logger.level = Logger::DEBUG
|
|
29
|
+
Mongo::Logger.logger = $logger
|
|
30
|
+
Mongo::Logger.level = Logger::WARN
|
|
31
|
+
#Mongoid.logger = $logger
|
|
32
|
+
|
|
33
|
+
# Require sub-Repositories
|
|
34
|
+
require_relative '../libfminer/libbbrc/bbrc' # include before openbabel
|
|
35
|
+
require_relative '../libfminer/liblast/last' #
|
|
36
|
+
require_relative '../last-utils/lu.rb'
|
|
37
|
+
require_relative '../openbabel/lib/openbabel'
|
|
38
|
+
|
|
39
|
+
# Fminer environment variables
|
|
40
|
+
ENV['FMINER_SMARTS'] = 'true'
|
|
41
|
+
ENV['FMINER_NO_AROMATIC'] = 'true'
|
|
42
|
+
ENV['FMINER_PVALUES'] = 'true'
|
|
43
|
+
ENV['FMINER_SILENT'] = 'true'
|
|
44
|
+
ENV['FMINER_NR_HITS'] = 'true'
|
|
45
|
+
|
|
46
|
+
# OpenTox classes and includes
|
|
47
|
+
CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation"]# Algorithm and Models are modules
|
|
48
|
+
|
|
49
|
+
[ # be aware of the require sequence as it affects class/method overwrites
|
|
50
|
+
"overwrite.rb",
|
|
51
|
+
"rest-client-wrapper.rb",
|
|
52
|
+
"error.rb",
|
|
53
|
+
"opentox.rb",
|
|
54
|
+
"feature.rb",
|
|
55
|
+
"compound.rb",
|
|
56
|
+
"dataset.rb",
|
|
57
|
+
"descriptor.rb",
|
|
58
|
+
"algorithm.rb",
|
|
59
|
+
"descriptor.rb",
|
|
60
|
+
"bbrc.rb",
|
|
61
|
+
"lazar-model.rb",
|
|
62
|
+
"similarity.rb",
|
|
63
|
+
"neighbor.rb",
|
|
64
|
+
"classification.rb",
|
|
65
|
+
"regression.rb",
|
|
66
|
+
"validation.rb",
|
|
67
|
+
"crossvalidation.rb",
|
|
68
|
+
].each{ |f| require_relative f }
|
|
69
|
+
|
data/lib/neighbor.rb
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
module OpenTox
|
|
2
|
+
module Algorithm
|
|
3
|
+
class Neighbor
|
|
4
|
+
|
|
5
|
+
def self.fingerprint_similarity compound, params={}
|
|
6
|
+
compound.neighbors params[:min_sim]
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
def self.fminer_similarity compound, params
|
|
10
|
+
feature_dataset = Dataset.find params[:feature_dataset_id]
|
|
11
|
+
query_fingerprint = Algorithm::Descriptor.smarts_match(compound, feature_dataset.features)
|
|
12
|
+
neighbors = []
|
|
13
|
+
|
|
14
|
+
# find neighbors
|
|
15
|
+
feature_dataset.data_entries.each_with_index do |fingerprint, i|
|
|
16
|
+
sim = Algorithm::Similarity.tanimoto fingerprint, query_fingerprint
|
|
17
|
+
if sim > params[:min_sim]
|
|
18
|
+
neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
neighbors
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
data/lib/opentox.rb
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
module OpenTox
|
|
2
|
+
|
|
3
|
+
# Ruby interface
|
|
4
|
+
|
|
5
|
+
# create default OpenTox classes (defined in opentox-client.rb)
|
|
6
|
+
# provides Mongoid's query and persistence methods
|
|
7
|
+
# http://mongoid.org/en/mongoid/docs/persistence.html
|
|
8
|
+
# http://mongoid.org/en/mongoid/docs/querying.html
|
|
9
|
+
CLASSES.each do |klass|
|
|
10
|
+
c = Class.new do
|
|
11
|
+
include OpenTox
|
|
12
|
+
include Mongoid::Document
|
|
13
|
+
include Mongoid::Timestamps
|
|
14
|
+
store_in collection: klass.downcase.pluralize
|
|
15
|
+
field :title, as: :name, type: String
|
|
16
|
+
|
|
17
|
+
end
|
|
18
|
+
OpenTox.const_set klass,c
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
end
|
|
22
|
+
|