lazar 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/.yardopts +4 -0
- data/Gemfile +2 -0
- data/LICENSE +674 -0
- data/README.md +44 -0
- data/Rakefile +1 -0
- data/VERSION +1 -0
- data/ext/lazar/extconf.rb +87 -0
- data/java/CdkDescriptorInfo.class +0 -0
- data/java/CdkDescriptorInfo.java +22 -0
- data/java/CdkDescriptors.class +0 -0
- data/java/CdkDescriptors.java +141 -0
- data/java/Jmol.jar +0 -0
- data/java/JoelibDescriptorInfo.class +0 -0
- data/java/JoelibDescriptorInfo.java +15 -0
- data/java/JoelibDescriptors.class +0 -0
- data/java/JoelibDescriptors.java +60 -0
- data/java/Rakefile +15 -0
- data/java/cdk-1.4.19.jar +0 -0
- data/java/joelib2.jar +0 -0
- data/java/log4j.jar +0 -0
- data/lazar.gemspec +29 -0
- data/lib/SMARTS_InteLigand.txt +983 -0
- data/lib/algorithm.rb +21 -0
- data/lib/bbrc.rb +165 -0
- data/lib/classification.rb +107 -0
- data/lib/compound.rb +254 -0
- data/lib/crossvalidation.rb +187 -0
- data/lib/dataset.rb +334 -0
- data/lib/descriptor.rb +247 -0
- data/lib/error.rb +66 -0
- data/lib/feature.rb +97 -0
- data/lib/lazar-model.rb +170 -0
- data/lib/lazar.rb +69 -0
- data/lib/neighbor.rb +25 -0
- data/lib/opentox.rb +22 -0
- data/lib/overwrite.rb +119 -0
- data/lib/regression.rb +199 -0
- data/lib/rest-client-wrapper.rb +98 -0
- data/lib/similarity.rb +58 -0
- data/lib/unique_descriptors.rb +120 -0
- data/lib/validation.rb +114 -0
- data/mongoid.yml +8 -0
- data/test/all.rb +5 -0
- data/test/compound.rb +100 -0
- data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +13553 -0
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +436 -0
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +568 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +87 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +978 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +1120 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +1113 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +850 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +829 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +1198 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +1505 -0
- data/test/data/EPAFHM.csv +618 -0
- data/test/data/EPAFHM.medi.csv +100 -0
- data/test/data/EPAFHM.mini.csv +22 -0
- data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +581 -0
- data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +1217 -0
- data/test/data/ISSCAN-multi.csv +59 -0
- data/test/data/LOAEL_log_mg_corrected_smiles.csv +568 -0
- data/test/data/LOAEL_log_mmol_corrected_smiles.csv +568 -0
- data/test/data/acetaldehyde.sdf +14 -0
- data/test/data/boiling_points.ext.sdf +11460 -0
- data/test/data/cpdb_100.csv +101 -0
- data/test/data/hamster_carcinogenicity.csv +86 -0
- data/test/data/hamster_carcinogenicity.mini.bool_float.csv +11 -0
- data/test/data/hamster_carcinogenicity.mini.bool_int.csv +11 -0
- data/test/data/hamster_carcinogenicity.mini.bool_string.csv +11 -0
- data/test/data/hamster_carcinogenicity.mini.csv +11 -0
- data/test/data/hamster_carcinogenicity.ntriples +618 -0
- data/test/data/hamster_carcinogenicity.sdf +2805 -0
- data/test/data/hamster_carcinogenicity.xls +0 -0
- data/test/data/hamster_carcinogenicity.yaml +352 -0
- data/test/data/hamster_carcinogenicity_with_errors.csv +88 -0
- data/test/data/kazius.csv +4070 -0
- data/test/data/multi_cell_call.csv +1067 -0
- data/test/data/multi_cell_call_no_dup.csv +1057 -0
- data/test/data/multicolumn.csv +8 -0
- data/test/data/rat_feature_dataset.csv +1179 -0
- data/test/data/wrong_dataset.csv +8 -0
- data/test/dataset-long.rb +117 -0
- data/test/dataset.rb +199 -0
- data/test/descriptor-long.rb +26 -0
- data/test/descriptor.rb +83 -0
- data/test/error.rb +24 -0
- data/test/feature.rb +65 -0
- data/test/fminer-long.rb +38 -0
- data/test/fminer.rb +52 -0
- data/test/lazar-fminer.rb +50 -0
- data/test/lazar-long.rb +72 -0
- data/test/lazar-physchem-short.rb +27 -0
- data/test/setup.rb +6 -0
- data/test/validation.rb +41 -0
- metadata +212 -0
data/lib/descriptor.rb
ADDED
@@ -0,0 +1,247 @@
|
|
1
|
+
require 'digest/md5'
|
2
|
+
ENV["JAVA_HOME"] ||= "/usr/lib/jvm/java-7-openjdk"
|
3
|
+
# TODO store descriptors in mongodb
|
4
|
+
|
5
|
+
module OpenTox
|
6
|
+
|
7
|
+
module Algorithm
|
8
|
+
|
9
|
+
# Class for descriptor calculations
|
10
|
+
class Descriptor
|
11
|
+
include OpenTox
|
12
|
+
|
13
|
+
JAVA_DIR = File.join(File.dirname(__FILE__),"..","java")
|
14
|
+
CDK_JAR = Dir[File.join(JAVA_DIR,"cdk-*jar")].last
|
15
|
+
JOELIB_JAR = File.join(JAVA_DIR,"joelib2.jar")
|
16
|
+
LOG4J_JAR = File.join(JAVA_DIR,"log4j.jar")
|
17
|
+
JMOL_JAR = File.join(JAVA_DIR,"Jmol.jar")
|
18
|
+
|
19
|
+
obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title"]
|
20
|
+
OBDESCRIPTORS = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d|
|
21
|
+
name,description = d.split(/\s+/,2)
|
22
|
+
["Openbabel."+name,description] unless obexclude.include? name
|
23
|
+
end.compact.sort{|a,b| a[0] <=> b[0]}]
|
24
|
+
|
25
|
+
cdk_desc = YAML.load(`java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptorInfo`)
|
26
|
+
CDKDESCRIPTORS = Hash[cdk_desc.collect { |d| ["Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,''), d[:description]] }.sort{|a,b| a[0] <=> b[0]}]
|
27
|
+
CDKDESCRIPTOR_VALUES = cdk_desc.collect { |d| prefix="Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,''); d[:names].collect{ |name| prefix+"."+name } }.flatten
|
28
|
+
|
29
|
+
# exclude Hashcode (not a physchem property) and GlobalTopologicalChargeIndex (Joelib bug)
|
30
|
+
joelibexclude = ["MoleculeHashcode","GlobalTopologicalChargeIndex"]
|
31
|
+
# strip Joelib messages from stdout
|
32
|
+
JOELIBDESCRIPTORS = Hash[YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`).collect do |d|
|
33
|
+
name = d[:java_class].sub(/^joelib2.feature.types./,'')
|
34
|
+
# impossible to obtain meaningful descriptions from JOELIb, see java/JoelibDescriptors.java
|
35
|
+
["Joelib."+name, "no description available"] unless joelibexclude.include? name
|
36
|
+
end.compact.sort{|a,b| a[0] <=> b[0]}]
|
37
|
+
|
38
|
+
DESCRIPTORS = OBDESCRIPTORS.merge(CDKDESCRIPTORS.merge(JOELIBDESCRIPTORS))
|
39
|
+
DESCRIPTOR_VALUES = OBDESCRIPTORS.keys + CDKDESCRIPTOR_VALUES + JOELIBDESCRIPTORS.keys
|
40
|
+
|
41
|
+
require_relative "unique_descriptors.rb"
|
42
|
+
|
43
|
+
# Description of available descriptors
|
44
|
+
def self.description descriptor
|
45
|
+
lib = descriptor.split('.').first
|
46
|
+
case lib
|
47
|
+
when "Openbabel"
|
48
|
+
OBDESCRIPTORS[descriptor]
|
49
|
+
when "Cdk"
|
50
|
+
name = descriptor.split('.')[0..-2].join('.')
|
51
|
+
CDKDESCRIPTORS[name]
|
52
|
+
when "Joelib"
|
53
|
+
JOELIBDESCRIPTORS[descriptor]
|
54
|
+
when "lookup"
|
55
|
+
"Read feature values from a dataset"
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
# Match an array of smarts features
|
60
|
+
def self.smarts_match compounds, smarts_features, count=false
|
61
|
+
bad_request_error "Compounds for smarts_match are empty" unless compounds
|
62
|
+
bad_request_error "Smarts features for smarts_match are empty" unless smarts_features
|
63
|
+
parse compounds
|
64
|
+
@count = count
|
65
|
+
obconversion = OpenBabel::OBConversion.new
|
66
|
+
obmol = OpenBabel::OBMol.new
|
67
|
+
obconversion.set_in_format('smi')
|
68
|
+
smarts_pattern = OpenBabel::OBSmartsPattern.new
|
69
|
+
smarts_features = [smarts_features] if smarts_features.is_a?(Feature)
|
70
|
+
@smarts = smarts_features.collect{|f| f.smarts}
|
71
|
+
@physchem_descriptors = nil
|
72
|
+
@data_entries = Array.new(@compounds.size){Array.new(@smarts.size,false)}
|
73
|
+
@compounds.each_with_index do |compound,c|
|
74
|
+
obconversion.read_string(obmol,compound.smiles)
|
75
|
+
@smarts.each_with_index do |smart,s|
|
76
|
+
smarts_pattern.init(smart)
|
77
|
+
if smarts_pattern.match(obmol)
|
78
|
+
count ? value = smarts_pattern.get_map_list.to_a.size : value = 1
|
79
|
+
else
|
80
|
+
value = 0
|
81
|
+
end
|
82
|
+
@data_entries[c][s] = value
|
83
|
+
end
|
84
|
+
end
|
85
|
+
serialize
|
86
|
+
end
|
87
|
+
|
88
|
+
# Count matches of an array with smarts features
|
89
|
+
def self.smarts_count compounds, smarts
|
90
|
+
# TODO: non-overlapping matches?
|
91
|
+
smarts_match compounds,smarts,true
|
92
|
+
end
|
93
|
+
|
94
|
+
# Calculate physchem descriptors
|
95
|
+
# @param [OpenTox::Compound,Array,OpenTox::Dataset] input object, either a compound, an array of compounds or a dataset
|
96
|
+
def self.physchem compounds, descriptors=UNIQUEDESCRIPTORS
|
97
|
+
parse compounds
|
98
|
+
@data_entries = Array.new(@compounds.size){[]}
|
99
|
+
@descriptors = descriptors
|
100
|
+
@smarts = nil
|
101
|
+
@physchem_descriptors = [] # CDK may return more than one result per descriptor, they are stored as separate features
|
102
|
+
des = {}
|
103
|
+
@descriptors.each do |d|
|
104
|
+
lib, descriptor = d.split(".",2)
|
105
|
+
lib = lib.downcase.to_sym
|
106
|
+
des[lib] ||= []
|
107
|
+
des[lib] << descriptor
|
108
|
+
end
|
109
|
+
des.each do |lib,descriptors|
|
110
|
+
send(lib, descriptors)
|
111
|
+
end
|
112
|
+
serialize
|
113
|
+
end
|
114
|
+
|
115
|
+
def self.openbabel descriptors
|
116
|
+
$logger.debug "compute #{descriptors.size} openbabel descriptors for #{@compounds.size} compounds"
|
117
|
+
obdescriptors = descriptors.collect{|d| OpenBabel::OBDescriptor.find_type d}
|
118
|
+
obmol = OpenBabel::OBMol.new
|
119
|
+
obconversion = OpenBabel::OBConversion.new
|
120
|
+
obconversion.set_in_format 'smi'
|
121
|
+
last_feature_idx = @physchem_descriptors.size
|
122
|
+
@compounds.each_with_index do |compound,c|
|
123
|
+
obconversion.read_string obmol, compound.smiles
|
124
|
+
obdescriptors.each_with_index do |descriptor,d|
|
125
|
+
@data_entries[c][d+last_feature_idx] = fix_value(descriptor.predict(obmol))
|
126
|
+
end
|
127
|
+
end
|
128
|
+
@physchem_descriptors += descriptors.collect{|d| "Openbabel.#{d}"}
|
129
|
+
end
|
130
|
+
|
131
|
+
def self.java_descriptors descriptors, lib
|
132
|
+
$logger.debug "compute #{descriptors.size} cdk descriptors for #{@compounds.size} compounds"
|
133
|
+
sdf = sdf_3d
|
134
|
+
# use java system call (rjb blocks within tasks)
|
135
|
+
# use Tempfiles to avoid "Argument list too long" error
|
136
|
+
case lib
|
137
|
+
when "cdk"
|
138
|
+
run_cmd "java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptors #{sdf} #{descriptors.join(" ")}"
|
139
|
+
when "joelib"
|
140
|
+
run_cmd "java -classpath #{JOELIB_JAR}:#{JMOL_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptors #{sdf} #{descriptors.join(' ')}"
|
141
|
+
end
|
142
|
+
last_feature_idx = @physchem_descriptors.size
|
143
|
+
YAML.load_file("#{sdf}#{lib}.yaml").each_with_index do |calculation,i|
|
144
|
+
# TODO create warnings
|
145
|
+
#$logger.error "Descriptor calculation failed for compound #{@compounds[i].inchi}." if calculation.empty?
|
146
|
+
# CDK Descriptors may calculate multiple values, they are stored in separate features
|
147
|
+
@physchem_descriptors += calculation.keys if i == 0
|
148
|
+
calculation.keys.each_with_index do |name,j|
|
149
|
+
@data_entries[i][j+last_feature_idx] = fix_value(calculation[name])
|
150
|
+
end
|
151
|
+
end
|
152
|
+
FileUtils.rm "#{sdf}#{lib}.yaml"
|
153
|
+
end
|
154
|
+
|
155
|
+
def self.cdk descriptors
|
156
|
+
java_descriptors descriptors, "cdk"
|
157
|
+
end
|
158
|
+
|
159
|
+
def self.joelib descriptors
|
160
|
+
java_descriptors descriptors, "joelib"
|
161
|
+
end
|
162
|
+
|
163
|
+
def self.lookup compounds, features, dataset
|
164
|
+
parse compounds
|
165
|
+
fingerprint = []
|
166
|
+
compounds.each do |compound|
|
167
|
+
fingerprint << []
|
168
|
+
features.each do |feature|
|
169
|
+
end
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
173
|
+
def self.run_cmd cmd
|
174
|
+
cmd = "#{cmd} 2>&1"
|
175
|
+
$logger.debug "running external cmd: '#{cmd}'"
|
176
|
+
p = IO.popen(cmd) do |io|
|
177
|
+
while line = io.gets
|
178
|
+
$logger.debug "> #{line.chomp}"
|
179
|
+
end
|
180
|
+
io.close
|
181
|
+
raise "external cmd failed '#{cmd}' (see log file for error msg)" unless $?.to_i == 0
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
def self.sdf_3d
|
186
|
+
# TODO check if 3d sdfs are stored in GridFS
|
187
|
+
sdf = ""
|
188
|
+
@compounds.each do |compound|
|
189
|
+
sdf << compound.sdf
|
190
|
+
end
|
191
|
+
sdf_file = "/tmp/#{SecureRandom.uuid}.sdf"
|
192
|
+
File.open(sdf_file,"w+"){|f| f.print sdf}
|
193
|
+
sdf_file
|
194
|
+
end
|
195
|
+
|
196
|
+
def self.parse compounds
|
197
|
+
@input_class = compounds.class.to_s
|
198
|
+
case @input_class
|
199
|
+
when "OpenTox::Compound"
|
200
|
+
@compounds = [compounds]
|
201
|
+
when "Array"
|
202
|
+
@compounds = compounds
|
203
|
+
when "OpenTox::Dataset"
|
204
|
+
@compounds = compounds.compounds
|
205
|
+
else
|
206
|
+
bad_request_error "Cannot calculate descriptors for #{compounds.class} objects."
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
def self.serialize
|
211
|
+
@data_entries.collect!{|de| de.collect{|v| v.round(5) unless v.nil?}}
|
212
|
+
case @input_class
|
213
|
+
when "OpenTox::Compound"
|
214
|
+
@data_entries.first
|
215
|
+
when "Array"
|
216
|
+
@data_entries
|
217
|
+
when "OpenTox::Dataset"
|
218
|
+
dataset = OpenTox::DescriptorDataset.new(:compound_ids => @compounds.collect{|c| c.id})
|
219
|
+
if @smarts
|
220
|
+
dataset.feature_ids = @smarts.collect{|smart| Smarts.find_or_create_by(:smarts => smart).id}
|
221
|
+
@count ? algo = "count" : algo = "match"
|
222
|
+
dataset.feature_calculation_algorithm = "#{self}.smarts_#{algo}"
|
223
|
+
|
224
|
+
elsif @physchem_descriptors
|
225
|
+
dataset.feature_ids = @physchem_descriptors.collect{|d| PhysChemDescriptor.find_or_create_by(:name => d, :creator => __FILE__).id}
|
226
|
+
dataset.data_entries = @data_entries
|
227
|
+
dataset.feature_calculation_algorithm = "#{self}.physchem"
|
228
|
+
#TODO params?
|
229
|
+
end
|
230
|
+
dataset.save_all
|
231
|
+
dataset
|
232
|
+
end
|
233
|
+
end
|
234
|
+
|
235
|
+
def self.fix_value val
|
236
|
+
val = val.first if val.is_a? Array and val.size == 1
|
237
|
+
val = nil if val == "NaN"
|
238
|
+
if val.numeric?
|
239
|
+
val = Float(val)
|
240
|
+
val = nil if val.nan? or val.infinite?
|
241
|
+
end
|
242
|
+
val
|
243
|
+
end
|
244
|
+
private_class_method :sdf_3d, :fix_value, :parse, :run_cmd, :serialize
|
245
|
+
end
|
246
|
+
end
|
247
|
+
end
|
data/lib/error.rb
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
module OpenToxError
|
2
|
+
attr_accessor :http_code, :message, :cause
|
3
|
+
def initialize message=nil
|
4
|
+
message = message.to_s.gsub(/\A"|"\Z/, '') if message # remove quotes
|
5
|
+
super message
|
6
|
+
@http_code ||= 500
|
7
|
+
@message = message.to_s
|
8
|
+
@cause = cut_backtrace(caller)
|
9
|
+
$logger.error("\n"+JSON.pretty_generate({
|
10
|
+
:http_code => @http_code,
|
11
|
+
:message => @message,
|
12
|
+
:cause => @cause
|
13
|
+
}))
|
14
|
+
end
|
15
|
+
|
16
|
+
def cut_backtrace(trace)
|
17
|
+
if trace.is_a?(Array)
|
18
|
+
cut_index = trace.find_index{|line| line.match(/sinatra|minitest/)}
|
19
|
+
cut_index ||= trace.size
|
20
|
+
cut_index -= 1
|
21
|
+
cut_index = trace.size-1 if cut_index < 0
|
22
|
+
trace[0..cut_index]
|
23
|
+
else
|
24
|
+
trace
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
|
30
|
+
class RuntimeError
|
31
|
+
include OpenToxError
|
32
|
+
end
|
33
|
+
|
34
|
+
# clutters log file with library errors
|
35
|
+
#class NoMethodError
|
36
|
+
#include OpenToxError
|
37
|
+
#end
|
38
|
+
|
39
|
+
module OpenTox
|
40
|
+
|
41
|
+
class Error < RuntimeError
|
42
|
+
include OpenToxError
|
43
|
+
|
44
|
+
def initialize(code, message=nil)
|
45
|
+
@http_code = code
|
46
|
+
super message
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
# OpenTox errors
|
51
|
+
RestClientWrapper.known_errors.each do |error|
|
52
|
+
# create error classes
|
53
|
+
c = Class.new Error do
|
54
|
+
define_method :initialize do |message=nil|
|
55
|
+
super error[:code], message
|
56
|
+
end
|
57
|
+
end
|
58
|
+
OpenTox.const_set error[:class],c
|
59
|
+
|
60
|
+
# define global methods for raising errors, eg. bad_request_error
|
61
|
+
Object.send(:define_method, error[:method]) do |message,uri=nil,cause=nil|
|
62
|
+
raise c.new(message)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
data/lib/feature.rb
ADDED
@@ -0,0 +1,97 @@
|
|
1
|
+
module OpenTox
|
2
|
+
|
3
|
+
# Basic feature class
|
4
|
+
class Feature
|
5
|
+
field :name, as: :title, type: String
|
6
|
+
field :nominal, type: Boolean
|
7
|
+
field :numeric, type: Boolean
|
8
|
+
field :measured, type: Boolean
|
9
|
+
end
|
10
|
+
|
11
|
+
# Feature for categorical variables
|
12
|
+
class NominalFeature < Feature
|
13
|
+
# TODO check if accept_values are still needed
|
14
|
+
field :accept_values, type: Array
|
15
|
+
def initialize params
|
16
|
+
super params
|
17
|
+
nominal = true
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
# Feature for quantitative variables
|
22
|
+
class NumericFeature < Feature
|
23
|
+
def initialize params
|
24
|
+
super params
|
25
|
+
numeric = true
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
# Feature for SMARTS fragments
|
30
|
+
class Smarts < NominalFeature
|
31
|
+
field :smarts, type: String
|
32
|
+
def self.from_smarts smarts
|
33
|
+
self.find_or_create_by :smarts => smarts
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
# Feature for supervised fragments from Fminer algorithm
|
38
|
+
class FminerSmarts < Smarts
|
39
|
+
field :p_value, type: Float
|
40
|
+
# TODO check if effect is used
|
41
|
+
field :effect, type: String
|
42
|
+
field :dataset_id
|
43
|
+
end
|
44
|
+
|
45
|
+
# Feature for database fingerprints
|
46
|
+
# needs count for efficient retrieval (see compound.rb)
|
47
|
+
class FingerprintSmarts < Smarts
|
48
|
+
field :count, type: Integer
|
49
|
+
def self.fingerprint
|
50
|
+
@@fp4 ||= OpenTox::FingerprintSmarts.all
|
51
|
+
unless @@fp4.size == 306
|
52
|
+
@@fp4 = []
|
53
|
+
# OpenBabel FP4 fingerprints
|
54
|
+
# OpenBabel http://open-babel.readthedocs.org/en/latest/Fingerprints/intro.html
|
55
|
+
# TODO investigate other types of fingerprints (MACCS)
|
56
|
+
# OpenBabel http://open-babel.readthedocs.org/en/latest/Fingerprints/intro.html
|
57
|
+
# http://www.dalkescientific.com/writings/diary/archive/2008/06/26/fingerprint_background.html
|
58
|
+
# OpenBabel MNA http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html#multilevel-neighborhoods-of-atoms-mna
|
59
|
+
# Morgan ECFP, FCFP
|
60
|
+
# http://cdk.github.io/cdk/1.5/docs/api/org/openscience/cdk/fingerprint/CircularFingerprinter.html
|
61
|
+
# http://www.rdkit.org/docs/GettingStartedInPython.html
|
62
|
+
# Chemfp
|
63
|
+
# https://chemfp.readthedocs.org/en/latest/using-tools.html
|
64
|
+
# CACTVS/PubChem
|
65
|
+
|
66
|
+
File.open(File.join(File.dirname(__FILE__),"SMARTS_InteLigand.txt")).each do |l|
|
67
|
+
l.strip!
|
68
|
+
unless l.empty? or l.match /^#/
|
69
|
+
name,smarts = l.split(': ')
|
70
|
+
@@fp4 << OpenTox::FingerprintSmarts.find_or_create_by(:name => name, :smarts => smarts) unless smarts.nil?
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
@@fp4
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
# Feature for physico-chemical descriptors
|
79
|
+
class PhysChemDescriptor < NumericFeature
|
80
|
+
field :algorithm, type: String, default: "OpenTox::Algorithm::Descriptor.physchem"
|
81
|
+
field :parameters, type: Hash
|
82
|
+
field :creator, type: String
|
83
|
+
end
|
84
|
+
|
85
|
+
# Feature for categorical bioassay results
|
86
|
+
class NominalBioAssay < NominalFeature
|
87
|
+
# TODO: needed? move to dataset?
|
88
|
+
field :description, type: String
|
89
|
+
end
|
90
|
+
|
91
|
+
# Feature for quantitative bioassay results
|
92
|
+
class NumericBioAssay < NumericFeature
|
93
|
+
# TODO: needed? move to dataset?
|
94
|
+
field :description, type: String
|
95
|
+
end
|
96
|
+
|
97
|
+
end
|
data/lib/lazar-model.rb
ADDED
@@ -0,0 +1,170 @@
|
|
1
|
+
module OpenTox
|
2
|
+
|
3
|
+
module Model
|
4
|
+
|
5
|
+
class Lazar
|
6
|
+
include OpenTox
|
7
|
+
include Mongoid::Document
|
8
|
+
include Mongoid::Timestamps
|
9
|
+
store_in collection: "models"
|
10
|
+
|
11
|
+
field :title, type: String
|
12
|
+
field :creator, type: String, default: __FILE__
|
13
|
+
# datasets
|
14
|
+
field :training_dataset_id, type: BSON::ObjectId
|
15
|
+
# algorithms
|
16
|
+
field :prediction_algorithm, type: String
|
17
|
+
field :neighbor_algorithm, type: String
|
18
|
+
field :neighbor_algorithm_parameters, type: Hash
|
19
|
+
# prediction feature
|
20
|
+
field :prediction_feature_id, type: BSON::ObjectId
|
21
|
+
|
22
|
+
attr_accessor :prediction_dataset
|
23
|
+
attr_accessor :training_dataset
|
24
|
+
|
25
|
+
# Create a lazar model from a training_dataset and a feature_dataset
|
26
|
+
# @param [OpenTox::Dataset] training_dataset
|
27
|
+
# @return [OpenTox::Model::Lazar] Regression or classification model
|
28
|
+
def self.create training_dataset
|
29
|
+
|
30
|
+
bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1
|
31
|
+
|
32
|
+
# TODO document convention
|
33
|
+
prediction_feature = training_dataset.features.first
|
34
|
+
prediction_feature.nominal ? lazar = OpenTox::Model::LazarClassification.new : lazar = OpenTox::Model::LazarRegression.new
|
35
|
+
lazar.training_dataset_id = training_dataset.id
|
36
|
+
lazar.prediction_feature_id = prediction_feature.id
|
37
|
+
lazar.title = prediction_feature.title
|
38
|
+
|
39
|
+
lazar.save
|
40
|
+
lazar
|
41
|
+
end
|
42
|
+
|
43
|
+
def predict object
|
44
|
+
|
45
|
+
t = Time.now
|
46
|
+
at = Time.now
|
47
|
+
|
48
|
+
training_dataset = Dataset.find training_dataset_id
|
49
|
+
prediction_feature = Feature.find prediction_feature_id
|
50
|
+
|
51
|
+
# parse data
|
52
|
+
compounds = []
|
53
|
+
case object.class.to_s
|
54
|
+
when "OpenTox::Compound"
|
55
|
+
compounds = [object]
|
56
|
+
when "Array"
|
57
|
+
compounds = object
|
58
|
+
when "OpenTox::Dataset"
|
59
|
+
compounds = object.compounds
|
60
|
+
else
|
61
|
+
bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Compounds or an OpenTox::Dataset as parameter."
|
62
|
+
end
|
63
|
+
|
64
|
+
# make predictions
|
65
|
+
predictions = []
|
66
|
+
neighbors = []
|
67
|
+
compounds.each_with_index do |compound,c|
|
68
|
+
t = Time.new
|
69
|
+
database_activities = training_dataset.values(compound,prediction_feature)
|
70
|
+
if database_activities and !database_activities.empty?
|
71
|
+
database_activities = database_activities.first if database_activities.size == 1
|
72
|
+
predictions << {:compound => compound, :value => database_activities, :confidence => "measured", :warning => "Compound #{compound.smiles} occurs in training dataset with activity '#{database_activities}'."}
|
73
|
+
next
|
74
|
+
end
|
75
|
+
neighbors = Algorithm.run(neighbor_algorithm, compound, neighbor_algorithm_parameters)
|
76
|
+
# add activities
|
77
|
+
# TODO: improve efficiency, takes 3 times longer than previous version
|
78
|
+
neighbors.collect! do |n|
|
79
|
+
rows = training_dataset.compound_ids.each_index.select{|i| training_dataset.compound_ids[i] == n.first}
|
80
|
+
acts = rows.collect{|row| training_dataset.data_entries[row][0]}.compact
|
81
|
+
acts.empty? ? nil : n << acts
|
82
|
+
end
|
83
|
+
neighbors.compact! # remove neighbors without training activities
|
84
|
+
predictions << Algorithm.run(prediction_algorithm, neighbors)
|
85
|
+
end
|
86
|
+
|
87
|
+
# serialize result
|
88
|
+
case object.class.to_s
|
89
|
+
when "OpenTox::Compound"
|
90
|
+
prediction = predictions.first
|
91
|
+
prediction[:neighbors] = neighbors.sort{|a,b| b[1] <=> a[1]} # sort according to similarity
|
92
|
+
return prediction
|
93
|
+
when "Array"
|
94
|
+
return predictions
|
95
|
+
when "OpenTox::Dataset"
|
96
|
+
# prepare prediction dataset
|
97
|
+
prediction_dataset = LazarPrediction.new(
|
98
|
+
:title => "Lazar prediction for #{prediction_feature.title}",
|
99
|
+
:creator => __FILE__,
|
100
|
+
:prediction_feature_id => prediction_feature.id
|
101
|
+
|
102
|
+
)
|
103
|
+
confidence_feature = OpenTox::NumericFeature.find_or_create_by( "title" => "Prediction confidence" )
|
104
|
+
# TODO move into warnings field
|
105
|
+
warning_feature = OpenTox::NominalFeature.find_or_create_by("title" => "Warnings")
|
106
|
+
prediction_dataset.features = [ prediction_feature, confidence_feature, warning_feature ]
|
107
|
+
prediction_dataset.compounds = compounds
|
108
|
+
prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:confidence], p[:warning]]}
|
109
|
+
prediction_dataset.save_all
|
110
|
+
return prediction_dataset
|
111
|
+
end
|
112
|
+
|
113
|
+
end
|
114
|
+
|
115
|
+
def training_activities
|
116
|
+
i = training_dataset.feature_ids.index prediction_feature_id
|
117
|
+
training_dataset.data_entries.collect{|de| de[i]}
|
118
|
+
end
|
119
|
+
|
120
|
+
end
|
121
|
+
|
122
|
+
class LazarClassification < Lazar
|
123
|
+
def initialize
|
124
|
+
super
|
125
|
+
self.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote"
|
126
|
+
self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity"
|
127
|
+
self.neighbor_algorithm_parameters = {:min_sim => 0.7}
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
class LazarFminerClassification < LazarClassification
|
132
|
+
|
133
|
+
def self.create training_dataset
|
134
|
+
model = super(training_dataset)
|
135
|
+
model.update "_type" => self.to_s # adjust class
|
136
|
+
model = self.find model.id # adjust class
|
137
|
+
model.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fminer_similarity"
|
138
|
+
model.neighbor_algorithm_parameters = {
|
139
|
+
:feature_calculation_algorithm => "OpenTox::Algorithm::Descriptor.smarts_match",
|
140
|
+
:feature_dataset_id => Algorithm::Fminer.bbrc(training_dataset).id,
|
141
|
+
:min_sim => 0.3
|
142
|
+
}
|
143
|
+
model.save
|
144
|
+
model
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
class LazarRegression < Lazar
|
149
|
+
|
150
|
+
def initialize
|
151
|
+
super
|
152
|
+
self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity"
|
153
|
+
self.prediction_algorithm = "OpenTox::Algorithm::Regression.weighted_average"
|
154
|
+
self.neighbor_algorithm_parameters = {:min_sim => 0.7}
|
155
|
+
end
|
156
|
+
|
157
|
+
end
|
158
|
+
|
159
|
+
class PredictionModel < Lazar
|
160
|
+
field :category, type: String
|
161
|
+
field :endpoint, type: String
|
162
|
+
field :unit, type: String
|
163
|
+
field :model_id, type: BSON::ObjectId
|
164
|
+
field :crossvalidation_id, type: BSON::ObjectId
|
165
|
+
end
|
166
|
+
|
167
|
+
end
|
168
|
+
|
169
|
+
end
|
170
|
+
|
data/lib/lazar.rb
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require "bundler/setup"
|
3
|
+
require "rest-client"
|
4
|
+
require 'yaml'
|
5
|
+
require 'json'
|
6
|
+
require 'logger'
|
7
|
+
require 'mongoid'
|
8
|
+
require 'rserve'
|
9
|
+
require "nokogiri"
|
10
|
+
require "base64"
|
11
|
+
|
12
|
+
|
13
|
+
# Mongo setup
|
14
|
+
# TODO retrieve correct environment from Rack/Sinatra
|
15
|
+
ENV["MONGOID_ENV"] ||= "development"
|
16
|
+
# TODO remove config files, change default via ENV or directly in Mongoid class
|
17
|
+
Mongoid.load!("#{File.expand_path(File.join(File.dirname(__FILE__),'..','mongoid.yml'))}")
|
18
|
+
# TODO get Mongo::Client from Mongoid
|
19
|
+
$mongo = Mongo::Client.new('mongodb://127.0.0.1:27017/opentox')
|
20
|
+
# TODO same for GridFS
|
21
|
+
$gridfs = $mongo.database.fs
|
22
|
+
|
23
|
+
# R setup
|
24
|
+
R = Rserve::Connection.new
|
25
|
+
|
26
|
+
# Logger setup
|
27
|
+
$logger = Logger.new STDOUT # STDERR did not work on my development machine (CH)
|
28
|
+
$logger.level = Logger::DEBUG
|
29
|
+
Mongo::Logger.logger = $logger
|
30
|
+
Mongo::Logger.level = Logger::WARN
|
31
|
+
#Mongoid.logger = $logger
|
32
|
+
|
33
|
+
# Require sub-Repositories
|
34
|
+
require_relative '../libfminer/libbbrc/bbrc' # include before openbabel
|
35
|
+
require_relative '../libfminer/liblast/last' #
|
36
|
+
require_relative '../last-utils/lu.rb'
|
37
|
+
require_relative '../openbabel/lib/openbabel'
|
38
|
+
|
39
|
+
# Fminer environment variables
|
40
|
+
ENV['FMINER_SMARTS'] = 'true'
|
41
|
+
ENV['FMINER_NO_AROMATIC'] = 'true'
|
42
|
+
ENV['FMINER_PVALUES'] = 'true'
|
43
|
+
ENV['FMINER_SILENT'] = 'true'
|
44
|
+
ENV['FMINER_NR_HITS'] = 'true'
|
45
|
+
|
46
|
+
# OpenTox classes and includes
|
47
|
+
CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation"]# Algorithm and Models are modules
|
48
|
+
|
49
|
+
[ # be aware of the require sequence as it affects class/method overwrites
|
50
|
+
"overwrite.rb",
|
51
|
+
"rest-client-wrapper.rb",
|
52
|
+
"error.rb",
|
53
|
+
"opentox.rb",
|
54
|
+
"feature.rb",
|
55
|
+
"compound.rb",
|
56
|
+
"dataset.rb",
|
57
|
+
"descriptor.rb",
|
58
|
+
"algorithm.rb",
|
59
|
+
"descriptor.rb",
|
60
|
+
"bbrc.rb",
|
61
|
+
"lazar-model.rb",
|
62
|
+
"similarity.rb",
|
63
|
+
"neighbor.rb",
|
64
|
+
"classification.rb",
|
65
|
+
"regression.rb",
|
66
|
+
"validation.rb",
|
67
|
+
"crossvalidation.rb",
|
68
|
+
].each{ |f| require_relative f }
|
69
|
+
|
data/lib/neighbor.rb
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
module OpenTox
|
2
|
+
module Algorithm
|
3
|
+
class Neighbor
|
4
|
+
|
5
|
+
def self.fingerprint_similarity compound, params={}
|
6
|
+
compound.neighbors params[:min_sim]
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.fminer_similarity compound, params
|
10
|
+
feature_dataset = Dataset.find params[:feature_dataset_id]
|
11
|
+
query_fingerprint = Algorithm::Descriptor.smarts_match(compound, feature_dataset.features)
|
12
|
+
neighbors = []
|
13
|
+
|
14
|
+
# find neighbors
|
15
|
+
feature_dataset.data_entries.each_with_index do |fingerprint, i|
|
16
|
+
sim = Algorithm::Similarity.tanimoto fingerprint, query_fingerprint
|
17
|
+
if sim > params[:min_sim]
|
18
|
+
neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming
|
19
|
+
end
|
20
|
+
end
|
21
|
+
neighbors
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
data/lib/opentox.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
module OpenTox
|
2
|
+
|
3
|
+
# Ruby interface
|
4
|
+
|
5
|
+
# create default OpenTox classes (defined in opentox-client.rb)
|
6
|
+
# provides Mongoid's query and persistence methods
|
7
|
+
# http://mongoid.org/en/mongoid/docs/persistence.html
|
8
|
+
# http://mongoid.org/en/mongoid/docs/querying.html
|
9
|
+
CLASSES.each do |klass|
|
10
|
+
c = Class.new do
|
11
|
+
include OpenTox
|
12
|
+
include Mongoid::Document
|
13
|
+
include Mongoid::Timestamps
|
14
|
+
store_in collection: klass.downcase.pluralize
|
15
|
+
field :title, as: :name, type: String
|
16
|
+
|
17
|
+
end
|
18
|
+
OpenTox.const_set klass,c
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
|