lazar 0.0.7 → 0.0.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +3 -0
- data/README.md +2 -1
- data/VERSION +1 -1
- data/ext/lazar/extconf.rb +15 -76
- data/ext/lazar/rinstall.R +9 -0
- data/lazar.gemspec +7 -7
- data/lib/classification.rb +5 -78
- data/lib/compound.rb +201 -44
- data/lib/crossvalidation.rb +224 -121
- data/lib/dataset.rb +83 -93
- data/lib/error.rb +1 -1
- data/lib/experiment.rb +99 -0
- data/lib/feature.rb +2 -54
- data/lib/lazar.rb +47 -34
- data/lib/leave-one-out-validation.rb +205 -0
- data/lib/model.rb +131 -76
- data/lib/opentox.rb +2 -2
- data/lib/overwrite.rb +37 -0
- data/lib/physchem.rb +133 -0
- data/lib/regression.rb +117 -189
- data/lib/rest-client-wrapper.rb +4 -5
- data/lib/unique_descriptors.rb +6 -7
- data/lib/validation.rb +63 -69
- data/test/all.rb +2 -2
- data/test/classification.rb +41 -0
- data/test/compound.rb +116 -7
- data/test/data/LOAEL_log_mg_corrected_smiles.csv +567 -567
- data/test/data/LOAEL_log_mmol_corrected_smiles.csv +566 -566
- data/test/data/LOAEL_mmol_corrected_smiles.csv +568 -0
- data/test/data/batch_prediction.csv +25 -0
- data/test/data/batch_prediction_inchi_small.csv +4 -0
- data/test/data/batch_prediction_smiles_small.csv +4 -0
- data/test/data/hamster_carcinogenicity.json +3 -0
- data/test/data/loael.csv +568 -0
- data/test/dataset-long.rb +5 -8
- data/test/dataset.rb +31 -11
- data/test/default_environment.rb +11 -0
- data/test/descriptor.rb +26 -41
- data/test/error.rb +1 -3
- data/test/experiment.rb +301 -0
- data/test/feature.rb +22 -10
- data/test/lazar-long.rb +43 -23
- data/test/lazar-physchem-short.rb +19 -16
- data/test/prediction_models.rb +20 -0
- data/test/regression.rb +43 -0
- data/test/setup.rb +3 -1
- data/test/test_environment.rb +10 -0
- data/test/validation.rb +92 -26
- metadata +64 -38
- data/lib/SMARTS_InteLigand.txt +0 -983
- data/lib/bbrc.rb +0 -165
- data/lib/descriptor.rb +0 -247
- data/lib/neighbor.rb +0 -25
- data/lib/similarity.rb +0 -58
- data/mongoid.yml +0 -8
- data/test/descriptor-long.rb +0 -26
- data/test/fminer-long.rb +0 -38
- data/test/fminer.rb +0 -52
- data/test/lazar-fminer.rb +0 -50
- data/test/lazar-regression.rb +0 -27
data/lib/bbrc.rb
DELETED
@@ -1,165 +0,0 @@
|
|
1
|
-
module OpenTox
|
2
|
-
module Algorithm
|
3
|
-
class Fminer
|
4
|
-
TABLE_OF_ELEMENTS = [
|
5
|
-
"H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th", "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr", "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Uut", "Fl", "Uup", "Lv", "Uus", "Uuo"]
|
6
|
-
|
7
|
-
#
|
8
|
-
# Run bbrc algorithm on dataset
|
9
|
-
#
|
10
|
-
# @param [OpenTox::Dataset] training dataset
|
11
|
-
# @param [optional] parameters BBRC parameters, accepted parameters are
|
12
|
-
# - min_frequency Minimum frequency (default 5)
|
13
|
-
# - feature_type Feature type, can be 'paths' or 'trees' (default "trees")
|
14
|
-
# - backbone BBRC classes, pass 'false' to switch off mining for BBRC representatives. (default "true")
|
15
|
-
# - min_chisq_significance Significance threshold (between 0 and 1)
|
16
|
-
# - nr_hits Set to "true" to get hit count instead of presence
|
17
|
-
# - get_target Set to "true" to obtain target variable as feature
|
18
|
-
# @return [OpenTox::Dataset] Fminer Dataset
|
19
|
-
def self.bbrc training_dataset, params={}
|
20
|
-
|
21
|
-
time = Time.now
|
22
|
-
bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1
|
23
|
-
|
24
|
-
prediction_feature = training_dataset.features.first
|
25
|
-
if params[:min_frequency]
|
26
|
-
minfreq = params[:min_frequency]
|
27
|
-
else
|
28
|
-
per_mil = 5 # value from latest version
|
29
|
-
per_mil = 8 # as suggested below
|
30
|
-
i = training_dataset.feature_ids.index prediction_feature.id
|
31
|
-
nr_labeled_cmpds = training_dataset.data_entries.select{|de| !de[i].nil?}.size
|
32
|
-
minfreq = per_mil * nr_labeled_cmpds.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
|
33
|
-
minfreq = 2 unless minfreq > 2
|
34
|
-
minfreq = minfreq.round
|
35
|
-
end
|
36
|
-
|
37
|
-
@bbrc ||= Bbrc::Bbrc.new
|
38
|
-
@bbrc.Reset
|
39
|
-
if prediction_feature.numeric
|
40
|
-
@bbrc.SetRegression(true) # AM: DO NOT MOVE DOWN! Must happen before the other Set... operations!
|
41
|
-
else
|
42
|
-
bad_request_error "No accept values for "\
|
43
|
-
"dataset '#{training_dataset.id}' and "\
|
44
|
-
"feature '#{prediction_feature.id}'" unless prediction_feature.accept_values
|
45
|
-
value2act = Hash[[*prediction_feature.accept_values.map.with_index]]
|
46
|
-
end
|
47
|
-
@bbrc.SetMinfreq(minfreq)
|
48
|
-
@bbrc.SetType(1) if params[:feature_type] == "paths"
|
49
|
-
@bbrc.SetBackbone(false) if params[:backbone] == "false"
|
50
|
-
@bbrc.SetChisqSig(params[:min_chisq_significance].to_f) if params[:min_chisq_significance]
|
51
|
-
@bbrc.SetConsoleOut(false)
|
52
|
-
|
53
|
-
params[:nr_hits] ? nr_hits = params[:nr_hits] : nr_hits = false
|
54
|
-
feature_dataset = FminerDataset.new(
|
55
|
-
:training_dataset_id => training_dataset.id,
|
56
|
-
:training_algorithm => "#{self.to_s}.bbrc",
|
57
|
-
:training_feature_id => prediction_feature.id ,
|
58
|
-
:training_parameters => {
|
59
|
-
:min_frequency => minfreq,
|
60
|
-
:nr_hits => nr_hits,
|
61
|
-
:backbone => (params[:backbone] == false ? false : true)
|
62
|
-
}
|
63
|
-
|
64
|
-
)
|
65
|
-
feature_dataset.compounds = training_dataset.compounds
|
66
|
-
|
67
|
-
# add data
|
68
|
-
training_dataset.compounds.each_with_index do |compound,i|
|
69
|
-
act = value2act[training_dataset.data_entries[i].first]
|
70
|
-
if act # TODO check if this works
|
71
|
-
@bbrc.AddCompound(compound.smiles,i+1)
|
72
|
-
@bbrc.AddActivity(act,i+1)
|
73
|
-
end
|
74
|
-
end
|
75
|
-
#g_median=@fminer.all_activities.values.to_scale.median
|
76
|
-
|
77
|
-
#task.progress 10
|
78
|
-
#step_width = 80 / @bbrc.GetNoRootNodes().to_f
|
79
|
-
|
80
|
-
$logger.debug "BBRC setup: #{Time.now-time}"
|
81
|
-
time = Time.now
|
82
|
-
ftime = 0
|
83
|
-
itime = 0
|
84
|
-
rtime = 0
|
85
|
-
|
86
|
-
# run @bbrc
|
87
|
-
(0 .. @bbrc.GetNoRootNodes()-1).each do |j|
|
88
|
-
results = @bbrc.MineRoot(j)
|
89
|
-
results.each do |result|
|
90
|
-
rt = Time.now
|
91
|
-
f = YAML.load(result)[0]
|
92
|
-
smarts = f.shift
|
93
|
-
# convert fminer SMARTS representation into a more human readable format
|
94
|
-
smarts.gsub!(%r{\[#(\d+)&(\w)\]}) do
|
95
|
-
element = TABLE_OF_ELEMENTS[$1.to_i-1]
|
96
|
-
$2 == "a" ? element.downcase : element
|
97
|
-
end
|
98
|
-
p_value = f.shift
|
99
|
-
f.flatten!
|
100
|
-
compound_idxs = f.collect{|e| e.first.first-1}
|
101
|
-
# majority class
|
102
|
-
effect = compound_idxs.collect{|i| training_dataset.data_entries[i].first}.mode
|
103
|
-
|
104
|
-
=begin
|
105
|
-
if (!@bbrc.GetRegression)
|
106
|
-
id_arrs = f[2..-1].flatten
|
107
|
-
max = OpenTox::Algorithm::Fminer.effect(f[2..-1].reverse, @fminer.db_class_sizes) # f needs reversal for bbrc
|
108
|
-
effect = max+1
|
109
|
-
else #regression part
|
110
|
-
id_arrs = f[2]
|
111
|
-
# DV: effect calculation
|
112
|
-
f_arr=Array.new
|
113
|
-
f[2].each do |id|
|
114
|
-
id=id.keys[0] # extract id from hit count hash
|
115
|
-
f_arr.push(@fminer.all_activities[id])
|
116
|
-
end
|
117
|
-
f_median=f_arr.to_scale.median
|
118
|
-
if g_median >= f_median
|
119
|
-
effect = 'activating'
|
120
|
-
else
|
121
|
-
effect = 'deactivating'
|
122
|
-
end
|
123
|
-
end
|
124
|
-
=end
|
125
|
-
rtime += Time.now - rt
|
126
|
-
|
127
|
-
ft = Time.now
|
128
|
-
feature = OpenTox::FminerSmarts.find_or_create_by({
|
129
|
-
"smarts" => smarts,
|
130
|
-
"p_value" => p_value.to_f.abs.round(5),
|
131
|
-
"effect" => effect,
|
132
|
-
"dataset_id" => feature_dataset.id
|
133
|
-
})
|
134
|
-
feature_dataset.feature_ids << feature.id
|
135
|
-
ftime += Time.now - ft
|
136
|
-
|
137
|
-
it = Time.now
|
138
|
-
f.each do |id_count_hash|
|
139
|
-
id_count_hash.each do |id,count|
|
140
|
-
nr_hits ? count = count.to_i : count = 1
|
141
|
-
feature_dataset.data_entries[id-1] ||= []
|
142
|
-
feature_dataset.data_entries[id-1][feature_dataset.feature_ids.size-1] = count
|
143
|
-
end
|
144
|
-
end
|
145
|
-
itime += Time.now - it
|
146
|
-
|
147
|
-
end
|
148
|
-
end
|
149
|
-
|
150
|
-
$logger.debug "Fminer: #{Time.now-time} (read: #{rtime}, iterate: #{itime}, find/create Features: #{ftime})"
|
151
|
-
time = Time.now
|
152
|
-
|
153
|
-
feature_dataset.fill_nil_with 0
|
154
|
-
|
155
|
-
$logger.debug "Prepare save: #{Time.now-time}"
|
156
|
-
time = Time.now
|
157
|
-
feature_dataset.save_all
|
158
|
-
|
159
|
-
$logger.debug "Save: #{Time.now-time}"
|
160
|
-
feature_dataset
|
161
|
-
|
162
|
-
end
|
163
|
-
end
|
164
|
-
end
|
165
|
-
end
|
data/lib/descriptor.rb
DELETED
@@ -1,247 +0,0 @@
|
|
1
|
-
require 'digest/md5'
|
2
|
-
ENV["JAVA_HOME"] ||= "/usr/lib/jvm/java-7-openjdk"
|
3
|
-
# TODO store descriptors in mongodb
|
4
|
-
|
5
|
-
module OpenTox
|
6
|
-
|
7
|
-
module Algorithm
|
8
|
-
|
9
|
-
# Class for descriptor calculations
|
10
|
-
class Descriptor
|
11
|
-
include OpenTox
|
12
|
-
|
13
|
-
JAVA_DIR = File.join(File.dirname(__FILE__),"..","java")
|
14
|
-
CDK_JAR = Dir[File.join(JAVA_DIR,"cdk-*jar")].last
|
15
|
-
JOELIB_JAR = File.join(JAVA_DIR,"joelib2.jar")
|
16
|
-
LOG4J_JAR = File.join(JAVA_DIR,"log4j.jar")
|
17
|
-
JMOL_JAR = File.join(JAVA_DIR,"Jmol.jar")
|
18
|
-
|
19
|
-
obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title"]
|
20
|
-
OBDESCRIPTORS = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d|
|
21
|
-
name,description = d.split(/\s+/,2)
|
22
|
-
["Openbabel."+name,description] unless obexclude.include? name
|
23
|
-
end.compact.sort{|a,b| a[0] <=> b[0]}]
|
24
|
-
|
25
|
-
cdk_desc = YAML.load(`java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptorInfo`)
|
26
|
-
CDKDESCRIPTORS = Hash[cdk_desc.collect { |d| ["Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,''), d[:description]] }.sort{|a,b| a[0] <=> b[0]}]
|
27
|
-
CDKDESCRIPTOR_VALUES = cdk_desc.collect { |d| prefix="Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,''); d[:names].collect{ |name| prefix+"."+name } }.flatten
|
28
|
-
|
29
|
-
# exclude Hashcode (not a physchem property) and GlobalTopologicalChargeIndex (Joelib bug)
|
30
|
-
joelibexclude = ["MoleculeHashcode","GlobalTopologicalChargeIndex"]
|
31
|
-
# strip Joelib messages from stdout
|
32
|
-
JOELIBDESCRIPTORS = Hash[YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`).collect do |d|
|
33
|
-
name = d[:java_class].sub(/^joelib2.feature.types./,'')
|
34
|
-
# impossible to obtain meaningful descriptions from JOELIb, see java/JoelibDescriptors.java
|
35
|
-
["Joelib."+name, "no description available"] unless joelibexclude.include? name
|
36
|
-
end.compact.sort{|a,b| a[0] <=> b[0]}]
|
37
|
-
|
38
|
-
DESCRIPTORS = OBDESCRIPTORS.merge(CDKDESCRIPTORS.merge(JOELIBDESCRIPTORS))
|
39
|
-
DESCRIPTOR_VALUES = OBDESCRIPTORS.keys + CDKDESCRIPTOR_VALUES + JOELIBDESCRIPTORS.keys
|
40
|
-
|
41
|
-
require_relative "unique_descriptors.rb"
|
42
|
-
|
43
|
-
# Description of available descriptors
|
44
|
-
def self.description descriptor
|
45
|
-
lib = descriptor.split('.').first
|
46
|
-
case lib
|
47
|
-
when "Openbabel"
|
48
|
-
OBDESCRIPTORS[descriptor]
|
49
|
-
when "Cdk"
|
50
|
-
name = descriptor.split('.')[0..-2].join('.')
|
51
|
-
CDKDESCRIPTORS[name]
|
52
|
-
when "Joelib"
|
53
|
-
JOELIBDESCRIPTORS[descriptor]
|
54
|
-
when "lookup"
|
55
|
-
"Read feature values from a dataset"
|
56
|
-
end
|
57
|
-
end
|
58
|
-
|
59
|
-
# Match an array of smarts features
|
60
|
-
def self.smarts_match compounds, smarts_features, count=false
|
61
|
-
bad_request_error "Compounds for smarts_match are empty" unless compounds
|
62
|
-
bad_request_error "Smarts features for smarts_match are empty" unless smarts_features
|
63
|
-
parse compounds
|
64
|
-
@count = count
|
65
|
-
obconversion = OpenBabel::OBConversion.new
|
66
|
-
obmol = OpenBabel::OBMol.new
|
67
|
-
obconversion.set_in_format('smi')
|
68
|
-
smarts_pattern = OpenBabel::OBSmartsPattern.new
|
69
|
-
smarts_features = [smarts_features] if smarts_features.is_a?(Feature)
|
70
|
-
@smarts = smarts_features.collect{|f| f.smarts}
|
71
|
-
@physchem_descriptors = nil
|
72
|
-
@data_entries = Array.new(@compounds.size){Array.new(@smarts.size,false)}
|
73
|
-
@compounds.each_with_index do |compound,c|
|
74
|
-
obconversion.read_string(obmol,compound.smiles)
|
75
|
-
@smarts.each_with_index do |smart,s|
|
76
|
-
smarts_pattern.init(smart)
|
77
|
-
if smarts_pattern.match(obmol)
|
78
|
-
count ? value = smarts_pattern.get_map_list.to_a.size : value = 1
|
79
|
-
else
|
80
|
-
value = 0
|
81
|
-
end
|
82
|
-
@data_entries[c][s] = value
|
83
|
-
end
|
84
|
-
end
|
85
|
-
serialize
|
86
|
-
end
|
87
|
-
|
88
|
-
# Count matches of an array with smarts features
|
89
|
-
def self.smarts_count compounds, smarts
|
90
|
-
# TODO: non-overlapping matches?
|
91
|
-
smarts_match compounds,smarts,true
|
92
|
-
end
|
93
|
-
|
94
|
-
# Calculate physchem descriptors
|
95
|
-
# @param [OpenTox::Compound,Array,OpenTox::Dataset] input object, either a compound, an array of compounds or a dataset
|
96
|
-
def self.physchem compounds, descriptors=UNIQUEDESCRIPTORS
|
97
|
-
parse compounds
|
98
|
-
@data_entries = Array.new(@compounds.size){[]}
|
99
|
-
@descriptors = descriptors
|
100
|
-
@smarts = nil
|
101
|
-
@physchem_descriptors = [] # CDK may return more than one result per descriptor, they are stored as separate features
|
102
|
-
des = {}
|
103
|
-
@descriptors.each do |d|
|
104
|
-
lib, descriptor = d.split(".",2)
|
105
|
-
lib = lib.downcase.to_sym
|
106
|
-
des[lib] ||= []
|
107
|
-
des[lib] << descriptor
|
108
|
-
end
|
109
|
-
des.each do |lib,descriptors|
|
110
|
-
send(lib, descriptors)
|
111
|
-
end
|
112
|
-
serialize
|
113
|
-
end
|
114
|
-
|
115
|
-
def self.openbabel descriptors
|
116
|
-
$logger.debug "compute #{descriptors.size} openbabel descriptors for #{@compounds.size} compounds"
|
117
|
-
obdescriptors = descriptors.collect{|d| OpenBabel::OBDescriptor.find_type d}
|
118
|
-
obmol = OpenBabel::OBMol.new
|
119
|
-
obconversion = OpenBabel::OBConversion.new
|
120
|
-
obconversion.set_in_format 'smi'
|
121
|
-
last_feature_idx = @physchem_descriptors.size
|
122
|
-
@compounds.each_with_index do |compound,c|
|
123
|
-
obconversion.read_string obmol, compound.smiles
|
124
|
-
obdescriptors.each_with_index do |descriptor,d|
|
125
|
-
@data_entries[c][d+last_feature_idx] = fix_value(descriptor.predict(obmol))
|
126
|
-
end
|
127
|
-
end
|
128
|
-
@physchem_descriptors += descriptors.collect{|d| "Openbabel.#{d}"}
|
129
|
-
end
|
130
|
-
|
131
|
-
def self.java_descriptors descriptors, lib
|
132
|
-
$logger.debug "compute #{descriptors.size} cdk descriptors for #{@compounds.size} compounds"
|
133
|
-
sdf = sdf_3d
|
134
|
-
# use java system call (rjb blocks within tasks)
|
135
|
-
# use Tempfiles to avoid "Argument list too long" error
|
136
|
-
case lib
|
137
|
-
when "cdk"
|
138
|
-
run_cmd "java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptors #{sdf} #{descriptors.join(" ")}"
|
139
|
-
when "joelib"
|
140
|
-
run_cmd "java -classpath #{JOELIB_JAR}:#{JMOL_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptors #{sdf} #{descriptors.join(' ')}"
|
141
|
-
end
|
142
|
-
last_feature_idx = @physchem_descriptors.size
|
143
|
-
YAML.load_file("#{sdf}#{lib}.yaml").each_with_index do |calculation,i|
|
144
|
-
# TODO create warnings
|
145
|
-
#$logger.error "Descriptor calculation failed for compound #{@compounds[i].inchi}." if calculation.empty?
|
146
|
-
# CDK Descriptors may calculate multiple values, they are stored in separate features
|
147
|
-
@physchem_descriptors += calculation.keys if i == 0
|
148
|
-
calculation.keys.each_with_index do |name,j|
|
149
|
-
@data_entries[i][j+last_feature_idx] = fix_value(calculation[name])
|
150
|
-
end
|
151
|
-
end
|
152
|
-
FileUtils.rm "#{sdf}#{lib}.yaml"
|
153
|
-
end
|
154
|
-
|
155
|
-
def self.cdk descriptors
|
156
|
-
java_descriptors descriptors, "cdk"
|
157
|
-
end
|
158
|
-
|
159
|
-
def self.joelib descriptors
|
160
|
-
java_descriptors descriptors, "joelib"
|
161
|
-
end
|
162
|
-
|
163
|
-
def self.lookup compounds, features, dataset
|
164
|
-
parse compounds
|
165
|
-
fingerprint = []
|
166
|
-
compounds.each do |compound|
|
167
|
-
fingerprint << []
|
168
|
-
features.each do |feature|
|
169
|
-
end
|
170
|
-
end
|
171
|
-
end
|
172
|
-
|
173
|
-
def self.run_cmd cmd
|
174
|
-
cmd = "#{cmd} 2>&1"
|
175
|
-
$logger.debug "running external cmd: '#{cmd}'"
|
176
|
-
p = IO.popen(cmd) do |io|
|
177
|
-
while line = io.gets
|
178
|
-
$logger.debug "> #{line.chomp}"
|
179
|
-
end
|
180
|
-
io.close
|
181
|
-
raise "external cmd failed '#{cmd}' (see log file for error msg)" unless $?.to_i == 0
|
182
|
-
end
|
183
|
-
end
|
184
|
-
|
185
|
-
def self.sdf_3d
|
186
|
-
# TODO check if 3d sdfs are stored in GridFS
|
187
|
-
sdf = ""
|
188
|
-
@compounds.each do |compound|
|
189
|
-
sdf << compound.sdf
|
190
|
-
end
|
191
|
-
sdf_file = "/tmp/#{SecureRandom.uuid}.sdf"
|
192
|
-
File.open(sdf_file,"w+"){|f| f.print sdf}
|
193
|
-
sdf_file
|
194
|
-
end
|
195
|
-
|
196
|
-
def self.parse compounds
|
197
|
-
@input_class = compounds.class.to_s
|
198
|
-
case @input_class
|
199
|
-
when "OpenTox::Compound"
|
200
|
-
@compounds = [compounds]
|
201
|
-
when "Array"
|
202
|
-
@compounds = compounds
|
203
|
-
when "OpenTox::Dataset"
|
204
|
-
@compounds = compounds.compounds
|
205
|
-
else
|
206
|
-
bad_request_error "Cannot calculate descriptors for #{compounds.class} objects."
|
207
|
-
end
|
208
|
-
end
|
209
|
-
|
210
|
-
def self.serialize
|
211
|
-
@data_entries.collect!{|de| de.collect{|v| v.round(5) unless v.nil?}}
|
212
|
-
case @input_class
|
213
|
-
when "OpenTox::Compound"
|
214
|
-
@data_entries.first
|
215
|
-
when "Array"
|
216
|
-
@data_entries
|
217
|
-
when "OpenTox::Dataset"
|
218
|
-
dataset = OpenTox::DescriptorDataset.new(:compound_ids => @compounds.collect{|c| c.id})
|
219
|
-
if @smarts
|
220
|
-
dataset.feature_ids = @smarts.collect{|smart| Smarts.find_or_create_by(:smarts => smart).id}
|
221
|
-
@count ? algo = "count" : algo = "match"
|
222
|
-
dataset.feature_calculation_algorithm = "#{self}.smarts_#{algo}"
|
223
|
-
|
224
|
-
elsif @physchem_descriptors
|
225
|
-
dataset.feature_ids = @physchem_descriptors.collect{|d| PhysChemDescriptor.find_or_create_by(:name => d, :creator => __FILE__).id}
|
226
|
-
dataset.data_entries = @data_entries
|
227
|
-
dataset.feature_calculation_algorithm = "#{self}.physchem"
|
228
|
-
#TODO params?
|
229
|
-
end
|
230
|
-
dataset.save_all
|
231
|
-
dataset
|
232
|
-
end
|
233
|
-
end
|
234
|
-
|
235
|
-
def self.fix_value val
|
236
|
-
val = val.first if val.is_a? Array and val.size == 1
|
237
|
-
val = nil if val == "NaN"
|
238
|
-
if val.numeric?
|
239
|
-
val = Float(val)
|
240
|
-
val = nil if val.nan? or val.infinite?
|
241
|
-
end
|
242
|
-
val
|
243
|
-
end
|
244
|
-
private_class_method :sdf_3d, :fix_value, :parse, :run_cmd, :serialize
|
245
|
-
end
|
246
|
-
end
|
247
|
-
end
|
data/lib/neighbor.rb
DELETED
@@ -1,25 +0,0 @@
|
|
1
|
-
module OpenTox
|
2
|
-
module Algorithm
|
3
|
-
class Neighbor
|
4
|
-
|
5
|
-
def self.fingerprint_similarity compound, params={}
|
6
|
-
compound.neighbors params[:min_sim]
|
7
|
-
end
|
8
|
-
|
9
|
-
def self.fminer_similarity compound, params
|
10
|
-
feature_dataset = Dataset.find params[:feature_dataset_id]
|
11
|
-
query_fingerprint = Algorithm::Descriptor.smarts_match(compound, feature_dataset.features)
|
12
|
-
neighbors = []
|
13
|
-
|
14
|
-
# find neighbors
|
15
|
-
feature_dataset.data_entries.each_with_index do |fingerprint, i|
|
16
|
-
sim = Algorithm::Similarity.tanimoto fingerprint, query_fingerprint
|
17
|
-
if sim > params[:min_sim]
|
18
|
-
neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming
|
19
|
-
end
|
20
|
-
end
|
21
|
-
neighbors
|
22
|
-
end
|
23
|
-
end
|
24
|
-
end
|
25
|
-
end
|
data/lib/similarity.rb
DELETED
@@ -1,58 +0,0 @@
|
|
1
|
-
=begin
|
2
|
-
* Name: similarity.rb
|
3
|
-
* Description: Similarity algorithms
|
4
|
-
* Author: Andreas Maunz <andreas@maunz.de
|
5
|
-
* Date: 10/2012
|
6
|
-
=end
|
7
|
-
|
8
|
-
module OpenTox
|
9
|
-
module Algorithm
|
10
|
-
|
11
|
-
class Similarity
|
12
|
-
|
13
|
-
#TODO weighted tanimoto
|
14
|
-
|
15
|
-
# Tanimoto similarity
|
16
|
-
# @param [Array] a fingerprints of first compound
|
17
|
-
# @param [Array] b fingerprints of second compound
|
18
|
-
# @return [Float] Tanimoto similarity
|
19
|
-
def self.tanimoto(a,b)
|
20
|
-
bad_request_error "fingerprints #{a} and #{b} don't have equal size" unless a.size == b.size
|
21
|
-
#common = 0.0
|
22
|
-
#a.each_with_index do |n,i|
|
23
|
-
#common += 1 if n == b[i]
|
24
|
-
#end
|
25
|
-
#common/a.size
|
26
|
-
# TODO check if calculation speed can be improved
|
27
|
-
common_p_sum = 0.0
|
28
|
-
all_p_sum = 0.0
|
29
|
-
(0...a.size).each { |idx|
|
30
|
-
common_p_sum += [ a[idx], b[idx] ].min
|
31
|
-
all_p_sum += [ a[idx], b[idx] ].max
|
32
|
-
}
|
33
|
-
common_p_sum/all_p_sum
|
34
|
-
end
|
35
|
-
|
36
|
-
|
37
|
-
# Cosine similarity
|
38
|
-
# @param [Array] a fingerprints of first compound
|
39
|
-
# @param [Array] b fingerprints of second compound
|
40
|
-
# @return [Float] Cosine similarity, the cosine of angle enclosed between vectors a and b
|
41
|
-
def self.cosine(a, b)
|
42
|
-
val = 0.0
|
43
|
-
if a.size>0 and b.size>0
|
44
|
-
if a.size>12 && b.size>12
|
45
|
-
a = a[0..11]
|
46
|
-
b = b[0..11]
|
47
|
-
end
|
48
|
-
a_vec = a.to_gv
|
49
|
-
b_vec = b.to_gv
|
50
|
-
val = a_vec.dot(b_vec) / (a_vec.norm * b_vec.norm)
|
51
|
-
end
|
52
|
-
val
|
53
|
-
end
|
54
|
-
|
55
|
-
end
|
56
|
-
|
57
|
-
end
|
58
|
-
end
|
data/mongoid.yml
DELETED
data/test/descriptor-long.rb
DELETED
@@ -1,26 +0,0 @@
|
|
1
|
-
require_relative "setup.rb"
|
2
|
-
class DescriptorLongTest < MiniTest::Test
|
3
|
-
|
4
|
-
def test_dataset_all
|
5
|
-
# TODO: improve CDK descriptor calculation speed or add timeout
|
6
|
-
skip "CDK descriptor calculation takes too long for some compounds"
|
7
|
-
dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.mini.csv")
|
8
|
-
d = OpenTox::Algorithm::Descriptor.physchem dataset
|
9
|
-
assert_equal dataset.compounds, d.compounds
|
10
|
-
assert_equal 332, d.features.size
|
11
|
-
assert_equal 332, d.data_entries.first.size
|
12
|
-
d.delete
|
13
|
-
end
|
14
|
-
|
15
|
-
def test_dataset_openbabel
|
16
|
-
# TODO: improve CDK descriptor calculation speed or add timeout
|
17
|
-
dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.mini.csv")
|
18
|
-
d = Algorithm::Descriptor.physchem dataset, Algorithm::Descriptor::OBDESCRIPTORS.keys
|
19
|
-
assert_equal dataset.compounds, d.compounds
|
20
|
-
size = Algorithm::Descriptor::OBDESCRIPTORS.keys.size
|
21
|
-
assert_equal size, d.features.size
|
22
|
-
assert_equal size, d.data_entries.first.size
|
23
|
-
d.delete
|
24
|
-
end
|
25
|
-
|
26
|
-
end
|
data/test/fminer-long.rb
DELETED
@@ -1,38 +0,0 @@
|
|
1
|
-
require_relative "setup.rb"
|
2
|
-
|
3
|
-
class FminerTest < MiniTest::Test
|
4
|
-
|
5
|
-
def test_fminer_multicell
|
6
|
-
#skip "multicell segfaults"
|
7
|
-
# TODO aborts, probably fminer
|
8
|
-
# or OpenBabel segfault
|
9
|
-
dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"multi_cell_call.csv")
|
10
|
-
feature_dataset = OpenTox::Algorithm::Fminer.bbrc(dataset)#, :min_frequency => 15)
|
11
|
-
p feature_dataset.training_parameters
|
12
|
-
assert_equal dataset.compound_ids, feature_dataset.compound_ids
|
13
|
-
dataset.delete
|
14
|
-
feature_dataset.delete
|
15
|
-
end
|
16
|
-
|
17
|
-
def test_fminer_isscan
|
18
|
-
dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"ISSCAN-multi.csv")
|
19
|
-
feature_dataset = OpenTox::Algorithm::Fminer.bbrc(dataset)#, :min_frequency => 15)
|
20
|
-
assert_equal feature_dataset.compounds.size, dataset.compounds.size
|
21
|
-
p feature_dataset.features.size
|
22
|
-
p feature_dataset.training_parameters
|
23
|
-
dataset.delete
|
24
|
-
feature_dataset.delete
|
25
|
-
end
|
26
|
-
|
27
|
-
def test_fminer_kazius
|
28
|
-
dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"kazius.csv")
|
29
|
-
# TODO reactivate default settings
|
30
|
-
feature_dataset = OpenTox::Algorithm::Fminer.bbrc(dataset, :min_frequency => 20)
|
31
|
-
assert_equal feature_dataset.compounds.size, dataset.compounds.size
|
32
|
-
feature_dataset = Dataset.find feature_dataset.id
|
33
|
-
assert feature_dataset.data_entries.size, dataset.compounds.size
|
34
|
-
dataset.delete
|
35
|
-
feature_dataset.delete
|
36
|
-
end
|
37
|
-
|
38
|
-
end
|
data/test/fminer.rb
DELETED
@@ -1,52 +0,0 @@
|
|
1
|
-
require_relative "setup.rb"
|
2
|
-
|
3
|
-
class FminerTest < MiniTest::Test
|
4
|
-
|
5
|
-
def test_fminer_bbrc
|
6
|
-
dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv")
|
7
|
-
refute_nil dataset.id
|
8
|
-
feature_dataset = OpenTox::Algorithm::Fminer.bbrc dataset
|
9
|
-
feature_dataset = Dataset.find feature_dataset.id
|
10
|
-
assert_equal dataset.compounds.size, feature_dataset.compounds.size
|
11
|
-
# TODO: fminer calculates 62 instead of 54 features
|
12
|
-
# it is unclear which commit changed the numbers (occurs with old libraries/mongodb branch too
|
13
|
-
# modification of Compound to use smiles instead of inchis seems to have no effect
|
14
|
-
#assert_equal 54, feature_dataset.features.size
|
15
|
-
#assert_equal "C-C-C=C", feature_dataset.features.first.smarts
|
16
|
-
compounds = feature_dataset.compounds
|
17
|
-
smarts = feature_dataset.features
|
18
|
-
smarts.each do |smart|
|
19
|
-
assert smart.p_value.round(2) >= 0.95
|
20
|
-
end
|
21
|
-
match = OpenTox::Algorithm::Descriptor.smarts_match compounds, smarts
|
22
|
-
feature_dataset.data_entries.each_with_index do |fingerprint,i|
|
23
|
-
assert_equal match[i], fingerprint
|
24
|
-
end
|
25
|
-
|
26
|
-
dataset.delete
|
27
|
-
feature_dataset.delete
|
28
|
-
end
|
29
|
-
|
30
|
-
def test_fminer_last
|
31
|
-
skip "last features have to be activated"
|
32
|
-
dataset = OpenTox::Dataset.new
|
33
|
-
dataset.upload File.join(DATA_DIR,"hamster_carcinogenicity.csv")
|
34
|
-
feature_dataset = OpenTox::Algorithm::Fminer.last :dataset => dataset
|
35
|
-
assert_equal dataset.compounds.size, feature_dataset.compounds.size
|
36
|
-
assert_equal 21, feature_dataset.features.size
|
37
|
-
assert_equal '[#6&A]-[#6&a]:[#6&a]:[#6&a]:[#6&a]:[#6&a]', feature_dataset.features.first.smarts
|
38
|
-
|
39
|
-
compounds = feature_dataset.compounds
|
40
|
-
smarts = feature_dataset.features.collect{|f| f.smarts}
|
41
|
-
match = OpenTox::Algorithm::Descriptor.smarts_match compounds, smarts
|
42
|
-
compounds.each_with_index do |c,i|
|
43
|
-
smarts.each_with_index do |s,j|
|
44
|
-
assert_equal match[i][j], feature_dataset.data_entries[i][j].to_i
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
|
-
dataset.delete
|
49
|
-
feature_dataset.delete
|
50
|
-
end
|
51
|
-
|
52
|
-
end
|
data/test/lazar-fminer.rb
DELETED
@@ -1,50 +0,0 @@
|
|
1
|
-
require_relative "setup.rb"
|
2
|
-
|
3
|
-
class LazarFminerTest < MiniTest::Test
|
4
|
-
|
5
|
-
def test_lazar_fminer
|
6
|
-
training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv")
|
7
|
-
model = Model::LazarFminerClassification.create training_dataset#, feature_dataset
|
8
|
-
feature_dataset = Dataset.find model.neighbor_algorithm_parameters[:feature_dataset_id]
|
9
|
-
assert_equal training_dataset.compounds.size, feature_dataset.compounds.size
|
10
|
-
#TODO check fminer features, see fminer.rb
|
11
|
-
#assert_equal 54, feature_dataset.features.size
|
12
|
-
feature_dataset.data_entries.each do |e|
|
13
|
-
assert_equal e.size, feature_dataset.features.size
|
14
|
-
end
|
15
|
-
#assert_equal 'C-C-C=C', feature_dataset.features.first.smarts
|
16
|
-
|
17
|
-
[ {
|
18
|
-
:compound => OpenTox::Compound.from_inchi("InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"),
|
19
|
-
:prediction => "false",
|
20
|
-
:confidence => 0.25281385281385277,
|
21
|
-
:nr_neighbors => 11
|
22
|
-
},{
|
23
|
-
:compound => OpenTox::Compound.from_smiles("c1ccccc1NN"),
|
24
|
-
:prediction => "false",
|
25
|
-
:confidence => 0.3639589577089577,
|
26
|
-
:nr_neighbors => 14
|
27
|
-
}, {
|
28
|
-
:compound => Compound.from_smiles('OCCCCCCCC\C=C/CCCCCCCC'),
|
29
|
-
:prediction => "false",
|
30
|
-
:confidence => 0.5555555555555556,
|
31
|
-
:nr_neighbors => 1
|
32
|
-
}].each do |example|
|
33
|
-
prediction = model.predict example[:compound]
|
34
|
-
|
35
|
-
assert_equal example[:prediction], prediction[:value]
|
36
|
-
#assert_equal example[:confidence], prediction[:confidence]
|
37
|
-
#assert_equal example[:nr_neighbors], prediction[:neighbors].size
|
38
|
-
end
|
39
|
-
|
40
|
-
# make a dataset prediction
|
41
|
-
compound_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini.csv")
|
42
|
-
prediction = model.predict compound_dataset
|
43
|
-
assert_equal compound_dataset.compounds, prediction.compounds
|
44
|
-
|
45
|
-
assert_equal "Cound not find similar compounds.", prediction.data_entries[7][2]
|
46
|
-
assert_equal "measured", prediction.data_entries[14][1]
|
47
|
-
# cleanup
|
48
|
-
[training_dataset,model,feature_dataset,compound_dataset].each{|o| o.delete}
|
49
|
-
end
|
50
|
-
end
|