lazar 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/.yardopts +4 -0
- data/Gemfile +2 -0
- data/LICENSE +674 -0
- data/README.md +44 -0
- data/Rakefile +1 -0
- data/VERSION +1 -0
- data/ext/lazar/extconf.rb +87 -0
- data/java/CdkDescriptorInfo.class +0 -0
- data/java/CdkDescriptorInfo.java +22 -0
- data/java/CdkDescriptors.class +0 -0
- data/java/CdkDescriptors.java +141 -0
- data/java/Jmol.jar +0 -0
- data/java/JoelibDescriptorInfo.class +0 -0
- data/java/JoelibDescriptorInfo.java +15 -0
- data/java/JoelibDescriptors.class +0 -0
- data/java/JoelibDescriptors.java +60 -0
- data/java/Rakefile +15 -0
- data/java/cdk-1.4.19.jar +0 -0
- data/java/joelib2.jar +0 -0
- data/java/log4j.jar +0 -0
- data/lazar.gemspec +29 -0
- data/lib/SMARTS_InteLigand.txt +983 -0
- data/lib/algorithm.rb +21 -0
- data/lib/bbrc.rb +165 -0
- data/lib/classification.rb +107 -0
- data/lib/compound.rb +254 -0
- data/lib/crossvalidation.rb +187 -0
- data/lib/dataset.rb +334 -0
- data/lib/descriptor.rb +247 -0
- data/lib/error.rb +66 -0
- data/lib/feature.rb +97 -0
- data/lib/lazar-model.rb +170 -0
- data/lib/lazar.rb +69 -0
- data/lib/neighbor.rb +25 -0
- data/lib/opentox.rb +22 -0
- data/lib/overwrite.rb +119 -0
- data/lib/regression.rb +199 -0
- data/lib/rest-client-wrapper.rb +98 -0
- data/lib/similarity.rb +58 -0
- data/lib/unique_descriptors.rb +120 -0
- data/lib/validation.rb +114 -0
- data/mongoid.yml +8 -0
- data/test/all.rb +5 -0
- data/test/compound.rb +100 -0
- data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +13553 -0
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +436 -0
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +568 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +87 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +978 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +1120 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +1113 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +850 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +829 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +1198 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +1505 -0
- data/test/data/EPAFHM.csv +618 -0
- data/test/data/EPAFHM.medi.csv +100 -0
- data/test/data/EPAFHM.mini.csv +22 -0
- data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +581 -0
- data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +1217 -0
- data/test/data/ISSCAN-multi.csv +59 -0
- data/test/data/LOAEL_log_mg_corrected_smiles.csv +568 -0
- data/test/data/LOAEL_log_mmol_corrected_smiles.csv +568 -0
- data/test/data/acetaldehyde.sdf +14 -0
- data/test/data/boiling_points.ext.sdf +11460 -0
- data/test/data/cpdb_100.csv +101 -0
- data/test/data/hamster_carcinogenicity.csv +86 -0
- data/test/data/hamster_carcinogenicity.mini.bool_float.csv +11 -0
- data/test/data/hamster_carcinogenicity.mini.bool_int.csv +11 -0
- data/test/data/hamster_carcinogenicity.mini.bool_string.csv +11 -0
- data/test/data/hamster_carcinogenicity.mini.csv +11 -0
- data/test/data/hamster_carcinogenicity.ntriples +618 -0
- data/test/data/hamster_carcinogenicity.sdf +2805 -0
- data/test/data/hamster_carcinogenicity.xls +0 -0
- data/test/data/hamster_carcinogenicity.yaml +352 -0
- data/test/data/hamster_carcinogenicity_with_errors.csv +88 -0
- data/test/data/kazius.csv +4070 -0
- data/test/data/multi_cell_call.csv +1067 -0
- data/test/data/multi_cell_call_no_dup.csv +1057 -0
- data/test/data/multicolumn.csv +8 -0
- data/test/data/rat_feature_dataset.csv +1179 -0
- data/test/data/wrong_dataset.csv +8 -0
- data/test/dataset-long.rb +117 -0
- data/test/dataset.rb +199 -0
- data/test/descriptor-long.rb +26 -0
- data/test/descriptor.rb +83 -0
- data/test/error.rb +24 -0
- data/test/feature.rb +65 -0
- data/test/fminer-long.rb +38 -0
- data/test/fminer.rb +52 -0
- data/test/lazar-fminer.rb +50 -0
- data/test/lazar-long.rb +72 -0
- data/test/lazar-physchem-short.rb +27 -0
- data/test/setup.rb +6 -0
- data/test/validation.rb +41 -0
- metadata +212 -0
data/lib/algorithm.rb
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
module OpenTox
|
|
2
|
+
|
|
3
|
+
module Algorithm
|
|
4
|
+
|
|
5
|
+
# Generic method to execute algorithms
|
|
6
|
+
# Algorithms should:
|
|
7
|
+
# - accept a Compound, an Array of Compounds or a Dataset as first argument
|
|
8
|
+
# - optional parameters as second argument
|
|
9
|
+
# - return an object corresponding to the input type as result (eg. Compound -> value, Array of Compounds -> Array of values, Dataset -> Dataset with values
|
|
10
|
+
# @param [OpenTox::Compound,Array,OpenTox::Dataset] Input object
|
|
11
|
+
# @param [Hash] Algorithm parameters
|
|
12
|
+
# @return Algorithm result
|
|
13
|
+
def self.run algorithm, object, parameters=nil
|
|
14
|
+
bad_request_error "Cannot run '#{algorithm}' algorithm. Please provide an OpenTox::Algorithm." unless algorithm =~ /^OpenTox::Algorithm/
|
|
15
|
+
klass,method = algorithm.split('.')
|
|
16
|
+
parameters.nil? ? Object.const_get(klass).send(method,object) : Object.const_get(klass).send(method,object, parameters)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
|
data/lib/bbrc.rb
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
module OpenTox
|
|
2
|
+
module Algorithm
|
|
3
|
+
class Fminer
|
|
4
|
+
TABLE_OF_ELEMENTS = [
|
|
5
|
+
"H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th", "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr", "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Uut", "Fl", "Uup", "Lv", "Uus", "Uuo"]
|
|
6
|
+
|
|
7
|
+
#
|
|
8
|
+
# Run bbrc algorithm on dataset
|
|
9
|
+
#
|
|
10
|
+
# @param [OpenTox::Dataset] training dataset
|
|
11
|
+
# @param [optional] parameters BBRC parameters, accepted parameters are
|
|
12
|
+
# - min_frequency Minimum frequency (default 5)
|
|
13
|
+
# - feature_type Feature type, can be 'paths' or 'trees' (default "trees")
|
|
14
|
+
# - backbone BBRC classes, pass 'false' to switch off mining for BBRC representatives. (default "true")
|
|
15
|
+
# - min_chisq_significance Significance threshold (between 0 and 1)
|
|
16
|
+
# - nr_hits Set to "true" to get hit count instead of presence
|
|
17
|
+
# - get_target Set to "true" to obtain target variable as feature
|
|
18
|
+
# @return [OpenTox::Dataset] Fminer Dataset
|
|
19
|
+
def self.bbrc training_dataset, params={}
|
|
20
|
+
|
|
21
|
+
time = Time.now
|
|
22
|
+
bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1
|
|
23
|
+
|
|
24
|
+
prediction_feature = training_dataset.features.first
|
|
25
|
+
if params[:min_frequency]
|
|
26
|
+
minfreq = params[:min_frequency]
|
|
27
|
+
else
|
|
28
|
+
per_mil = 5 # value from latest version
|
|
29
|
+
per_mil = 8 # as suggested below
|
|
30
|
+
i = training_dataset.feature_ids.index prediction_feature.id
|
|
31
|
+
nr_labeled_cmpds = training_dataset.data_entries.select{|de| !de[i].nil?}.size
|
|
32
|
+
minfreq = per_mil * nr_labeled_cmpds.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
|
|
33
|
+
minfreq = 2 unless minfreq > 2
|
|
34
|
+
minfreq = minfreq.round
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
@bbrc ||= Bbrc::Bbrc.new
|
|
38
|
+
@bbrc.Reset
|
|
39
|
+
if prediction_feature.numeric
|
|
40
|
+
@bbrc.SetRegression(true) # AM: DO NOT MOVE DOWN! Must happen before the other Set... operations!
|
|
41
|
+
else
|
|
42
|
+
bad_request_error "No accept values for "\
|
|
43
|
+
"dataset '#{training_dataset.id}' and "\
|
|
44
|
+
"feature '#{prediction_feature.id}'" unless prediction_feature.accept_values
|
|
45
|
+
value2act = Hash[[*prediction_feature.accept_values.map.with_index]]
|
|
46
|
+
end
|
|
47
|
+
@bbrc.SetMinfreq(minfreq)
|
|
48
|
+
@bbrc.SetType(1) if params[:feature_type] == "paths"
|
|
49
|
+
@bbrc.SetBackbone(false) if params[:backbone] == "false"
|
|
50
|
+
@bbrc.SetChisqSig(params[:min_chisq_significance].to_f) if params[:min_chisq_significance]
|
|
51
|
+
@bbrc.SetConsoleOut(false)
|
|
52
|
+
|
|
53
|
+
params[:nr_hits] ? nr_hits = params[:nr_hits] : nr_hits = false
|
|
54
|
+
feature_dataset = FminerDataset.new(
|
|
55
|
+
:training_dataset_id => training_dataset.id,
|
|
56
|
+
:training_algorithm => "#{self.to_s}.bbrc",
|
|
57
|
+
:training_feature_id => prediction_feature.id ,
|
|
58
|
+
:training_parameters => {
|
|
59
|
+
:min_frequency => minfreq,
|
|
60
|
+
:nr_hits => nr_hits,
|
|
61
|
+
:backbone => (params[:backbone] == false ? false : true)
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
)
|
|
65
|
+
feature_dataset.compounds = training_dataset.compounds
|
|
66
|
+
|
|
67
|
+
# add data
|
|
68
|
+
training_dataset.compounds.each_with_index do |compound,i|
|
|
69
|
+
act = value2act[training_dataset.data_entries[i].first]
|
|
70
|
+
if act # TODO check if this works
|
|
71
|
+
@bbrc.AddCompound(compound.smiles,i+1)
|
|
72
|
+
@bbrc.AddActivity(act,i+1)
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
#g_median=@fminer.all_activities.values.to_scale.median
|
|
76
|
+
|
|
77
|
+
#task.progress 10
|
|
78
|
+
#step_width = 80 / @bbrc.GetNoRootNodes().to_f
|
|
79
|
+
|
|
80
|
+
$logger.debug "BBRC setup: #{Time.now-time}"
|
|
81
|
+
time = Time.now
|
|
82
|
+
ftime = 0
|
|
83
|
+
itime = 0
|
|
84
|
+
rtime = 0
|
|
85
|
+
|
|
86
|
+
# run @bbrc
|
|
87
|
+
(0 .. @bbrc.GetNoRootNodes()-1).each do |j|
|
|
88
|
+
results = @bbrc.MineRoot(j)
|
|
89
|
+
results.each do |result|
|
|
90
|
+
rt = Time.now
|
|
91
|
+
f = YAML.load(result)[0]
|
|
92
|
+
smarts = f.shift
|
|
93
|
+
# convert fminer SMARTS representation into a more human readable format
|
|
94
|
+
smarts.gsub!(%r{\[#(\d+)&(\w)\]}) do
|
|
95
|
+
element = TABLE_OF_ELEMENTS[$1.to_i-1]
|
|
96
|
+
$2 == "a" ? element.downcase : element
|
|
97
|
+
end
|
|
98
|
+
p_value = f.shift
|
|
99
|
+
f.flatten!
|
|
100
|
+
compound_idxs = f.collect{|e| e.first.first-1}
|
|
101
|
+
# majority class
|
|
102
|
+
effect = compound_idxs.collect{|i| training_dataset.data_entries[i].first}.mode
|
|
103
|
+
|
|
104
|
+
=begin
|
|
105
|
+
if (!@bbrc.GetRegression)
|
|
106
|
+
id_arrs = f[2..-1].flatten
|
|
107
|
+
max = OpenTox::Algorithm::Fminer.effect(f[2..-1].reverse, @fminer.db_class_sizes) # f needs reversal for bbrc
|
|
108
|
+
effect = max+1
|
|
109
|
+
else #regression part
|
|
110
|
+
id_arrs = f[2]
|
|
111
|
+
# DV: effect calculation
|
|
112
|
+
f_arr=Array.new
|
|
113
|
+
f[2].each do |id|
|
|
114
|
+
id=id.keys[0] # extract id from hit count hash
|
|
115
|
+
f_arr.push(@fminer.all_activities[id])
|
|
116
|
+
end
|
|
117
|
+
f_median=f_arr.to_scale.median
|
|
118
|
+
if g_median >= f_median
|
|
119
|
+
effect = 'activating'
|
|
120
|
+
else
|
|
121
|
+
effect = 'deactivating'
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
=end
|
|
125
|
+
rtime += Time.now - rt
|
|
126
|
+
|
|
127
|
+
ft = Time.now
|
|
128
|
+
feature = OpenTox::FminerSmarts.find_or_create_by({
|
|
129
|
+
"smarts" => smarts,
|
|
130
|
+
"p_value" => p_value.to_f.abs.round(5),
|
|
131
|
+
"effect" => effect,
|
|
132
|
+
"dataset_id" => feature_dataset.id
|
|
133
|
+
})
|
|
134
|
+
feature_dataset.feature_ids << feature.id
|
|
135
|
+
ftime += Time.now - ft
|
|
136
|
+
|
|
137
|
+
it = Time.now
|
|
138
|
+
f.each do |id_count_hash|
|
|
139
|
+
id_count_hash.each do |id,count|
|
|
140
|
+
nr_hits ? count = count.to_i : count = 1
|
|
141
|
+
feature_dataset.data_entries[id-1] ||= []
|
|
142
|
+
feature_dataset.data_entries[id-1][feature_dataset.feature_ids.size-1] = count
|
|
143
|
+
end
|
|
144
|
+
end
|
|
145
|
+
itime += Time.now - it
|
|
146
|
+
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
$logger.debug "Fminer: #{Time.now-time} (read: #{rtime}, iterate: #{itime}, find/create Features: #{ftime})"
|
|
151
|
+
time = Time.now
|
|
152
|
+
|
|
153
|
+
feature_dataset.fill_nil_with 0
|
|
154
|
+
|
|
155
|
+
$logger.debug "Prepare save: #{Time.now-time}"
|
|
156
|
+
time = Time.now
|
|
157
|
+
feature_dataset.save_all
|
|
158
|
+
|
|
159
|
+
$logger.debug "Save: #{Time.now-time}"
|
|
160
|
+
feature_dataset
|
|
161
|
+
|
|
162
|
+
end
|
|
163
|
+
end
|
|
164
|
+
end
|
|
165
|
+
end
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
module OpenTox
|
|
2
|
+
module Algorithm
|
|
3
|
+
|
|
4
|
+
class Classification
|
|
5
|
+
|
|
6
|
+
def self.weighted_majority_vote neighbors
|
|
7
|
+
return {:value => nil,:confidence => nil,:warning => "Cound not find similar compounds."} if neighbors.empty?
|
|
8
|
+
weighted_sum = {}
|
|
9
|
+
sim_sum = 0.0
|
|
10
|
+
neighbors.each do |row|
|
|
11
|
+
n,sim,acts = row
|
|
12
|
+
acts.each do |act|
|
|
13
|
+
weighted_sum[act] ||= 0
|
|
14
|
+
weighted_sum[act] += sim
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
case weighted_sum.size
|
|
18
|
+
when 1
|
|
19
|
+
return {:value => weighted_sum.keys.first, :confidence => weighted_sum.values.first/neighbors.size.abs}
|
|
20
|
+
when 2
|
|
21
|
+
sim_sum = weighted_sum[weighted_sum.keys[0]]
|
|
22
|
+
sim_sum -= weighted_sum[weighted_sum.keys[1]]
|
|
23
|
+
sim_sum > 0 ? prediction = weighted_sum.keys[0] : prediction = weighted_sum.keys[1]
|
|
24
|
+
confidence = (sim_sum/neighbors.size).abs
|
|
25
|
+
return {:value => prediction,:confidence => confidence}
|
|
26
|
+
else
|
|
27
|
+
bad_request_error "Cannot predict more than 2 classes, multinomial classifications is not yet implemented. Received classes were: '#{weighted.sum.keys}'"
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Classification with majority vote from neighbors weighted by similarity
|
|
32
|
+
# @param [Hash] params Keys `:activities, :sims, :value_map` are required
|
|
33
|
+
# @return [Numeric] A prediction value.
|
|
34
|
+
def self.fminer_weighted_majority_vote neighbors, training_dataset
|
|
35
|
+
|
|
36
|
+
neighbor_contribution = 0.0
|
|
37
|
+
confidence_sum = 0.0
|
|
38
|
+
|
|
39
|
+
$logger.debug "Weighted Majority Vote Classification."
|
|
40
|
+
|
|
41
|
+
values = neighbors.collect{|n| n[2]}.uniq
|
|
42
|
+
neighbors.each do |neighbor|
|
|
43
|
+
i = training_dataset.compound_ids.index n.id
|
|
44
|
+
neighbor_weight = neighbor[1]
|
|
45
|
+
activity = values.index(neighbor[2]) + 1 # map values to integers > 1
|
|
46
|
+
neighbor_contribution += activity * neighbor_weight
|
|
47
|
+
if values.size == 2 # AM: provide compat to binary classification: 1=>false 2=>true
|
|
48
|
+
case activity
|
|
49
|
+
when 1
|
|
50
|
+
confidence_sum -= neighbor_weight
|
|
51
|
+
when 2
|
|
52
|
+
confidence_sum += neighbor_weight
|
|
53
|
+
end
|
|
54
|
+
else
|
|
55
|
+
confidence_sum += neighbor_weight
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
if values.size == 2
|
|
59
|
+
if confidence_sum >= 0.0
|
|
60
|
+
prediction = values[1]
|
|
61
|
+
elsif confidence_sum < 0.0
|
|
62
|
+
prediction = values[0]
|
|
63
|
+
end
|
|
64
|
+
elsif values.size == 1 # all neighbors have the same value
|
|
65
|
+
prediction = values[0]
|
|
66
|
+
else
|
|
67
|
+
prediction = (neighbor_contribution/confidence_sum).round # AM: new multinomial prediction
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
confidence = (confidence_sum/neighbors.size).abs
|
|
71
|
+
{:value => prediction, :confidence => confidence.abs}
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# Local support vector regression from neighbors
|
|
75
|
+
# @param [Hash] params Keys `:props, :activities, :sims, :min_train_performance` are required
|
|
76
|
+
# @return [Numeric] A prediction value.
|
|
77
|
+
def self.local_svm_classification(params)
|
|
78
|
+
|
|
79
|
+
confidence = 0.0
|
|
80
|
+
prediction = nil
|
|
81
|
+
|
|
82
|
+
$logger.debug "Local SVM."
|
|
83
|
+
if params[:activities].size>0
|
|
84
|
+
if params[:props]
|
|
85
|
+
n_prop = params[:props][0].collect.to_a
|
|
86
|
+
q_prop = params[:props][1].collect.to_a
|
|
87
|
+
props = [ n_prop, q_prop ]
|
|
88
|
+
end
|
|
89
|
+
activities = params[:activities].collect.to_a
|
|
90
|
+
activities = activities.collect{|v| "Val" + v.to_s} # Convert to string for R to recognize classification
|
|
91
|
+
prediction = local_svm_prop( props, activities, params[:min_train_performance]) # params[:props].nil? signals non-prop setting
|
|
92
|
+
prediction = prediction.sub(/Val/,"") if prediction # Convert back
|
|
93
|
+
confidence = 0.0 if prediction.nil?
|
|
94
|
+
#$logger.debug "Prediction: '" + prediction.to_s + "' ('#{prediction.class}')."
|
|
95
|
+
confidence = get_confidence({:sims => params[:sims][1], :activities => params[:activities]})
|
|
96
|
+
end
|
|
97
|
+
{:value => prediction, :confidence => confidence}
|
|
98
|
+
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
|
data/lib/compound.rb
ADDED
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
# TODO: check
|
|
2
|
+
# *** Open Babel Error in ParseFile
|
|
3
|
+
# Could not find contribution data file.
|
|
4
|
+
|
|
5
|
+
CACTUS_URI="http://cactus.nci.nih.gov/chemical/structure/"
|
|
6
|
+
|
|
7
|
+
module OpenTox
|
|
8
|
+
|
|
9
|
+
class Compound
|
|
10
|
+
include OpenTox
|
|
11
|
+
|
|
12
|
+
field :inchi, type: String
|
|
13
|
+
field :smiles, type: String
|
|
14
|
+
field :inchikey, type: String
|
|
15
|
+
field :names, type: Array
|
|
16
|
+
field :warning, type: String
|
|
17
|
+
field :cid, type: String
|
|
18
|
+
field :chemblid, type: String
|
|
19
|
+
field :png_id, type: BSON::ObjectId
|
|
20
|
+
field :svg_id, type: BSON::ObjectId
|
|
21
|
+
field :sdf_id, type: BSON::ObjectId
|
|
22
|
+
field :fp4, type: Array
|
|
23
|
+
field :fp4_size, type: Integer
|
|
24
|
+
|
|
25
|
+
# Overwrites standard Mongoid method to create fingerprints before database insertion
|
|
26
|
+
def self.find_or_create_by params
|
|
27
|
+
compound = self.find_or_initialize_by params
|
|
28
|
+
unless compound.fp4 and !compound.fp4.empty?
|
|
29
|
+
compound.fp4_size = 0
|
|
30
|
+
compound.fp4 = []
|
|
31
|
+
fingerprint = FingerprintSmarts.fingerprint
|
|
32
|
+
Algorithm::Descriptor.smarts_match(compound, fingerprint).each_with_index do |m,i|
|
|
33
|
+
if m > 0
|
|
34
|
+
compound.fp4 << fingerprint[i].id
|
|
35
|
+
compound.fp4_size += 1
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
compound.save
|
|
40
|
+
compound
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Create a compound from smiles string
|
|
44
|
+
# @example
|
|
45
|
+
# compound = OpenTox::Compound.from_smiles("c1ccccc1")
|
|
46
|
+
# @param [String] smiles Smiles string
|
|
47
|
+
# @return [OpenTox::Compound] Compound
|
|
48
|
+
def self.from_smiles smiles
|
|
49
|
+
smiles = obconversion(smiles,"smi","can")
|
|
50
|
+
if smiles.empty?
|
|
51
|
+
Compound.find_or_create_by(:warning => "SMILES parsing failed for '#{smiles}', this may be caused by an incorrect SMILES string.")
|
|
52
|
+
else
|
|
53
|
+
Compound.find_or_create_by :smiles => obconversion(smiles,"smi","can")
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Create a compound from inchi string
|
|
58
|
+
# @param inchi [String] smiles InChI string
|
|
59
|
+
# @return [OpenTox::Compound] Compound
|
|
60
|
+
def self.from_inchi inchi
|
|
61
|
+
# Temporary workaround for OpenBabels Inchi bug
|
|
62
|
+
# http://sourceforge.net/p/openbabel/bugs/957/
|
|
63
|
+
# bug has not been fixed in latest git/development version
|
|
64
|
+
#smiles = `echo "#{inchi}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -iinchi - -ocan`.chomp.strip
|
|
65
|
+
smiles = obconversion(inchi,"inchi","can")
|
|
66
|
+
if smiles.empty?
|
|
67
|
+
Compound.find_or_create_by(:warning => "InChi parsing failed for #{inchi}, this may be caused by an incorrect InChi string or a bug in OpenBabel libraries.")
|
|
68
|
+
else
|
|
69
|
+
Compound.find_or_create_by(:smiles => smiles, :inchi => inchi)
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Create a compound from sdf string
|
|
74
|
+
# @param sdf [String] smiles SDF string
|
|
75
|
+
# @return [OpenTox::Compound] Compound
|
|
76
|
+
def self.from_sdf sdf
|
|
77
|
+
# do not store sdf because it might be 2D
|
|
78
|
+
Compound.from_smiles obconversion(sdf,"sdf","can")
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# Create a compound from name. Relies on an external service for name lookups.
|
|
82
|
+
# @example
|
|
83
|
+
# compound = OpenTox::Compound.from_name("Benzene")
|
|
84
|
+
# @param name [String] can be also an InChI/InChiKey, CAS number, etc
|
|
85
|
+
# @return [OpenTox::Compound] Compound
|
|
86
|
+
def self.from_name name
|
|
87
|
+
Compound.from_smiles RestClientWrapper.get(File.join(CACTUS_URI,URI.escape(name),"smiles"))
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# Get InChI
|
|
91
|
+
# @return [String] InChI string
|
|
92
|
+
def inchi
|
|
93
|
+
unless self["inchi"]
|
|
94
|
+
|
|
95
|
+
result = obconversion(smiles,"smi","inchi")
|
|
96
|
+
#result = `echo "#{self.smiles}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -ismi - -oinchi`.chomp
|
|
97
|
+
update(:inchi => result.chomp) unless result.empty?
|
|
98
|
+
end
|
|
99
|
+
self["inchi"]
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
# Get InChIKey
|
|
103
|
+
# @return [String] InChIKey string
|
|
104
|
+
def inchikey
|
|
105
|
+
update(:inchikey => obconversion(smiles,"smi","inchikey")) unless self["inchikey"]
|
|
106
|
+
self["inchikey"]
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
# Get (canonical) smiles
|
|
110
|
+
# @return [String] Smiles string
|
|
111
|
+
def smiles
|
|
112
|
+
update(:smiles => obconversion(self["smiles"],"smi","can")) unless self["smiles"]
|
|
113
|
+
self["smiles"]
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# Get sdf
|
|
117
|
+
# @return [String] SDF string
|
|
118
|
+
def sdf
|
|
119
|
+
if self.sdf_id.nil?
|
|
120
|
+
sdf = obconversion(smiles,"smi","sdf")
|
|
121
|
+
file = Mongo::Grid::File.new(sdf, :filename => "#{id}.sdf",:content_type => "chemical/x-mdl-sdfile")
|
|
122
|
+
sdf_id = $gridfs.insert_one file
|
|
123
|
+
update :sdf_id => sdf_id
|
|
124
|
+
end
|
|
125
|
+
$gridfs.find_one(_id: self.sdf_id).data
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
# Get SVG image
|
|
129
|
+
# @return [image/svg] Image data
|
|
130
|
+
def svg
|
|
131
|
+
if self.svg_id.nil?
|
|
132
|
+
svg = obconversion(smiles,"smi","svg")
|
|
133
|
+
file = Mongo::Grid::File.new(svg, :filename => "#{id}.svg", :content_type => "image/svg")
|
|
134
|
+
update(:image_id => $gridfs.insert_one(file))
|
|
135
|
+
end
|
|
136
|
+
$gridfs.find_one(_id: self.svg_id).data
|
|
137
|
+
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
# Get png image
|
|
141
|
+
# @example
|
|
142
|
+
# image = compound.png
|
|
143
|
+
# @return [image/png] Image data
|
|
144
|
+
def png
|
|
145
|
+
if self.png_id.nil?
|
|
146
|
+
png = obconversion(smiles,"smi","_png2")
|
|
147
|
+
file = Mongo::Grid::File.new(Base64.encode64(png), :filename => "#{id}.png", :content_type => "image/png")
|
|
148
|
+
update(:png_id => $gridfs.insert_one(file))
|
|
149
|
+
end
|
|
150
|
+
Base64.decode64($gridfs.find_one(_id: self.png_id).data)
|
|
151
|
+
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
# Get all known compound names. Relies on an external service for name lookups.
|
|
155
|
+
# @example
|
|
156
|
+
# names = compound.names
|
|
157
|
+
# @return [String] Compound names
|
|
158
|
+
def names
|
|
159
|
+
update(:names => RestClientWrapper.get("#{CACTUS_URI}#{inchi}/names").split("\n")) unless self["names"]
|
|
160
|
+
self["names"]
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
# @return [String] PubChem Compound Identifier (CID), derieved via restcall to pubchem
|
|
164
|
+
def cid
|
|
165
|
+
pug_uri = "http://pubchem.ncbi.nlm.nih.gov/rest/pug/"
|
|
166
|
+
update(:cid => RestClientWrapper.post(File.join(pug_uri, "compound", "inchi", "cids", "TXT"),{:inchi => inchi}).strip) unless self["cid"]
|
|
167
|
+
self["cid"]
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
# @return [String] ChEMBL database compound id, derieved via restcall to chembl
|
|
171
|
+
def chemblid
|
|
172
|
+
# https://www.ebi.ac.uk/chembldb/ws#individualCompoundByInChiKey
|
|
173
|
+
uri = "http://www.ebi.ac.uk/chemblws/compounds/smiles/#{smiles}.json"
|
|
174
|
+
update(:chemblid => JSON.parse(RestClientWrapper.get(uri))["compounds"].first["chemblId"]) unless self["chemblid"]
|
|
175
|
+
self["chemblid"]
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
def neighbors threshold=0.7
|
|
179
|
+
# from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb
|
|
180
|
+
qn = fp4.size
|
|
181
|
+
#qmin = qn * threshold
|
|
182
|
+
#qmax = qn / threshold
|
|
183
|
+
#not sure if it is worth the effort of keeping feature counts up to date (compound deletions, additions, ...)
|
|
184
|
+
#reqbits = [count['_id'] for count in db.mfp_counts.find({'_id': {'$in': qfp}}).sort('count', 1).limit(qn - qmin + 1)]
|
|
185
|
+
aggregate = [
|
|
186
|
+
#{'$match': {'mfp.count': {'$gte': qmin, '$lte': qmax}, 'mfp.bits': {'$in': reqbits}}},
|
|
187
|
+
{'$match' => {'_id' => {'$ne' => self.id}}}, # remove self
|
|
188
|
+
{'$project' => {
|
|
189
|
+
'tanimoto' => {'$let' => {
|
|
190
|
+
'vars' => {'common' => {'$size' => {'$setIntersection' => ['$fp4', fp4]}}},
|
|
191
|
+
'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [qn, '$fp4_size']}, '$$common']}]}
|
|
192
|
+
}},
|
|
193
|
+
'_id' => 1
|
|
194
|
+
}},
|
|
195
|
+
{'$match' => {'tanimoto' => {'$gte' => threshold}}},
|
|
196
|
+
{'$sort' => {'tanimoto' => -1}}
|
|
197
|
+
]
|
|
198
|
+
|
|
199
|
+
$mongo["compounds"].aggregate(aggregate).collect{ |r| [r["_id"], r["tanimoto"]] }
|
|
200
|
+
|
|
201
|
+
end
|
|
202
|
+
=begin
|
|
203
|
+
=end
|
|
204
|
+
|
|
205
|
+
private
|
|
206
|
+
|
|
207
|
+
def self.obconversion(identifier,input_format,output_format,option=nil)
|
|
208
|
+
obconversion = OpenBabel::OBConversion.new
|
|
209
|
+
obconversion.set_options(option, OpenBabel::OBConversion::OUTOPTIONS) if option
|
|
210
|
+
obmol = OpenBabel::OBMol.new
|
|
211
|
+
obconversion.set_in_and_out_formats input_format, output_format
|
|
212
|
+
obconversion.read_string obmol, identifier
|
|
213
|
+
case output_format
|
|
214
|
+
when /smi|can|inchi/
|
|
215
|
+
obconversion.write_string(obmol).gsub(/\s/,'').chomp
|
|
216
|
+
when /sdf/
|
|
217
|
+
p "SDF conversion"
|
|
218
|
+
# has no effect
|
|
219
|
+
#obconversion.add_option("gen3D", OpenBabel::OBConversion::GENOPTIONS)
|
|
220
|
+
# segfaults with openbabel git master
|
|
221
|
+
#OpenBabel::OBOp.find_type("Gen3D").do(obmol)
|
|
222
|
+
|
|
223
|
+
# TODO: find disconnected structures
|
|
224
|
+
# strip_salts
|
|
225
|
+
# separate
|
|
226
|
+
obmol.add_hydrogens
|
|
227
|
+
builder = OpenBabel::OBBuilder.new
|
|
228
|
+
builder.build(obmol)
|
|
229
|
+
|
|
230
|
+
sdf = obconversion.write_string(obmol)
|
|
231
|
+
print sdf
|
|
232
|
+
if sdf.match(/.nan/)
|
|
233
|
+
|
|
234
|
+
# TODO: fix or eliminate 2d generation
|
|
235
|
+
$logger.warn "3D generation failed for compound #{identifier}, trying to calculate 2D structure"
|
|
236
|
+
obconversion.set_options("gen2D", OpenBabel::OBConversion::GENOPTIONS)
|
|
237
|
+
#OpenBabel::OBOp.find_type("Gen2D").do(obmol)
|
|
238
|
+
sdf = obconversion.write_string(obmol)
|
|
239
|
+
if sdf.match(/.nan/)
|
|
240
|
+
$logger.warn "2D generation failed for compound #{identifier}"
|
|
241
|
+
sdf = nil
|
|
242
|
+
end
|
|
243
|
+
end
|
|
244
|
+
sdf
|
|
245
|
+
else
|
|
246
|
+
obconversion.write_string(obmol)
|
|
247
|
+
end
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
def obconversion(identifier,input_format,output_format,option=nil)
|
|
251
|
+
self.class.obconversion(identifier,input_format,output_format,option=nil)
|
|
252
|
+
end
|
|
253
|
+
end
|
|
254
|
+
end
|