lazar 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/.yardopts +4 -0
- data/Gemfile +2 -0
- data/LICENSE +674 -0
- data/README.md +44 -0
- data/Rakefile +1 -0
- data/VERSION +1 -0
- data/ext/lazar/extconf.rb +87 -0
- data/java/CdkDescriptorInfo.class +0 -0
- data/java/CdkDescriptorInfo.java +22 -0
- data/java/CdkDescriptors.class +0 -0
- data/java/CdkDescriptors.java +141 -0
- data/java/Jmol.jar +0 -0
- data/java/JoelibDescriptorInfo.class +0 -0
- data/java/JoelibDescriptorInfo.java +15 -0
- data/java/JoelibDescriptors.class +0 -0
- data/java/JoelibDescriptors.java +60 -0
- data/java/Rakefile +15 -0
- data/java/cdk-1.4.19.jar +0 -0
- data/java/joelib2.jar +0 -0
- data/java/log4j.jar +0 -0
- data/lazar.gemspec +29 -0
- data/lib/SMARTS_InteLigand.txt +983 -0
- data/lib/algorithm.rb +21 -0
- data/lib/bbrc.rb +165 -0
- data/lib/classification.rb +107 -0
- data/lib/compound.rb +254 -0
- data/lib/crossvalidation.rb +187 -0
- data/lib/dataset.rb +334 -0
- data/lib/descriptor.rb +247 -0
- data/lib/error.rb +66 -0
- data/lib/feature.rb +97 -0
- data/lib/lazar-model.rb +170 -0
- data/lib/lazar.rb +69 -0
- data/lib/neighbor.rb +25 -0
- data/lib/opentox.rb +22 -0
- data/lib/overwrite.rb +119 -0
- data/lib/regression.rb +199 -0
- data/lib/rest-client-wrapper.rb +98 -0
- data/lib/similarity.rb +58 -0
- data/lib/unique_descriptors.rb +120 -0
- data/lib/validation.rb +114 -0
- data/mongoid.yml +8 -0
- data/test/all.rb +5 -0
- data/test/compound.rb +100 -0
- data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +13553 -0
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +436 -0
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +568 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +87 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +978 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +1120 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +1113 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +850 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +829 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +1198 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +1505 -0
- data/test/data/EPAFHM.csv +618 -0
- data/test/data/EPAFHM.medi.csv +100 -0
- data/test/data/EPAFHM.mini.csv +22 -0
- data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +581 -0
- data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +1217 -0
- data/test/data/ISSCAN-multi.csv +59 -0
- data/test/data/LOAEL_log_mg_corrected_smiles.csv +568 -0
- data/test/data/LOAEL_log_mmol_corrected_smiles.csv +568 -0
- data/test/data/acetaldehyde.sdf +14 -0
- data/test/data/boiling_points.ext.sdf +11460 -0
- data/test/data/cpdb_100.csv +101 -0
- data/test/data/hamster_carcinogenicity.csv +86 -0
- data/test/data/hamster_carcinogenicity.mini.bool_float.csv +11 -0
- data/test/data/hamster_carcinogenicity.mini.bool_int.csv +11 -0
- data/test/data/hamster_carcinogenicity.mini.bool_string.csv +11 -0
- data/test/data/hamster_carcinogenicity.mini.csv +11 -0
- data/test/data/hamster_carcinogenicity.ntriples +618 -0
- data/test/data/hamster_carcinogenicity.sdf +2805 -0
- data/test/data/hamster_carcinogenicity.xls +0 -0
- data/test/data/hamster_carcinogenicity.yaml +352 -0
- data/test/data/hamster_carcinogenicity_with_errors.csv +88 -0
- data/test/data/kazius.csv +4070 -0
- data/test/data/multi_cell_call.csv +1067 -0
- data/test/data/multi_cell_call_no_dup.csv +1057 -0
- data/test/data/multicolumn.csv +8 -0
- data/test/data/rat_feature_dataset.csv +1179 -0
- data/test/data/wrong_dataset.csv +8 -0
- data/test/dataset-long.rb +117 -0
- data/test/dataset.rb +199 -0
- data/test/descriptor-long.rb +26 -0
- data/test/descriptor.rb +83 -0
- data/test/error.rb +24 -0
- data/test/feature.rb +65 -0
- data/test/fminer-long.rb +38 -0
- data/test/fminer.rb +52 -0
- data/test/lazar-fminer.rb +50 -0
- data/test/lazar-long.rb +72 -0
- data/test/lazar-physchem-short.rb +27 -0
- data/test/setup.rb +6 -0
- data/test/validation.rb +41 -0
- metadata +212 -0
data/lib/algorithm.rb
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
module OpenTox
|
2
|
+
|
3
|
+
module Algorithm
|
4
|
+
|
5
|
+
# Generic method to execute algorithms
|
6
|
+
# Algorithms should:
|
7
|
+
# - accept a Compound, an Array of Compounds or a Dataset as first argument
|
8
|
+
# - optional parameters as second argument
|
9
|
+
# - return an object corresponding to the input type as result (eg. Compound -> value, Array of Compounds -> Array of values, Dataset -> Dataset with values
|
10
|
+
# @param [OpenTox::Compound,Array,OpenTox::Dataset] Input object
|
11
|
+
# @param [Hash] Algorithm parameters
|
12
|
+
# @return Algorithm result
|
13
|
+
def self.run algorithm, object, parameters=nil
|
14
|
+
bad_request_error "Cannot run '#{algorithm}' algorithm. Please provide an OpenTox::Algorithm." unless algorithm =~ /^OpenTox::Algorithm/
|
15
|
+
klass,method = algorithm.split('.')
|
16
|
+
parameters.nil? ? Object.const_get(klass).send(method,object) : Object.const_get(klass).send(method,object, parameters)
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
data/lib/bbrc.rb
ADDED
@@ -0,0 +1,165 @@
|
|
1
|
+
module OpenTox
|
2
|
+
module Algorithm
|
3
|
+
class Fminer
|
4
|
+
TABLE_OF_ELEMENTS = [
|
5
|
+
"H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th", "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr", "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Uut", "Fl", "Uup", "Lv", "Uus", "Uuo"]
|
6
|
+
|
7
|
+
#
|
8
|
+
# Run bbrc algorithm on dataset
|
9
|
+
#
|
10
|
+
# @param [OpenTox::Dataset] training dataset
|
11
|
+
# @param [optional] parameters BBRC parameters, accepted parameters are
|
12
|
+
# - min_frequency Minimum frequency (default 5)
|
13
|
+
# - feature_type Feature type, can be 'paths' or 'trees' (default "trees")
|
14
|
+
# - backbone BBRC classes, pass 'false' to switch off mining for BBRC representatives. (default "true")
|
15
|
+
# - min_chisq_significance Significance threshold (between 0 and 1)
|
16
|
+
# - nr_hits Set to "true" to get hit count instead of presence
|
17
|
+
# - get_target Set to "true" to obtain target variable as feature
|
18
|
+
# @return [OpenTox::Dataset] Fminer Dataset
|
19
|
+
def self.bbrc training_dataset, params={}
|
20
|
+
|
21
|
+
time = Time.now
|
22
|
+
bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1
|
23
|
+
|
24
|
+
prediction_feature = training_dataset.features.first
|
25
|
+
if params[:min_frequency]
|
26
|
+
minfreq = params[:min_frequency]
|
27
|
+
else
|
28
|
+
per_mil = 5 # value from latest version
|
29
|
+
per_mil = 8 # as suggested below
|
30
|
+
i = training_dataset.feature_ids.index prediction_feature.id
|
31
|
+
nr_labeled_cmpds = training_dataset.data_entries.select{|de| !de[i].nil?}.size
|
32
|
+
minfreq = per_mil * nr_labeled_cmpds.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
|
33
|
+
minfreq = 2 unless minfreq > 2
|
34
|
+
minfreq = minfreq.round
|
35
|
+
end
|
36
|
+
|
37
|
+
@bbrc ||= Bbrc::Bbrc.new
|
38
|
+
@bbrc.Reset
|
39
|
+
if prediction_feature.numeric
|
40
|
+
@bbrc.SetRegression(true) # AM: DO NOT MOVE DOWN! Must happen before the other Set... operations!
|
41
|
+
else
|
42
|
+
bad_request_error "No accept values for "\
|
43
|
+
"dataset '#{training_dataset.id}' and "\
|
44
|
+
"feature '#{prediction_feature.id}'" unless prediction_feature.accept_values
|
45
|
+
value2act = Hash[[*prediction_feature.accept_values.map.with_index]]
|
46
|
+
end
|
47
|
+
@bbrc.SetMinfreq(minfreq)
|
48
|
+
@bbrc.SetType(1) if params[:feature_type] == "paths"
|
49
|
+
@bbrc.SetBackbone(false) if params[:backbone] == "false"
|
50
|
+
@bbrc.SetChisqSig(params[:min_chisq_significance].to_f) if params[:min_chisq_significance]
|
51
|
+
@bbrc.SetConsoleOut(false)
|
52
|
+
|
53
|
+
params[:nr_hits] ? nr_hits = params[:nr_hits] : nr_hits = false
|
54
|
+
feature_dataset = FminerDataset.new(
|
55
|
+
:training_dataset_id => training_dataset.id,
|
56
|
+
:training_algorithm => "#{self.to_s}.bbrc",
|
57
|
+
:training_feature_id => prediction_feature.id ,
|
58
|
+
:training_parameters => {
|
59
|
+
:min_frequency => minfreq,
|
60
|
+
:nr_hits => nr_hits,
|
61
|
+
:backbone => (params[:backbone] == false ? false : true)
|
62
|
+
}
|
63
|
+
|
64
|
+
)
|
65
|
+
feature_dataset.compounds = training_dataset.compounds
|
66
|
+
|
67
|
+
# add data
|
68
|
+
training_dataset.compounds.each_with_index do |compound,i|
|
69
|
+
act = value2act[training_dataset.data_entries[i].first]
|
70
|
+
if act # TODO check if this works
|
71
|
+
@bbrc.AddCompound(compound.smiles,i+1)
|
72
|
+
@bbrc.AddActivity(act,i+1)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
#g_median=@fminer.all_activities.values.to_scale.median
|
76
|
+
|
77
|
+
#task.progress 10
|
78
|
+
#step_width = 80 / @bbrc.GetNoRootNodes().to_f
|
79
|
+
|
80
|
+
$logger.debug "BBRC setup: #{Time.now-time}"
|
81
|
+
time = Time.now
|
82
|
+
ftime = 0
|
83
|
+
itime = 0
|
84
|
+
rtime = 0
|
85
|
+
|
86
|
+
# run @bbrc
|
87
|
+
(0 .. @bbrc.GetNoRootNodes()-1).each do |j|
|
88
|
+
results = @bbrc.MineRoot(j)
|
89
|
+
results.each do |result|
|
90
|
+
rt = Time.now
|
91
|
+
f = YAML.load(result)[0]
|
92
|
+
smarts = f.shift
|
93
|
+
# convert fminer SMARTS representation into a more human readable format
|
94
|
+
smarts.gsub!(%r{\[#(\d+)&(\w)\]}) do
|
95
|
+
element = TABLE_OF_ELEMENTS[$1.to_i-1]
|
96
|
+
$2 == "a" ? element.downcase : element
|
97
|
+
end
|
98
|
+
p_value = f.shift
|
99
|
+
f.flatten!
|
100
|
+
compound_idxs = f.collect{|e| e.first.first-1}
|
101
|
+
# majority class
|
102
|
+
effect = compound_idxs.collect{|i| training_dataset.data_entries[i].first}.mode
|
103
|
+
|
104
|
+
=begin
|
105
|
+
if (!@bbrc.GetRegression)
|
106
|
+
id_arrs = f[2..-1].flatten
|
107
|
+
max = OpenTox::Algorithm::Fminer.effect(f[2..-1].reverse, @fminer.db_class_sizes) # f needs reversal for bbrc
|
108
|
+
effect = max+1
|
109
|
+
else #regression part
|
110
|
+
id_arrs = f[2]
|
111
|
+
# DV: effect calculation
|
112
|
+
f_arr=Array.new
|
113
|
+
f[2].each do |id|
|
114
|
+
id=id.keys[0] # extract id from hit count hash
|
115
|
+
f_arr.push(@fminer.all_activities[id])
|
116
|
+
end
|
117
|
+
f_median=f_arr.to_scale.median
|
118
|
+
if g_median >= f_median
|
119
|
+
effect = 'activating'
|
120
|
+
else
|
121
|
+
effect = 'deactivating'
|
122
|
+
end
|
123
|
+
end
|
124
|
+
=end
|
125
|
+
rtime += Time.now - rt
|
126
|
+
|
127
|
+
ft = Time.now
|
128
|
+
feature = OpenTox::FminerSmarts.find_or_create_by({
|
129
|
+
"smarts" => smarts,
|
130
|
+
"p_value" => p_value.to_f.abs.round(5),
|
131
|
+
"effect" => effect,
|
132
|
+
"dataset_id" => feature_dataset.id
|
133
|
+
})
|
134
|
+
feature_dataset.feature_ids << feature.id
|
135
|
+
ftime += Time.now - ft
|
136
|
+
|
137
|
+
it = Time.now
|
138
|
+
f.each do |id_count_hash|
|
139
|
+
id_count_hash.each do |id,count|
|
140
|
+
nr_hits ? count = count.to_i : count = 1
|
141
|
+
feature_dataset.data_entries[id-1] ||= []
|
142
|
+
feature_dataset.data_entries[id-1][feature_dataset.feature_ids.size-1] = count
|
143
|
+
end
|
144
|
+
end
|
145
|
+
itime += Time.now - it
|
146
|
+
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
$logger.debug "Fminer: #{Time.now-time} (read: #{rtime}, iterate: #{itime}, find/create Features: #{ftime})"
|
151
|
+
time = Time.now
|
152
|
+
|
153
|
+
feature_dataset.fill_nil_with 0
|
154
|
+
|
155
|
+
$logger.debug "Prepare save: #{Time.now-time}"
|
156
|
+
time = Time.now
|
157
|
+
feature_dataset.save_all
|
158
|
+
|
159
|
+
$logger.debug "Save: #{Time.now-time}"
|
160
|
+
feature_dataset
|
161
|
+
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
@@ -0,0 +1,107 @@
|
|
1
|
+
module OpenTox
|
2
|
+
module Algorithm
|
3
|
+
|
4
|
+
class Classification
|
5
|
+
|
6
|
+
def self.weighted_majority_vote neighbors
|
7
|
+
return {:value => nil,:confidence => nil,:warning => "Cound not find similar compounds."} if neighbors.empty?
|
8
|
+
weighted_sum = {}
|
9
|
+
sim_sum = 0.0
|
10
|
+
neighbors.each do |row|
|
11
|
+
n,sim,acts = row
|
12
|
+
acts.each do |act|
|
13
|
+
weighted_sum[act] ||= 0
|
14
|
+
weighted_sum[act] += sim
|
15
|
+
end
|
16
|
+
end
|
17
|
+
case weighted_sum.size
|
18
|
+
when 1
|
19
|
+
return {:value => weighted_sum.keys.first, :confidence => weighted_sum.values.first/neighbors.size.abs}
|
20
|
+
when 2
|
21
|
+
sim_sum = weighted_sum[weighted_sum.keys[0]]
|
22
|
+
sim_sum -= weighted_sum[weighted_sum.keys[1]]
|
23
|
+
sim_sum > 0 ? prediction = weighted_sum.keys[0] : prediction = weighted_sum.keys[1]
|
24
|
+
confidence = (sim_sum/neighbors.size).abs
|
25
|
+
return {:value => prediction,:confidence => confidence}
|
26
|
+
else
|
27
|
+
bad_request_error "Cannot predict more than 2 classes, multinomial classifications is not yet implemented. Received classes were: '#{weighted.sum.keys}'"
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
# Classification with majority vote from neighbors weighted by similarity
|
32
|
+
# @param [Hash] params Keys `:activities, :sims, :value_map` are required
|
33
|
+
# @return [Numeric] A prediction value.
|
34
|
+
def self.fminer_weighted_majority_vote neighbors, training_dataset
|
35
|
+
|
36
|
+
neighbor_contribution = 0.0
|
37
|
+
confidence_sum = 0.0
|
38
|
+
|
39
|
+
$logger.debug "Weighted Majority Vote Classification."
|
40
|
+
|
41
|
+
values = neighbors.collect{|n| n[2]}.uniq
|
42
|
+
neighbors.each do |neighbor|
|
43
|
+
i = training_dataset.compound_ids.index n.id
|
44
|
+
neighbor_weight = neighbor[1]
|
45
|
+
activity = values.index(neighbor[2]) + 1 # map values to integers > 1
|
46
|
+
neighbor_contribution += activity * neighbor_weight
|
47
|
+
if values.size == 2 # AM: provide compat to binary classification: 1=>false 2=>true
|
48
|
+
case activity
|
49
|
+
when 1
|
50
|
+
confidence_sum -= neighbor_weight
|
51
|
+
when 2
|
52
|
+
confidence_sum += neighbor_weight
|
53
|
+
end
|
54
|
+
else
|
55
|
+
confidence_sum += neighbor_weight
|
56
|
+
end
|
57
|
+
end
|
58
|
+
if values.size == 2
|
59
|
+
if confidence_sum >= 0.0
|
60
|
+
prediction = values[1]
|
61
|
+
elsif confidence_sum < 0.0
|
62
|
+
prediction = values[0]
|
63
|
+
end
|
64
|
+
elsif values.size == 1 # all neighbors have the same value
|
65
|
+
prediction = values[0]
|
66
|
+
else
|
67
|
+
prediction = (neighbor_contribution/confidence_sum).round # AM: new multinomial prediction
|
68
|
+
end
|
69
|
+
|
70
|
+
confidence = (confidence_sum/neighbors.size).abs
|
71
|
+
{:value => prediction, :confidence => confidence.abs}
|
72
|
+
end
|
73
|
+
|
74
|
+
# Local support vector regression from neighbors
|
75
|
+
# @param [Hash] params Keys `:props, :activities, :sims, :min_train_performance` are required
|
76
|
+
# @return [Numeric] A prediction value.
|
77
|
+
def self.local_svm_classification(params)
|
78
|
+
|
79
|
+
confidence = 0.0
|
80
|
+
prediction = nil
|
81
|
+
|
82
|
+
$logger.debug "Local SVM."
|
83
|
+
if params[:activities].size>0
|
84
|
+
if params[:props]
|
85
|
+
n_prop = params[:props][0].collect.to_a
|
86
|
+
q_prop = params[:props][1].collect.to_a
|
87
|
+
props = [ n_prop, q_prop ]
|
88
|
+
end
|
89
|
+
activities = params[:activities].collect.to_a
|
90
|
+
activities = activities.collect{|v| "Val" + v.to_s} # Convert to string for R to recognize classification
|
91
|
+
prediction = local_svm_prop( props, activities, params[:min_train_performance]) # params[:props].nil? signals non-prop setting
|
92
|
+
prediction = prediction.sub(/Val/,"") if prediction # Convert back
|
93
|
+
confidence = 0.0 if prediction.nil?
|
94
|
+
#$logger.debug "Prediction: '" + prediction.to_s + "' ('#{prediction.class}')."
|
95
|
+
confidence = get_confidence({:sims => params[:sims][1], :activities => params[:activities]})
|
96
|
+
end
|
97
|
+
{:value => prediction, :confidence => confidence}
|
98
|
+
|
99
|
+
end
|
100
|
+
|
101
|
+
|
102
|
+
|
103
|
+
end
|
104
|
+
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
data/lib/compound.rb
ADDED
@@ -0,0 +1,254 @@
|
|
1
|
+
# TODO: check
|
2
|
+
# *** Open Babel Error in ParseFile
|
3
|
+
# Could not find contribution data file.
|
4
|
+
|
5
|
+
CACTUS_URI="http://cactus.nci.nih.gov/chemical/structure/"
|
6
|
+
|
7
|
+
module OpenTox
|
8
|
+
|
9
|
+
class Compound
|
10
|
+
include OpenTox
|
11
|
+
|
12
|
+
field :inchi, type: String
|
13
|
+
field :smiles, type: String
|
14
|
+
field :inchikey, type: String
|
15
|
+
field :names, type: Array
|
16
|
+
field :warning, type: String
|
17
|
+
field :cid, type: String
|
18
|
+
field :chemblid, type: String
|
19
|
+
field :png_id, type: BSON::ObjectId
|
20
|
+
field :svg_id, type: BSON::ObjectId
|
21
|
+
field :sdf_id, type: BSON::ObjectId
|
22
|
+
field :fp4, type: Array
|
23
|
+
field :fp4_size, type: Integer
|
24
|
+
|
25
|
+
# Overwrites standard Mongoid method to create fingerprints before database insertion
|
26
|
+
def self.find_or_create_by params
|
27
|
+
compound = self.find_or_initialize_by params
|
28
|
+
unless compound.fp4 and !compound.fp4.empty?
|
29
|
+
compound.fp4_size = 0
|
30
|
+
compound.fp4 = []
|
31
|
+
fingerprint = FingerprintSmarts.fingerprint
|
32
|
+
Algorithm::Descriptor.smarts_match(compound, fingerprint).each_with_index do |m,i|
|
33
|
+
if m > 0
|
34
|
+
compound.fp4 << fingerprint[i].id
|
35
|
+
compound.fp4_size += 1
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
compound.save
|
40
|
+
compound
|
41
|
+
end
|
42
|
+
|
43
|
+
# Create a compound from smiles string
|
44
|
+
# @example
|
45
|
+
# compound = OpenTox::Compound.from_smiles("c1ccccc1")
|
46
|
+
# @param [String] smiles Smiles string
|
47
|
+
# @return [OpenTox::Compound] Compound
|
48
|
+
def self.from_smiles smiles
|
49
|
+
smiles = obconversion(smiles,"smi","can")
|
50
|
+
if smiles.empty?
|
51
|
+
Compound.find_or_create_by(:warning => "SMILES parsing failed for '#{smiles}', this may be caused by an incorrect SMILES string.")
|
52
|
+
else
|
53
|
+
Compound.find_or_create_by :smiles => obconversion(smiles,"smi","can")
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
# Create a compound from inchi string
|
58
|
+
# @param inchi [String] smiles InChI string
|
59
|
+
# @return [OpenTox::Compound] Compound
|
60
|
+
def self.from_inchi inchi
|
61
|
+
# Temporary workaround for OpenBabels Inchi bug
|
62
|
+
# http://sourceforge.net/p/openbabel/bugs/957/
|
63
|
+
# bug has not been fixed in latest git/development version
|
64
|
+
#smiles = `echo "#{inchi}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -iinchi - -ocan`.chomp.strip
|
65
|
+
smiles = obconversion(inchi,"inchi","can")
|
66
|
+
if smiles.empty?
|
67
|
+
Compound.find_or_create_by(:warning => "InChi parsing failed for #{inchi}, this may be caused by an incorrect InChi string or a bug in OpenBabel libraries.")
|
68
|
+
else
|
69
|
+
Compound.find_or_create_by(:smiles => smiles, :inchi => inchi)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
# Create a compound from sdf string
|
74
|
+
# @param sdf [String] smiles SDF string
|
75
|
+
# @return [OpenTox::Compound] Compound
|
76
|
+
def self.from_sdf sdf
|
77
|
+
# do not store sdf because it might be 2D
|
78
|
+
Compound.from_smiles obconversion(sdf,"sdf","can")
|
79
|
+
end
|
80
|
+
|
81
|
+
# Create a compound from name. Relies on an external service for name lookups.
|
82
|
+
# @example
|
83
|
+
# compound = OpenTox::Compound.from_name("Benzene")
|
84
|
+
# @param name [String] can be also an InChI/InChiKey, CAS number, etc
|
85
|
+
# @return [OpenTox::Compound] Compound
|
86
|
+
def self.from_name name
|
87
|
+
Compound.from_smiles RestClientWrapper.get(File.join(CACTUS_URI,URI.escape(name),"smiles"))
|
88
|
+
end
|
89
|
+
|
90
|
+
# Get InChI
|
91
|
+
# @return [String] InChI string
|
92
|
+
def inchi
|
93
|
+
unless self["inchi"]
|
94
|
+
|
95
|
+
result = obconversion(smiles,"smi","inchi")
|
96
|
+
#result = `echo "#{self.smiles}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -ismi - -oinchi`.chomp
|
97
|
+
update(:inchi => result.chomp) unless result.empty?
|
98
|
+
end
|
99
|
+
self["inchi"]
|
100
|
+
end
|
101
|
+
|
102
|
+
# Get InChIKey
|
103
|
+
# @return [String] InChIKey string
|
104
|
+
def inchikey
|
105
|
+
update(:inchikey => obconversion(smiles,"smi","inchikey")) unless self["inchikey"]
|
106
|
+
self["inchikey"]
|
107
|
+
end
|
108
|
+
|
109
|
+
# Get (canonical) smiles
|
110
|
+
# @return [String] Smiles string
|
111
|
+
def smiles
|
112
|
+
update(:smiles => obconversion(self["smiles"],"smi","can")) unless self["smiles"]
|
113
|
+
self["smiles"]
|
114
|
+
end
|
115
|
+
|
116
|
+
# Get sdf
|
117
|
+
# @return [String] SDF string
|
118
|
+
def sdf
|
119
|
+
if self.sdf_id.nil?
|
120
|
+
sdf = obconversion(smiles,"smi","sdf")
|
121
|
+
file = Mongo::Grid::File.new(sdf, :filename => "#{id}.sdf",:content_type => "chemical/x-mdl-sdfile")
|
122
|
+
sdf_id = $gridfs.insert_one file
|
123
|
+
update :sdf_id => sdf_id
|
124
|
+
end
|
125
|
+
$gridfs.find_one(_id: self.sdf_id).data
|
126
|
+
end
|
127
|
+
|
128
|
+
# Get SVG image
|
129
|
+
# @return [image/svg] Image data
|
130
|
+
def svg
|
131
|
+
if self.svg_id.nil?
|
132
|
+
svg = obconversion(smiles,"smi","svg")
|
133
|
+
file = Mongo::Grid::File.new(svg, :filename => "#{id}.svg", :content_type => "image/svg")
|
134
|
+
update(:image_id => $gridfs.insert_one(file))
|
135
|
+
end
|
136
|
+
$gridfs.find_one(_id: self.svg_id).data
|
137
|
+
|
138
|
+
end
|
139
|
+
|
140
|
+
# Get png image
|
141
|
+
# @example
|
142
|
+
# image = compound.png
|
143
|
+
# @return [image/png] Image data
|
144
|
+
def png
|
145
|
+
if self.png_id.nil?
|
146
|
+
png = obconversion(smiles,"smi","_png2")
|
147
|
+
file = Mongo::Grid::File.new(Base64.encode64(png), :filename => "#{id}.png", :content_type => "image/png")
|
148
|
+
update(:png_id => $gridfs.insert_one(file))
|
149
|
+
end
|
150
|
+
Base64.decode64($gridfs.find_one(_id: self.png_id).data)
|
151
|
+
|
152
|
+
end
|
153
|
+
|
154
|
+
# Get all known compound names. Relies on an external service for name lookups.
|
155
|
+
# @example
|
156
|
+
# names = compound.names
|
157
|
+
# @return [String] Compound names
|
158
|
+
def names
|
159
|
+
update(:names => RestClientWrapper.get("#{CACTUS_URI}#{inchi}/names").split("\n")) unless self["names"]
|
160
|
+
self["names"]
|
161
|
+
end
|
162
|
+
|
163
|
+
# @return [String] PubChem Compound Identifier (CID), derieved via restcall to pubchem
|
164
|
+
def cid
|
165
|
+
pug_uri = "http://pubchem.ncbi.nlm.nih.gov/rest/pug/"
|
166
|
+
update(:cid => RestClientWrapper.post(File.join(pug_uri, "compound", "inchi", "cids", "TXT"),{:inchi => inchi}).strip) unless self["cid"]
|
167
|
+
self["cid"]
|
168
|
+
end
|
169
|
+
|
170
|
+
# @return [String] ChEMBL database compound id, derieved via restcall to chembl
|
171
|
+
def chemblid
|
172
|
+
# https://www.ebi.ac.uk/chembldb/ws#individualCompoundByInChiKey
|
173
|
+
uri = "http://www.ebi.ac.uk/chemblws/compounds/smiles/#{smiles}.json"
|
174
|
+
update(:chemblid => JSON.parse(RestClientWrapper.get(uri))["compounds"].first["chemblId"]) unless self["chemblid"]
|
175
|
+
self["chemblid"]
|
176
|
+
end
|
177
|
+
|
178
|
+
def neighbors threshold=0.7
|
179
|
+
# from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb
|
180
|
+
qn = fp4.size
|
181
|
+
#qmin = qn * threshold
|
182
|
+
#qmax = qn / threshold
|
183
|
+
#not sure if it is worth the effort of keeping feature counts up to date (compound deletions, additions, ...)
|
184
|
+
#reqbits = [count['_id'] for count in db.mfp_counts.find({'_id': {'$in': qfp}}).sort('count', 1).limit(qn - qmin + 1)]
|
185
|
+
aggregate = [
|
186
|
+
#{'$match': {'mfp.count': {'$gte': qmin, '$lte': qmax}, 'mfp.bits': {'$in': reqbits}}},
|
187
|
+
{'$match' => {'_id' => {'$ne' => self.id}}}, # remove self
|
188
|
+
{'$project' => {
|
189
|
+
'tanimoto' => {'$let' => {
|
190
|
+
'vars' => {'common' => {'$size' => {'$setIntersection' => ['$fp4', fp4]}}},
|
191
|
+
'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [qn, '$fp4_size']}, '$$common']}]}
|
192
|
+
}},
|
193
|
+
'_id' => 1
|
194
|
+
}},
|
195
|
+
{'$match' => {'tanimoto' => {'$gte' => threshold}}},
|
196
|
+
{'$sort' => {'tanimoto' => -1}}
|
197
|
+
]
|
198
|
+
|
199
|
+
$mongo["compounds"].aggregate(aggregate).collect{ |r| [r["_id"], r["tanimoto"]] }
|
200
|
+
|
201
|
+
end
|
202
|
+
=begin
|
203
|
+
=end
|
204
|
+
|
205
|
+
private
|
206
|
+
|
207
|
+
def self.obconversion(identifier,input_format,output_format,option=nil)
|
208
|
+
obconversion = OpenBabel::OBConversion.new
|
209
|
+
obconversion.set_options(option, OpenBabel::OBConversion::OUTOPTIONS) if option
|
210
|
+
obmol = OpenBabel::OBMol.new
|
211
|
+
obconversion.set_in_and_out_formats input_format, output_format
|
212
|
+
obconversion.read_string obmol, identifier
|
213
|
+
case output_format
|
214
|
+
when /smi|can|inchi/
|
215
|
+
obconversion.write_string(obmol).gsub(/\s/,'').chomp
|
216
|
+
when /sdf/
|
217
|
+
p "SDF conversion"
|
218
|
+
# has no effect
|
219
|
+
#obconversion.add_option("gen3D", OpenBabel::OBConversion::GENOPTIONS)
|
220
|
+
# segfaults with openbabel git master
|
221
|
+
#OpenBabel::OBOp.find_type("Gen3D").do(obmol)
|
222
|
+
|
223
|
+
# TODO: find disconnected structures
|
224
|
+
# strip_salts
|
225
|
+
# separate
|
226
|
+
obmol.add_hydrogens
|
227
|
+
builder = OpenBabel::OBBuilder.new
|
228
|
+
builder.build(obmol)
|
229
|
+
|
230
|
+
sdf = obconversion.write_string(obmol)
|
231
|
+
print sdf
|
232
|
+
if sdf.match(/.nan/)
|
233
|
+
|
234
|
+
# TODO: fix or eliminate 2d generation
|
235
|
+
$logger.warn "3D generation failed for compound #{identifier}, trying to calculate 2D structure"
|
236
|
+
obconversion.set_options("gen2D", OpenBabel::OBConversion::GENOPTIONS)
|
237
|
+
#OpenBabel::OBOp.find_type("Gen2D").do(obmol)
|
238
|
+
sdf = obconversion.write_string(obmol)
|
239
|
+
if sdf.match(/.nan/)
|
240
|
+
$logger.warn "2D generation failed for compound #{identifier}"
|
241
|
+
sdf = nil
|
242
|
+
end
|
243
|
+
end
|
244
|
+
sdf
|
245
|
+
else
|
246
|
+
obconversion.write_string(obmol)
|
247
|
+
end
|
248
|
+
end
|
249
|
+
|
250
|
+
def obconversion(identifier,input_format,output_format,option=nil)
|
251
|
+
self.class.obconversion(identifier,input_format,output_format,option=nil)
|
252
|
+
end
|
253
|
+
end
|
254
|
+
end
|