lazar 0.0.7 → 0.0.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +3 -0
- data/README.md +2 -1
- data/VERSION +1 -1
- data/ext/lazar/extconf.rb +15 -76
- data/ext/lazar/rinstall.R +9 -0
- data/lazar.gemspec +7 -7
- data/lib/classification.rb +5 -78
- data/lib/compound.rb +201 -44
- data/lib/crossvalidation.rb +224 -121
- data/lib/dataset.rb +83 -93
- data/lib/error.rb +1 -1
- data/lib/experiment.rb +99 -0
- data/lib/feature.rb +2 -54
- data/lib/lazar.rb +47 -34
- data/lib/leave-one-out-validation.rb +205 -0
- data/lib/model.rb +131 -76
- data/lib/opentox.rb +2 -2
- data/lib/overwrite.rb +37 -0
- data/lib/physchem.rb +133 -0
- data/lib/regression.rb +117 -189
- data/lib/rest-client-wrapper.rb +4 -5
- data/lib/unique_descriptors.rb +6 -7
- data/lib/validation.rb +63 -69
- data/test/all.rb +2 -2
- data/test/classification.rb +41 -0
- data/test/compound.rb +116 -7
- data/test/data/LOAEL_log_mg_corrected_smiles.csv +567 -567
- data/test/data/LOAEL_log_mmol_corrected_smiles.csv +566 -566
- data/test/data/LOAEL_mmol_corrected_smiles.csv +568 -0
- data/test/data/batch_prediction.csv +25 -0
- data/test/data/batch_prediction_inchi_small.csv +4 -0
- data/test/data/batch_prediction_smiles_small.csv +4 -0
- data/test/data/hamster_carcinogenicity.json +3 -0
- data/test/data/loael.csv +568 -0
- data/test/dataset-long.rb +5 -8
- data/test/dataset.rb +31 -11
- data/test/default_environment.rb +11 -0
- data/test/descriptor.rb +26 -41
- data/test/error.rb +1 -3
- data/test/experiment.rb +301 -0
- data/test/feature.rb +22 -10
- data/test/lazar-long.rb +43 -23
- data/test/lazar-physchem-short.rb +19 -16
- data/test/prediction_models.rb +20 -0
- data/test/regression.rb +43 -0
- data/test/setup.rb +3 -1
- data/test/test_environment.rb +10 -0
- data/test/validation.rb +92 -26
- metadata +64 -38
- data/lib/SMARTS_InteLigand.txt +0 -983
- data/lib/bbrc.rb +0 -165
- data/lib/descriptor.rb +0 -247
- data/lib/neighbor.rb +0 -25
- data/lib/similarity.rb +0 -58
- data/mongoid.yml +0 -8
- data/test/descriptor-long.rb +0 -26
- data/test/fminer-long.rb +0 -38
- data/test/fminer.rb +0 -52
- data/test/lazar-fminer.rb +0 -50
- data/test/lazar-regression.rb +0 -27
data/lib/physchem.rb
ADDED
@@ -0,0 +1,133 @@
|
|
1
|
+
module OpenTox
|
2
|
+
|
3
|
+
# Feature for physico-chemical descriptors
|
4
|
+
class PhysChem < NumericFeature
|
5
|
+
|
6
|
+
field :library, type: String
|
7
|
+
field :descriptor, type: String
|
8
|
+
field :description, type: String
|
9
|
+
|
10
|
+
JAVA_DIR = File.join(File.dirname(__FILE__),"..","java")
|
11
|
+
CDK_JAR = Dir[File.join(JAVA_DIR,"cdk-*jar")].last
|
12
|
+
JOELIB_JAR = File.join(JAVA_DIR,"joelib2.jar")
|
13
|
+
LOG4J_JAR = File.join(JAVA_DIR,"log4j.jar")
|
14
|
+
JMOL_JAR = File.join(JAVA_DIR,"Jmol.jar")
|
15
|
+
|
16
|
+
obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title","L5"]
|
17
|
+
OBDESCRIPTORS = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d|
|
18
|
+
name,description = d.split(/\s+/,2)
|
19
|
+
["Openbabel."+name,description] unless obexclude.include? name
|
20
|
+
end.compact.sort{|a,b| a[0] <=> b[0]}]
|
21
|
+
|
22
|
+
cdkdescriptors = {}
|
23
|
+
CDK_DESCRIPTIONS = YAML.load(`java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptorInfo`)
|
24
|
+
CDK_DESCRIPTIONS.each do |d|
|
25
|
+
prefix="Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,'')
|
26
|
+
d[:names].each { |name| cdkdescriptors[prefix+"."+name] = d[:description] }
|
27
|
+
end
|
28
|
+
CDKDESCRIPTORS = cdkdescriptors
|
29
|
+
|
30
|
+
# exclude Hashcode (not a physchem property) and GlobalTopologicalChargeIndex (Joelib bug)
|
31
|
+
joelibexclude = ["MoleculeHashcode","GlobalTopologicalChargeIndex"]
|
32
|
+
# strip Joelib messages from stdout
|
33
|
+
JOELIBDESCRIPTORS = Hash[YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`).collect do |d|
|
34
|
+
name = d[:java_class].sub(/^joelib2.feature.types./,'')
|
35
|
+
["Joelib."+name, "JOELIb does not provide meaningful descriptions, see java/JoelibDescriptors.java for details."] unless joelibexclude.include? name
|
36
|
+
end.compact.sort{|a,b| a[0] <=> b[0]}]
|
37
|
+
|
38
|
+
DESCRIPTORS = OBDESCRIPTORS.merge(CDKDESCRIPTORS.merge(JOELIBDESCRIPTORS))
|
39
|
+
|
40
|
+
require_relative "unique_descriptors.rb"
|
41
|
+
|
42
|
+
def self.descriptors desc=DESCRIPTORS
|
43
|
+
desc.collect do |name,description|
|
44
|
+
lib,desc = name.split('.',2)
|
45
|
+
self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true, :numeric => true, :nominal => false)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def self.unique_descriptors
|
50
|
+
udesc = []
|
51
|
+
UNIQUEDESCRIPTORS.each do |name|
|
52
|
+
lib,desc = name.split('.',2)
|
53
|
+
if lib == "Cdk"
|
54
|
+
CDK_DESCRIPTIONS.select{|d| desc == d[:java_class].split('.').last.sub('Descriptor','') }.first[:names].each do |n|
|
55
|
+
dname = "#{name}.#{n}"
|
56
|
+
description = DESCRIPTORS[dname]
|
57
|
+
udesc << self.find_or_create_by(:name => dname, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true, :numeric => true, :nominal => false)
|
58
|
+
end
|
59
|
+
else
|
60
|
+
description = DESCRIPTORS[name]
|
61
|
+
udesc << self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true, :numeric => true, :nominal => false)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
udesc
|
65
|
+
end
|
66
|
+
|
67
|
+
def self.openbabel_descriptors
|
68
|
+
descriptors OBDESCRIPTORS
|
69
|
+
end
|
70
|
+
|
71
|
+
def self.cdk_descriptors
|
72
|
+
descriptors CDKDESCRIPTORS
|
73
|
+
end
|
74
|
+
|
75
|
+
def self.joelib_descriptors
|
76
|
+
descriptors JOELIBDESCRIPTORS
|
77
|
+
end
|
78
|
+
|
79
|
+
def calculate compound
|
80
|
+
result = send library.downcase,descriptor,compound
|
81
|
+
result[self.name]
|
82
|
+
end
|
83
|
+
|
84
|
+
def openbabel descriptor, compound
|
85
|
+
obdescriptor = OpenBabel::OBDescriptor.find_type descriptor
|
86
|
+
obmol = OpenBabel::OBMol.new
|
87
|
+
obconversion = OpenBabel::OBConversion.new
|
88
|
+
obconversion.set_in_format 'smi'
|
89
|
+
obconversion.read_string obmol, compound.smiles
|
90
|
+
{"#{library.capitalize}.#{descriptor}" => fix_value(obdescriptor.predict(obmol))}
|
91
|
+
end
|
92
|
+
|
93
|
+
def cdk descriptor, compound
|
94
|
+
java_descriptor "cdk", descriptor, compound
|
95
|
+
end
|
96
|
+
|
97
|
+
def joelib descriptor, compound
|
98
|
+
java_descriptor "joelib", descriptor, compound
|
99
|
+
end
|
100
|
+
|
101
|
+
private
|
102
|
+
|
103
|
+
def java_descriptor lib, descriptor, compound
|
104
|
+
|
105
|
+
sdf_3d = "/tmp/#{SecureRandom.uuid}.sdf"
|
106
|
+
File.open(sdf_3d,"w+"){|f| f.print compound.sdf}
|
107
|
+
|
108
|
+
# use java system call (rjb blocks within tasks)
|
109
|
+
# use Tempfiles to avoid "Argument list too long" error
|
110
|
+
case lib
|
111
|
+
when "cdk"
|
112
|
+
`java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptors #{sdf_3d} #{descriptor}`
|
113
|
+
when "joelib"
|
114
|
+
`java -classpath #{JOELIB_JAR}:#{JMOL_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptors #{sdf_3d} #{descriptor}`
|
115
|
+
end
|
116
|
+
result = YAML.load_file("#{sdf_3d}#{lib}.yaml").first
|
117
|
+
result.keys.each{|k| result[k] = result.delete(k)}
|
118
|
+
result
|
119
|
+
end
|
120
|
+
|
121
|
+
def fix_value val
|
122
|
+
val = val.first if val.is_a? Array and val.size == 1
|
123
|
+
val = nil if val == "NaN"
|
124
|
+
if val.numeric?
|
125
|
+
val = Float(val)
|
126
|
+
val = nil if val.nan? or val.infinite?
|
127
|
+
end
|
128
|
+
val
|
129
|
+
end
|
130
|
+
|
131
|
+
end
|
132
|
+
|
133
|
+
end
|
data/lib/regression.rb
CHANGED
@@ -1,223 +1,151 @@
|
|
1
|
-
# TODO install R packages kernlab, caret, doMC, class, e1071
|
2
|
-
|
3
|
-
|
4
|
-
# log transform activities (create new dataset)
|
5
|
-
# scale, normalize features, might not be necessary
|
6
|
-
# http://stats.stackexchange.com/questions/19216/variables-are-often-adjusted-e-g-standardised-before-making-a-model-when-is
|
7
|
-
# http://stats.stackexchange.com/questions/7112/when-and-how-to-use-standardized-explanatory-variables-in-linear-regression
|
8
|
-
# zero-order correlation and the semi-partial correlation
|
9
|
-
# seems to be necessary for svm
|
10
|
-
# http://stats.stackexchange.com/questions/77876/why-would-scaling-features-decrease-svm-performance?lq=1
|
11
|
-
# http://stackoverflow.com/questions/15436367/svm-scaling-input-values
|
12
|
-
# use lasso or elastic net??
|
13
|
-
# select relevant features
|
14
|
-
# remove features with a single value
|
15
|
-
# remove correlated features
|
16
|
-
# remove features not correlated with endpoint
|
17
1
|
module OpenTox
|
18
2
|
module Algorithm
|
19
3
|
|
20
4
|
class Regression
|
21
5
|
|
22
|
-
def self.
|
6
|
+
def self.local_weighted_average compound, params
|
23
7
|
weighted_sum = 0.0
|
24
8
|
sim_sum = 0.0
|
9
|
+
neighbors = params[:neighbors]
|
25
10
|
neighbors.each do |row|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
11
|
+
sim = row["tanimoto"]
|
12
|
+
if row["features"][params[:prediction_feature_id].to_s]
|
13
|
+
row["features"][params[:prediction_feature_id].to_s].each do |act|
|
14
|
+
weighted_sum += sim*Math.log10(act)
|
15
|
+
sim_sum += sim
|
16
|
+
end
|
30
17
|
end
|
31
18
|
end
|
32
|
-
confidence = sim_sum/neighbors.size.to_f
|
33
19
|
sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum)
|
34
|
-
{:value => prediction
|
20
|
+
{:value => prediction}
|
35
21
|
end
|
36
22
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
=
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
23
|
+
# TODO explicit neighbors, also for physchem
|
24
|
+
def self.local_fingerprint_regression compound, params, method='pls'#, method_params="sigma=0.05"
|
25
|
+
neighbors = params[:neighbors]
|
26
|
+
return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0
|
27
|
+
activities = []
|
28
|
+
fingerprints = {}
|
29
|
+
weights = []
|
30
|
+
fingerprint_ids = neighbors.collect{|row| Compound.find(row["_id"]).fingerprint}.flatten.uniq.sort
|
31
|
+
|
32
|
+
neighbors.each_with_index do |row,i|
|
33
|
+
neighbor = Compound.find row["_id"]
|
34
|
+
fingerprint = neighbor.fingerprint
|
35
|
+
if row["features"][params[:prediction_feature_id].to_s]
|
36
|
+
row["features"][params[:prediction_feature_id].to_s].each do |act|
|
37
|
+
activities << Math.log10(act)
|
38
|
+
weights << row["tanimoto"]
|
39
|
+
fingerprint_ids.each_with_index do |id,j|
|
40
|
+
fingerprints[id] ||= []
|
41
|
+
fingerprints[id] << fingerprint.include?(id)
|
42
|
+
end
|
43
|
+
end
|
53
44
|
end
|
54
45
|
end
|
55
|
-
=end
|
56
|
-
confidence = sim_sum/neighbors.size.to_f
|
57
|
-
sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum)
|
58
|
-
{:value => prediction,:confidence => confidence}
|
59
|
-
end
|
60
46
|
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
47
|
+
variables = []
|
48
|
+
data_frame = [activities]
|
49
|
+
fingerprints.each do |k,v|
|
50
|
+
unless v.uniq.size == 1
|
51
|
+
data_frame << v.collect{|m| m ? "T" : "F"}
|
52
|
+
variables << k
|
53
|
+
end
|
54
|
+
end
|
65
55
|
|
66
|
-
|
67
|
-
|
56
|
+
if variables.empty?
|
57
|
+
result = local_weighted_average(compound, params)
|
58
|
+
result[:warning] = "No variables for regression model. Using weighted average of similar compounds."
|
59
|
+
return result
|
68
60
|
|
69
|
-
$logger.debug "Local SVM."
|
70
|
-
props = neighbors.collect{|row| row[3] }
|
71
|
-
neighbors.shift
|
72
|
-
activities = neighbors.collect{|n| n[2]}
|
73
|
-
prediction = self.local_svm_prop( props, activities, params[:min_train_performance]) # params[:props].nil? signals non-prop setting
|
74
|
-
prediction = nil if (!prediction.nil? && prediction.infinite?)
|
75
|
-
$logger.debug "Prediction: '#{prediction}' ('#{prediction.class}')."
|
76
|
-
if prediction
|
77
|
-
confidence = get_confidence({:sims => neighbors.collect{|n| n[1]}, :activities => activities})
|
78
61
|
else
|
79
|
-
|
62
|
+
compound_features = variables.collect{|f| compound.fingerprint.include?(f) ? "T" : "F"}
|
63
|
+
prediction = r_model_prediction method, data_frame, variables, weights, compound_features
|
64
|
+
if prediction.nil? or prediction[:value].nil?
|
65
|
+
prediction = local_weighted_average(compound, params)
|
66
|
+
prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds."
|
67
|
+
return prediction
|
68
|
+
else
|
69
|
+
prediction[:prediction_interval] = [10**(prediction[:value]-1.96*prediction[:rmse]), 10**(prediction[:value]+1.96*prediction[:rmse])]
|
70
|
+
prediction[:value] = 10**prediction[:value]
|
71
|
+
prediction[:rmse] = 10**prediction[:rmse]
|
72
|
+
prediction
|
73
|
+
end
|
80
74
|
end
|
81
|
-
|
82
|
-
|
75
|
+
|
83
76
|
end
|
84
77
|
|
78
|
+
def self.local_physchem_regression compound, params, method="plsr"#, method_params="ncomp = 4"
|
79
|
+
|
80
|
+
neighbors = params[:neighbors]
|
81
|
+
return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0
|
82
|
+
return {:value => neighbors.first["features"][params[:prediction_feature_id]], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1
|
83
|
+
|
84
|
+
activities = []
|
85
|
+
weights = []
|
86
|
+
physchem = {}
|
87
|
+
|
88
|
+
neighbors.each_with_index do |row,i|
|
89
|
+
neighbor = Compound.find row["_id"]
|
90
|
+
if row["features"][params[:prediction_feature_id].to_s]
|
91
|
+
row["features"][params[:prediction_feature_id].to_s].each do |act|
|
92
|
+
activities << Math.log10(act)
|
93
|
+
weights << row["tanimoto"] # TODO cosine ?
|
94
|
+
neighbor.physchem.each do |pid,v| # insert physchem only if there is an activity
|
95
|
+
physchem[pid] ||= []
|
96
|
+
physchem[pid] << v
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
85
101
|
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
# @param [Array] activities, activities for neighbors.
|
91
|
-
# @param [Float] min_train_performance, parameter to control censoring
|
92
|
-
# @return [Numeric] A prediction value.
|
93
|
-
def self.local_svm_prop(props, activities, min_train_performance)
|
102
|
+
# remove properties with a single value
|
103
|
+
physchem.each do |pid,v|
|
104
|
+
physchem.delete(pid) if v.uniq.size <= 1
|
105
|
+
end
|
94
106
|
|
95
|
-
|
96
|
-
|
97
|
-
|
107
|
+
if physchem.empty?
|
108
|
+
result = local_weighted_average(compound, params)
|
109
|
+
result[:warning] = "No variables for regression model. Using weighted average of similar compounds."
|
110
|
+
return result
|
98
111
|
|
99
|
-
prediction = nil
|
100
|
-
if activities.uniq.size == 1
|
101
|
-
prediction = activities[0]
|
102
112
|
else
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
113
|
+
data_frame = [activities] + physchem.keys.collect { |pid| physchem[pid] }
|
114
|
+
prediction = r_model_prediction method, data_frame, physchem.keys, weights, physchem.keys.collect{|pid| compound.physchem[pid]}
|
115
|
+
if prediction.nil?
|
116
|
+
prediction = local_weighted_average(compound, params)
|
117
|
+
prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds."
|
118
|
+
return prediction
|
119
|
+
else
|
120
|
+
prediction[:value] = 10**prediction[:value]
|
121
|
+
prediction
|
111
122
|
end
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
rs << "set.seed(1)"
|
116
|
-
$logger.debug "Loading R packages: #{Time.now-t}"
|
117
|
-
t = Time.now
|
118
|
-
p n_prop
|
119
|
-
begin
|
120
|
-
|
121
|
-
# set data
|
122
|
-
rs << "n_prop <- c(#{n_prop.flatten.join(',')})"
|
123
|
-
rs << "n_prop <- c(#{n_prop.flatten.join(',')})"
|
124
|
-
rs << "n_prop_x_size <- c(#{n_prop.size})"
|
125
|
-
rs << "n_prop_y_size <- c(#{n_prop[0].size})"
|
126
|
-
rs << "y <- c(#{activities.join(',')})"
|
127
|
-
rs << "q_prop <- c(#{q_prop.join(',')})"
|
128
|
-
rs << "y = matrix(y)"
|
129
|
-
rs << "prop_matrix = matrix(n_prop, n_prop_x_size, n_prop_y_size, byrow=T)"
|
130
|
-
rs << "q_prop = matrix(q_prop, 1, n_prop_y_size, byrow=T)"
|
131
|
-
|
132
|
-
$logger.debug "Setting R data: #{Time.now-t}"
|
133
|
-
t = Time.now
|
134
|
-
# prepare data
|
135
|
-
rs << "
|
136
|
-
weights=NULL
|
137
|
-
if (!(class(y) == 'numeric')) {
|
138
|
-
y = factor(y)
|
139
|
-
weights=unlist(as.list(prop.table(table(y))))
|
140
|
-
weights=(weights-1)^2
|
141
|
-
}
|
142
|
-
"
|
143
|
-
|
144
|
-
rs << "
|
145
|
-
rem = nearZeroVar(prop_matrix)
|
146
|
-
if (length(rem) > 0) {
|
147
|
-
prop_matrix = prop_matrix[,-rem,drop=F]
|
148
|
-
q_prop = q_prop[,-rem,drop=F]
|
149
|
-
}
|
150
|
-
rem = findCorrelation(cor(prop_matrix))
|
151
|
-
if (length(rem) > 0) {
|
152
|
-
prop_matrix = prop_matrix[,-rem,drop=F]
|
153
|
-
q_prop = q_prop[,-rem,drop=F]
|
154
|
-
}
|
155
|
-
"
|
156
|
-
|
157
|
-
#p @r.eval("y").to_ruby
|
158
|
-
#p "weights"
|
159
|
-
#p @r.eval("weights").to_ruby
|
160
|
-
$logger.debug "Preparing R data: #{Time.now-t}"
|
161
|
-
t = Time.now
|
162
|
-
# model + support vectors
|
163
|
-
#train_success = @r.eval <<-EOR
|
164
|
-
rs << '
|
165
|
-
model = train(prop_matrix,y,
|
166
|
-
method="svmRadial",
|
167
|
-
preProcess=c("center", "scale"),
|
168
|
-
class.weights=weights,
|
169
|
-
trControl=trainControl(method="LGOCV",number=10),
|
170
|
-
tuneLength=8
|
171
|
-
)
|
172
|
-
perf = ifelse ( class(y)!="numeric", max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared )
|
173
|
-
'
|
174
|
-
File.open("/tmp/r.r","w+"){|f| f.puts rs.join("\n")}
|
175
|
-
p rs.join("\n")
|
176
|
-
p `Rscript /tmp/r.r`
|
177
|
-
=begin
|
178
|
-
@r.void_eval <<-EOR
|
179
|
-
model = train(prop_matrix,y,
|
180
|
-
method="svmRadial",
|
181
|
-
#preProcess=c("center", "scale"),
|
182
|
-
#class.weights=weights,
|
183
|
-
#trControl=trainControl(method="LGOCV",number=10),
|
184
|
-
#tuneLength=8
|
185
|
-
)
|
186
|
-
perf = ifelse ( class(y)!='numeric', max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared )
|
187
|
-
EOR
|
188
|
-
=end
|
189
|
-
|
190
|
-
$logger.debug "Creating R SVM model: #{Time.now-t}"
|
191
|
-
t = Time.now
|
192
|
-
if train_success
|
193
|
-
# prediction
|
194
|
-
@r.eval "predict(model,q_prop); p = predict(model,q_prop)" # kernlab bug: predict twice
|
195
|
-
#@r.eval "p = predict(model,q_prop)" # kernlab bug: predict twice
|
196
|
-
@r.eval "if (class(y)!='numeric') p = as.character(p)"
|
197
|
-
prediction = @r.p
|
123
|
+
end
|
124
|
+
|
125
|
+
end
|
198
126
|
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
$logger.debug "#{e.class}: #{e.message}"
|
210
|
-
$logger.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
211
|
-
ensure
|
212
|
-
#puts @r.inspect
|
213
|
-
#TODO: broken pipe
|
214
|
-
#@r.quit # free R
|
215
|
-
end
|
127
|
+
def self.r_model_prediction method, training_data, training_features, training_weights, query_feature_values
|
128
|
+
R.assign "weights", training_weights
|
129
|
+
r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})"
|
130
|
+
R.eval "data <- #{r_data_frame}"
|
131
|
+
R.assign "features", training_features
|
132
|
+
R.eval "names(data) <- append(c('activities'),features)" #
|
133
|
+
begin
|
134
|
+
R.eval "model <- train(activities ~ ., data = data, method = '#{method}')"
|
135
|
+
rescue
|
136
|
+
return nil
|
216
137
|
end
|
217
|
-
|
138
|
+
R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))"
|
139
|
+
R.eval "names(fingerprint) <- features"
|
140
|
+
R.eval "prediction <- predict(model,fingerprint)"
|
141
|
+
{
|
142
|
+
:value => R.eval("prediction").to_f,
|
143
|
+
:rmse => R.eval("getTrainPerf(model)$TrainRMSE").to_f,
|
144
|
+
:r_squared => R.eval("getTrainPerf(model)$TrainRsquared").to_f,
|
145
|
+
}
|
218
146
|
end
|
219
|
-
end
|
220
147
|
|
148
|
+
end
|
221
149
|
end
|
222
150
|
end
|
223
151
|
|
data/lib/rest-client-wrapper.rb
CHANGED
@@ -26,15 +26,14 @@ module OpenTox
|
|
26
26
|
define_singleton_method method do |uri,payload={},headers={},waiting_task=nil|
|
27
27
|
|
28
28
|
# check input
|
29
|
-
bad_request_error "Headers are not a hash: #{headers.inspect}
|
29
|
+
bad_request_error "Headers are not a hash: #{headers.inspect} for #{uri}." unless headers==nil or headers.is_a?(Hash)
|
30
30
|
headers[:subjectid] ||= @@subjectid
|
31
|
-
bad_request_error "Invalid URI: '#{uri}'"
|
32
|
-
#resource_not_found_error "URI '#{uri}' not found.", uri unless URI.accessible?(uri, @subjectid) unless URI.ssl?(uri)
|
31
|
+
bad_request_error "Invalid URI: '#{uri}'" unless URI.valid? uri
|
33
32
|
# make sure that no header parameters are set in the payload
|
34
33
|
[:accept,:content_type,:subjectid].each do |header|
|
35
34
|
if defined? $aa || URI(uri).host == URI($aa[:uri]).host
|
36
35
|
else
|
37
|
-
bad_request_error "#{header} should be submitted in the headers
|
36
|
+
bad_request_error "#{header} should be submitted in the headers of URI: #{uri}" if payload and payload.is_a?(Hash) and payload[header]
|
38
37
|
end
|
39
38
|
end
|
40
39
|
|
@@ -72,7 +71,7 @@ module OpenTox
|
|
72
71
|
msg = "Could not parse error response from rest call '#{method}' to '#{uri}':\n#{response}"
|
73
72
|
cause = nil
|
74
73
|
end
|
75
|
-
Object.method(error[:method]).call msg, uri, cause # call error method
|
74
|
+
Object.method(error[:method]).call "#{msg}, #{uri}, #{cause}" # call error method
|
76
75
|
else
|
77
76
|
response
|
78
77
|
end
|
data/lib/unique_descriptors.rb
CHANGED
@@ -12,7 +12,7 @@ UNIQUEDESCRIPTORS = [
|
|
12
12
|
"Openbabel.HBA1", #Number of Hydrogen Bond Acceptors 1 (JoelLib)
|
13
13
|
"Openbabel.HBA2", #Number of Hydrogen Bond Acceptors 2 (JoelLib)
|
14
14
|
"Openbabel.HBD", #Number of Hydrogen Bond Donors (JoelLib)
|
15
|
-
"
|
15
|
+
#"Openbabe..L5", #Lipinski Rule of Five# TODO Openbabel.L5 returns nil, investigate!!!
|
16
16
|
"Openbabel.logP", #octanol/water partition coefficient
|
17
17
|
"Openbabel.MP", #Melting point
|
18
18
|
"Openbabel.MR", #molar refractivity
|
@@ -24,7 +24,7 @@ UNIQUEDESCRIPTORS = [
|
|
24
24
|
"Cdk.ALOGP", #Calculates atom additive logP and molar refractivity values as described by Ghose and Crippen and
|
25
25
|
"Cdk.APol", #Descriptor that calculates the sum of the atomic polarizabilities (including implicit hydrogens).
|
26
26
|
"Cdk.AcidicGroupCount", #Returns the number of acidic groups.
|
27
|
-
"Cdk.AminoAcidCount", #Returns the number of amino acids found in the system
|
27
|
+
#"Cdk.AminoAcidCount", #Returns the number of amino acids found in the system
|
28
28
|
#"Cdk.AromaticAtomsCount", #Descriptor based on the number of aromatic atoms of a molecule.
|
29
29
|
#"Cdk.AromaticBondsCount", #Descriptor based on the number of aromatic bonds of a molecule.
|
30
30
|
#"Cdk.AtomCount", #Descriptor based on the number of atoms of a certain element type.
|
@@ -56,7 +56,7 @@ UNIQUEDESCRIPTORS = [
|
|
56
56
|
"Cdk.LengthOverBreadth", #Calculates the ratio of length to breadth.
|
57
57
|
"Cdk.LongestAliphaticChain", #Returns the number of atoms in the longest aliphatic chain
|
58
58
|
"Cdk.MDE", #Evaluate molecular distance edge descriptors for C, N and O
|
59
|
-
"Cdk.MannholdLogP", #Descriptor that calculates the LogP based on a simple equation using the number of carbons and hetero atoms .
|
59
|
+
#"Cdk.MannholdLogP", #Descriptor that calculates the LogP based on a simple equation using the number of carbons and hetero atoms .
|
60
60
|
"Cdk.MomentOfInertia", #Descriptor that calculates the principal moments of inertia and ratios of the principal moments. Als calculates the radius of gyration.
|
61
61
|
"Cdk.PetitjeanNumber", #Descriptor that calculates the Petitjean Number of a molecule.
|
62
62
|
"Cdk.PetitjeanShapeIndex", #The topological and geometric shape indices described Petitjean and Bath et al. respectively. Both measure the anisotropy in a molecule.
|
@@ -75,7 +75,7 @@ UNIQUEDESCRIPTORS = [
|
|
75
75
|
"Joelib.count.NumberOfP", #no description available
|
76
76
|
"Joelib.count.NumberOfO", #no description available
|
77
77
|
"Joelib.count.NumberOfN", #no description available
|
78
|
-
#"
|
78
|
+
#"Joeli#.count.AromaticBonds", #no description available
|
79
79
|
"Joelib.count.NumberOfI", #no description available
|
80
80
|
"Joelib.count.NumberOfF", #no description available
|
81
81
|
"Joelib.count.NumberOfC", #no description available
|
@@ -91,7 +91,7 @@ UNIQUEDESCRIPTORS = [
|
|
91
91
|
"Joelib.GeometricalShapeCoefficient", #no description available
|
92
92
|
#"Joelib.MolecularWeight", #no description available
|
93
93
|
"Joelib.FractionRotatableBonds", #no description available
|
94
|
-
#"
|
94
|
+
#"Joeli..count.HBD2", #no description available
|
95
95
|
#"Joelib.count.HBD1", #no description available
|
96
96
|
"Joelib.LogP", #no description available
|
97
97
|
"Joelib.GraphShapeCoefficient", #no description available
|
@@ -116,5 +116,4 @@ UNIQUEDESCRIPTORS = [
|
|
116
116
|
"Joelib.count.SOGroups", #no description available
|
117
117
|
"Joelib.TopologicalDiameter", #no description available
|
118
118
|
"Joelib.count.NumberOfHal", #no description available
|
119
|
-
|
120
|
-
].sort
|
119
|
+
]
|