lazar 0.0.7 → 0.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +3 -0
- data/README.md +2 -1
- data/VERSION +1 -1
- data/ext/lazar/extconf.rb +15 -76
- data/ext/lazar/rinstall.R +9 -0
- data/lazar.gemspec +7 -7
- data/lib/classification.rb +5 -78
- data/lib/compound.rb +201 -44
- data/lib/crossvalidation.rb +224 -121
- data/lib/dataset.rb +83 -93
- data/lib/error.rb +1 -1
- data/lib/experiment.rb +99 -0
- data/lib/feature.rb +2 -54
- data/lib/lazar.rb +47 -34
- data/lib/leave-one-out-validation.rb +205 -0
- data/lib/model.rb +131 -76
- data/lib/opentox.rb +2 -2
- data/lib/overwrite.rb +37 -0
- data/lib/physchem.rb +133 -0
- data/lib/regression.rb +117 -189
- data/lib/rest-client-wrapper.rb +4 -5
- data/lib/unique_descriptors.rb +6 -7
- data/lib/validation.rb +63 -69
- data/test/all.rb +2 -2
- data/test/classification.rb +41 -0
- data/test/compound.rb +116 -7
- data/test/data/LOAEL_log_mg_corrected_smiles.csv +567 -567
- data/test/data/LOAEL_log_mmol_corrected_smiles.csv +566 -566
- data/test/data/LOAEL_mmol_corrected_smiles.csv +568 -0
- data/test/data/batch_prediction.csv +25 -0
- data/test/data/batch_prediction_inchi_small.csv +4 -0
- data/test/data/batch_prediction_smiles_small.csv +4 -0
- data/test/data/hamster_carcinogenicity.json +3 -0
- data/test/data/loael.csv +568 -0
- data/test/dataset-long.rb +5 -8
- data/test/dataset.rb +31 -11
- data/test/default_environment.rb +11 -0
- data/test/descriptor.rb +26 -41
- data/test/error.rb +1 -3
- data/test/experiment.rb +301 -0
- data/test/feature.rb +22 -10
- data/test/lazar-long.rb +43 -23
- data/test/lazar-physchem-short.rb +19 -16
- data/test/prediction_models.rb +20 -0
- data/test/regression.rb +43 -0
- data/test/setup.rb +3 -1
- data/test/test_environment.rb +10 -0
- data/test/validation.rb +92 -26
- metadata +64 -38
- data/lib/SMARTS_InteLigand.txt +0 -983
- data/lib/bbrc.rb +0 -165
- data/lib/descriptor.rb +0 -247
- data/lib/neighbor.rb +0 -25
- data/lib/similarity.rb +0 -58
- data/mongoid.yml +0 -8
- data/test/descriptor-long.rb +0 -26
- data/test/fminer-long.rb +0 -38
- data/test/fminer.rb +0 -52
- data/test/lazar-fminer.rb +0 -50
- data/test/lazar-regression.rb +0 -27
data/lib/physchem.rb
ADDED
@@ -0,0 +1,133 @@
|
|
1
|
+
module OpenTox
|
2
|
+
|
3
|
+
# Feature for physico-chemical descriptors
|
4
|
+
class PhysChem < NumericFeature
|
5
|
+
|
6
|
+
field :library, type: String
|
7
|
+
field :descriptor, type: String
|
8
|
+
field :description, type: String
|
9
|
+
|
10
|
+
JAVA_DIR = File.join(File.dirname(__FILE__),"..","java")
|
11
|
+
CDK_JAR = Dir[File.join(JAVA_DIR,"cdk-*jar")].last
|
12
|
+
JOELIB_JAR = File.join(JAVA_DIR,"joelib2.jar")
|
13
|
+
LOG4J_JAR = File.join(JAVA_DIR,"log4j.jar")
|
14
|
+
JMOL_JAR = File.join(JAVA_DIR,"Jmol.jar")
|
15
|
+
|
16
|
+
obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title","L5"]
|
17
|
+
OBDESCRIPTORS = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d|
|
18
|
+
name,description = d.split(/\s+/,2)
|
19
|
+
["Openbabel."+name,description] unless obexclude.include? name
|
20
|
+
end.compact.sort{|a,b| a[0] <=> b[0]}]
|
21
|
+
|
22
|
+
cdkdescriptors = {}
|
23
|
+
CDK_DESCRIPTIONS = YAML.load(`java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptorInfo`)
|
24
|
+
CDK_DESCRIPTIONS.each do |d|
|
25
|
+
prefix="Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,'')
|
26
|
+
d[:names].each { |name| cdkdescriptors[prefix+"."+name] = d[:description] }
|
27
|
+
end
|
28
|
+
CDKDESCRIPTORS = cdkdescriptors
|
29
|
+
|
30
|
+
# exclude Hashcode (not a physchem property) and GlobalTopologicalChargeIndex (Joelib bug)
|
31
|
+
joelibexclude = ["MoleculeHashcode","GlobalTopologicalChargeIndex"]
|
32
|
+
# strip Joelib messages from stdout
|
33
|
+
JOELIBDESCRIPTORS = Hash[YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`).collect do |d|
|
34
|
+
name = d[:java_class].sub(/^joelib2.feature.types./,'')
|
35
|
+
["Joelib."+name, "JOELIb does not provide meaningful descriptions, see java/JoelibDescriptors.java for details."] unless joelibexclude.include? name
|
36
|
+
end.compact.sort{|a,b| a[0] <=> b[0]}]
|
37
|
+
|
38
|
+
DESCRIPTORS = OBDESCRIPTORS.merge(CDKDESCRIPTORS.merge(JOELIBDESCRIPTORS))
|
39
|
+
|
40
|
+
require_relative "unique_descriptors.rb"
|
41
|
+
|
42
|
+
def self.descriptors desc=DESCRIPTORS
|
43
|
+
desc.collect do |name,description|
|
44
|
+
lib,desc = name.split('.',2)
|
45
|
+
self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true, :numeric => true, :nominal => false)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def self.unique_descriptors
|
50
|
+
udesc = []
|
51
|
+
UNIQUEDESCRIPTORS.each do |name|
|
52
|
+
lib,desc = name.split('.',2)
|
53
|
+
if lib == "Cdk"
|
54
|
+
CDK_DESCRIPTIONS.select{|d| desc == d[:java_class].split('.').last.sub('Descriptor','') }.first[:names].each do |n|
|
55
|
+
dname = "#{name}.#{n}"
|
56
|
+
description = DESCRIPTORS[dname]
|
57
|
+
udesc << self.find_or_create_by(:name => dname, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true, :numeric => true, :nominal => false)
|
58
|
+
end
|
59
|
+
else
|
60
|
+
description = DESCRIPTORS[name]
|
61
|
+
udesc << self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true, :numeric => true, :nominal => false)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
udesc
|
65
|
+
end
|
66
|
+
|
67
|
+
def self.openbabel_descriptors
|
68
|
+
descriptors OBDESCRIPTORS
|
69
|
+
end
|
70
|
+
|
71
|
+
def self.cdk_descriptors
|
72
|
+
descriptors CDKDESCRIPTORS
|
73
|
+
end
|
74
|
+
|
75
|
+
def self.joelib_descriptors
|
76
|
+
descriptors JOELIBDESCRIPTORS
|
77
|
+
end
|
78
|
+
|
79
|
+
def calculate compound
|
80
|
+
result = send library.downcase,descriptor,compound
|
81
|
+
result[self.name]
|
82
|
+
end
|
83
|
+
|
84
|
+
def openbabel descriptor, compound
|
85
|
+
obdescriptor = OpenBabel::OBDescriptor.find_type descriptor
|
86
|
+
obmol = OpenBabel::OBMol.new
|
87
|
+
obconversion = OpenBabel::OBConversion.new
|
88
|
+
obconversion.set_in_format 'smi'
|
89
|
+
obconversion.read_string obmol, compound.smiles
|
90
|
+
{"#{library.capitalize}.#{descriptor}" => fix_value(obdescriptor.predict(obmol))}
|
91
|
+
end
|
92
|
+
|
93
|
+
def cdk descriptor, compound
|
94
|
+
java_descriptor "cdk", descriptor, compound
|
95
|
+
end
|
96
|
+
|
97
|
+
def joelib descriptor, compound
|
98
|
+
java_descriptor "joelib", descriptor, compound
|
99
|
+
end
|
100
|
+
|
101
|
+
private
|
102
|
+
|
103
|
+
def java_descriptor lib, descriptor, compound
|
104
|
+
|
105
|
+
sdf_3d = "/tmp/#{SecureRandom.uuid}.sdf"
|
106
|
+
File.open(sdf_3d,"w+"){|f| f.print compound.sdf}
|
107
|
+
|
108
|
+
# use java system call (rjb blocks within tasks)
|
109
|
+
# use Tempfiles to avoid "Argument list too long" error
|
110
|
+
case lib
|
111
|
+
when "cdk"
|
112
|
+
`java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptors #{sdf_3d} #{descriptor}`
|
113
|
+
when "joelib"
|
114
|
+
`java -classpath #{JOELIB_JAR}:#{JMOL_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptors #{sdf_3d} #{descriptor}`
|
115
|
+
end
|
116
|
+
result = YAML.load_file("#{sdf_3d}#{lib}.yaml").first
|
117
|
+
result.keys.each{|k| result[k] = result.delete(k)}
|
118
|
+
result
|
119
|
+
end
|
120
|
+
|
121
|
+
def fix_value val
|
122
|
+
val = val.first if val.is_a? Array and val.size == 1
|
123
|
+
val = nil if val == "NaN"
|
124
|
+
if val.numeric?
|
125
|
+
val = Float(val)
|
126
|
+
val = nil if val.nan? or val.infinite?
|
127
|
+
end
|
128
|
+
val
|
129
|
+
end
|
130
|
+
|
131
|
+
end
|
132
|
+
|
133
|
+
end
|
data/lib/regression.rb
CHANGED
@@ -1,223 +1,151 @@
|
|
1
|
-
# TODO install R packages kernlab, caret, doMC, class, e1071
|
2
|
-
|
3
|
-
|
4
|
-
# log transform activities (create new dataset)
|
5
|
-
# scale, normalize features, might not be necessary
|
6
|
-
# http://stats.stackexchange.com/questions/19216/variables-are-often-adjusted-e-g-standardised-before-making-a-model-when-is
|
7
|
-
# http://stats.stackexchange.com/questions/7112/when-and-how-to-use-standardized-explanatory-variables-in-linear-regression
|
8
|
-
# zero-order correlation and the semi-partial correlation
|
9
|
-
# seems to be necessary for svm
|
10
|
-
# http://stats.stackexchange.com/questions/77876/why-would-scaling-features-decrease-svm-performance?lq=1
|
11
|
-
# http://stackoverflow.com/questions/15436367/svm-scaling-input-values
|
12
|
-
# use lasso or elastic net??
|
13
|
-
# select relevant features
|
14
|
-
# remove features with a single value
|
15
|
-
# remove correlated features
|
16
|
-
# remove features not correlated with endpoint
|
17
1
|
module OpenTox
|
18
2
|
module Algorithm
|
19
3
|
|
20
4
|
class Regression
|
21
5
|
|
22
|
-
def self.
|
6
|
+
def self.local_weighted_average compound, params
|
23
7
|
weighted_sum = 0.0
|
24
8
|
sim_sum = 0.0
|
9
|
+
neighbors = params[:neighbors]
|
25
10
|
neighbors.each do |row|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
11
|
+
sim = row["tanimoto"]
|
12
|
+
if row["features"][params[:prediction_feature_id].to_s]
|
13
|
+
row["features"][params[:prediction_feature_id].to_s].each do |act|
|
14
|
+
weighted_sum += sim*Math.log10(act)
|
15
|
+
sim_sum += sim
|
16
|
+
end
|
30
17
|
end
|
31
18
|
end
|
32
|
-
confidence = sim_sum/neighbors.size.to_f
|
33
19
|
sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum)
|
34
|
-
{:value => prediction
|
20
|
+
{:value => prediction}
|
35
21
|
end
|
36
22
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
=
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
23
|
+
# TODO explicit neighbors, also for physchem
|
24
|
+
def self.local_fingerprint_regression compound, params, method='pls'#, method_params="sigma=0.05"
|
25
|
+
neighbors = params[:neighbors]
|
26
|
+
return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0
|
27
|
+
activities = []
|
28
|
+
fingerprints = {}
|
29
|
+
weights = []
|
30
|
+
fingerprint_ids = neighbors.collect{|row| Compound.find(row["_id"]).fingerprint}.flatten.uniq.sort
|
31
|
+
|
32
|
+
neighbors.each_with_index do |row,i|
|
33
|
+
neighbor = Compound.find row["_id"]
|
34
|
+
fingerprint = neighbor.fingerprint
|
35
|
+
if row["features"][params[:prediction_feature_id].to_s]
|
36
|
+
row["features"][params[:prediction_feature_id].to_s].each do |act|
|
37
|
+
activities << Math.log10(act)
|
38
|
+
weights << row["tanimoto"]
|
39
|
+
fingerprint_ids.each_with_index do |id,j|
|
40
|
+
fingerprints[id] ||= []
|
41
|
+
fingerprints[id] << fingerprint.include?(id)
|
42
|
+
end
|
43
|
+
end
|
53
44
|
end
|
54
45
|
end
|
55
|
-
=end
|
56
|
-
confidence = sim_sum/neighbors.size.to_f
|
57
|
-
sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum)
|
58
|
-
{:value => prediction,:confidence => confidence}
|
59
|
-
end
|
60
46
|
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
47
|
+
variables = []
|
48
|
+
data_frame = [activities]
|
49
|
+
fingerprints.each do |k,v|
|
50
|
+
unless v.uniq.size == 1
|
51
|
+
data_frame << v.collect{|m| m ? "T" : "F"}
|
52
|
+
variables << k
|
53
|
+
end
|
54
|
+
end
|
65
55
|
|
66
|
-
|
67
|
-
|
56
|
+
if variables.empty?
|
57
|
+
result = local_weighted_average(compound, params)
|
58
|
+
result[:warning] = "No variables for regression model. Using weighted average of similar compounds."
|
59
|
+
return result
|
68
60
|
|
69
|
-
$logger.debug "Local SVM."
|
70
|
-
props = neighbors.collect{|row| row[3] }
|
71
|
-
neighbors.shift
|
72
|
-
activities = neighbors.collect{|n| n[2]}
|
73
|
-
prediction = self.local_svm_prop( props, activities, params[:min_train_performance]) # params[:props].nil? signals non-prop setting
|
74
|
-
prediction = nil if (!prediction.nil? && prediction.infinite?)
|
75
|
-
$logger.debug "Prediction: '#{prediction}' ('#{prediction.class}')."
|
76
|
-
if prediction
|
77
|
-
confidence = get_confidence({:sims => neighbors.collect{|n| n[1]}, :activities => activities})
|
78
61
|
else
|
79
|
-
|
62
|
+
compound_features = variables.collect{|f| compound.fingerprint.include?(f) ? "T" : "F"}
|
63
|
+
prediction = r_model_prediction method, data_frame, variables, weights, compound_features
|
64
|
+
if prediction.nil? or prediction[:value].nil?
|
65
|
+
prediction = local_weighted_average(compound, params)
|
66
|
+
prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds."
|
67
|
+
return prediction
|
68
|
+
else
|
69
|
+
prediction[:prediction_interval] = [10**(prediction[:value]-1.96*prediction[:rmse]), 10**(prediction[:value]+1.96*prediction[:rmse])]
|
70
|
+
prediction[:value] = 10**prediction[:value]
|
71
|
+
prediction[:rmse] = 10**prediction[:rmse]
|
72
|
+
prediction
|
73
|
+
end
|
80
74
|
end
|
81
|
-
|
82
|
-
|
75
|
+
|
83
76
|
end
|
84
77
|
|
78
|
+
def self.local_physchem_regression compound, params, method="plsr"#, method_params="ncomp = 4"
|
79
|
+
|
80
|
+
neighbors = params[:neighbors]
|
81
|
+
return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0
|
82
|
+
return {:value => neighbors.first["features"][params[:prediction_feature_id]], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1
|
83
|
+
|
84
|
+
activities = []
|
85
|
+
weights = []
|
86
|
+
physchem = {}
|
87
|
+
|
88
|
+
neighbors.each_with_index do |row,i|
|
89
|
+
neighbor = Compound.find row["_id"]
|
90
|
+
if row["features"][params[:prediction_feature_id].to_s]
|
91
|
+
row["features"][params[:prediction_feature_id].to_s].each do |act|
|
92
|
+
activities << Math.log10(act)
|
93
|
+
weights << row["tanimoto"] # TODO cosine ?
|
94
|
+
neighbor.physchem.each do |pid,v| # insert physchem only if there is an activity
|
95
|
+
physchem[pid] ||= []
|
96
|
+
physchem[pid] << v
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
85
101
|
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
# @param [Array] activities, activities for neighbors.
|
91
|
-
# @param [Float] min_train_performance, parameter to control censoring
|
92
|
-
# @return [Numeric] A prediction value.
|
93
|
-
def self.local_svm_prop(props, activities, min_train_performance)
|
102
|
+
# remove properties with a single value
|
103
|
+
physchem.each do |pid,v|
|
104
|
+
physchem.delete(pid) if v.uniq.size <= 1
|
105
|
+
end
|
94
106
|
|
95
|
-
|
96
|
-
|
97
|
-
|
107
|
+
if physchem.empty?
|
108
|
+
result = local_weighted_average(compound, params)
|
109
|
+
result[:warning] = "No variables for regression model. Using weighted average of similar compounds."
|
110
|
+
return result
|
98
111
|
|
99
|
-
prediction = nil
|
100
|
-
if activities.uniq.size == 1
|
101
|
-
prediction = activities[0]
|
102
112
|
else
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
113
|
+
data_frame = [activities] + physchem.keys.collect { |pid| physchem[pid] }
|
114
|
+
prediction = r_model_prediction method, data_frame, physchem.keys, weights, physchem.keys.collect{|pid| compound.physchem[pid]}
|
115
|
+
if prediction.nil?
|
116
|
+
prediction = local_weighted_average(compound, params)
|
117
|
+
prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds."
|
118
|
+
return prediction
|
119
|
+
else
|
120
|
+
prediction[:value] = 10**prediction[:value]
|
121
|
+
prediction
|
111
122
|
end
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
rs << "set.seed(1)"
|
116
|
-
$logger.debug "Loading R packages: #{Time.now-t}"
|
117
|
-
t = Time.now
|
118
|
-
p n_prop
|
119
|
-
begin
|
120
|
-
|
121
|
-
# set data
|
122
|
-
rs << "n_prop <- c(#{n_prop.flatten.join(',')})"
|
123
|
-
rs << "n_prop <- c(#{n_prop.flatten.join(',')})"
|
124
|
-
rs << "n_prop_x_size <- c(#{n_prop.size})"
|
125
|
-
rs << "n_prop_y_size <- c(#{n_prop[0].size})"
|
126
|
-
rs << "y <- c(#{activities.join(',')})"
|
127
|
-
rs << "q_prop <- c(#{q_prop.join(',')})"
|
128
|
-
rs << "y = matrix(y)"
|
129
|
-
rs << "prop_matrix = matrix(n_prop, n_prop_x_size, n_prop_y_size, byrow=T)"
|
130
|
-
rs << "q_prop = matrix(q_prop, 1, n_prop_y_size, byrow=T)"
|
131
|
-
|
132
|
-
$logger.debug "Setting R data: #{Time.now-t}"
|
133
|
-
t = Time.now
|
134
|
-
# prepare data
|
135
|
-
rs << "
|
136
|
-
weights=NULL
|
137
|
-
if (!(class(y) == 'numeric')) {
|
138
|
-
y = factor(y)
|
139
|
-
weights=unlist(as.list(prop.table(table(y))))
|
140
|
-
weights=(weights-1)^2
|
141
|
-
}
|
142
|
-
"
|
143
|
-
|
144
|
-
rs << "
|
145
|
-
rem = nearZeroVar(prop_matrix)
|
146
|
-
if (length(rem) > 0) {
|
147
|
-
prop_matrix = prop_matrix[,-rem,drop=F]
|
148
|
-
q_prop = q_prop[,-rem,drop=F]
|
149
|
-
}
|
150
|
-
rem = findCorrelation(cor(prop_matrix))
|
151
|
-
if (length(rem) > 0) {
|
152
|
-
prop_matrix = prop_matrix[,-rem,drop=F]
|
153
|
-
q_prop = q_prop[,-rem,drop=F]
|
154
|
-
}
|
155
|
-
"
|
156
|
-
|
157
|
-
#p @r.eval("y").to_ruby
|
158
|
-
#p "weights"
|
159
|
-
#p @r.eval("weights").to_ruby
|
160
|
-
$logger.debug "Preparing R data: #{Time.now-t}"
|
161
|
-
t = Time.now
|
162
|
-
# model + support vectors
|
163
|
-
#train_success = @r.eval <<-EOR
|
164
|
-
rs << '
|
165
|
-
model = train(prop_matrix,y,
|
166
|
-
method="svmRadial",
|
167
|
-
preProcess=c("center", "scale"),
|
168
|
-
class.weights=weights,
|
169
|
-
trControl=trainControl(method="LGOCV",number=10),
|
170
|
-
tuneLength=8
|
171
|
-
)
|
172
|
-
perf = ifelse ( class(y)!="numeric", max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared )
|
173
|
-
'
|
174
|
-
File.open("/tmp/r.r","w+"){|f| f.puts rs.join("\n")}
|
175
|
-
p rs.join("\n")
|
176
|
-
p `Rscript /tmp/r.r`
|
177
|
-
=begin
|
178
|
-
@r.void_eval <<-EOR
|
179
|
-
model = train(prop_matrix,y,
|
180
|
-
method="svmRadial",
|
181
|
-
#preProcess=c("center", "scale"),
|
182
|
-
#class.weights=weights,
|
183
|
-
#trControl=trainControl(method="LGOCV",number=10),
|
184
|
-
#tuneLength=8
|
185
|
-
)
|
186
|
-
perf = ifelse ( class(y)!='numeric', max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared )
|
187
|
-
EOR
|
188
|
-
=end
|
189
|
-
|
190
|
-
$logger.debug "Creating R SVM model: #{Time.now-t}"
|
191
|
-
t = Time.now
|
192
|
-
if train_success
|
193
|
-
# prediction
|
194
|
-
@r.eval "predict(model,q_prop); p = predict(model,q_prop)" # kernlab bug: predict twice
|
195
|
-
#@r.eval "p = predict(model,q_prop)" # kernlab bug: predict twice
|
196
|
-
@r.eval "if (class(y)!='numeric') p = as.character(p)"
|
197
|
-
prediction = @r.p
|
123
|
+
end
|
124
|
+
|
125
|
+
end
|
198
126
|
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
$logger.debug "#{e.class}: #{e.message}"
|
210
|
-
$logger.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
211
|
-
ensure
|
212
|
-
#puts @r.inspect
|
213
|
-
#TODO: broken pipe
|
214
|
-
#@r.quit # free R
|
215
|
-
end
|
127
|
+
def self.r_model_prediction method, training_data, training_features, training_weights, query_feature_values
|
128
|
+
R.assign "weights", training_weights
|
129
|
+
r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})"
|
130
|
+
R.eval "data <- #{r_data_frame}"
|
131
|
+
R.assign "features", training_features
|
132
|
+
R.eval "names(data) <- append(c('activities'),features)" #
|
133
|
+
begin
|
134
|
+
R.eval "model <- train(activities ~ ., data = data, method = '#{method}')"
|
135
|
+
rescue
|
136
|
+
return nil
|
216
137
|
end
|
217
|
-
|
138
|
+
R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))"
|
139
|
+
R.eval "names(fingerprint) <- features"
|
140
|
+
R.eval "prediction <- predict(model,fingerprint)"
|
141
|
+
{
|
142
|
+
:value => R.eval("prediction").to_f,
|
143
|
+
:rmse => R.eval("getTrainPerf(model)$TrainRMSE").to_f,
|
144
|
+
:r_squared => R.eval("getTrainPerf(model)$TrainRsquared").to_f,
|
145
|
+
}
|
218
146
|
end
|
219
|
-
end
|
220
147
|
|
148
|
+
end
|
221
149
|
end
|
222
150
|
end
|
223
151
|
|
data/lib/rest-client-wrapper.rb
CHANGED
@@ -26,15 +26,14 @@ module OpenTox
|
|
26
26
|
define_singleton_method method do |uri,payload={},headers={},waiting_task=nil|
|
27
27
|
|
28
28
|
# check input
|
29
|
-
bad_request_error "Headers are not a hash: #{headers.inspect}
|
29
|
+
bad_request_error "Headers are not a hash: #{headers.inspect} for #{uri}." unless headers==nil or headers.is_a?(Hash)
|
30
30
|
headers[:subjectid] ||= @@subjectid
|
31
|
-
bad_request_error "Invalid URI: '#{uri}'"
|
32
|
-
#resource_not_found_error "URI '#{uri}' not found.", uri unless URI.accessible?(uri, @subjectid) unless URI.ssl?(uri)
|
31
|
+
bad_request_error "Invalid URI: '#{uri}'" unless URI.valid? uri
|
33
32
|
# make sure that no header parameters are set in the payload
|
34
33
|
[:accept,:content_type,:subjectid].each do |header|
|
35
34
|
if defined? $aa || URI(uri).host == URI($aa[:uri]).host
|
36
35
|
else
|
37
|
-
bad_request_error "#{header} should be submitted in the headers
|
36
|
+
bad_request_error "#{header} should be submitted in the headers of URI: #{uri}" if payload and payload.is_a?(Hash) and payload[header]
|
38
37
|
end
|
39
38
|
end
|
40
39
|
|
@@ -72,7 +71,7 @@ module OpenTox
|
|
72
71
|
msg = "Could not parse error response from rest call '#{method}' to '#{uri}':\n#{response}"
|
73
72
|
cause = nil
|
74
73
|
end
|
75
|
-
Object.method(error[:method]).call msg, uri, cause # call error method
|
74
|
+
Object.method(error[:method]).call "#{msg}, #{uri}, #{cause}" # call error method
|
76
75
|
else
|
77
76
|
response
|
78
77
|
end
|
data/lib/unique_descriptors.rb
CHANGED
@@ -12,7 +12,7 @@ UNIQUEDESCRIPTORS = [
|
|
12
12
|
"Openbabel.HBA1", #Number of Hydrogen Bond Acceptors 1 (JoelLib)
|
13
13
|
"Openbabel.HBA2", #Number of Hydrogen Bond Acceptors 2 (JoelLib)
|
14
14
|
"Openbabel.HBD", #Number of Hydrogen Bond Donors (JoelLib)
|
15
|
-
"
|
15
|
+
#"Openbabe..L5", #Lipinski Rule of Five# TODO Openbabel.L5 returns nil, investigate!!!
|
16
16
|
"Openbabel.logP", #octanol/water partition coefficient
|
17
17
|
"Openbabel.MP", #Melting point
|
18
18
|
"Openbabel.MR", #molar refractivity
|
@@ -24,7 +24,7 @@ UNIQUEDESCRIPTORS = [
|
|
24
24
|
"Cdk.ALOGP", #Calculates atom additive logP and molar refractivity values as described by Ghose and Crippen and
|
25
25
|
"Cdk.APol", #Descriptor that calculates the sum of the atomic polarizabilities (including implicit hydrogens).
|
26
26
|
"Cdk.AcidicGroupCount", #Returns the number of acidic groups.
|
27
|
-
"Cdk.AminoAcidCount", #Returns the number of amino acids found in the system
|
27
|
+
#"Cdk.AminoAcidCount", #Returns the number of amino acids found in the system
|
28
28
|
#"Cdk.AromaticAtomsCount", #Descriptor based on the number of aromatic atoms of a molecule.
|
29
29
|
#"Cdk.AromaticBondsCount", #Descriptor based on the number of aromatic bonds of a molecule.
|
30
30
|
#"Cdk.AtomCount", #Descriptor based on the number of atoms of a certain element type.
|
@@ -56,7 +56,7 @@ UNIQUEDESCRIPTORS = [
|
|
56
56
|
"Cdk.LengthOverBreadth", #Calculates the ratio of length to breadth.
|
57
57
|
"Cdk.LongestAliphaticChain", #Returns the number of atoms in the longest aliphatic chain
|
58
58
|
"Cdk.MDE", #Evaluate molecular distance edge descriptors for C, N and O
|
59
|
-
"Cdk.MannholdLogP", #Descriptor that calculates the LogP based on a simple equation using the number of carbons and hetero atoms .
|
59
|
+
#"Cdk.MannholdLogP", #Descriptor that calculates the LogP based on a simple equation using the number of carbons and hetero atoms .
|
60
60
|
"Cdk.MomentOfInertia", #Descriptor that calculates the principal moments of inertia and ratios of the principal moments. Als calculates the radius of gyration.
|
61
61
|
"Cdk.PetitjeanNumber", #Descriptor that calculates the Petitjean Number of a molecule.
|
62
62
|
"Cdk.PetitjeanShapeIndex", #The topological and geometric shape indices described Petitjean and Bath et al. respectively. Both measure the anisotropy in a molecule.
|
@@ -75,7 +75,7 @@ UNIQUEDESCRIPTORS = [
|
|
75
75
|
"Joelib.count.NumberOfP", #no description available
|
76
76
|
"Joelib.count.NumberOfO", #no description available
|
77
77
|
"Joelib.count.NumberOfN", #no description available
|
78
|
-
#"
|
78
|
+
#"Joeli#.count.AromaticBonds", #no description available
|
79
79
|
"Joelib.count.NumberOfI", #no description available
|
80
80
|
"Joelib.count.NumberOfF", #no description available
|
81
81
|
"Joelib.count.NumberOfC", #no description available
|
@@ -91,7 +91,7 @@ UNIQUEDESCRIPTORS = [
|
|
91
91
|
"Joelib.GeometricalShapeCoefficient", #no description available
|
92
92
|
#"Joelib.MolecularWeight", #no description available
|
93
93
|
"Joelib.FractionRotatableBonds", #no description available
|
94
|
-
#"
|
94
|
+
#"Joeli..count.HBD2", #no description available
|
95
95
|
#"Joelib.count.HBD1", #no description available
|
96
96
|
"Joelib.LogP", #no description available
|
97
97
|
"Joelib.GraphShapeCoefficient", #no description available
|
@@ -116,5 +116,4 @@ UNIQUEDESCRIPTORS = [
|
|
116
116
|
"Joelib.count.SOGroups", #no description available
|
117
117
|
"Joelib.TopologicalDiameter", #no description available
|
118
118
|
"Joelib.count.NumberOfHal", #no description available
|
119
|
-
|
120
|
-
].sort
|
119
|
+
]
|