lazar 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/.yardopts +4 -0
- data/Gemfile +2 -0
- data/LICENSE +674 -0
- data/README.md +44 -0
- data/Rakefile +1 -0
- data/VERSION +1 -0
- data/ext/lazar/extconf.rb +87 -0
- data/java/CdkDescriptorInfo.class +0 -0
- data/java/CdkDescriptorInfo.java +22 -0
- data/java/CdkDescriptors.class +0 -0
- data/java/CdkDescriptors.java +141 -0
- data/java/Jmol.jar +0 -0
- data/java/JoelibDescriptorInfo.class +0 -0
- data/java/JoelibDescriptorInfo.java +15 -0
- data/java/JoelibDescriptors.class +0 -0
- data/java/JoelibDescriptors.java +60 -0
- data/java/Rakefile +15 -0
- data/java/cdk-1.4.19.jar +0 -0
- data/java/joelib2.jar +0 -0
- data/java/log4j.jar +0 -0
- data/lazar.gemspec +29 -0
- data/lib/SMARTS_InteLigand.txt +983 -0
- data/lib/algorithm.rb +21 -0
- data/lib/bbrc.rb +165 -0
- data/lib/classification.rb +107 -0
- data/lib/compound.rb +254 -0
- data/lib/crossvalidation.rb +187 -0
- data/lib/dataset.rb +334 -0
- data/lib/descriptor.rb +247 -0
- data/lib/error.rb +66 -0
- data/lib/feature.rb +97 -0
- data/lib/lazar-model.rb +170 -0
- data/lib/lazar.rb +69 -0
- data/lib/neighbor.rb +25 -0
- data/lib/opentox.rb +22 -0
- data/lib/overwrite.rb +119 -0
- data/lib/regression.rb +199 -0
- data/lib/rest-client-wrapper.rb +98 -0
- data/lib/similarity.rb +58 -0
- data/lib/unique_descriptors.rb +120 -0
- data/lib/validation.rb +114 -0
- data/mongoid.yml +8 -0
- data/test/all.rb +5 -0
- data/test/compound.rb +100 -0
- data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +13553 -0
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +436 -0
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +568 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +87 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +978 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +1120 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +1113 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +850 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +829 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +1198 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +1505 -0
- data/test/data/EPAFHM.csv +618 -0
- data/test/data/EPAFHM.medi.csv +100 -0
- data/test/data/EPAFHM.mini.csv +22 -0
- data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +581 -0
- data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +1217 -0
- data/test/data/ISSCAN-multi.csv +59 -0
- data/test/data/LOAEL_log_mg_corrected_smiles.csv +568 -0
- data/test/data/LOAEL_log_mmol_corrected_smiles.csv +568 -0
- data/test/data/acetaldehyde.sdf +14 -0
- data/test/data/boiling_points.ext.sdf +11460 -0
- data/test/data/cpdb_100.csv +101 -0
- data/test/data/hamster_carcinogenicity.csv +86 -0
- data/test/data/hamster_carcinogenicity.mini.bool_float.csv +11 -0
- data/test/data/hamster_carcinogenicity.mini.bool_int.csv +11 -0
- data/test/data/hamster_carcinogenicity.mini.bool_string.csv +11 -0
- data/test/data/hamster_carcinogenicity.mini.csv +11 -0
- data/test/data/hamster_carcinogenicity.ntriples +618 -0
- data/test/data/hamster_carcinogenicity.sdf +2805 -0
- data/test/data/hamster_carcinogenicity.xls +0 -0
- data/test/data/hamster_carcinogenicity.yaml +352 -0
- data/test/data/hamster_carcinogenicity_with_errors.csv +88 -0
- data/test/data/kazius.csv +4070 -0
- data/test/data/multi_cell_call.csv +1067 -0
- data/test/data/multi_cell_call_no_dup.csv +1057 -0
- data/test/data/multicolumn.csv +8 -0
- data/test/data/rat_feature_dataset.csv +1179 -0
- data/test/data/wrong_dataset.csv +8 -0
- data/test/dataset-long.rb +117 -0
- data/test/dataset.rb +199 -0
- data/test/descriptor-long.rb +26 -0
- data/test/descriptor.rb +83 -0
- data/test/error.rb +24 -0
- data/test/feature.rb +65 -0
- data/test/fminer-long.rb +38 -0
- data/test/fminer.rb +52 -0
- data/test/lazar-fminer.rb +50 -0
- data/test/lazar-long.rb +72 -0
- data/test/lazar-physchem-short.rb +27 -0
- data/test/setup.rb +6 -0
- data/test/validation.rb +41 -0
- metadata +212 -0
data/lib/overwrite.rb
ADDED
@@ -0,0 +1,119 @@
|
|
1
|
+
require "base64"
|
2
|
+
class Object
|
3
|
+
# An object is blank if it's false, empty, or a whitespace string.
|
4
|
+
# For example, "", " ", +nil+, [], and {} are all blank.
|
5
|
+
def blank?
|
6
|
+
respond_to?(:empty?) ? empty? : !self
|
7
|
+
end
|
8
|
+
|
9
|
+
def numeric?
|
10
|
+
true if Float(self) rescue false
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
class Numeric
|
15
|
+
def percent_of(n)
|
16
|
+
self.to_f / n.to_f * 100.0
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
module Enumerable
|
21
|
+
# @return [Array] only the duplicates of an enumerable
|
22
|
+
def duplicates
|
23
|
+
inject({}) {|h,v| h[v]=h[v].to_i+1; h}.reject{|k,v| v==1}.keys
|
24
|
+
end
|
25
|
+
# http://stackoverflow.com/questions/2562256/find-most-common-string-in-an-array
|
26
|
+
Enumerable.class_eval do
|
27
|
+
def mode
|
28
|
+
group_by do |e|
|
29
|
+
e
|
30
|
+
end.values.max_by(&:size).first
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
class String
|
36
|
+
# @return [String] converts camel-case to underscore-case (OpenTox::SuperModel -> open_tox/super_model)
|
37
|
+
def underscore
|
38
|
+
self.gsub(/::/, '/').
|
39
|
+
gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
|
40
|
+
gsub(/([a-z\d])([A-Z])/,'\1_\2').
|
41
|
+
tr("-", "_").
|
42
|
+
downcase
|
43
|
+
end
|
44
|
+
|
45
|
+
# convert strings to boolean values
|
46
|
+
# @return [TrueClass,FalseClass] true or false
|
47
|
+
def to_boolean
|
48
|
+
return true if self == true || self =~ (/(true|t|yes|y|1)$/i)
|
49
|
+
return false if self == false || self.nil? || self =~ (/(false|f|no|n|0)$/i)
|
50
|
+
bad_request_error "invalid value for Boolean: \"#{self}\""
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
54
|
+
|
55
|
+
class File
|
56
|
+
# @return [String] mime_type including charset using linux cmd command
|
57
|
+
def mime_type
|
58
|
+
`file -ib '#{self.path}'`.chomp
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
class Array
|
63
|
+
|
64
|
+
# Sum up the size of single arrays in an array of arrays
|
65
|
+
# @param [Array] Array of arrays
|
66
|
+
# @return [Integer] Sum of size of array elements
|
67
|
+
def sum_size
|
68
|
+
self.inject(0) { |s,a|
|
69
|
+
if a.respond_to?('size')
|
70
|
+
s+=a.size
|
71
|
+
else
|
72
|
+
internal_server_error "No size available: #{a.inspect}"
|
73
|
+
end
|
74
|
+
}
|
75
|
+
end
|
76
|
+
|
77
|
+
# For symbolic features
|
78
|
+
# @param [Array] Array to test.
|
79
|
+
# @return [Boolean] Whether the array has just one unique value.
|
80
|
+
def zero_variance?
|
81
|
+
return self.uniq.size == 1
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
85
|
+
|
86
|
+
module URI
|
87
|
+
|
88
|
+
def self.ssl? uri
|
89
|
+
URI.parse(uri).instance_of? URI::HTTPS
|
90
|
+
end
|
91
|
+
|
92
|
+
# @return [Boolean] checks if resource exists by making a HEAD-request
|
93
|
+
def self.accessible?(uri)
|
94
|
+
parsed_uri = URI.parse(uri + (OpenTox::RestClientWrapper.subjectid ? "?subjectid=#{CGI.escape OpenTox::RestClientWrapper.subjectid}" : ""))
|
95
|
+
http_code = URI.task?(uri) ? 600 : 400
|
96
|
+
http = Net::HTTP.new(parsed_uri.host, parsed_uri.port)
|
97
|
+
unless (URI.ssl? uri) == true
|
98
|
+
http = Net::HTTP.new(parsed_uri.host, parsed_uri.port)
|
99
|
+
request = Net::HTTP::Head.new(parsed_uri.request_uri)
|
100
|
+
http.request(request).code.to_i < http_code
|
101
|
+
else
|
102
|
+
http = Net::HTTP.new(parsed_uri.host, parsed_uri.port)
|
103
|
+
http.use_ssl = true
|
104
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
105
|
+
request = Net::HTTP::Head.new(parsed_uri.request_uri)
|
106
|
+
http.request(request).code.to_i < http_code
|
107
|
+
end
|
108
|
+
rescue
|
109
|
+
false
|
110
|
+
end
|
111
|
+
|
112
|
+
def self.valid? uri
|
113
|
+
u = URI.parse(uri)
|
114
|
+
u.scheme!=nil and u.host!=nil
|
115
|
+
rescue URI::InvalidURIError
|
116
|
+
false
|
117
|
+
end
|
118
|
+
|
119
|
+
end
|
data/lib/regression.rb
ADDED
@@ -0,0 +1,199 @@
|
|
1
|
+
# TODO install R packages kernlab, caret, doMC, class, e1071
|
2
|
+
|
3
|
+
|
4
|
+
# log transform activities (create new dataset)
|
5
|
+
# scale, normalize features, might not be necessary
|
6
|
+
# http://stats.stackexchange.com/questions/19216/variables-are-often-adjusted-e-g-standardised-before-making-a-model-when-is
|
7
|
+
# http://stats.stackexchange.com/questions/7112/when-and-how-to-use-standardized-explanatory-variables-in-linear-regression
|
8
|
+
# zero-order correlation and the semi-partial correlation
|
9
|
+
# seems to be necessary for svm
|
10
|
+
# http://stats.stackexchange.com/questions/77876/why-would-scaling-features-decrease-svm-performance?lq=1
|
11
|
+
# http://stackoverflow.com/questions/15436367/svm-scaling-input-values
|
12
|
+
# use lasso or elastic net??
|
13
|
+
# select relevant features
|
14
|
+
# remove features with a single value
|
15
|
+
# remove correlated features
|
16
|
+
# remove features not correlated with endpoint
|
17
|
+
module OpenTox
|
18
|
+
module Algorithm
|
19
|
+
|
20
|
+
class Regression
|
21
|
+
|
22
|
+
def self.weighted_average neighbors
|
23
|
+
weighted_sum = 0.0
|
24
|
+
sim_sum = 0.0
|
25
|
+
neighbors.each do |row|
|
26
|
+
n,sim,acts = row
|
27
|
+
acts.each do |act|
|
28
|
+
weighted_sum += sim*Math.log10(act)
|
29
|
+
sim_sum += sim
|
30
|
+
end
|
31
|
+
end
|
32
|
+
confidence = sim_sum/neighbors.size.to_f
|
33
|
+
sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum)
|
34
|
+
{:value => prediction,:confidence => confidence}
|
35
|
+
end
|
36
|
+
|
37
|
+
# Local support vector regression from neighbors
|
38
|
+
# @param [Hash] params Keys `:props, :activities, :sims, :min_train_performance` are required
|
39
|
+
# @return [Numeric] A prediction value.
|
40
|
+
def self.local_svm_regression neighbors, params={:min_train_performance => 0.1}
|
41
|
+
|
42
|
+
confidence = 0.0
|
43
|
+
prediction = nil
|
44
|
+
|
45
|
+
$logger.debug "Local SVM."
|
46
|
+
props = neighbors.collect{|row| row[3] }
|
47
|
+
neighbors.shift
|
48
|
+
activities = neighbors.collect{|n| n[2]}
|
49
|
+
prediction = self.local_svm_prop( props, activities, params[:min_train_performance]) # params[:props].nil? signals non-prop setting
|
50
|
+
prediction = nil if (!prediction.nil? && prediction.infinite?)
|
51
|
+
$logger.debug "Prediction: '#{prediction}' ('#{prediction.class}')."
|
52
|
+
if prediction
|
53
|
+
confidence = get_confidence({:sims => neighbors.collect{|n| n[1]}, :activities => activities})
|
54
|
+
else
|
55
|
+
confidence = nil if prediction.nil?
|
56
|
+
end
|
57
|
+
[prediction, confidence]
|
58
|
+
|
59
|
+
end
|
60
|
+
|
61
|
+
|
62
|
+
# Local support vector prediction from neighbors.
|
63
|
+
# Uses propositionalized setting.
|
64
|
+
# Not to be called directly (use local_svm_regression or local_svm_classification).
|
65
|
+
# @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ]
|
66
|
+
# @param [Array] activities, activities for neighbors.
|
67
|
+
# @param [Float] min_train_performance, parameter to control censoring
|
68
|
+
# @return [Numeric] A prediction value.
|
69
|
+
def self.local_svm_prop(props, activities, min_train_performance)
|
70
|
+
|
71
|
+
$logger.debug "Local SVM (Propositionalization / Kernlab Kernel)."
|
72
|
+
n_prop = props[1..-1] # is a matrix, i.e. two nested Arrays.
|
73
|
+
q_prop = props[0] # is an Array.
|
74
|
+
|
75
|
+
prediction = nil
|
76
|
+
if activities.uniq.size == 1
|
77
|
+
prediction = activities[0]
|
78
|
+
else
|
79
|
+
t = Time.now
|
80
|
+
#$logger.debug gram_matrix.to_yaml
|
81
|
+
#@r = RinRuby.new(true,false) # global R instance leads to Socket errors after a large number of requests
|
82
|
+
@r = Rserve::Connection.new#(true,false) # global R instance leads to Socket errors after a large number of requests
|
83
|
+
rs = []
|
84
|
+
["caret", "doMC", "class"].each do |lib|
|
85
|
+
#raise "failed to load R-package #{lib}" unless @r.void_eval "suppressPackageStartupMessages(library('#{lib}'))"
|
86
|
+
rs << "suppressPackageStartupMessages(library('#{lib}'))"
|
87
|
+
end
|
88
|
+
#@r.eval "registerDoMC()" # switch on parallel processing
|
89
|
+
rs << "registerDoMC()" # switch on parallel processing
|
90
|
+
#@r.eval "set.seed(1)"
|
91
|
+
rs << "set.seed(1)"
|
92
|
+
$logger.debug "Loading R packages: #{Time.now-t}"
|
93
|
+
t = Time.now
|
94
|
+
p n_prop
|
95
|
+
begin
|
96
|
+
|
97
|
+
# set data
|
98
|
+
rs << "n_prop <- c(#{n_prop.flatten.join(',')})"
|
99
|
+
rs << "n_prop <- c(#{n_prop.flatten.join(',')})"
|
100
|
+
rs << "n_prop_x_size <- c(#{n_prop.size})"
|
101
|
+
rs << "n_prop_y_size <- c(#{n_prop[0].size})"
|
102
|
+
rs << "y <- c(#{activities.join(',')})"
|
103
|
+
rs << "q_prop <- c(#{q_prop.join(',')})"
|
104
|
+
rs << "y = matrix(y)"
|
105
|
+
rs << "prop_matrix = matrix(n_prop, n_prop_x_size, n_prop_y_size, byrow=T)"
|
106
|
+
rs << "q_prop = matrix(q_prop, 1, n_prop_y_size, byrow=T)"
|
107
|
+
|
108
|
+
$logger.debug "Setting R data: #{Time.now-t}"
|
109
|
+
t = Time.now
|
110
|
+
# prepare data
|
111
|
+
rs << "
|
112
|
+
weights=NULL
|
113
|
+
if (!(class(y) == 'numeric')) {
|
114
|
+
y = factor(y)
|
115
|
+
weights=unlist(as.list(prop.table(table(y))))
|
116
|
+
weights=(weights-1)^2
|
117
|
+
}
|
118
|
+
"
|
119
|
+
|
120
|
+
rs << "
|
121
|
+
rem = nearZeroVar(prop_matrix)
|
122
|
+
if (length(rem) > 0) {
|
123
|
+
prop_matrix = prop_matrix[,-rem,drop=F]
|
124
|
+
q_prop = q_prop[,-rem,drop=F]
|
125
|
+
}
|
126
|
+
rem = findCorrelation(cor(prop_matrix))
|
127
|
+
if (length(rem) > 0) {
|
128
|
+
prop_matrix = prop_matrix[,-rem,drop=F]
|
129
|
+
q_prop = q_prop[,-rem,drop=F]
|
130
|
+
}
|
131
|
+
"
|
132
|
+
|
133
|
+
#p @r.eval("y").to_ruby
|
134
|
+
#p "weights"
|
135
|
+
#p @r.eval("weights").to_ruby
|
136
|
+
$logger.debug "Preparing R data: #{Time.now-t}"
|
137
|
+
t = Time.now
|
138
|
+
# model + support vectors
|
139
|
+
#train_success = @r.eval <<-EOR
|
140
|
+
rs << '
|
141
|
+
model = train(prop_matrix,y,
|
142
|
+
method="svmRadial",
|
143
|
+
preProcess=c("center", "scale"),
|
144
|
+
class.weights=weights,
|
145
|
+
trControl=trainControl(method="LGOCV",number=10),
|
146
|
+
tuneLength=8
|
147
|
+
)
|
148
|
+
perf = ifelse ( class(y)!="numeric", max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared )
|
149
|
+
'
|
150
|
+
File.open("/tmp/r.r","w+"){|f| f.puts rs.join("\n")}
|
151
|
+
p rs.join("\n")
|
152
|
+
p `Rscript /tmp/r.r`
|
153
|
+
=begin
|
154
|
+
@r.void_eval <<-EOR
|
155
|
+
model = train(prop_matrix,y,
|
156
|
+
method="svmRadial",
|
157
|
+
#preProcess=c("center", "scale"),
|
158
|
+
#class.weights=weights,
|
159
|
+
#trControl=trainControl(method="LGOCV",number=10),
|
160
|
+
#tuneLength=8
|
161
|
+
)
|
162
|
+
perf = ifelse ( class(y)!='numeric', max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared )
|
163
|
+
EOR
|
164
|
+
=end
|
165
|
+
|
166
|
+
$logger.debug "Creating R SVM model: #{Time.now-t}"
|
167
|
+
t = Time.now
|
168
|
+
if train_success
|
169
|
+
# prediction
|
170
|
+
@r.eval "predict(model,q_prop); p = predict(model,q_prop)" # kernlab bug: predict twice
|
171
|
+
#@r.eval "p = predict(model,q_prop)" # kernlab bug: predict twice
|
172
|
+
@r.eval "if (class(y)!='numeric') p = as.character(p)"
|
173
|
+
prediction = @r.p
|
174
|
+
|
175
|
+
# censoring
|
176
|
+
prediction = nil if ( @r.perf.nan? || @r.perf < min_train_performance.to_f )
|
177
|
+
prediction = nil if prediction =~ /NA/
|
178
|
+
$logger.debug "Performance: '#{sprintf("%.2f", @r.perf)}'"
|
179
|
+
else
|
180
|
+
$logger.debug "Model creation failed."
|
181
|
+
prediction = nil
|
182
|
+
end
|
183
|
+
$logger.debug "R Prediction: #{Time.now-t}"
|
184
|
+
rescue Exception => e
|
185
|
+
$logger.debug "#{e.class}: #{e.message}"
|
186
|
+
$logger.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
187
|
+
ensure
|
188
|
+
#puts @r.inspect
|
189
|
+
#TODO: broken pipe
|
190
|
+
#@r.quit # free R
|
191
|
+
end
|
192
|
+
end
|
193
|
+
prediction
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
@@ -0,0 +1,98 @@
|
|
1
|
+
module OpenTox
|
2
|
+
|
3
|
+
class RestClientWrapper
|
4
|
+
|
5
|
+
attr_accessor :request, :response
|
6
|
+
|
7
|
+
@@subjectid = nil
|
8
|
+
|
9
|
+
def self.subjectid=(subjectid)
|
10
|
+
@@subjectid = subjectid
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.subjectid
|
14
|
+
@@subjectid
|
15
|
+
end
|
16
|
+
|
17
|
+
# REST methods
|
18
|
+
# Raises OpenTox::Error if call fails (rescued in overwrite.rb -> halt 502)
|
19
|
+
# Does not wait for task to finish and returns task uri
|
20
|
+
# @param [String] destination URI
|
21
|
+
# @param [optional,Hash|String] Payload data posted to the service
|
22
|
+
# @param [optional,Hash] Headers with params like :accept, :content_type, :subjectid, :verify_ssl
|
23
|
+
# @return [RestClient::Response] REST call response
|
24
|
+
[:head,:get,:post,:put,:delete].each do |method|
|
25
|
+
|
26
|
+
define_singleton_method method do |uri,payload={},headers={},waiting_task=nil|
|
27
|
+
|
28
|
+
# check input
|
29
|
+
bad_request_error "Headers are not a hash: #{headers.inspect}", uri unless headers==nil or headers.is_a?(Hash)
|
30
|
+
headers[:subjectid] ||= @@subjectid
|
31
|
+
bad_request_error "Invalid URI: '#{uri}'", uri unless URI.valid? uri
|
32
|
+
#resource_not_found_error "URI '#{uri}' not found.", uri unless URI.accessible?(uri, @subjectid) unless URI.ssl?(uri)
|
33
|
+
# make sure that no header parameters are set in the payload
|
34
|
+
[:accept,:content_type,:subjectid].each do |header|
|
35
|
+
if defined? $aa || URI(uri).host == URI($aa[:uri]).host
|
36
|
+
else
|
37
|
+
bad_request_error "#{header} should be submitted in the headers", uri if payload and payload.is_a?(Hash) and payload[header]
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
# create request
|
42
|
+
args={}
|
43
|
+
args[:method] = method
|
44
|
+
args[:url] = uri
|
45
|
+
args[:verify_ssl] = 0 if headers[:verify_ssl].nil? || headers[:verify_ssl].empty?
|
46
|
+
args[:timeout] = 1800
|
47
|
+
args[:payload] = payload
|
48
|
+
headers.each{ |k,v| headers.delete(k) if v==nil } if headers #remove keys with empty values, as this can cause problems
|
49
|
+
args[:headers] = headers
|
50
|
+
|
51
|
+
$logger.debug "post to #{uri} with params #{payload.inspect.to_s[0..1000]}" if method.to_s=="post"
|
52
|
+
|
53
|
+
@request = RestClient::Request.new(args)
|
54
|
+
# ignore error codes from Task services (may return error codes >= 400 according to API, which causes exceptions in RestClient and RDF::Reader)
|
55
|
+
@response = @request.execute do |response, request, result|
|
56
|
+
if [301, 302, 307].include? response.code and request.method == :get
|
57
|
+
response.follow_redirection(request, result)
|
58
|
+
elsif response.code >= 400 and !URI.task?(uri)
|
59
|
+
#TODO add parameters to error-report
|
60
|
+
#parameters = request.args
|
61
|
+
#parameters[:headers][:subjectid] = "REMOVED" if parameters[:headers] and parameters[:headers][:subjectid]
|
62
|
+
#parameters[:url] = parameters[:url].gsub(/(http|https|)\:\/\/[a-zA-Z0-9\-]+\:[a-zA-Z0-9]+\@/, "REMOVED@") if parameters[:url]
|
63
|
+
#message += "\nREST parameters:\n#{parameters.inspect}"
|
64
|
+
error = known_errors.collect{|e| e if e[:code] == response.code}.compact.first
|
65
|
+
begin # errors are returned as error reports in json, try to parse
|
66
|
+
# TODO: may be the reason for failure of task.rb -n test_11_wait_for_error_task
|
67
|
+
content = JSON.parse(response)
|
68
|
+
msg = content["message"].to_s
|
69
|
+
cause = content["errorCause"].to_s
|
70
|
+
raise if msg.size==0 && cause.size==0 # parsing failed
|
71
|
+
rescue # parsing error failed, use complete content as message
|
72
|
+
msg = "Could not parse error response from rest call '#{method}' to '#{uri}':\n#{response}"
|
73
|
+
cause = nil
|
74
|
+
end
|
75
|
+
Object.method(error[:method]).call msg, uri, cause # call error method
|
76
|
+
else
|
77
|
+
response
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
#@return [Array] of hashes with error code, method and class
|
84
|
+
def self.known_errors
|
85
|
+
errors = []
|
86
|
+
RestClient::STATUSES.each do |code,k|
|
87
|
+
if code >= 400
|
88
|
+
method = k.underscore.gsub(/ |'/,'_')
|
89
|
+
method += "_error" unless method.match(/_error$/)
|
90
|
+
klass = method.split("_").collect{|s| s.capitalize}.join("")
|
91
|
+
errors << {:code => code, :method => method.to_sym, :class => klass}
|
92
|
+
end
|
93
|
+
end
|
94
|
+
errors
|
95
|
+
end
|
96
|
+
|
97
|
+
end
|
98
|
+
end
|
data/lib/similarity.rb
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
=begin
|
2
|
+
* Name: similarity.rb
|
3
|
+
* Description: Similarity algorithms
|
4
|
+
* Author: Andreas Maunz <andreas@maunz.de
|
5
|
+
* Date: 10/2012
|
6
|
+
=end
|
7
|
+
|
8
|
+
module OpenTox
|
9
|
+
module Algorithm
|
10
|
+
|
11
|
+
class Similarity
|
12
|
+
|
13
|
+
#TODO weighted tanimoto
|
14
|
+
|
15
|
+
# Tanimoto similarity
|
16
|
+
# @param [Array] a fingerprints of first compound
|
17
|
+
# @param [Array] b fingerprints of second compound
|
18
|
+
# @return [Float] Tanimoto similarity
|
19
|
+
def self.tanimoto(a,b)
|
20
|
+
bad_request_error "fingerprints #{a} and #{b} don't have equal size" unless a.size == b.size
|
21
|
+
#common = 0.0
|
22
|
+
#a.each_with_index do |n,i|
|
23
|
+
#common += 1 if n == b[i]
|
24
|
+
#end
|
25
|
+
#common/a.size
|
26
|
+
# TODO check if calculation speed can be improved
|
27
|
+
common_p_sum = 0.0
|
28
|
+
all_p_sum = 0.0
|
29
|
+
(0...a.size).each { |idx|
|
30
|
+
common_p_sum += [ a[idx], b[idx] ].min
|
31
|
+
all_p_sum += [ a[idx], b[idx] ].max
|
32
|
+
}
|
33
|
+
common_p_sum/all_p_sum
|
34
|
+
end
|
35
|
+
|
36
|
+
|
37
|
+
# Cosine similarity
|
38
|
+
# @param [Array] a fingerprints of first compound
|
39
|
+
# @param [Array] b fingerprints of second compound
|
40
|
+
# @return [Float] Cosine similarity, the cosine of angle enclosed between vectors a and b
|
41
|
+
def self.cosine(a, b)
|
42
|
+
val = 0.0
|
43
|
+
if a.size>0 and b.size>0
|
44
|
+
if a.size>12 && b.size>12
|
45
|
+
a = a[0..11]
|
46
|
+
b = b[0..11]
|
47
|
+
end
|
48
|
+
a_vec = a.to_gv
|
49
|
+
b_vec = b.to_gv
|
50
|
+
val = a_vec.dot(b_vec) / (a_vec.norm * b_vec.norm)
|
51
|
+
end
|
52
|
+
val
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,120 @@
|
|
1
|
+
# set of non redundant descriptors, faster algorithms are preferred
|
2
|
+
# TODO:
|
3
|
+
# select logP algorithm
|
4
|
+
# select l5 algorithm
|
5
|
+
# use smarts matcher for atom counts
|
6
|
+
# check correlations
|
7
|
+
UNIQUEDESCRIPTORS = [
|
8
|
+
"Openbabel.abonds", #Number of aromatic bonds
|
9
|
+
"Openbabel.atoms", #Number of atoms
|
10
|
+
"Openbabel.bonds", #Number of bonds
|
11
|
+
"Openbabel.dbonds", #Number of double bonds
|
12
|
+
"Openbabel.HBA1", #Number of Hydrogen Bond Acceptors 1 (JoelLib)
|
13
|
+
"Openbabel.HBA2", #Number of Hydrogen Bond Acceptors 2 (JoelLib)
|
14
|
+
"Openbabel.HBD", #Number of Hydrogen Bond Donors (JoelLib)
|
15
|
+
"Openbabel.L5", #Lipinski Rule of Five
|
16
|
+
"Openbabel.logP", #octanol/water partition coefficient
|
17
|
+
"Openbabel.MP", #Melting point
|
18
|
+
"Openbabel.MR", #molar refractivity
|
19
|
+
"Openbabel.MW", #Molecular Weight filter
|
20
|
+
"Openbabel.nF", #Number of Fluorine Atoms
|
21
|
+
"Openbabel.sbonds", #Number of single bonds
|
22
|
+
"Openbabel.tbonds", #Number of triple bonds
|
23
|
+
"Openbabel.TPSA", #topological polar surface area
|
24
|
+
"Cdk.ALOGP", #Calculates atom additive logP and molar refractivity values as described by Ghose and Crippen and
|
25
|
+
"Cdk.APol", #Descriptor that calculates the sum of the atomic polarizabilities (including implicit hydrogens).
|
26
|
+
"Cdk.AcidicGroupCount", #Returns the number of acidic groups.
|
27
|
+
"Cdk.AminoAcidCount", #Returns the number of amino acids found in the system
|
28
|
+
#"Cdk.AromaticAtomsCount", #Descriptor based on the number of aromatic atoms of a molecule.
|
29
|
+
#"Cdk.AromaticBondsCount", #Descriptor based on the number of aromatic bonds of a molecule.
|
30
|
+
#"Cdk.AtomCount", #Descriptor based on the number of atoms of a certain element type.
|
31
|
+
"Cdk.AutocorrelationCharge", #The Moreau-Broto autocorrelation descriptors using partial charges
|
32
|
+
"Cdk.AutocorrelationMass", #The Moreau-Broto autocorrelation descriptors using atomic weight
|
33
|
+
"Cdk.AutocorrelationPolarizability", #The Moreau-Broto autocorrelation descriptors using polarizability
|
34
|
+
"Cdk.BCUT", #Eigenvalue based descriptor noted for its utility in chemical diversity described by Pearlman et al. .
|
35
|
+
"Cdk.BPol", #Descriptor that calculates the sum of the absolute value of the difference between atomic polarizabilities of all bonded atoms in the molecule (including implicit hydrogens).
|
36
|
+
"Cdk.BasicGroupCount", #Returns the number of basic groups.
|
37
|
+
#"Cdk.BondCount", #Descriptor based on the number of bonds of a certain bond order.
|
38
|
+
"Cdk.CPSA", #A variety of descriptors combining surface area and partial charge information
|
39
|
+
"Cdk.CarbonTypes", #Characterizes the carbon connectivity in terms of hybridization
|
40
|
+
"Cdk.ChiChain", #Evaluates the Kier & Hall Chi chain indices of orders 3,4,5 and 6
|
41
|
+
"Cdk.ChiCluster", #Evaluates the Kier & Hall Chi cluster indices of orders 3,4,5,6 and 7
|
42
|
+
"Cdk.ChiPathCluster", #Evaluates the Kier & Hall Chi path cluster indices of orders 4,5 and 6
|
43
|
+
"Cdk.ChiPath", #Evaluates the Kier & Hall Chi path indices of orders 0,1,2,3,4,5,6 and 7
|
44
|
+
"Cdk.EccentricConnectivityIndex", #A topological descriptor combining distance and adjacency information.
|
45
|
+
"Cdk.FMF", #Descriptor characterizing molecular complexity in terms of its Murcko framework
|
46
|
+
"Cdk.FragmentComplexity", #Class that returns the complexity of a system. The complexity is defined as @cdk.cite{Nilakantan06}
|
47
|
+
"Cdk.GravitationalIndex", #Descriptor characterizing the mass distribution of the molecule.
|
48
|
+
#"Cdk.HBondAcceptorCount", #Descriptor that calculates the number of hydrogen bond acceptors.
|
49
|
+
#"Cdk.HBondDonorCount", #Descriptor that calculates the number of hydrogen bond donors.
|
50
|
+
"Cdk.HybridizationRatio", #Characterizes molecular complexity in terms of carbon hybridization states.
|
51
|
+
"Cdk.IPMolecularLearning", #Descriptor that evaluates the ionization potential.
|
52
|
+
"Cdk.KappaShapeIndices", #Descriptor that calculates Kier and Hall kappa molecular shape indices.
|
53
|
+
"Cdk.KierHallSmarts", #Counts the number of occurrences of the E-state fragments
|
54
|
+
"Cdk.LargestChain", #Returns the number of atoms in the largest chain
|
55
|
+
"Cdk.LargestPiSystem", #Returns the number of atoms in the largest pi chain
|
56
|
+
"Cdk.LengthOverBreadth", #Calculates the ratio of length to breadth.
|
57
|
+
"Cdk.LongestAliphaticChain", #Returns the number of atoms in the longest aliphatic chain
|
58
|
+
"Cdk.MDE", #Evaluate molecular distance edge descriptors for C, N and O
|
59
|
+
"Cdk.MannholdLogP", #Descriptor that calculates the LogP based on a simple equation using the number of carbons and hetero atoms .
|
60
|
+
"Cdk.MomentOfInertia", #Descriptor that calculates the principal moments of inertia and ratios of the principal moments. Als calculates the radius of gyration.
|
61
|
+
"Cdk.PetitjeanNumber", #Descriptor that calculates the Petitjean Number of a molecule.
|
62
|
+
"Cdk.PetitjeanShapeIndex", #The topological and geometric shape indices described Petitjean and Bath et al. respectively. Both measure the anisotropy in a molecule.
|
63
|
+
"Cdk.RotatableBondsCount", #Descriptor that calculates the number of nonrotatable bonds on a molecule.
|
64
|
+
#"Cdk.RuleOfFive", #This Class contains a method that returns the number failures of the Lipinski's Rule Of Five.
|
65
|
+
#"Cdk.TPSA", #Calculation of topological polar surface area based on fragment contributions .
|
66
|
+
"Cdk.VABC", #Describes the volume of a molecule.
|
67
|
+
"Cdk.VAdjMa", #Descriptor that calculates the vertex adjacency information of a molecule.
|
68
|
+
"Cdk.WHIM", #Holistic descriptors described by Todeschini et al .
|
69
|
+
#"Cdk.Weight", #Descriptor based on the weight of atoms of a certain element type. If no element is specified, the returned value is the Molecular Weight
|
70
|
+
"Cdk.WeightedPath", #The weighted path (molecular ID) descriptors described by Randic. They characterize molecular branching.
|
71
|
+
"Cdk.WienerNumbers", #This class calculates Wiener path number and Wiener polarity number.
|
72
|
+
"Cdk.XLogP", #Prediction of logP based on the atom-type method called XLogP.
|
73
|
+
"Cdk.ZagrebIndex", #The sum of the squared atom degrees of all heavy atoms.
|
74
|
+
"Joelib.count.NumberOfS", #no description available
|
75
|
+
"Joelib.count.NumberOfP", #no description available
|
76
|
+
"Joelib.count.NumberOfO", #no description available
|
77
|
+
"Joelib.count.NumberOfN", #no description available
|
78
|
+
#"Joelib.count.AromaticBonds", #no description available
|
79
|
+
"Joelib.count.NumberOfI", #no description available
|
80
|
+
"Joelib.count.NumberOfF", #no description available
|
81
|
+
"Joelib.count.NumberOfC", #no description available
|
82
|
+
"Joelib.count.NumberOfB", #no description available
|
83
|
+
"Joelib.count.HydrophobicGroups", #no description available
|
84
|
+
#"Joelib.KierShape3", #no description available
|
85
|
+
#"Joelib.KierShape2", #no description available
|
86
|
+
#"Joelib.KierShape1", #no description available
|
87
|
+
#"Joelib.count.AcidicGroups", #no description available
|
88
|
+
"Joelib.count.AliphaticOHGroups", #no description available
|
89
|
+
#"Joelib.count.NumberOfAtoms", #no description available
|
90
|
+
"Joelib.TopologicalRadius", #no description available
|
91
|
+
"Joelib.GeometricalShapeCoefficient", #no description available
|
92
|
+
#"Joelib.MolecularWeight", #no description available
|
93
|
+
"Joelib.FractionRotatableBonds", #no description available
|
94
|
+
#"Joelib.count.HBD2", #no description available
|
95
|
+
#"Joelib.count.HBD1", #no description available
|
96
|
+
"Joelib.LogP", #no description available
|
97
|
+
"Joelib.GraphShapeCoefficient", #no description available
|
98
|
+
"Joelib.count.BasicGroups", #no description available
|
99
|
+
#"Joelib.count.RotatableBonds", #no description available
|
100
|
+
"Joelib.count.HeavyBonds", #no description available
|
101
|
+
"Joelib.PolarSurfaceArea", #no description available
|
102
|
+
#"Joelib.ZagrebIndex1", #no description available
|
103
|
+
"Joelib.GeometricalRadius", #no description available
|
104
|
+
"Joelib.count.SO2Groups", #no description available
|
105
|
+
"Joelib.count.AromaticOHGroups", #no description available
|
106
|
+
"Joelib.GeometricalDiameter", #no description available
|
107
|
+
#"Joelib.MolarRefractivity", #no description available
|
108
|
+
"Joelib.count.NumberOfCl", #no description available
|
109
|
+
"Joelib.count.OSOGroups", #no description available
|
110
|
+
"Joelib.count.NumberOfBr", #no description available
|
111
|
+
"Joelib.count.NO2Groups", #no description available
|
112
|
+
"Joelib.count.HeteroCycles", #no description available
|
113
|
+
#"Joelib.count.HBA2", #no description available
|
114
|
+
#"Joelib.count.HBA1", #no description available
|
115
|
+
#"Joelib.count.NumberOfBonds", #no description available
|
116
|
+
"Joelib.count.SOGroups", #no description available
|
117
|
+
"Joelib.TopologicalDiameter", #no description available
|
118
|
+
"Joelib.count.NumberOfHal", #no description available
|
119
|
+
|
120
|
+
].sort
|