lazar 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/.yardopts +4 -0
- data/Gemfile +2 -0
- data/LICENSE +674 -0
- data/README.md +44 -0
- data/Rakefile +1 -0
- data/VERSION +1 -0
- data/ext/lazar/extconf.rb +87 -0
- data/java/CdkDescriptorInfo.class +0 -0
- data/java/CdkDescriptorInfo.java +22 -0
- data/java/CdkDescriptors.class +0 -0
- data/java/CdkDescriptors.java +141 -0
- data/java/Jmol.jar +0 -0
- data/java/JoelibDescriptorInfo.class +0 -0
- data/java/JoelibDescriptorInfo.java +15 -0
- data/java/JoelibDescriptors.class +0 -0
- data/java/JoelibDescriptors.java +60 -0
- data/java/Rakefile +15 -0
- data/java/cdk-1.4.19.jar +0 -0
- data/java/joelib2.jar +0 -0
- data/java/log4j.jar +0 -0
- data/lazar.gemspec +29 -0
- data/lib/SMARTS_InteLigand.txt +983 -0
- data/lib/algorithm.rb +21 -0
- data/lib/bbrc.rb +165 -0
- data/lib/classification.rb +107 -0
- data/lib/compound.rb +254 -0
- data/lib/crossvalidation.rb +187 -0
- data/lib/dataset.rb +334 -0
- data/lib/descriptor.rb +247 -0
- data/lib/error.rb +66 -0
- data/lib/feature.rb +97 -0
- data/lib/lazar-model.rb +170 -0
- data/lib/lazar.rb +69 -0
- data/lib/neighbor.rb +25 -0
- data/lib/opentox.rb +22 -0
- data/lib/overwrite.rb +119 -0
- data/lib/regression.rb +199 -0
- data/lib/rest-client-wrapper.rb +98 -0
- data/lib/similarity.rb +58 -0
- data/lib/unique_descriptors.rb +120 -0
- data/lib/validation.rb +114 -0
- data/mongoid.yml +8 -0
- data/test/all.rb +5 -0
- data/test/compound.rb +100 -0
- data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +13553 -0
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +436 -0
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +568 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +87 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +978 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +1120 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +1113 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +850 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +829 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +1198 -0
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +1505 -0
- data/test/data/EPAFHM.csv +618 -0
- data/test/data/EPAFHM.medi.csv +100 -0
- data/test/data/EPAFHM.mini.csv +22 -0
- data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +581 -0
- data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +1217 -0
- data/test/data/ISSCAN-multi.csv +59 -0
- data/test/data/LOAEL_log_mg_corrected_smiles.csv +568 -0
- data/test/data/LOAEL_log_mmol_corrected_smiles.csv +568 -0
- data/test/data/acetaldehyde.sdf +14 -0
- data/test/data/boiling_points.ext.sdf +11460 -0
- data/test/data/cpdb_100.csv +101 -0
- data/test/data/hamster_carcinogenicity.csv +86 -0
- data/test/data/hamster_carcinogenicity.mini.bool_float.csv +11 -0
- data/test/data/hamster_carcinogenicity.mini.bool_int.csv +11 -0
- data/test/data/hamster_carcinogenicity.mini.bool_string.csv +11 -0
- data/test/data/hamster_carcinogenicity.mini.csv +11 -0
- data/test/data/hamster_carcinogenicity.ntriples +618 -0
- data/test/data/hamster_carcinogenicity.sdf +2805 -0
- data/test/data/hamster_carcinogenicity.xls +0 -0
- data/test/data/hamster_carcinogenicity.yaml +352 -0
- data/test/data/hamster_carcinogenicity_with_errors.csv +88 -0
- data/test/data/kazius.csv +4070 -0
- data/test/data/multi_cell_call.csv +1067 -0
- data/test/data/multi_cell_call_no_dup.csv +1057 -0
- data/test/data/multicolumn.csv +8 -0
- data/test/data/rat_feature_dataset.csv +1179 -0
- data/test/data/wrong_dataset.csv +8 -0
- data/test/dataset-long.rb +117 -0
- data/test/dataset.rb +199 -0
- data/test/descriptor-long.rb +26 -0
- data/test/descriptor.rb +83 -0
- data/test/error.rb +24 -0
- data/test/feature.rb +65 -0
- data/test/fminer-long.rb +38 -0
- data/test/fminer.rb +52 -0
- data/test/lazar-fminer.rb +50 -0
- data/test/lazar-long.rb +72 -0
- data/test/lazar-physchem-short.rb +27 -0
- data/test/setup.rb +6 -0
- data/test/validation.rb +41 -0
- metadata +212 -0
data/lib/overwrite.rb
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
require "base64"
|
|
2
|
+
class Object
|
|
3
|
+
# An object is blank if it's false, empty, or a whitespace string.
|
|
4
|
+
# For example, "", " ", +nil+, [], and {} are all blank.
|
|
5
|
+
def blank?
|
|
6
|
+
respond_to?(:empty?) ? empty? : !self
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
def numeric?
|
|
10
|
+
true if Float(self) rescue false
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
class Numeric
|
|
15
|
+
def percent_of(n)
|
|
16
|
+
self.to_f / n.to_f * 100.0
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
module Enumerable
|
|
21
|
+
# @return [Array] only the duplicates of an enumerable
|
|
22
|
+
def duplicates
|
|
23
|
+
inject({}) {|h,v| h[v]=h[v].to_i+1; h}.reject{|k,v| v==1}.keys
|
|
24
|
+
end
|
|
25
|
+
# http://stackoverflow.com/questions/2562256/find-most-common-string-in-an-array
|
|
26
|
+
Enumerable.class_eval do
|
|
27
|
+
def mode
|
|
28
|
+
group_by do |e|
|
|
29
|
+
e
|
|
30
|
+
end.values.max_by(&:size).first
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
class String
|
|
36
|
+
# @return [String] converts camel-case to underscore-case (OpenTox::SuperModel -> open_tox/super_model)
|
|
37
|
+
def underscore
|
|
38
|
+
self.gsub(/::/, '/').
|
|
39
|
+
gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
|
|
40
|
+
gsub(/([a-z\d])([A-Z])/,'\1_\2').
|
|
41
|
+
tr("-", "_").
|
|
42
|
+
downcase
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# convert strings to boolean values
|
|
46
|
+
# @return [TrueClass,FalseClass] true or false
|
|
47
|
+
def to_boolean
|
|
48
|
+
return true if self == true || self =~ (/(true|t|yes|y|1)$/i)
|
|
49
|
+
return false if self == false || self.nil? || self =~ (/(false|f|no|n|0)$/i)
|
|
50
|
+
bad_request_error "invalid value for Boolean: \"#{self}\""
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
class File
|
|
56
|
+
# @return [String] mime_type including charset using linux cmd command
|
|
57
|
+
def mime_type
|
|
58
|
+
`file -ib '#{self.path}'`.chomp
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
class Array
|
|
63
|
+
|
|
64
|
+
# Sum up the size of single arrays in an array of arrays
|
|
65
|
+
# @param [Array] Array of arrays
|
|
66
|
+
# @return [Integer] Sum of size of array elements
|
|
67
|
+
def sum_size
|
|
68
|
+
self.inject(0) { |s,a|
|
|
69
|
+
if a.respond_to?('size')
|
|
70
|
+
s+=a.size
|
|
71
|
+
else
|
|
72
|
+
internal_server_error "No size available: #{a.inspect}"
|
|
73
|
+
end
|
|
74
|
+
}
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# For symbolic features
|
|
78
|
+
# @param [Array] Array to test.
|
|
79
|
+
# @return [Boolean] Whether the array has just one unique value.
|
|
80
|
+
def zero_variance?
|
|
81
|
+
return self.uniq.size == 1
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
module URI
|
|
87
|
+
|
|
88
|
+
def self.ssl? uri
|
|
89
|
+
URI.parse(uri).instance_of? URI::HTTPS
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# @return [Boolean] checks if resource exists by making a HEAD-request
|
|
93
|
+
def self.accessible?(uri)
|
|
94
|
+
parsed_uri = URI.parse(uri + (OpenTox::RestClientWrapper.subjectid ? "?subjectid=#{CGI.escape OpenTox::RestClientWrapper.subjectid}" : ""))
|
|
95
|
+
http_code = URI.task?(uri) ? 600 : 400
|
|
96
|
+
http = Net::HTTP.new(parsed_uri.host, parsed_uri.port)
|
|
97
|
+
unless (URI.ssl? uri) == true
|
|
98
|
+
http = Net::HTTP.new(parsed_uri.host, parsed_uri.port)
|
|
99
|
+
request = Net::HTTP::Head.new(parsed_uri.request_uri)
|
|
100
|
+
http.request(request).code.to_i < http_code
|
|
101
|
+
else
|
|
102
|
+
http = Net::HTTP.new(parsed_uri.host, parsed_uri.port)
|
|
103
|
+
http.use_ssl = true
|
|
104
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
|
105
|
+
request = Net::HTTP::Head.new(parsed_uri.request_uri)
|
|
106
|
+
http.request(request).code.to_i < http_code
|
|
107
|
+
end
|
|
108
|
+
rescue
|
|
109
|
+
false
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
def self.valid? uri
|
|
113
|
+
u = URI.parse(uri)
|
|
114
|
+
u.scheme!=nil and u.host!=nil
|
|
115
|
+
rescue URI::InvalidURIError
|
|
116
|
+
false
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
end
|
data/lib/regression.rb
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
# TODO install R packages kernlab, caret, doMC, class, e1071
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
# log transform activities (create new dataset)
|
|
5
|
+
# scale, normalize features, might not be necessary
|
|
6
|
+
# http://stats.stackexchange.com/questions/19216/variables-are-often-adjusted-e-g-standardised-before-making-a-model-when-is
|
|
7
|
+
# http://stats.stackexchange.com/questions/7112/when-and-how-to-use-standardized-explanatory-variables-in-linear-regression
|
|
8
|
+
# zero-order correlation and the semi-partial correlation
|
|
9
|
+
# seems to be necessary for svm
|
|
10
|
+
# http://stats.stackexchange.com/questions/77876/why-would-scaling-features-decrease-svm-performance?lq=1
|
|
11
|
+
# http://stackoverflow.com/questions/15436367/svm-scaling-input-values
|
|
12
|
+
# use lasso or elastic net??
|
|
13
|
+
# select relevant features
|
|
14
|
+
# remove features with a single value
|
|
15
|
+
# remove correlated features
|
|
16
|
+
# remove features not correlated with endpoint
|
|
17
|
+
module OpenTox
|
|
18
|
+
module Algorithm
|
|
19
|
+
|
|
20
|
+
class Regression
|
|
21
|
+
|
|
22
|
+
def self.weighted_average neighbors
|
|
23
|
+
weighted_sum = 0.0
|
|
24
|
+
sim_sum = 0.0
|
|
25
|
+
neighbors.each do |row|
|
|
26
|
+
n,sim,acts = row
|
|
27
|
+
acts.each do |act|
|
|
28
|
+
weighted_sum += sim*Math.log10(act)
|
|
29
|
+
sim_sum += sim
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
confidence = sim_sum/neighbors.size.to_f
|
|
33
|
+
sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum)
|
|
34
|
+
{:value => prediction,:confidence => confidence}
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Local support vector regression from neighbors
|
|
38
|
+
# @param [Hash] params Keys `:props, :activities, :sims, :min_train_performance` are required
|
|
39
|
+
# @return [Numeric] A prediction value.
|
|
40
|
+
def self.local_svm_regression neighbors, params={:min_train_performance => 0.1}
|
|
41
|
+
|
|
42
|
+
confidence = 0.0
|
|
43
|
+
prediction = nil
|
|
44
|
+
|
|
45
|
+
$logger.debug "Local SVM."
|
|
46
|
+
props = neighbors.collect{|row| row[3] }
|
|
47
|
+
neighbors.shift
|
|
48
|
+
activities = neighbors.collect{|n| n[2]}
|
|
49
|
+
prediction = self.local_svm_prop( props, activities, params[:min_train_performance]) # params[:props].nil? signals non-prop setting
|
|
50
|
+
prediction = nil if (!prediction.nil? && prediction.infinite?)
|
|
51
|
+
$logger.debug "Prediction: '#{prediction}' ('#{prediction.class}')."
|
|
52
|
+
if prediction
|
|
53
|
+
confidence = get_confidence({:sims => neighbors.collect{|n| n[1]}, :activities => activities})
|
|
54
|
+
else
|
|
55
|
+
confidence = nil if prediction.nil?
|
|
56
|
+
end
|
|
57
|
+
[prediction, confidence]
|
|
58
|
+
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
# Local support vector prediction from neighbors.
|
|
63
|
+
# Uses propositionalized setting.
|
|
64
|
+
# Not to be called directly (use local_svm_regression or local_svm_classification).
|
|
65
|
+
# @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ]
|
|
66
|
+
# @param [Array] activities, activities for neighbors.
|
|
67
|
+
# @param [Float] min_train_performance, parameter to control censoring
|
|
68
|
+
# @return [Numeric] A prediction value.
|
|
69
|
+
def self.local_svm_prop(props, activities, min_train_performance)
|
|
70
|
+
|
|
71
|
+
$logger.debug "Local SVM (Propositionalization / Kernlab Kernel)."
|
|
72
|
+
n_prop = props[1..-1] # is a matrix, i.e. two nested Arrays.
|
|
73
|
+
q_prop = props[0] # is an Array.
|
|
74
|
+
|
|
75
|
+
prediction = nil
|
|
76
|
+
if activities.uniq.size == 1
|
|
77
|
+
prediction = activities[0]
|
|
78
|
+
else
|
|
79
|
+
t = Time.now
|
|
80
|
+
#$logger.debug gram_matrix.to_yaml
|
|
81
|
+
#@r = RinRuby.new(true,false) # global R instance leads to Socket errors after a large number of requests
|
|
82
|
+
@r = Rserve::Connection.new#(true,false) # global R instance leads to Socket errors after a large number of requests
|
|
83
|
+
rs = []
|
|
84
|
+
["caret", "doMC", "class"].each do |lib|
|
|
85
|
+
#raise "failed to load R-package #{lib}" unless @r.void_eval "suppressPackageStartupMessages(library('#{lib}'))"
|
|
86
|
+
rs << "suppressPackageStartupMessages(library('#{lib}'))"
|
|
87
|
+
end
|
|
88
|
+
#@r.eval "registerDoMC()" # switch on parallel processing
|
|
89
|
+
rs << "registerDoMC()" # switch on parallel processing
|
|
90
|
+
#@r.eval "set.seed(1)"
|
|
91
|
+
rs << "set.seed(1)"
|
|
92
|
+
$logger.debug "Loading R packages: #{Time.now-t}"
|
|
93
|
+
t = Time.now
|
|
94
|
+
p n_prop
|
|
95
|
+
begin
|
|
96
|
+
|
|
97
|
+
# set data
|
|
98
|
+
rs << "n_prop <- c(#{n_prop.flatten.join(',')})"
|
|
99
|
+
rs << "n_prop <- c(#{n_prop.flatten.join(',')})"
|
|
100
|
+
rs << "n_prop_x_size <- c(#{n_prop.size})"
|
|
101
|
+
rs << "n_prop_y_size <- c(#{n_prop[0].size})"
|
|
102
|
+
rs << "y <- c(#{activities.join(',')})"
|
|
103
|
+
rs << "q_prop <- c(#{q_prop.join(',')})"
|
|
104
|
+
rs << "y = matrix(y)"
|
|
105
|
+
rs << "prop_matrix = matrix(n_prop, n_prop_x_size, n_prop_y_size, byrow=T)"
|
|
106
|
+
rs << "q_prop = matrix(q_prop, 1, n_prop_y_size, byrow=T)"
|
|
107
|
+
|
|
108
|
+
$logger.debug "Setting R data: #{Time.now-t}"
|
|
109
|
+
t = Time.now
|
|
110
|
+
# prepare data
|
|
111
|
+
rs << "
|
|
112
|
+
weights=NULL
|
|
113
|
+
if (!(class(y) == 'numeric')) {
|
|
114
|
+
y = factor(y)
|
|
115
|
+
weights=unlist(as.list(prop.table(table(y))))
|
|
116
|
+
weights=(weights-1)^2
|
|
117
|
+
}
|
|
118
|
+
"
|
|
119
|
+
|
|
120
|
+
rs << "
|
|
121
|
+
rem = nearZeroVar(prop_matrix)
|
|
122
|
+
if (length(rem) > 0) {
|
|
123
|
+
prop_matrix = prop_matrix[,-rem,drop=F]
|
|
124
|
+
q_prop = q_prop[,-rem,drop=F]
|
|
125
|
+
}
|
|
126
|
+
rem = findCorrelation(cor(prop_matrix))
|
|
127
|
+
if (length(rem) > 0) {
|
|
128
|
+
prop_matrix = prop_matrix[,-rem,drop=F]
|
|
129
|
+
q_prop = q_prop[,-rem,drop=F]
|
|
130
|
+
}
|
|
131
|
+
"
|
|
132
|
+
|
|
133
|
+
#p @r.eval("y").to_ruby
|
|
134
|
+
#p "weights"
|
|
135
|
+
#p @r.eval("weights").to_ruby
|
|
136
|
+
$logger.debug "Preparing R data: #{Time.now-t}"
|
|
137
|
+
t = Time.now
|
|
138
|
+
# model + support vectors
|
|
139
|
+
#train_success = @r.eval <<-EOR
|
|
140
|
+
rs << '
|
|
141
|
+
model = train(prop_matrix,y,
|
|
142
|
+
method="svmRadial",
|
|
143
|
+
preProcess=c("center", "scale"),
|
|
144
|
+
class.weights=weights,
|
|
145
|
+
trControl=trainControl(method="LGOCV",number=10),
|
|
146
|
+
tuneLength=8
|
|
147
|
+
)
|
|
148
|
+
perf = ifelse ( class(y)!="numeric", max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared )
|
|
149
|
+
'
|
|
150
|
+
File.open("/tmp/r.r","w+"){|f| f.puts rs.join("\n")}
|
|
151
|
+
p rs.join("\n")
|
|
152
|
+
p `Rscript /tmp/r.r`
|
|
153
|
+
=begin
|
|
154
|
+
@r.void_eval <<-EOR
|
|
155
|
+
model = train(prop_matrix,y,
|
|
156
|
+
method="svmRadial",
|
|
157
|
+
#preProcess=c("center", "scale"),
|
|
158
|
+
#class.weights=weights,
|
|
159
|
+
#trControl=trainControl(method="LGOCV",number=10),
|
|
160
|
+
#tuneLength=8
|
|
161
|
+
)
|
|
162
|
+
perf = ifelse ( class(y)!='numeric', max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared )
|
|
163
|
+
EOR
|
|
164
|
+
=end
|
|
165
|
+
|
|
166
|
+
$logger.debug "Creating R SVM model: #{Time.now-t}"
|
|
167
|
+
t = Time.now
|
|
168
|
+
if train_success
|
|
169
|
+
# prediction
|
|
170
|
+
@r.eval "predict(model,q_prop); p = predict(model,q_prop)" # kernlab bug: predict twice
|
|
171
|
+
#@r.eval "p = predict(model,q_prop)" # kernlab bug: predict twice
|
|
172
|
+
@r.eval "if (class(y)!='numeric') p = as.character(p)"
|
|
173
|
+
prediction = @r.p
|
|
174
|
+
|
|
175
|
+
# censoring
|
|
176
|
+
prediction = nil if ( @r.perf.nan? || @r.perf < min_train_performance.to_f )
|
|
177
|
+
prediction = nil if prediction =~ /NA/
|
|
178
|
+
$logger.debug "Performance: '#{sprintf("%.2f", @r.perf)}'"
|
|
179
|
+
else
|
|
180
|
+
$logger.debug "Model creation failed."
|
|
181
|
+
prediction = nil
|
|
182
|
+
end
|
|
183
|
+
$logger.debug "R Prediction: #{Time.now-t}"
|
|
184
|
+
rescue Exception => e
|
|
185
|
+
$logger.debug "#{e.class}: #{e.message}"
|
|
186
|
+
$logger.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
|
187
|
+
ensure
|
|
188
|
+
#puts @r.inspect
|
|
189
|
+
#TODO: broken pipe
|
|
190
|
+
#@r.quit # free R
|
|
191
|
+
end
|
|
192
|
+
end
|
|
193
|
+
prediction
|
|
194
|
+
end
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
end
|
|
198
|
+
end
|
|
199
|
+
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
module OpenTox
|
|
2
|
+
|
|
3
|
+
class RestClientWrapper
|
|
4
|
+
|
|
5
|
+
attr_accessor :request, :response
|
|
6
|
+
|
|
7
|
+
@@subjectid = nil
|
|
8
|
+
|
|
9
|
+
def self.subjectid=(subjectid)
|
|
10
|
+
@@subjectid = subjectid
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def self.subjectid
|
|
14
|
+
@@subjectid
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# REST methods
|
|
18
|
+
# Raises OpenTox::Error if call fails (rescued in overwrite.rb -> halt 502)
|
|
19
|
+
# Does not wait for task to finish and returns task uri
|
|
20
|
+
# @param [String] destination URI
|
|
21
|
+
# @param [optional,Hash|String] Payload data posted to the service
|
|
22
|
+
# @param [optional,Hash] Headers with params like :accept, :content_type, :subjectid, :verify_ssl
|
|
23
|
+
# @return [RestClient::Response] REST call response
|
|
24
|
+
[:head,:get,:post,:put,:delete].each do |method|
|
|
25
|
+
|
|
26
|
+
define_singleton_method method do |uri,payload={},headers={},waiting_task=nil|
|
|
27
|
+
|
|
28
|
+
# check input
|
|
29
|
+
bad_request_error "Headers are not a hash: #{headers.inspect}", uri unless headers==nil or headers.is_a?(Hash)
|
|
30
|
+
headers[:subjectid] ||= @@subjectid
|
|
31
|
+
bad_request_error "Invalid URI: '#{uri}'", uri unless URI.valid? uri
|
|
32
|
+
#resource_not_found_error "URI '#{uri}' not found.", uri unless URI.accessible?(uri, @subjectid) unless URI.ssl?(uri)
|
|
33
|
+
# make sure that no header parameters are set in the payload
|
|
34
|
+
[:accept,:content_type,:subjectid].each do |header|
|
|
35
|
+
if defined? $aa || URI(uri).host == URI($aa[:uri]).host
|
|
36
|
+
else
|
|
37
|
+
bad_request_error "#{header} should be submitted in the headers", uri if payload and payload.is_a?(Hash) and payload[header]
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# create request
|
|
42
|
+
args={}
|
|
43
|
+
args[:method] = method
|
|
44
|
+
args[:url] = uri
|
|
45
|
+
args[:verify_ssl] = 0 if headers[:verify_ssl].nil? || headers[:verify_ssl].empty?
|
|
46
|
+
args[:timeout] = 1800
|
|
47
|
+
args[:payload] = payload
|
|
48
|
+
headers.each{ |k,v| headers.delete(k) if v==nil } if headers #remove keys with empty values, as this can cause problems
|
|
49
|
+
args[:headers] = headers
|
|
50
|
+
|
|
51
|
+
$logger.debug "post to #{uri} with params #{payload.inspect.to_s[0..1000]}" if method.to_s=="post"
|
|
52
|
+
|
|
53
|
+
@request = RestClient::Request.new(args)
|
|
54
|
+
# ignore error codes from Task services (may return error codes >= 400 according to API, which causes exceptions in RestClient and RDF::Reader)
|
|
55
|
+
@response = @request.execute do |response, request, result|
|
|
56
|
+
if [301, 302, 307].include? response.code and request.method == :get
|
|
57
|
+
response.follow_redirection(request, result)
|
|
58
|
+
elsif response.code >= 400 and !URI.task?(uri)
|
|
59
|
+
#TODO add parameters to error-report
|
|
60
|
+
#parameters = request.args
|
|
61
|
+
#parameters[:headers][:subjectid] = "REMOVED" if parameters[:headers] and parameters[:headers][:subjectid]
|
|
62
|
+
#parameters[:url] = parameters[:url].gsub(/(http|https|)\:\/\/[a-zA-Z0-9\-]+\:[a-zA-Z0-9]+\@/, "REMOVED@") if parameters[:url]
|
|
63
|
+
#message += "\nREST parameters:\n#{parameters.inspect}"
|
|
64
|
+
error = known_errors.collect{|e| e if e[:code] == response.code}.compact.first
|
|
65
|
+
begin # errors are returned as error reports in json, try to parse
|
|
66
|
+
# TODO: may be the reason for failure of task.rb -n test_11_wait_for_error_task
|
|
67
|
+
content = JSON.parse(response)
|
|
68
|
+
msg = content["message"].to_s
|
|
69
|
+
cause = content["errorCause"].to_s
|
|
70
|
+
raise if msg.size==0 && cause.size==0 # parsing failed
|
|
71
|
+
rescue # parsing error failed, use complete content as message
|
|
72
|
+
msg = "Could not parse error response from rest call '#{method}' to '#{uri}':\n#{response}"
|
|
73
|
+
cause = nil
|
|
74
|
+
end
|
|
75
|
+
Object.method(error[:method]).call msg, uri, cause # call error method
|
|
76
|
+
else
|
|
77
|
+
response
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
#@return [Array] of hashes with error code, method and class
|
|
84
|
+
def self.known_errors
|
|
85
|
+
errors = []
|
|
86
|
+
RestClient::STATUSES.each do |code,k|
|
|
87
|
+
if code >= 400
|
|
88
|
+
method = k.underscore.gsub(/ |'/,'_')
|
|
89
|
+
method += "_error" unless method.match(/_error$/)
|
|
90
|
+
klass = method.split("_").collect{|s| s.capitalize}.join("")
|
|
91
|
+
errors << {:code => code, :method => method.to_sym, :class => klass}
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
errors
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
end
|
|
98
|
+
end
|
data/lib/similarity.rb
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
=begin
|
|
2
|
+
* Name: similarity.rb
|
|
3
|
+
* Description: Similarity algorithms
|
|
4
|
+
* Author: Andreas Maunz <andreas@maunz.de
|
|
5
|
+
* Date: 10/2012
|
|
6
|
+
=end
|
|
7
|
+
|
|
8
|
+
module OpenTox
|
|
9
|
+
module Algorithm
|
|
10
|
+
|
|
11
|
+
class Similarity
|
|
12
|
+
|
|
13
|
+
#TODO weighted tanimoto
|
|
14
|
+
|
|
15
|
+
# Tanimoto similarity
|
|
16
|
+
# @param [Array] a fingerprints of first compound
|
|
17
|
+
# @param [Array] b fingerprints of second compound
|
|
18
|
+
# @return [Float] Tanimoto similarity
|
|
19
|
+
def self.tanimoto(a,b)
|
|
20
|
+
bad_request_error "fingerprints #{a} and #{b} don't have equal size" unless a.size == b.size
|
|
21
|
+
#common = 0.0
|
|
22
|
+
#a.each_with_index do |n,i|
|
|
23
|
+
#common += 1 if n == b[i]
|
|
24
|
+
#end
|
|
25
|
+
#common/a.size
|
|
26
|
+
# TODO check if calculation speed can be improved
|
|
27
|
+
common_p_sum = 0.0
|
|
28
|
+
all_p_sum = 0.0
|
|
29
|
+
(0...a.size).each { |idx|
|
|
30
|
+
common_p_sum += [ a[idx], b[idx] ].min
|
|
31
|
+
all_p_sum += [ a[idx], b[idx] ].max
|
|
32
|
+
}
|
|
33
|
+
common_p_sum/all_p_sum
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# Cosine similarity
|
|
38
|
+
# @param [Array] a fingerprints of first compound
|
|
39
|
+
# @param [Array] b fingerprints of second compound
|
|
40
|
+
# @return [Float] Cosine similarity, the cosine of angle enclosed between vectors a and b
|
|
41
|
+
def self.cosine(a, b)
|
|
42
|
+
val = 0.0
|
|
43
|
+
if a.size>0 and b.size>0
|
|
44
|
+
if a.size>12 && b.size>12
|
|
45
|
+
a = a[0..11]
|
|
46
|
+
b = b[0..11]
|
|
47
|
+
end
|
|
48
|
+
a_vec = a.to_gv
|
|
49
|
+
b_vec = b.to_gv
|
|
50
|
+
val = a_vec.dot(b_vec) / (a_vec.norm * b_vec.norm)
|
|
51
|
+
end
|
|
52
|
+
val
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
end
|
|
58
|
+
end
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
# set of non redundant descriptors, faster algorithms are preferred
|
|
2
|
+
# TODO:
|
|
3
|
+
# select logP algorithm
|
|
4
|
+
# select l5 algorithm
|
|
5
|
+
# use smarts matcher for atom counts
|
|
6
|
+
# check correlations
|
|
7
|
+
UNIQUEDESCRIPTORS = [
|
|
8
|
+
"Openbabel.abonds", #Number of aromatic bonds
|
|
9
|
+
"Openbabel.atoms", #Number of atoms
|
|
10
|
+
"Openbabel.bonds", #Number of bonds
|
|
11
|
+
"Openbabel.dbonds", #Number of double bonds
|
|
12
|
+
"Openbabel.HBA1", #Number of Hydrogen Bond Acceptors 1 (JoelLib)
|
|
13
|
+
"Openbabel.HBA2", #Number of Hydrogen Bond Acceptors 2 (JoelLib)
|
|
14
|
+
"Openbabel.HBD", #Number of Hydrogen Bond Donors (JoelLib)
|
|
15
|
+
"Openbabel.L5", #Lipinski Rule of Five
|
|
16
|
+
"Openbabel.logP", #octanol/water partition coefficient
|
|
17
|
+
"Openbabel.MP", #Melting point
|
|
18
|
+
"Openbabel.MR", #molar refractivity
|
|
19
|
+
"Openbabel.MW", #Molecular Weight filter
|
|
20
|
+
"Openbabel.nF", #Number of Fluorine Atoms
|
|
21
|
+
"Openbabel.sbonds", #Number of single bonds
|
|
22
|
+
"Openbabel.tbonds", #Number of triple bonds
|
|
23
|
+
"Openbabel.TPSA", #topological polar surface area
|
|
24
|
+
"Cdk.ALOGP", #Calculates atom additive logP and molar refractivity values as described by Ghose and Crippen and
|
|
25
|
+
"Cdk.APol", #Descriptor that calculates the sum of the atomic polarizabilities (including implicit hydrogens).
|
|
26
|
+
"Cdk.AcidicGroupCount", #Returns the number of acidic groups.
|
|
27
|
+
"Cdk.AminoAcidCount", #Returns the number of amino acids found in the system
|
|
28
|
+
#"Cdk.AromaticAtomsCount", #Descriptor based on the number of aromatic atoms of a molecule.
|
|
29
|
+
#"Cdk.AromaticBondsCount", #Descriptor based on the number of aromatic bonds of a molecule.
|
|
30
|
+
#"Cdk.AtomCount", #Descriptor based on the number of atoms of a certain element type.
|
|
31
|
+
"Cdk.AutocorrelationCharge", #The Moreau-Broto autocorrelation descriptors using partial charges
|
|
32
|
+
"Cdk.AutocorrelationMass", #The Moreau-Broto autocorrelation descriptors using atomic weight
|
|
33
|
+
"Cdk.AutocorrelationPolarizability", #The Moreau-Broto autocorrelation descriptors using polarizability
|
|
34
|
+
"Cdk.BCUT", #Eigenvalue based descriptor noted for its utility in chemical diversity described by Pearlman et al. .
|
|
35
|
+
"Cdk.BPol", #Descriptor that calculates the sum of the absolute value of the difference between atomic polarizabilities of all bonded atoms in the molecule (including implicit hydrogens).
|
|
36
|
+
"Cdk.BasicGroupCount", #Returns the number of basic groups.
|
|
37
|
+
#"Cdk.BondCount", #Descriptor based on the number of bonds of a certain bond order.
|
|
38
|
+
"Cdk.CPSA", #A variety of descriptors combining surface area and partial charge information
|
|
39
|
+
"Cdk.CarbonTypes", #Characterizes the carbon connectivity in terms of hybridization
|
|
40
|
+
"Cdk.ChiChain", #Evaluates the Kier & Hall Chi chain indices of orders 3,4,5 and 6
|
|
41
|
+
"Cdk.ChiCluster", #Evaluates the Kier & Hall Chi cluster indices of orders 3,4,5,6 and 7
|
|
42
|
+
"Cdk.ChiPathCluster", #Evaluates the Kier & Hall Chi path cluster indices of orders 4,5 and 6
|
|
43
|
+
"Cdk.ChiPath", #Evaluates the Kier & Hall Chi path indices of orders 0,1,2,3,4,5,6 and 7
|
|
44
|
+
"Cdk.EccentricConnectivityIndex", #A topological descriptor combining distance and adjacency information.
|
|
45
|
+
"Cdk.FMF", #Descriptor characterizing molecular complexity in terms of its Murcko framework
|
|
46
|
+
"Cdk.FragmentComplexity", #Class that returns the complexity of a system. The complexity is defined as @cdk.cite{Nilakantan06}
|
|
47
|
+
"Cdk.GravitationalIndex", #Descriptor characterizing the mass distribution of the molecule.
|
|
48
|
+
#"Cdk.HBondAcceptorCount", #Descriptor that calculates the number of hydrogen bond acceptors.
|
|
49
|
+
#"Cdk.HBondDonorCount", #Descriptor that calculates the number of hydrogen bond donors.
|
|
50
|
+
"Cdk.HybridizationRatio", #Characterizes molecular complexity in terms of carbon hybridization states.
|
|
51
|
+
"Cdk.IPMolecularLearning", #Descriptor that evaluates the ionization potential.
|
|
52
|
+
"Cdk.KappaShapeIndices", #Descriptor that calculates Kier and Hall kappa molecular shape indices.
|
|
53
|
+
"Cdk.KierHallSmarts", #Counts the number of occurrences of the E-state fragments
|
|
54
|
+
"Cdk.LargestChain", #Returns the number of atoms in the largest chain
|
|
55
|
+
"Cdk.LargestPiSystem", #Returns the number of atoms in the largest pi chain
|
|
56
|
+
"Cdk.LengthOverBreadth", #Calculates the ratio of length to breadth.
|
|
57
|
+
"Cdk.LongestAliphaticChain", #Returns the number of atoms in the longest aliphatic chain
|
|
58
|
+
"Cdk.MDE", #Evaluate molecular distance edge descriptors for C, N and O
|
|
59
|
+
"Cdk.MannholdLogP", #Descriptor that calculates the LogP based on a simple equation using the number of carbons and hetero atoms .
|
|
60
|
+
"Cdk.MomentOfInertia", #Descriptor that calculates the principal moments of inertia and ratios of the principal moments. Als calculates the radius of gyration.
|
|
61
|
+
"Cdk.PetitjeanNumber", #Descriptor that calculates the Petitjean Number of a molecule.
|
|
62
|
+
"Cdk.PetitjeanShapeIndex", #The topological and geometric shape indices described Petitjean and Bath et al. respectively. Both measure the anisotropy in a molecule.
|
|
63
|
+
"Cdk.RotatableBondsCount", #Descriptor that calculates the number of nonrotatable bonds on a molecule.
|
|
64
|
+
#"Cdk.RuleOfFive", #This Class contains a method that returns the number failures of the Lipinski's Rule Of Five.
|
|
65
|
+
#"Cdk.TPSA", #Calculation of topological polar surface area based on fragment contributions .
|
|
66
|
+
"Cdk.VABC", #Describes the volume of a molecule.
|
|
67
|
+
"Cdk.VAdjMa", #Descriptor that calculates the vertex adjacency information of a molecule.
|
|
68
|
+
"Cdk.WHIM", #Holistic descriptors described by Todeschini et al .
|
|
69
|
+
#"Cdk.Weight", #Descriptor based on the weight of atoms of a certain element type. If no element is specified, the returned value is the Molecular Weight
|
|
70
|
+
"Cdk.WeightedPath", #The weighted path (molecular ID) descriptors described by Randic. They characterize molecular branching.
|
|
71
|
+
"Cdk.WienerNumbers", #This class calculates Wiener path number and Wiener polarity number.
|
|
72
|
+
"Cdk.XLogP", #Prediction of logP based on the atom-type method called XLogP.
|
|
73
|
+
"Cdk.ZagrebIndex", #The sum of the squared atom degrees of all heavy atoms.
|
|
74
|
+
"Joelib.count.NumberOfS", #no description available
|
|
75
|
+
"Joelib.count.NumberOfP", #no description available
|
|
76
|
+
"Joelib.count.NumberOfO", #no description available
|
|
77
|
+
"Joelib.count.NumberOfN", #no description available
|
|
78
|
+
#"Joelib.count.AromaticBonds", #no description available
|
|
79
|
+
"Joelib.count.NumberOfI", #no description available
|
|
80
|
+
"Joelib.count.NumberOfF", #no description available
|
|
81
|
+
"Joelib.count.NumberOfC", #no description available
|
|
82
|
+
"Joelib.count.NumberOfB", #no description available
|
|
83
|
+
"Joelib.count.HydrophobicGroups", #no description available
|
|
84
|
+
#"Joelib.KierShape3", #no description available
|
|
85
|
+
#"Joelib.KierShape2", #no description available
|
|
86
|
+
#"Joelib.KierShape1", #no description available
|
|
87
|
+
#"Joelib.count.AcidicGroups", #no description available
|
|
88
|
+
"Joelib.count.AliphaticOHGroups", #no description available
|
|
89
|
+
#"Joelib.count.NumberOfAtoms", #no description available
|
|
90
|
+
"Joelib.TopologicalRadius", #no description available
|
|
91
|
+
"Joelib.GeometricalShapeCoefficient", #no description available
|
|
92
|
+
#"Joelib.MolecularWeight", #no description available
|
|
93
|
+
"Joelib.FractionRotatableBonds", #no description available
|
|
94
|
+
#"Joelib.count.HBD2", #no description available
|
|
95
|
+
#"Joelib.count.HBD1", #no description available
|
|
96
|
+
"Joelib.LogP", #no description available
|
|
97
|
+
"Joelib.GraphShapeCoefficient", #no description available
|
|
98
|
+
"Joelib.count.BasicGroups", #no description available
|
|
99
|
+
#"Joelib.count.RotatableBonds", #no description available
|
|
100
|
+
"Joelib.count.HeavyBonds", #no description available
|
|
101
|
+
"Joelib.PolarSurfaceArea", #no description available
|
|
102
|
+
#"Joelib.ZagrebIndex1", #no description available
|
|
103
|
+
"Joelib.GeometricalRadius", #no description available
|
|
104
|
+
"Joelib.count.SO2Groups", #no description available
|
|
105
|
+
"Joelib.count.AromaticOHGroups", #no description available
|
|
106
|
+
"Joelib.GeometricalDiameter", #no description available
|
|
107
|
+
#"Joelib.MolarRefractivity", #no description available
|
|
108
|
+
"Joelib.count.NumberOfCl", #no description available
|
|
109
|
+
"Joelib.count.OSOGroups", #no description available
|
|
110
|
+
"Joelib.count.NumberOfBr", #no description available
|
|
111
|
+
"Joelib.count.NO2Groups", #no description available
|
|
112
|
+
"Joelib.count.HeteroCycles", #no description available
|
|
113
|
+
#"Joelib.count.HBA2", #no description available
|
|
114
|
+
#"Joelib.count.HBA1", #no description available
|
|
115
|
+
#"Joelib.count.NumberOfBonds", #no description available
|
|
116
|
+
"Joelib.count.SOGroups", #no description available
|
|
117
|
+
"Joelib.TopologicalDiameter", #no description available
|
|
118
|
+
"Joelib.count.NumberOfHal", #no description available
|
|
119
|
+
|
|
120
|
+
].sort
|