lazar 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/.yardopts +4 -0
  4. data/Gemfile +2 -0
  5. data/LICENSE +674 -0
  6. data/README.md +44 -0
  7. data/Rakefile +1 -0
  8. data/VERSION +1 -0
  9. data/ext/lazar/extconf.rb +87 -0
  10. data/java/CdkDescriptorInfo.class +0 -0
  11. data/java/CdkDescriptorInfo.java +22 -0
  12. data/java/CdkDescriptors.class +0 -0
  13. data/java/CdkDescriptors.java +141 -0
  14. data/java/Jmol.jar +0 -0
  15. data/java/JoelibDescriptorInfo.class +0 -0
  16. data/java/JoelibDescriptorInfo.java +15 -0
  17. data/java/JoelibDescriptors.class +0 -0
  18. data/java/JoelibDescriptors.java +60 -0
  19. data/java/Rakefile +15 -0
  20. data/java/cdk-1.4.19.jar +0 -0
  21. data/java/joelib2.jar +0 -0
  22. data/java/log4j.jar +0 -0
  23. data/lazar.gemspec +29 -0
  24. data/lib/SMARTS_InteLigand.txt +983 -0
  25. data/lib/algorithm.rb +21 -0
  26. data/lib/bbrc.rb +165 -0
  27. data/lib/classification.rb +107 -0
  28. data/lib/compound.rb +254 -0
  29. data/lib/crossvalidation.rb +187 -0
  30. data/lib/dataset.rb +334 -0
  31. data/lib/descriptor.rb +247 -0
  32. data/lib/error.rb +66 -0
  33. data/lib/feature.rb +97 -0
  34. data/lib/lazar-model.rb +170 -0
  35. data/lib/lazar.rb +69 -0
  36. data/lib/neighbor.rb +25 -0
  37. data/lib/opentox.rb +22 -0
  38. data/lib/overwrite.rb +119 -0
  39. data/lib/regression.rb +199 -0
  40. data/lib/rest-client-wrapper.rb +98 -0
  41. data/lib/similarity.rb +58 -0
  42. data/lib/unique_descriptors.rb +120 -0
  43. data/lib/validation.rb +114 -0
  44. data/mongoid.yml +8 -0
  45. data/test/all.rb +5 -0
  46. data/test/compound.rb +100 -0
  47. data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +13553 -0
  48. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +436 -0
  49. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +568 -0
  50. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +87 -0
  51. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +978 -0
  52. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +1120 -0
  53. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +1113 -0
  54. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +850 -0
  55. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +829 -0
  56. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +1198 -0
  57. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +1505 -0
  58. data/test/data/EPAFHM.csv +618 -0
  59. data/test/data/EPAFHM.medi.csv +100 -0
  60. data/test/data/EPAFHM.mini.csv +22 -0
  61. data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +581 -0
  62. data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +1217 -0
  63. data/test/data/ISSCAN-multi.csv +59 -0
  64. data/test/data/LOAEL_log_mg_corrected_smiles.csv +568 -0
  65. data/test/data/LOAEL_log_mmol_corrected_smiles.csv +568 -0
  66. data/test/data/acetaldehyde.sdf +14 -0
  67. data/test/data/boiling_points.ext.sdf +11460 -0
  68. data/test/data/cpdb_100.csv +101 -0
  69. data/test/data/hamster_carcinogenicity.csv +86 -0
  70. data/test/data/hamster_carcinogenicity.mini.bool_float.csv +11 -0
  71. data/test/data/hamster_carcinogenicity.mini.bool_int.csv +11 -0
  72. data/test/data/hamster_carcinogenicity.mini.bool_string.csv +11 -0
  73. data/test/data/hamster_carcinogenicity.mini.csv +11 -0
  74. data/test/data/hamster_carcinogenicity.ntriples +618 -0
  75. data/test/data/hamster_carcinogenicity.sdf +2805 -0
  76. data/test/data/hamster_carcinogenicity.xls +0 -0
  77. data/test/data/hamster_carcinogenicity.yaml +352 -0
  78. data/test/data/hamster_carcinogenicity_with_errors.csv +88 -0
  79. data/test/data/kazius.csv +4070 -0
  80. data/test/data/multi_cell_call.csv +1067 -0
  81. data/test/data/multi_cell_call_no_dup.csv +1057 -0
  82. data/test/data/multicolumn.csv +8 -0
  83. data/test/data/rat_feature_dataset.csv +1179 -0
  84. data/test/data/wrong_dataset.csv +8 -0
  85. data/test/dataset-long.rb +117 -0
  86. data/test/dataset.rb +199 -0
  87. data/test/descriptor-long.rb +26 -0
  88. data/test/descriptor.rb +83 -0
  89. data/test/error.rb +24 -0
  90. data/test/feature.rb +65 -0
  91. data/test/fminer-long.rb +38 -0
  92. data/test/fminer.rb +52 -0
  93. data/test/lazar-fminer.rb +50 -0
  94. data/test/lazar-long.rb +72 -0
  95. data/test/lazar-physchem-short.rb +27 -0
  96. data/test/setup.rb +6 -0
  97. data/test/validation.rb +41 -0
  98. metadata +212 -0
data/lib/overwrite.rb ADDED
@@ -0,0 +1,119 @@
1
+ require "base64"
2
+ class Object
3
+ # An object is blank if it's false, empty, or a whitespace string.
4
+ # For example, "", " ", +nil+, [], and {} are all blank.
5
+ def blank?
6
+ respond_to?(:empty?) ? empty? : !self
7
+ end
8
+
9
+ def numeric?
10
+ true if Float(self) rescue false
11
+ end
12
+ end
13
+
14
+ class Numeric
15
+ def percent_of(n)
16
+ self.to_f / n.to_f * 100.0
17
+ end
18
+ end
19
+
20
+ module Enumerable
21
+ # @return [Array] only the duplicates of an enumerable
22
+ def duplicates
23
+ inject({}) {|h,v| h[v]=h[v].to_i+1; h}.reject{|k,v| v==1}.keys
24
+ end
25
+ # http://stackoverflow.com/questions/2562256/find-most-common-string-in-an-array
26
+ Enumerable.class_eval do
27
+ def mode
28
+ group_by do |e|
29
+ e
30
+ end.values.max_by(&:size).first
31
+ end
32
+ end
33
+ end
34
+
35
+ class String
36
+ # @return [String] converts camel-case to underscore-case (OpenTox::SuperModel -> open_tox/super_model)
37
+ def underscore
38
+ self.gsub(/::/, '/').
39
+ gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
40
+ gsub(/([a-z\d])([A-Z])/,'\1_\2').
41
+ tr("-", "_").
42
+ downcase
43
+ end
44
+
45
+ # convert strings to boolean values
46
+ # @return [TrueClass,FalseClass] true or false
47
+ def to_boolean
48
+ return true if self == true || self =~ (/(true|t|yes|y|1)$/i)
49
+ return false if self == false || self.nil? || self =~ (/(false|f|no|n|0)$/i)
50
+ bad_request_error "invalid value for Boolean: \"#{self}\""
51
+ end
52
+
53
+ end
54
+
55
+ class File
56
+ # @return [String] mime_type including charset using linux cmd command
57
+ def mime_type
58
+ `file -ib '#{self.path}'`.chomp
59
+ end
60
+ end
61
+
62
+ class Array
63
+
64
+ # Sum up the size of single arrays in an array of arrays
65
+ # @param [Array] Array of arrays
66
+ # @return [Integer] Sum of size of array elements
67
+ def sum_size
68
+ self.inject(0) { |s,a|
69
+ if a.respond_to?('size')
70
+ s+=a.size
71
+ else
72
+ internal_server_error "No size available: #{a.inspect}"
73
+ end
74
+ }
75
+ end
76
+
77
+ # For symbolic features
78
+ # @param [Array] Array to test.
79
+ # @return [Boolean] Whether the array has just one unique value.
80
+ def zero_variance?
81
+ return self.uniq.size == 1
82
+ end
83
+
84
+ end
85
+
86
+ module URI
87
+
88
+ def self.ssl? uri
89
+ URI.parse(uri).instance_of? URI::HTTPS
90
+ end
91
+
92
+ # @return [Boolean] checks if resource exists by making a HEAD-request
93
+ def self.accessible?(uri)
94
+ parsed_uri = URI.parse(uri + (OpenTox::RestClientWrapper.subjectid ? "?subjectid=#{CGI.escape OpenTox::RestClientWrapper.subjectid}" : ""))
95
+ http_code = URI.task?(uri) ? 600 : 400
96
+ http = Net::HTTP.new(parsed_uri.host, parsed_uri.port)
97
+ unless (URI.ssl? uri) == true
98
+ http = Net::HTTP.new(parsed_uri.host, parsed_uri.port)
99
+ request = Net::HTTP::Head.new(parsed_uri.request_uri)
100
+ http.request(request).code.to_i < http_code
101
+ else
102
+ http = Net::HTTP.new(parsed_uri.host, parsed_uri.port)
103
+ http.use_ssl = true
104
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
105
+ request = Net::HTTP::Head.new(parsed_uri.request_uri)
106
+ http.request(request).code.to_i < http_code
107
+ end
108
+ rescue
109
+ false
110
+ end
111
+
112
+ def self.valid? uri
113
+ u = URI.parse(uri)
114
+ u.scheme!=nil and u.host!=nil
115
+ rescue URI::InvalidURIError
116
+ false
117
+ end
118
+
119
+ end
data/lib/regression.rb ADDED
@@ -0,0 +1,199 @@
1
+ # TODO install R packages kernlab, caret, doMC, class, e1071
2
+
3
+
4
+ # log transform activities (create new dataset)
5
+ # scale, normalize features, might not be necessary
6
+ # http://stats.stackexchange.com/questions/19216/variables-are-often-adjusted-e-g-standardised-before-making-a-model-when-is
7
+ # http://stats.stackexchange.com/questions/7112/when-and-how-to-use-standardized-explanatory-variables-in-linear-regression
8
+ # zero-order correlation and the semi-partial correlation
9
+ # seems to be necessary for svm
10
+ # http://stats.stackexchange.com/questions/77876/why-would-scaling-features-decrease-svm-performance?lq=1
11
+ # http://stackoverflow.com/questions/15436367/svm-scaling-input-values
12
+ # use lasso or elastic net??
13
+ # select relevant features
14
+ # remove features with a single value
15
+ # remove correlated features
16
+ # remove features not correlated with endpoint
17
+ module OpenTox
18
+ module Algorithm
19
+
20
+ class Regression
21
+
22
+ def self.weighted_average neighbors
23
+ weighted_sum = 0.0
24
+ sim_sum = 0.0
25
+ neighbors.each do |row|
26
+ n,sim,acts = row
27
+ acts.each do |act|
28
+ weighted_sum += sim*Math.log10(act)
29
+ sim_sum += sim
30
+ end
31
+ end
32
+ confidence = sim_sum/neighbors.size.to_f
33
+ sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum)
34
+ {:value => prediction,:confidence => confidence}
35
+ end
36
+
37
+ # Local support vector regression from neighbors
38
+ # @param [Hash] params Keys `:props, :activities, :sims, :min_train_performance` are required
39
+ # @return [Numeric] A prediction value.
40
+ def self.local_svm_regression neighbors, params={:min_train_performance => 0.1}
41
+
42
+ confidence = 0.0
43
+ prediction = nil
44
+
45
+ $logger.debug "Local SVM."
46
+ props = neighbors.collect{|row| row[3] }
47
+ neighbors.shift
48
+ activities = neighbors.collect{|n| n[2]}
49
+ prediction = self.local_svm_prop( props, activities, params[:min_train_performance]) # params[:props].nil? signals non-prop setting
50
+ prediction = nil if (!prediction.nil? && prediction.infinite?)
51
+ $logger.debug "Prediction: '#{prediction}' ('#{prediction.class}')."
52
+ if prediction
53
+ confidence = get_confidence({:sims => neighbors.collect{|n| n[1]}, :activities => activities})
54
+ else
55
+ confidence = nil if prediction.nil?
56
+ end
57
+ [prediction, confidence]
58
+
59
+ end
60
+
61
+
62
+ # Local support vector prediction from neighbors.
63
+ # Uses propositionalized setting.
64
+ # Not to be called directly (use local_svm_regression or local_svm_classification).
65
+ # @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ]
66
+ # @param [Array] activities, activities for neighbors.
67
+ # @param [Float] min_train_performance, parameter to control censoring
68
+ # @return [Numeric] A prediction value.
69
+ def self.local_svm_prop(props, activities, min_train_performance)
70
+
71
+ $logger.debug "Local SVM (Propositionalization / Kernlab Kernel)."
72
+ n_prop = props[1..-1] # is a matrix, i.e. two nested Arrays.
73
+ q_prop = props[0] # is an Array.
74
+
75
+ prediction = nil
76
+ if activities.uniq.size == 1
77
+ prediction = activities[0]
78
+ else
79
+ t = Time.now
80
+ #$logger.debug gram_matrix.to_yaml
81
+ #@r = RinRuby.new(true,false) # global R instance leads to Socket errors after a large number of requests
82
+ @r = Rserve::Connection.new#(true,false) # global R instance leads to Socket errors after a large number of requests
83
+ rs = []
84
+ ["caret", "doMC", "class"].each do |lib|
85
+ #raise "failed to load R-package #{lib}" unless @r.void_eval "suppressPackageStartupMessages(library('#{lib}'))"
86
+ rs << "suppressPackageStartupMessages(library('#{lib}'))"
87
+ end
88
+ #@r.eval "registerDoMC()" # switch on parallel processing
89
+ rs << "registerDoMC()" # switch on parallel processing
90
+ #@r.eval "set.seed(1)"
91
+ rs << "set.seed(1)"
92
+ $logger.debug "Loading R packages: #{Time.now-t}"
93
+ t = Time.now
94
+ p n_prop
95
+ begin
96
+
97
+ # set data
98
+ rs << "n_prop <- c(#{n_prop.flatten.join(',')})"
99
+ rs << "n_prop <- c(#{n_prop.flatten.join(',')})"
100
+ rs << "n_prop_x_size <- c(#{n_prop.size})"
101
+ rs << "n_prop_y_size <- c(#{n_prop[0].size})"
102
+ rs << "y <- c(#{activities.join(',')})"
103
+ rs << "q_prop <- c(#{q_prop.join(',')})"
104
+ rs << "y = matrix(y)"
105
+ rs << "prop_matrix = matrix(n_prop, n_prop_x_size, n_prop_y_size, byrow=T)"
106
+ rs << "q_prop = matrix(q_prop, 1, n_prop_y_size, byrow=T)"
107
+
108
+ $logger.debug "Setting R data: #{Time.now-t}"
109
+ t = Time.now
110
+ # prepare data
111
+ rs << "
112
+ weights=NULL
113
+ if (!(class(y) == 'numeric')) {
114
+ y = factor(y)
115
+ weights=unlist(as.list(prop.table(table(y))))
116
+ weights=(weights-1)^2
117
+ }
118
+ "
119
+
120
+ rs << "
121
+ rem = nearZeroVar(prop_matrix)
122
+ if (length(rem) > 0) {
123
+ prop_matrix = prop_matrix[,-rem,drop=F]
124
+ q_prop = q_prop[,-rem,drop=F]
125
+ }
126
+ rem = findCorrelation(cor(prop_matrix))
127
+ if (length(rem) > 0) {
128
+ prop_matrix = prop_matrix[,-rem,drop=F]
129
+ q_prop = q_prop[,-rem,drop=F]
130
+ }
131
+ "
132
+
133
+ #p @r.eval("y").to_ruby
134
+ #p "weights"
135
+ #p @r.eval("weights").to_ruby
136
+ $logger.debug "Preparing R data: #{Time.now-t}"
137
+ t = Time.now
138
+ # model + support vectors
139
+ #train_success = @r.eval <<-EOR
140
+ rs << '
141
+ model = train(prop_matrix,y,
142
+ method="svmRadial",
143
+ preProcess=c("center", "scale"),
144
+ class.weights=weights,
145
+ trControl=trainControl(method="LGOCV",number=10),
146
+ tuneLength=8
147
+ )
148
+ perf = ifelse ( class(y)!="numeric", max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared )
149
+ '
150
+ File.open("/tmp/r.r","w+"){|f| f.puts rs.join("\n")}
151
+ p rs.join("\n")
152
+ p `Rscript /tmp/r.r`
153
+ =begin
154
+ @r.void_eval <<-EOR
155
+ model = train(prop_matrix,y,
156
+ method="svmRadial",
157
+ #preProcess=c("center", "scale"),
158
+ #class.weights=weights,
159
+ #trControl=trainControl(method="LGOCV",number=10),
160
+ #tuneLength=8
161
+ )
162
+ perf = ifelse ( class(y)!='numeric', max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared )
163
+ EOR
164
+ =end
165
+
166
+ $logger.debug "Creating R SVM model: #{Time.now-t}"
167
+ t = Time.now
168
+ if train_success
169
+ # prediction
170
+ @r.eval "predict(model,q_prop); p = predict(model,q_prop)" # kernlab bug: predict twice
171
+ #@r.eval "p = predict(model,q_prop)" # kernlab bug: predict twice
172
+ @r.eval "if (class(y)!='numeric') p = as.character(p)"
173
+ prediction = @r.p
174
+
175
+ # censoring
176
+ prediction = nil if ( @r.perf.nan? || @r.perf < min_train_performance.to_f )
177
+ prediction = nil if prediction =~ /NA/
178
+ $logger.debug "Performance: '#{sprintf("%.2f", @r.perf)}'"
179
+ else
180
+ $logger.debug "Model creation failed."
181
+ prediction = nil
182
+ end
183
+ $logger.debug "R Prediction: #{Time.now-t}"
184
+ rescue Exception => e
185
+ $logger.debug "#{e.class}: #{e.message}"
186
+ $logger.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
187
+ ensure
188
+ #puts @r.inspect
189
+ #TODO: broken pipe
190
+ #@r.quit # free R
191
+ end
192
+ end
193
+ prediction
194
+ end
195
+ end
196
+
197
+ end
198
+ end
199
+
@@ -0,0 +1,98 @@
1
+ module OpenTox
2
+
3
+ class RestClientWrapper
4
+
5
+ attr_accessor :request, :response
6
+
7
+ @@subjectid = nil
8
+
9
+ def self.subjectid=(subjectid)
10
+ @@subjectid = subjectid
11
+ end
12
+
13
+ def self.subjectid
14
+ @@subjectid
15
+ end
16
+
17
+ # REST methods
18
+ # Raises OpenTox::Error if call fails (rescued in overwrite.rb -> halt 502)
19
+ # Does not wait for task to finish and returns task uri
20
+ # @param [String] destination URI
21
+ # @param [optional,Hash|String] Payload data posted to the service
22
+ # @param [optional,Hash] Headers with params like :accept, :content_type, :subjectid, :verify_ssl
23
+ # @return [RestClient::Response] REST call response
24
+ [:head,:get,:post,:put,:delete].each do |method|
25
+
26
+ define_singleton_method method do |uri,payload={},headers={},waiting_task=nil|
27
+
28
+ # check input
29
+ bad_request_error "Headers are not a hash: #{headers.inspect}", uri unless headers==nil or headers.is_a?(Hash)
30
+ headers[:subjectid] ||= @@subjectid
31
+ bad_request_error "Invalid URI: '#{uri}'", uri unless URI.valid? uri
32
+ #resource_not_found_error "URI '#{uri}' not found.", uri unless URI.accessible?(uri, @subjectid) unless URI.ssl?(uri)
33
+ # make sure that no header parameters are set in the payload
34
+ [:accept,:content_type,:subjectid].each do |header|
35
+ if defined? $aa || URI(uri).host == URI($aa[:uri]).host
36
+ else
37
+ bad_request_error "#{header} should be submitted in the headers", uri if payload and payload.is_a?(Hash) and payload[header]
38
+ end
39
+ end
40
+
41
+ # create request
42
+ args={}
43
+ args[:method] = method
44
+ args[:url] = uri
45
+ args[:verify_ssl] = 0 if headers[:verify_ssl].nil? || headers[:verify_ssl].empty?
46
+ args[:timeout] = 1800
47
+ args[:payload] = payload
48
+ headers.each{ |k,v| headers.delete(k) if v==nil } if headers #remove keys with empty values, as this can cause problems
49
+ args[:headers] = headers
50
+
51
+ $logger.debug "post to #{uri} with params #{payload.inspect.to_s[0..1000]}" if method.to_s=="post"
52
+
53
+ @request = RestClient::Request.new(args)
54
+ # ignore error codes from Task services (may return error codes >= 400 according to API, which causes exceptions in RestClient and RDF::Reader)
55
+ @response = @request.execute do |response, request, result|
56
+ if [301, 302, 307].include? response.code and request.method == :get
57
+ response.follow_redirection(request, result)
58
+ elsif response.code >= 400 and !URI.task?(uri)
59
+ #TODO add parameters to error-report
60
+ #parameters = request.args
61
+ #parameters[:headers][:subjectid] = "REMOVED" if parameters[:headers] and parameters[:headers][:subjectid]
62
+ #parameters[:url] = parameters[:url].gsub(/(http|https|)\:\/\/[a-zA-Z0-9\-]+\:[a-zA-Z0-9]+\@/, "REMOVED@") if parameters[:url]
63
+ #message += "\nREST parameters:\n#{parameters.inspect}"
64
+ error = known_errors.collect{|e| e if e[:code] == response.code}.compact.first
65
+ begin # errors are returned as error reports in json, try to parse
66
+ # TODO: may be the reason for failure of task.rb -n test_11_wait_for_error_task
67
+ content = JSON.parse(response)
68
+ msg = content["message"].to_s
69
+ cause = content["errorCause"].to_s
70
+ raise if msg.size==0 && cause.size==0 # parsing failed
71
+ rescue # parsing error failed, use complete content as message
72
+ msg = "Could not parse error response from rest call '#{method}' to '#{uri}':\n#{response}"
73
+ cause = nil
74
+ end
75
+ Object.method(error[:method]).call msg, uri, cause # call error method
76
+ else
77
+ response
78
+ end
79
+ end
80
+ end
81
+ end
82
+
83
+ #@return [Array] of hashes with error code, method and class
84
+ def self.known_errors
85
+ errors = []
86
+ RestClient::STATUSES.each do |code,k|
87
+ if code >= 400
88
+ method = k.underscore.gsub(/ |'/,'_')
89
+ method += "_error" unless method.match(/_error$/)
90
+ klass = method.split("_").collect{|s| s.capitalize}.join("")
91
+ errors << {:code => code, :method => method.to_sym, :class => klass}
92
+ end
93
+ end
94
+ errors
95
+ end
96
+
97
+ end
98
+ end
data/lib/similarity.rb ADDED
@@ -0,0 +1,58 @@
1
+ =begin
2
+ * Name: similarity.rb
3
+ * Description: Similarity algorithms
4
+ * Author: Andreas Maunz <andreas@maunz.de
5
+ * Date: 10/2012
6
+ =end
7
+
8
+ module OpenTox
9
+ module Algorithm
10
+
11
+ class Similarity
12
+
13
+ #TODO weighted tanimoto
14
+
15
+ # Tanimoto similarity
16
+ # @param [Array] a fingerprints of first compound
17
+ # @param [Array] b fingerprints of second compound
18
+ # @return [Float] Tanimoto similarity
19
+ def self.tanimoto(a,b)
20
+ bad_request_error "fingerprints #{a} and #{b} don't have equal size" unless a.size == b.size
21
+ #common = 0.0
22
+ #a.each_with_index do |n,i|
23
+ #common += 1 if n == b[i]
24
+ #end
25
+ #common/a.size
26
+ # TODO check if calculation speed can be improved
27
+ common_p_sum = 0.0
28
+ all_p_sum = 0.0
29
+ (0...a.size).each { |idx|
30
+ common_p_sum += [ a[idx], b[idx] ].min
31
+ all_p_sum += [ a[idx], b[idx] ].max
32
+ }
33
+ common_p_sum/all_p_sum
34
+ end
35
+
36
+
37
+ # Cosine similarity
38
+ # @param [Array] a fingerprints of first compound
39
+ # @param [Array] b fingerprints of second compound
40
+ # @return [Float] Cosine similarity, the cosine of angle enclosed between vectors a and b
41
+ def self.cosine(a, b)
42
+ val = 0.0
43
+ if a.size>0 and b.size>0
44
+ if a.size>12 && b.size>12
45
+ a = a[0..11]
46
+ b = b[0..11]
47
+ end
48
+ a_vec = a.to_gv
49
+ b_vec = b.to_gv
50
+ val = a_vec.dot(b_vec) / (a_vec.norm * b_vec.norm)
51
+ end
52
+ val
53
+ end
54
+
55
+ end
56
+
57
+ end
58
+ end
@@ -0,0 +1,120 @@
1
+ # set of non redundant descriptors, faster algorithms are preferred
2
+ # TODO:
3
+ # select logP algorithm
4
+ # select l5 algorithm
5
+ # use smarts matcher for atom counts
6
+ # check correlations
7
+ UNIQUEDESCRIPTORS = [
8
+ "Openbabel.abonds", #Number of aromatic bonds
9
+ "Openbabel.atoms", #Number of atoms
10
+ "Openbabel.bonds", #Number of bonds
11
+ "Openbabel.dbonds", #Number of double bonds
12
+ "Openbabel.HBA1", #Number of Hydrogen Bond Acceptors 1 (JoelLib)
13
+ "Openbabel.HBA2", #Number of Hydrogen Bond Acceptors 2 (JoelLib)
14
+ "Openbabel.HBD", #Number of Hydrogen Bond Donors (JoelLib)
15
+ "Openbabel.L5", #Lipinski Rule of Five
16
+ "Openbabel.logP", #octanol/water partition coefficient
17
+ "Openbabel.MP", #Melting point
18
+ "Openbabel.MR", #molar refractivity
19
+ "Openbabel.MW", #Molecular Weight filter
20
+ "Openbabel.nF", #Number of Fluorine Atoms
21
+ "Openbabel.sbonds", #Number of single bonds
22
+ "Openbabel.tbonds", #Number of triple bonds
23
+ "Openbabel.TPSA", #topological polar surface area
24
+ "Cdk.ALOGP", #Calculates atom additive logP and molar refractivity values as described by Ghose and Crippen and
25
+ "Cdk.APol", #Descriptor that calculates the sum of the atomic polarizabilities (including implicit hydrogens).
26
+ "Cdk.AcidicGroupCount", #Returns the number of acidic groups.
27
+ "Cdk.AminoAcidCount", #Returns the number of amino acids found in the system
28
+ #"Cdk.AromaticAtomsCount", #Descriptor based on the number of aromatic atoms of a molecule.
29
+ #"Cdk.AromaticBondsCount", #Descriptor based on the number of aromatic bonds of a molecule.
30
+ #"Cdk.AtomCount", #Descriptor based on the number of atoms of a certain element type.
31
+ "Cdk.AutocorrelationCharge", #The Moreau-Broto autocorrelation descriptors using partial charges
32
+ "Cdk.AutocorrelationMass", #The Moreau-Broto autocorrelation descriptors using atomic weight
33
+ "Cdk.AutocorrelationPolarizability", #The Moreau-Broto autocorrelation descriptors using polarizability
34
+ "Cdk.BCUT", #Eigenvalue based descriptor noted for its utility in chemical diversity described by Pearlman et al. .
35
+ "Cdk.BPol", #Descriptor that calculates the sum of the absolute value of the difference between atomic polarizabilities of all bonded atoms in the molecule (including implicit hydrogens).
36
+ "Cdk.BasicGroupCount", #Returns the number of basic groups.
37
+ #"Cdk.BondCount", #Descriptor based on the number of bonds of a certain bond order.
38
+ "Cdk.CPSA", #A variety of descriptors combining surface area and partial charge information
39
+ "Cdk.CarbonTypes", #Characterizes the carbon connectivity in terms of hybridization
40
+ "Cdk.ChiChain", #Evaluates the Kier & Hall Chi chain indices of orders 3,4,5 and 6
41
+ "Cdk.ChiCluster", #Evaluates the Kier & Hall Chi cluster indices of orders 3,4,5,6 and 7
42
+ "Cdk.ChiPathCluster", #Evaluates the Kier & Hall Chi path cluster indices of orders 4,5 and 6
43
+ "Cdk.ChiPath", #Evaluates the Kier & Hall Chi path indices of orders 0,1,2,3,4,5,6 and 7
44
+ "Cdk.EccentricConnectivityIndex", #A topological descriptor combining distance and adjacency information.
45
+ "Cdk.FMF", #Descriptor characterizing molecular complexity in terms of its Murcko framework
46
+ "Cdk.FragmentComplexity", #Class that returns the complexity of a system. The complexity is defined as @cdk.cite{Nilakantan06}
47
+ "Cdk.GravitationalIndex", #Descriptor characterizing the mass distribution of the molecule.
48
+ #"Cdk.HBondAcceptorCount", #Descriptor that calculates the number of hydrogen bond acceptors.
49
+ #"Cdk.HBondDonorCount", #Descriptor that calculates the number of hydrogen bond donors.
50
+ "Cdk.HybridizationRatio", #Characterizes molecular complexity in terms of carbon hybridization states.
51
+ "Cdk.IPMolecularLearning", #Descriptor that evaluates the ionization potential.
52
+ "Cdk.KappaShapeIndices", #Descriptor that calculates Kier and Hall kappa molecular shape indices.
53
+ "Cdk.KierHallSmarts", #Counts the number of occurrences of the E-state fragments
54
+ "Cdk.LargestChain", #Returns the number of atoms in the largest chain
55
+ "Cdk.LargestPiSystem", #Returns the number of atoms in the largest pi chain
56
+ "Cdk.LengthOverBreadth", #Calculates the ratio of length to breadth.
57
+ "Cdk.LongestAliphaticChain", #Returns the number of atoms in the longest aliphatic chain
58
+ "Cdk.MDE", #Evaluate molecular distance edge descriptors for C, N and O
59
+ "Cdk.MannholdLogP", #Descriptor that calculates the LogP based on a simple equation using the number of carbons and hetero atoms .
60
+ "Cdk.MomentOfInertia", #Descriptor that calculates the principal moments of inertia and ratios of the principal moments. Als calculates the radius of gyration.
61
+ "Cdk.PetitjeanNumber", #Descriptor that calculates the Petitjean Number of a molecule.
62
+ "Cdk.PetitjeanShapeIndex", #The topological and geometric shape indices described Petitjean and Bath et al. respectively. Both measure the anisotropy in a molecule.
63
+ "Cdk.RotatableBondsCount", #Descriptor that calculates the number of nonrotatable bonds on a molecule.
64
+ #"Cdk.RuleOfFive", #This Class contains a method that returns the number failures of the Lipinski's Rule Of Five.
65
+ #"Cdk.TPSA", #Calculation of topological polar surface area based on fragment contributions .
66
+ "Cdk.VABC", #Describes the volume of a molecule.
67
+ "Cdk.VAdjMa", #Descriptor that calculates the vertex adjacency information of a molecule.
68
+ "Cdk.WHIM", #Holistic descriptors described by Todeschini et al .
69
+ #"Cdk.Weight", #Descriptor based on the weight of atoms of a certain element type. If no element is specified, the returned value is the Molecular Weight
70
+ "Cdk.WeightedPath", #The weighted path (molecular ID) descriptors described by Randic. They characterize molecular branching.
71
+ "Cdk.WienerNumbers", #This class calculates Wiener path number and Wiener polarity number.
72
+ "Cdk.XLogP", #Prediction of logP based on the atom-type method called XLogP.
73
+ "Cdk.ZagrebIndex", #The sum of the squared atom degrees of all heavy atoms.
74
+ "Joelib.count.NumberOfS", #no description available
75
+ "Joelib.count.NumberOfP", #no description available
76
+ "Joelib.count.NumberOfO", #no description available
77
+ "Joelib.count.NumberOfN", #no description available
78
+ #"Joelib.count.AromaticBonds", #no description available
79
+ "Joelib.count.NumberOfI", #no description available
80
+ "Joelib.count.NumberOfF", #no description available
81
+ "Joelib.count.NumberOfC", #no description available
82
+ "Joelib.count.NumberOfB", #no description available
83
+ "Joelib.count.HydrophobicGroups", #no description available
84
+ #"Joelib.KierShape3", #no description available
85
+ #"Joelib.KierShape2", #no description available
86
+ #"Joelib.KierShape1", #no description available
87
+ #"Joelib.count.AcidicGroups", #no description available
88
+ "Joelib.count.AliphaticOHGroups", #no description available
89
+ #"Joelib.count.NumberOfAtoms", #no description available
90
+ "Joelib.TopologicalRadius", #no description available
91
+ "Joelib.GeometricalShapeCoefficient", #no description available
92
+ #"Joelib.MolecularWeight", #no description available
93
+ "Joelib.FractionRotatableBonds", #no description available
94
+ #"Joelib.count.HBD2", #no description available
95
+ #"Joelib.count.HBD1", #no description available
96
+ "Joelib.LogP", #no description available
97
+ "Joelib.GraphShapeCoefficient", #no description available
98
+ "Joelib.count.BasicGroups", #no description available
99
+ #"Joelib.count.RotatableBonds", #no description available
100
+ "Joelib.count.HeavyBonds", #no description available
101
+ "Joelib.PolarSurfaceArea", #no description available
102
+ #"Joelib.ZagrebIndex1", #no description available
103
+ "Joelib.GeometricalRadius", #no description available
104
+ "Joelib.count.SO2Groups", #no description available
105
+ "Joelib.count.AromaticOHGroups", #no description available
106
+ "Joelib.GeometricalDiameter", #no description available
107
+ #"Joelib.MolarRefractivity", #no description available
108
+ "Joelib.count.NumberOfCl", #no description available
109
+ "Joelib.count.OSOGroups", #no description available
110
+ "Joelib.count.NumberOfBr", #no description available
111
+ "Joelib.count.NO2Groups", #no description available
112
+ "Joelib.count.HeteroCycles", #no description available
113
+ #"Joelib.count.HBA2", #no description available
114
+ #"Joelib.count.HBA1", #no description available
115
+ #"Joelib.count.NumberOfBonds", #no description available
116
+ "Joelib.count.SOGroups", #no description available
117
+ "Joelib.TopologicalDiameter", #no description available
118
+ "Joelib.count.NumberOfHal", #no description available
119
+
120
+ ].sort