lazar 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (98) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/.yardopts +4 -0
  4. data/Gemfile +2 -0
  5. data/LICENSE +674 -0
  6. data/README.md +44 -0
  7. data/Rakefile +1 -0
  8. data/VERSION +1 -0
  9. data/ext/lazar/extconf.rb +87 -0
  10. data/java/CdkDescriptorInfo.class +0 -0
  11. data/java/CdkDescriptorInfo.java +22 -0
  12. data/java/CdkDescriptors.class +0 -0
  13. data/java/CdkDescriptors.java +141 -0
  14. data/java/Jmol.jar +0 -0
  15. data/java/JoelibDescriptorInfo.class +0 -0
  16. data/java/JoelibDescriptorInfo.java +15 -0
  17. data/java/JoelibDescriptors.class +0 -0
  18. data/java/JoelibDescriptors.java +60 -0
  19. data/java/Rakefile +15 -0
  20. data/java/cdk-1.4.19.jar +0 -0
  21. data/java/joelib2.jar +0 -0
  22. data/java/log4j.jar +0 -0
  23. data/lazar.gemspec +29 -0
  24. data/lib/SMARTS_InteLigand.txt +983 -0
  25. data/lib/algorithm.rb +21 -0
  26. data/lib/bbrc.rb +165 -0
  27. data/lib/classification.rb +107 -0
  28. data/lib/compound.rb +254 -0
  29. data/lib/crossvalidation.rb +187 -0
  30. data/lib/dataset.rb +334 -0
  31. data/lib/descriptor.rb +247 -0
  32. data/lib/error.rb +66 -0
  33. data/lib/feature.rb +97 -0
  34. data/lib/lazar-model.rb +170 -0
  35. data/lib/lazar.rb +69 -0
  36. data/lib/neighbor.rb +25 -0
  37. data/lib/opentox.rb +22 -0
  38. data/lib/overwrite.rb +119 -0
  39. data/lib/regression.rb +199 -0
  40. data/lib/rest-client-wrapper.rb +98 -0
  41. data/lib/similarity.rb +58 -0
  42. data/lib/unique_descriptors.rb +120 -0
  43. data/lib/validation.rb +114 -0
  44. data/mongoid.yml +8 -0
  45. data/test/all.rb +5 -0
  46. data/test/compound.rb +100 -0
  47. data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +13553 -0
  48. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +436 -0
  49. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +568 -0
  50. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +87 -0
  51. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +978 -0
  52. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +1120 -0
  53. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +1113 -0
  54. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +850 -0
  55. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +829 -0
  56. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +1198 -0
  57. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +1505 -0
  58. data/test/data/EPAFHM.csv +618 -0
  59. data/test/data/EPAFHM.medi.csv +100 -0
  60. data/test/data/EPAFHM.mini.csv +22 -0
  61. data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +581 -0
  62. data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +1217 -0
  63. data/test/data/ISSCAN-multi.csv +59 -0
  64. data/test/data/LOAEL_log_mg_corrected_smiles.csv +568 -0
  65. data/test/data/LOAEL_log_mmol_corrected_smiles.csv +568 -0
  66. data/test/data/acetaldehyde.sdf +14 -0
  67. data/test/data/boiling_points.ext.sdf +11460 -0
  68. data/test/data/cpdb_100.csv +101 -0
  69. data/test/data/hamster_carcinogenicity.csv +86 -0
  70. data/test/data/hamster_carcinogenicity.mini.bool_float.csv +11 -0
  71. data/test/data/hamster_carcinogenicity.mini.bool_int.csv +11 -0
  72. data/test/data/hamster_carcinogenicity.mini.bool_string.csv +11 -0
  73. data/test/data/hamster_carcinogenicity.mini.csv +11 -0
  74. data/test/data/hamster_carcinogenicity.ntriples +618 -0
  75. data/test/data/hamster_carcinogenicity.sdf +2805 -0
  76. data/test/data/hamster_carcinogenicity.xls +0 -0
  77. data/test/data/hamster_carcinogenicity.yaml +352 -0
  78. data/test/data/hamster_carcinogenicity_with_errors.csv +88 -0
  79. data/test/data/kazius.csv +4070 -0
  80. data/test/data/multi_cell_call.csv +1067 -0
  81. data/test/data/multi_cell_call_no_dup.csv +1057 -0
  82. data/test/data/multicolumn.csv +8 -0
  83. data/test/data/rat_feature_dataset.csv +1179 -0
  84. data/test/data/wrong_dataset.csv +8 -0
  85. data/test/dataset-long.rb +117 -0
  86. data/test/dataset.rb +199 -0
  87. data/test/descriptor-long.rb +26 -0
  88. data/test/descriptor.rb +83 -0
  89. data/test/error.rb +24 -0
  90. data/test/feature.rb +65 -0
  91. data/test/fminer-long.rb +38 -0
  92. data/test/fminer.rb +52 -0
  93. data/test/lazar-fminer.rb +50 -0
  94. data/test/lazar-long.rb +72 -0
  95. data/test/lazar-physchem-short.rb +27 -0
  96. data/test/setup.rb +6 -0
  97. data/test/validation.rb +41 -0
  98. metadata +212 -0
data/lib/overwrite.rb ADDED
@@ -0,0 +1,119 @@
1
+ require "base64"
2
+ class Object
3
+ # An object is blank if it's false, empty, or a whitespace string.
4
+ # For example, "", " ", +nil+, [], and {} are all blank.
5
+ def blank?
6
+ respond_to?(:empty?) ? empty? : !self
7
+ end
8
+
9
+ def numeric?
10
+ true if Float(self) rescue false
11
+ end
12
+ end
13
+
14
+ class Numeric
15
+ def percent_of(n)
16
+ self.to_f / n.to_f * 100.0
17
+ end
18
+ end
19
+
20
+ module Enumerable
21
+ # @return [Array] only the duplicates of an enumerable
22
+ def duplicates
23
+ inject({}) {|h,v| h[v]=h[v].to_i+1; h}.reject{|k,v| v==1}.keys
24
+ end
25
+ # http://stackoverflow.com/questions/2562256/find-most-common-string-in-an-array
26
+ Enumerable.class_eval do
27
+ def mode
28
+ group_by do |e|
29
+ e
30
+ end.values.max_by(&:size).first
31
+ end
32
+ end
33
+ end
34
+
35
+ class String
36
+ # @return [String] converts camel-case to underscore-case (OpenTox::SuperModel -> open_tox/super_model)
37
+ def underscore
38
+ self.gsub(/::/, '/').
39
+ gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
40
+ gsub(/([a-z\d])([A-Z])/,'\1_\2').
41
+ tr("-", "_").
42
+ downcase
43
+ end
44
+
45
+ # convert strings to boolean values
46
+ # @return [TrueClass,FalseClass] true or false
47
+ def to_boolean
48
+ return true if self == true || self =~ (/(true|t|yes|y|1)$/i)
49
+ return false if self == false || self.nil? || self =~ (/(false|f|no|n|0)$/i)
50
+ bad_request_error "invalid value for Boolean: \"#{self}\""
51
+ end
52
+
53
+ end
54
+
55
+ class File
56
+ # @return [String] mime_type including charset using linux cmd command
57
+ def mime_type
58
+ `file -ib '#{self.path}'`.chomp
59
+ end
60
+ end
61
+
62
+ class Array
63
+
64
+ # Sum up the size of single arrays in an array of arrays
65
+ # @param [Array] Array of arrays
66
+ # @return [Integer] Sum of size of array elements
67
+ def sum_size
68
+ self.inject(0) { |s,a|
69
+ if a.respond_to?('size')
70
+ s+=a.size
71
+ else
72
+ internal_server_error "No size available: #{a.inspect}"
73
+ end
74
+ }
75
+ end
76
+
77
+ # For symbolic features
78
+ # @param [Array] Array to test.
79
+ # @return [Boolean] Whether the array has just one unique value.
80
+ def zero_variance?
81
+ return self.uniq.size == 1
82
+ end
83
+
84
+ end
85
+
86
+ module URI
87
+
88
+ def self.ssl? uri
89
+ URI.parse(uri).instance_of? URI::HTTPS
90
+ end
91
+
92
+ # @return [Boolean] checks if resource exists by making a HEAD-request
93
+ def self.accessible?(uri)
94
+ parsed_uri = URI.parse(uri + (OpenTox::RestClientWrapper.subjectid ? "?subjectid=#{CGI.escape OpenTox::RestClientWrapper.subjectid}" : ""))
95
+ http_code = URI.task?(uri) ? 600 : 400
96
+ http = Net::HTTP.new(parsed_uri.host, parsed_uri.port)
97
+ unless (URI.ssl? uri) == true
98
+ http = Net::HTTP.new(parsed_uri.host, parsed_uri.port)
99
+ request = Net::HTTP::Head.new(parsed_uri.request_uri)
100
+ http.request(request).code.to_i < http_code
101
+ else
102
+ http = Net::HTTP.new(parsed_uri.host, parsed_uri.port)
103
+ http.use_ssl = true
104
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
105
+ request = Net::HTTP::Head.new(parsed_uri.request_uri)
106
+ http.request(request).code.to_i < http_code
107
+ end
108
+ rescue
109
+ false
110
+ end
111
+
112
+ def self.valid? uri
113
+ u = URI.parse(uri)
114
+ u.scheme!=nil and u.host!=nil
115
+ rescue URI::InvalidURIError
116
+ false
117
+ end
118
+
119
+ end
data/lib/regression.rb ADDED
@@ -0,0 +1,199 @@
1
+ # TODO install R packages kernlab, caret, doMC, class, e1071
2
+
3
+
4
+ # log transform activities (create new dataset)
5
+ # scale, normalize features, might not be necessary
6
+ # http://stats.stackexchange.com/questions/19216/variables-are-often-adjusted-e-g-standardised-before-making-a-model-when-is
7
+ # http://stats.stackexchange.com/questions/7112/when-and-how-to-use-standardized-explanatory-variables-in-linear-regression
8
+ # zero-order correlation and the semi-partial correlation
9
+ # seems to be necessary for svm
10
+ # http://stats.stackexchange.com/questions/77876/why-would-scaling-features-decrease-svm-performance?lq=1
11
+ # http://stackoverflow.com/questions/15436367/svm-scaling-input-values
12
+ # use lasso or elastic net??
13
+ # select relevant features
14
+ # remove features with a single value
15
+ # remove correlated features
16
+ # remove features not correlated with endpoint
17
+ module OpenTox
18
+ module Algorithm
19
+
20
+ class Regression
21
+
22
+ def self.weighted_average neighbors
23
+ weighted_sum = 0.0
24
+ sim_sum = 0.0
25
+ neighbors.each do |row|
26
+ n,sim,acts = row
27
+ acts.each do |act|
28
+ weighted_sum += sim*Math.log10(act)
29
+ sim_sum += sim
30
+ end
31
+ end
32
+ confidence = sim_sum/neighbors.size.to_f
33
+ sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum)
34
+ {:value => prediction,:confidence => confidence}
35
+ end
36
+
37
+ # Local support vector regression from neighbors
38
+ # @param [Hash] params Keys `:props, :activities, :sims, :min_train_performance` are required
39
+ # @return [Numeric] A prediction value.
40
+ def self.local_svm_regression neighbors, params={:min_train_performance => 0.1}
41
+
42
+ confidence = 0.0
43
+ prediction = nil
44
+
45
+ $logger.debug "Local SVM."
46
+ props = neighbors.collect{|row| row[3] }
47
+ neighbors.shift
48
+ activities = neighbors.collect{|n| n[2]}
49
+ prediction = self.local_svm_prop( props, activities, params[:min_train_performance]) # params[:props].nil? signals non-prop setting
50
+ prediction = nil if (!prediction.nil? && prediction.infinite?)
51
+ $logger.debug "Prediction: '#{prediction}' ('#{prediction.class}')."
52
+ if prediction
53
+ confidence = get_confidence({:sims => neighbors.collect{|n| n[1]}, :activities => activities})
54
+ else
55
+ confidence = nil if prediction.nil?
56
+ end
57
+ [prediction, confidence]
58
+
59
+ end
60
+
61
+
62
+ # Local support vector prediction from neighbors.
63
+ # Uses propositionalized setting.
64
+ # Not to be called directly (use local_svm_regression or local_svm_classification).
65
+ # @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ]
66
+ # @param [Array] activities, activities for neighbors.
67
+ # @param [Float] min_train_performance, parameter to control censoring
68
+ # @return [Numeric] A prediction value.
69
+ def self.local_svm_prop(props, activities, min_train_performance)
70
+
71
+ $logger.debug "Local SVM (Propositionalization / Kernlab Kernel)."
72
+ n_prop = props[1..-1] # is a matrix, i.e. two nested Arrays.
73
+ q_prop = props[0] # is an Array.
74
+
75
+ prediction = nil
76
+ if activities.uniq.size == 1
77
+ prediction = activities[0]
78
+ else
79
+ t = Time.now
80
+ #$logger.debug gram_matrix.to_yaml
81
+ #@r = RinRuby.new(true,false) # global R instance leads to Socket errors after a large number of requests
82
+ @r = Rserve::Connection.new#(true,false) # global R instance leads to Socket errors after a large number of requests
83
+ rs = []
84
+ ["caret", "doMC", "class"].each do |lib|
85
+ #raise "failed to load R-package #{lib}" unless @r.void_eval "suppressPackageStartupMessages(library('#{lib}'))"
86
+ rs << "suppressPackageStartupMessages(library('#{lib}'))"
87
+ end
88
+ #@r.eval "registerDoMC()" # switch on parallel processing
89
+ rs << "registerDoMC()" # switch on parallel processing
90
+ #@r.eval "set.seed(1)"
91
+ rs << "set.seed(1)"
92
+ $logger.debug "Loading R packages: #{Time.now-t}"
93
+ t = Time.now
94
+ p n_prop
95
+ begin
96
+
97
+ # set data
98
+ rs << "n_prop <- c(#{n_prop.flatten.join(',')})"
99
+ rs << "n_prop <- c(#{n_prop.flatten.join(',')})"
100
+ rs << "n_prop_x_size <- c(#{n_prop.size})"
101
+ rs << "n_prop_y_size <- c(#{n_prop[0].size})"
102
+ rs << "y <- c(#{activities.join(',')})"
103
+ rs << "q_prop <- c(#{q_prop.join(',')})"
104
+ rs << "y = matrix(y)"
105
+ rs << "prop_matrix = matrix(n_prop, n_prop_x_size, n_prop_y_size, byrow=T)"
106
+ rs << "q_prop = matrix(q_prop, 1, n_prop_y_size, byrow=T)"
107
+
108
+ $logger.debug "Setting R data: #{Time.now-t}"
109
+ t = Time.now
110
+ # prepare data
111
+ rs << "
112
+ weights=NULL
113
+ if (!(class(y) == 'numeric')) {
114
+ y = factor(y)
115
+ weights=unlist(as.list(prop.table(table(y))))
116
+ weights=(weights-1)^2
117
+ }
118
+ "
119
+
120
+ rs << "
121
+ rem = nearZeroVar(prop_matrix)
122
+ if (length(rem) > 0) {
123
+ prop_matrix = prop_matrix[,-rem,drop=F]
124
+ q_prop = q_prop[,-rem,drop=F]
125
+ }
126
+ rem = findCorrelation(cor(prop_matrix))
127
+ if (length(rem) > 0) {
128
+ prop_matrix = prop_matrix[,-rem,drop=F]
129
+ q_prop = q_prop[,-rem,drop=F]
130
+ }
131
+ "
132
+
133
+ #p @r.eval("y").to_ruby
134
+ #p "weights"
135
+ #p @r.eval("weights").to_ruby
136
+ $logger.debug "Preparing R data: #{Time.now-t}"
137
+ t = Time.now
138
+ # model + support vectors
139
+ #train_success = @r.eval <<-EOR
140
+ rs << '
141
+ model = train(prop_matrix,y,
142
+ method="svmRadial",
143
+ preProcess=c("center", "scale"),
144
+ class.weights=weights,
145
+ trControl=trainControl(method="LGOCV",number=10),
146
+ tuneLength=8
147
+ )
148
+ perf = ifelse ( class(y)!="numeric", max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared )
149
+ '
150
+ File.open("/tmp/r.r","w+"){|f| f.puts rs.join("\n")}
151
+ p rs.join("\n")
152
+ p `Rscript /tmp/r.r`
153
+ =begin
154
+ @r.void_eval <<-EOR
155
+ model = train(prop_matrix,y,
156
+ method="svmRadial",
157
+ #preProcess=c("center", "scale"),
158
+ #class.weights=weights,
159
+ #trControl=trainControl(method="LGOCV",number=10),
160
+ #tuneLength=8
161
+ )
162
+ perf = ifelse ( class(y)!='numeric', max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared )
163
+ EOR
164
+ =end
165
+
166
+ $logger.debug "Creating R SVM model: #{Time.now-t}"
167
+ t = Time.now
168
+ if train_success
169
+ # prediction
170
+ @r.eval "predict(model,q_prop); p = predict(model,q_prop)" # kernlab bug: predict twice
171
+ #@r.eval "p = predict(model,q_prop)" # kernlab bug: predict twice
172
+ @r.eval "if (class(y)!='numeric') p = as.character(p)"
173
+ prediction = @r.p
174
+
175
+ # censoring
176
+ prediction = nil if ( @r.perf.nan? || @r.perf < min_train_performance.to_f )
177
+ prediction = nil if prediction =~ /NA/
178
+ $logger.debug "Performance: '#{sprintf("%.2f", @r.perf)}'"
179
+ else
180
+ $logger.debug "Model creation failed."
181
+ prediction = nil
182
+ end
183
+ $logger.debug "R Prediction: #{Time.now-t}"
184
+ rescue Exception => e
185
+ $logger.debug "#{e.class}: #{e.message}"
186
+ $logger.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
187
+ ensure
188
+ #puts @r.inspect
189
+ #TODO: broken pipe
190
+ #@r.quit # free R
191
+ end
192
+ end
193
+ prediction
194
+ end
195
+ end
196
+
197
+ end
198
+ end
199
+
@@ -0,0 +1,98 @@
1
+ module OpenTox
2
+
3
+ class RestClientWrapper
4
+
5
+ attr_accessor :request, :response
6
+
7
+ @@subjectid = nil
8
+
9
+ def self.subjectid=(subjectid)
10
+ @@subjectid = subjectid
11
+ end
12
+
13
+ def self.subjectid
14
+ @@subjectid
15
+ end
16
+
17
+ # REST methods
18
+ # Raises OpenTox::Error if call fails (rescued in overwrite.rb -> halt 502)
19
+ # Does not wait for task to finish and returns task uri
20
+ # @param [String] destination URI
21
+ # @param [optional,Hash|String] Payload data posted to the service
22
+ # @param [optional,Hash] Headers with params like :accept, :content_type, :subjectid, :verify_ssl
23
+ # @return [RestClient::Response] REST call response
24
+ [:head,:get,:post,:put,:delete].each do |method|
25
+
26
+ define_singleton_method method do |uri,payload={},headers={},waiting_task=nil|
27
+
28
+ # check input
29
+ bad_request_error "Headers are not a hash: #{headers.inspect}", uri unless headers==nil or headers.is_a?(Hash)
30
+ headers[:subjectid] ||= @@subjectid
31
+ bad_request_error "Invalid URI: '#{uri}'", uri unless URI.valid? uri
32
+ #resource_not_found_error "URI '#{uri}' not found.", uri unless URI.accessible?(uri, @subjectid) unless URI.ssl?(uri)
33
+ # make sure that no header parameters are set in the payload
34
+ [:accept,:content_type,:subjectid].each do |header|
35
+ if defined? $aa || URI(uri).host == URI($aa[:uri]).host
36
+ else
37
+ bad_request_error "#{header} should be submitted in the headers", uri if payload and payload.is_a?(Hash) and payload[header]
38
+ end
39
+ end
40
+
41
+ # create request
42
+ args={}
43
+ args[:method] = method
44
+ args[:url] = uri
45
+ args[:verify_ssl] = 0 if headers[:verify_ssl].nil? || headers[:verify_ssl].empty?
46
+ args[:timeout] = 1800
47
+ args[:payload] = payload
48
+ headers.each{ |k,v| headers.delete(k) if v==nil } if headers #remove keys with empty values, as this can cause problems
49
+ args[:headers] = headers
50
+
51
+ $logger.debug "post to #{uri} with params #{payload.inspect.to_s[0..1000]}" if method.to_s=="post"
52
+
53
+ @request = RestClient::Request.new(args)
54
+ # ignore error codes from Task services (may return error codes >= 400 according to API, which causes exceptions in RestClient and RDF::Reader)
55
+ @response = @request.execute do |response, request, result|
56
+ if [301, 302, 307].include? response.code and request.method == :get
57
+ response.follow_redirection(request, result)
58
+ elsif response.code >= 400 and !URI.task?(uri)
59
+ #TODO add parameters to error-report
60
+ #parameters = request.args
61
+ #parameters[:headers][:subjectid] = "REMOVED" if parameters[:headers] and parameters[:headers][:subjectid]
62
+ #parameters[:url] = parameters[:url].gsub(/(http|https|)\:\/\/[a-zA-Z0-9\-]+\:[a-zA-Z0-9]+\@/, "REMOVED@") if parameters[:url]
63
+ #message += "\nREST parameters:\n#{parameters.inspect}"
64
+ error = known_errors.collect{|e| e if e[:code] == response.code}.compact.first
65
+ begin # errors are returned as error reports in json, try to parse
66
+ # TODO: may be the reason for failure of task.rb -n test_11_wait_for_error_task
67
+ content = JSON.parse(response)
68
+ msg = content["message"].to_s
69
+ cause = content["errorCause"].to_s
70
+ raise if msg.size==0 && cause.size==0 # parsing failed
71
+ rescue # parsing error failed, use complete content as message
72
+ msg = "Could not parse error response from rest call '#{method}' to '#{uri}':\n#{response}"
73
+ cause = nil
74
+ end
75
+ Object.method(error[:method]).call msg, uri, cause # call error method
76
+ else
77
+ response
78
+ end
79
+ end
80
+ end
81
+ end
82
+
83
+ #@return [Array] of hashes with error code, method and class
84
+ def self.known_errors
85
+ errors = []
86
+ RestClient::STATUSES.each do |code,k|
87
+ if code >= 400
88
+ method = k.underscore.gsub(/ |'/,'_')
89
+ method += "_error" unless method.match(/_error$/)
90
+ klass = method.split("_").collect{|s| s.capitalize}.join("")
91
+ errors << {:code => code, :method => method.to_sym, :class => klass}
92
+ end
93
+ end
94
+ errors
95
+ end
96
+
97
+ end
98
+ end
data/lib/similarity.rb ADDED
@@ -0,0 +1,58 @@
1
+ =begin
2
+ * Name: similarity.rb
3
+ * Description: Similarity algorithms
4
+ * Author: Andreas Maunz <andreas@maunz.de
5
+ * Date: 10/2012
6
+ =end
7
+
8
+ module OpenTox
9
+ module Algorithm
10
+
11
+ class Similarity
12
+
13
+ #TODO weighted tanimoto
14
+
15
+ # Tanimoto similarity
16
+ # @param [Array] a fingerprints of first compound
17
+ # @param [Array] b fingerprints of second compound
18
+ # @return [Float] Tanimoto similarity
19
+ def self.tanimoto(a,b)
20
+ bad_request_error "fingerprints #{a} and #{b} don't have equal size" unless a.size == b.size
21
+ #common = 0.0
22
+ #a.each_with_index do |n,i|
23
+ #common += 1 if n == b[i]
24
+ #end
25
+ #common/a.size
26
+ # TODO check if calculation speed can be improved
27
+ common_p_sum = 0.0
28
+ all_p_sum = 0.0
29
+ (0...a.size).each { |idx|
30
+ common_p_sum += [ a[idx], b[idx] ].min
31
+ all_p_sum += [ a[idx], b[idx] ].max
32
+ }
33
+ common_p_sum/all_p_sum
34
+ end
35
+
36
+
37
+ # Cosine similarity
38
+ # @param [Array] a fingerprints of first compound
39
+ # @param [Array] b fingerprints of second compound
40
+ # @return [Float] Cosine similarity, the cosine of angle enclosed between vectors a and b
41
+ def self.cosine(a, b)
42
+ val = 0.0
43
+ if a.size>0 and b.size>0
44
+ if a.size>12 && b.size>12
45
+ a = a[0..11]
46
+ b = b[0..11]
47
+ end
48
+ a_vec = a.to_gv
49
+ b_vec = b.to_gv
50
+ val = a_vec.dot(b_vec) / (a_vec.norm * b_vec.norm)
51
+ end
52
+ val
53
+ end
54
+
55
+ end
56
+
57
+ end
58
+ end
@@ -0,0 +1,120 @@
1
+ # set of non redundant descriptors, faster algorithms are preferred
2
+ # TODO:
3
+ # select logP algorithm
4
+ # select l5 algorithm
5
+ # use smarts matcher for atom counts
6
+ # check correlations
7
+ UNIQUEDESCRIPTORS = [
8
+ "Openbabel.abonds", #Number of aromatic bonds
9
+ "Openbabel.atoms", #Number of atoms
10
+ "Openbabel.bonds", #Number of bonds
11
+ "Openbabel.dbonds", #Number of double bonds
12
+ "Openbabel.HBA1", #Number of Hydrogen Bond Acceptors 1 (JoelLib)
13
+ "Openbabel.HBA2", #Number of Hydrogen Bond Acceptors 2 (JoelLib)
14
+ "Openbabel.HBD", #Number of Hydrogen Bond Donors (JoelLib)
15
+ "Openbabel.L5", #Lipinski Rule of Five
16
+ "Openbabel.logP", #octanol/water partition coefficient
17
+ "Openbabel.MP", #Melting point
18
+ "Openbabel.MR", #molar refractivity
19
+ "Openbabel.MW", #Molecular Weight filter
20
+ "Openbabel.nF", #Number of Fluorine Atoms
21
+ "Openbabel.sbonds", #Number of single bonds
22
+ "Openbabel.tbonds", #Number of triple bonds
23
+ "Openbabel.TPSA", #topological polar surface area
24
+ "Cdk.ALOGP", #Calculates atom additive logP and molar refractivity values as described by Ghose and Crippen and
25
+ "Cdk.APol", #Descriptor that calculates the sum of the atomic polarizabilities (including implicit hydrogens).
26
+ "Cdk.AcidicGroupCount", #Returns the number of acidic groups.
27
+ "Cdk.AminoAcidCount", #Returns the number of amino acids found in the system
28
+ #"Cdk.AromaticAtomsCount", #Descriptor based on the number of aromatic atoms of a molecule.
29
+ #"Cdk.AromaticBondsCount", #Descriptor based on the number of aromatic bonds of a molecule.
30
+ #"Cdk.AtomCount", #Descriptor based on the number of atoms of a certain element type.
31
+ "Cdk.AutocorrelationCharge", #The Moreau-Broto autocorrelation descriptors using partial charges
32
+ "Cdk.AutocorrelationMass", #The Moreau-Broto autocorrelation descriptors using atomic weight
33
+ "Cdk.AutocorrelationPolarizability", #The Moreau-Broto autocorrelation descriptors using polarizability
34
+ "Cdk.BCUT", #Eigenvalue based descriptor noted for its utility in chemical diversity described by Pearlman et al. .
35
+ "Cdk.BPol", #Descriptor that calculates the sum of the absolute value of the difference between atomic polarizabilities of all bonded atoms in the molecule (including implicit hydrogens).
36
+ "Cdk.BasicGroupCount", #Returns the number of basic groups.
37
+ #"Cdk.BondCount", #Descriptor based on the number of bonds of a certain bond order.
38
+ "Cdk.CPSA", #A variety of descriptors combining surface area and partial charge information
39
+ "Cdk.CarbonTypes", #Characterizes the carbon connectivity in terms of hybridization
40
+ "Cdk.ChiChain", #Evaluates the Kier & Hall Chi chain indices of orders 3,4,5 and 6
41
+ "Cdk.ChiCluster", #Evaluates the Kier & Hall Chi cluster indices of orders 3,4,5,6 and 7
42
+ "Cdk.ChiPathCluster", #Evaluates the Kier & Hall Chi path cluster indices of orders 4,5 and 6
43
+ "Cdk.ChiPath", #Evaluates the Kier & Hall Chi path indices of orders 0,1,2,3,4,5,6 and 7
44
+ "Cdk.EccentricConnectivityIndex", #A topological descriptor combining distance and adjacency information.
45
+ "Cdk.FMF", #Descriptor characterizing molecular complexity in terms of its Murcko framework
46
+ "Cdk.FragmentComplexity", #Class that returns the complexity of a system. The complexity is defined as @cdk.cite{Nilakantan06}
47
+ "Cdk.GravitationalIndex", #Descriptor characterizing the mass distribution of the molecule.
48
+ #"Cdk.HBondAcceptorCount", #Descriptor that calculates the number of hydrogen bond acceptors.
49
+ #"Cdk.HBondDonorCount", #Descriptor that calculates the number of hydrogen bond donors.
50
+ "Cdk.HybridizationRatio", #Characterizes molecular complexity in terms of carbon hybridization states.
51
+ "Cdk.IPMolecularLearning", #Descriptor that evaluates the ionization potential.
52
+ "Cdk.KappaShapeIndices", #Descriptor that calculates Kier and Hall kappa molecular shape indices.
53
+ "Cdk.KierHallSmarts", #Counts the number of occurrences of the E-state fragments
54
+ "Cdk.LargestChain", #Returns the number of atoms in the largest chain
55
+ "Cdk.LargestPiSystem", #Returns the number of atoms in the largest pi chain
56
+ "Cdk.LengthOverBreadth", #Calculates the ratio of length to breadth.
57
+ "Cdk.LongestAliphaticChain", #Returns the number of atoms in the longest aliphatic chain
58
+ "Cdk.MDE", #Evaluate molecular distance edge descriptors for C, N and O
59
+ "Cdk.MannholdLogP", #Descriptor that calculates the LogP based on a simple equation using the number of carbons and hetero atoms .
60
+ "Cdk.MomentOfInertia", #Descriptor that calculates the principal moments of inertia and ratios of the principal moments. Als calculates the radius of gyration.
61
+ "Cdk.PetitjeanNumber", #Descriptor that calculates the Petitjean Number of a molecule.
62
+ "Cdk.PetitjeanShapeIndex", #The topological and geometric shape indices described Petitjean and Bath et al. respectively. Both measure the anisotropy in a molecule.
63
+ "Cdk.RotatableBondsCount", #Descriptor that calculates the number of nonrotatable bonds on a molecule.
64
+ #"Cdk.RuleOfFive", #This Class contains a method that returns the number failures of the Lipinski's Rule Of Five.
65
+ #"Cdk.TPSA", #Calculation of topological polar surface area based on fragment contributions .
66
+ "Cdk.VABC", #Describes the volume of a molecule.
67
+ "Cdk.VAdjMa", #Descriptor that calculates the vertex adjacency information of a molecule.
68
+ "Cdk.WHIM", #Holistic descriptors described by Todeschini et al .
69
+ #"Cdk.Weight", #Descriptor based on the weight of atoms of a certain element type. If no element is specified, the returned value is the Molecular Weight
70
+ "Cdk.WeightedPath", #The weighted path (molecular ID) descriptors described by Randic. They characterize molecular branching.
71
+ "Cdk.WienerNumbers", #This class calculates Wiener path number and Wiener polarity number.
72
+ "Cdk.XLogP", #Prediction of logP based on the atom-type method called XLogP.
73
+ "Cdk.ZagrebIndex", #The sum of the squared atom degrees of all heavy atoms.
74
+ "Joelib.count.NumberOfS", #no description available
75
+ "Joelib.count.NumberOfP", #no description available
76
+ "Joelib.count.NumberOfO", #no description available
77
+ "Joelib.count.NumberOfN", #no description available
78
+ #"Joelib.count.AromaticBonds", #no description available
79
+ "Joelib.count.NumberOfI", #no description available
80
+ "Joelib.count.NumberOfF", #no description available
81
+ "Joelib.count.NumberOfC", #no description available
82
+ "Joelib.count.NumberOfB", #no description available
83
+ "Joelib.count.HydrophobicGroups", #no description available
84
+ #"Joelib.KierShape3", #no description available
85
+ #"Joelib.KierShape2", #no description available
86
+ #"Joelib.KierShape1", #no description available
87
+ #"Joelib.count.AcidicGroups", #no description available
88
+ "Joelib.count.AliphaticOHGroups", #no description available
89
+ #"Joelib.count.NumberOfAtoms", #no description available
90
+ "Joelib.TopologicalRadius", #no description available
91
+ "Joelib.GeometricalShapeCoefficient", #no description available
92
+ #"Joelib.MolecularWeight", #no description available
93
+ "Joelib.FractionRotatableBonds", #no description available
94
+ #"Joelib.count.HBD2", #no description available
95
+ #"Joelib.count.HBD1", #no description available
96
+ "Joelib.LogP", #no description available
97
+ "Joelib.GraphShapeCoefficient", #no description available
98
+ "Joelib.count.BasicGroups", #no description available
99
+ #"Joelib.count.RotatableBonds", #no description available
100
+ "Joelib.count.HeavyBonds", #no description available
101
+ "Joelib.PolarSurfaceArea", #no description available
102
+ #"Joelib.ZagrebIndex1", #no description available
103
+ "Joelib.GeometricalRadius", #no description available
104
+ "Joelib.count.SO2Groups", #no description available
105
+ "Joelib.count.AromaticOHGroups", #no description available
106
+ "Joelib.GeometricalDiameter", #no description available
107
+ #"Joelib.MolarRefractivity", #no description available
108
+ "Joelib.count.NumberOfCl", #no description available
109
+ "Joelib.count.OSOGroups", #no description available
110
+ "Joelib.count.NumberOfBr", #no description available
111
+ "Joelib.count.NO2Groups", #no description available
112
+ "Joelib.count.HeteroCycles", #no description available
113
+ #"Joelib.count.HBA2", #no description available
114
+ #"Joelib.count.HBA1", #no description available
115
+ #"Joelib.count.NumberOfBonds", #no description available
116
+ "Joelib.count.SOGroups", #no description available
117
+ "Joelib.TopologicalDiameter", #no description available
118
+ "Joelib.count.NumberOfHal", #no description available
119
+
120
+ ].sort