lazar 0.0.7 → 0.0.9

Sign up to get free protection for your applications and to get access to all the features.
Files changed (61) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +3 -0
  3. data/README.md +2 -1
  4. data/VERSION +1 -1
  5. data/ext/lazar/extconf.rb +15 -76
  6. data/ext/lazar/rinstall.R +9 -0
  7. data/lazar.gemspec +7 -7
  8. data/lib/classification.rb +5 -78
  9. data/lib/compound.rb +201 -44
  10. data/lib/crossvalidation.rb +224 -121
  11. data/lib/dataset.rb +83 -93
  12. data/lib/error.rb +1 -1
  13. data/lib/experiment.rb +99 -0
  14. data/lib/feature.rb +2 -54
  15. data/lib/lazar.rb +47 -34
  16. data/lib/leave-one-out-validation.rb +205 -0
  17. data/lib/model.rb +131 -76
  18. data/lib/opentox.rb +2 -2
  19. data/lib/overwrite.rb +37 -0
  20. data/lib/physchem.rb +133 -0
  21. data/lib/regression.rb +117 -189
  22. data/lib/rest-client-wrapper.rb +4 -5
  23. data/lib/unique_descriptors.rb +6 -7
  24. data/lib/validation.rb +63 -69
  25. data/test/all.rb +2 -2
  26. data/test/classification.rb +41 -0
  27. data/test/compound.rb +116 -7
  28. data/test/data/LOAEL_log_mg_corrected_smiles.csv +567 -567
  29. data/test/data/LOAEL_log_mmol_corrected_smiles.csv +566 -566
  30. data/test/data/LOAEL_mmol_corrected_smiles.csv +568 -0
  31. data/test/data/batch_prediction.csv +25 -0
  32. data/test/data/batch_prediction_inchi_small.csv +4 -0
  33. data/test/data/batch_prediction_smiles_small.csv +4 -0
  34. data/test/data/hamster_carcinogenicity.json +3 -0
  35. data/test/data/loael.csv +568 -0
  36. data/test/dataset-long.rb +5 -8
  37. data/test/dataset.rb +31 -11
  38. data/test/default_environment.rb +11 -0
  39. data/test/descriptor.rb +26 -41
  40. data/test/error.rb +1 -3
  41. data/test/experiment.rb +301 -0
  42. data/test/feature.rb +22 -10
  43. data/test/lazar-long.rb +43 -23
  44. data/test/lazar-physchem-short.rb +19 -16
  45. data/test/prediction_models.rb +20 -0
  46. data/test/regression.rb +43 -0
  47. data/test/setup.rb +3 -1
  48. data/test/test_environment.rb +10 -0
  49. data/test/validation.rb +92 -26
  50. metadata +64 -38
  51. data/lib/SMARTS_InteLigand.txt +0 -983
  52. data/lib/bbrc.rb +0 -165
  53. data/lib/descriptor.rb +0 -247
  54. data/lib/neighbor.rb +0 -25
  55. data/lib/similarity.rb +0 -58
  56. data/mongoid.yml +0 -8
  57. data/test/descriptor-long.rb +0 -26
  58. data/test/fminer-long.rb +0 -38
  59. data/test/fminer.rb +0 -52
  60. data/test/lazar-fminer.rb +0 -50
  61. data/test/lazar-regression.rb +0 -27
@@ -0,0 +1,205 @@
1
+ module OpenTox
2
+
3
+ class LeaveOneOutValidation
4
+
5
+ field :model_id, type: BSON::ObjectId
6
+ field :dataset_id, type: BSON::ObjectId
7
+ field :nr_instances, type: Integer
8
+ field :nr_unpredicted, type: Integer
9
+ field :predictions, type: Array
10
+ field :finished_at, type: Time
11
+
12
+ def self.create model
13
+ model.training_dataset.features.first.nominal? ? klass = ClassificationLeaveOneOutValidation : klass = RegressionLeaveOneOutValidation
14
+ loo = klass.new :model_id => model.id, :dataset_id => model.training_dataset_id
15
+ compound_ids = model.training_dataset.compound_ids
16
+ predictions = model.predict model.training_dataset.compounds
17
+ predictions = predictions.each_with_index {|p,i| p[:compound_id] = compound_ids[i]}
18
+ predictions.select!{|p| p[:database_activities] and !p[:database_activities].empty?}
19
+ loo.nr_instances = predictions.size
20
+ predictions.select!{|p| p[:value]} # remove unpredicted
21
+ loo.predictions = predictions#.sort{|a,b| b[:confidence] <=> a[:confidence]}
22
+ loo.nr_unpredicted = loo.nr_instances - loo.predictions.size
23
+ loo.statistics
24
+ loo.save
25
+ loo
26
+ end
27
+
28
+ def model
29
+ Model::Lazar.find model_id
30
+ end
31
+ end
32
+
33
+ class ClassificationLeaveOneOutValidation < LeaveOneOutValidation
34
+
35
+ field :accept_values, type: Array
36
+ field :confusion_matrix, type: Array, default: []
37
+ field :weighted_confusion_matrix, type: Array, default: []
38
+ field :accuracy, type: Float
39
+ field :weighted_accuracy, type: Float
40
+ field :true_rate, type: Hash, default: {}
41
+ field :predictivity, type: Hash, default: {}
42
+ field :confidence_plot_id, type: BSON::ObjectId
43
+
44
+ def statistics
45
+ accept_values = Feature.find(model.prediction_feature_id).accept_values
46
+ confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
47
+ weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
48
+ predictions.each do |pred|
49
+ pred[:database_activities].each do |db_act|
50
+ if pred[:value]
51
+ if pred[:value] == db_act
52
+ if pred[:value] == accept_values[0]
53
+ confusion_matrix[0][0] += 1
54
+ weighted_confusion_matrix[0][0] += pred[:confidence]
55
+ elsif pred[:value] == accept_values[1]
56
+ confusion_matrix[1][1] += 1
57
+ weighted_confusion_matrix[1][1] += pred[:confidence]
58
+ end
59
+ else
60
+ if pred[:value] == accept_values[0]
61
+ confusion_matrix[0][1] += 1
62
+ weighted_confusion_matrix[0][1] += pred[:confidence]
63
+ elsif pred[:value] == accept_values[1]
64
+ confusion_matrix[1][0] += 1
65
+ weighted_confusion_matrix[1][0] += pred[:confidence]
66
+ end
67
+ end
68
+ end
69
+ end
70
+ end
71
+ accept_values.each_with_index do |v,i|
72
+ true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
73
+ predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
74
+ end
75
+ confidence_sum = 0
76
+ weighted_confusion_matrix.each do |r|
77
+ r.each do |c|
78
+ confidence_sum += c
79
+ end
80
+ end
81
+ update_attributes(
82
+ accept_values: accept_values,
83
+ confusion_matrix: confusion_matrix,
84
+ weighted_confusion_matrix: weighted_confusion_matrix,
85
+ accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f,
86
+ weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
87
+ true_rate: true_rate,
88
+ predictivity: predictivity,
89
+ finished_at: Time.now
90
+ )
91
+ $logger.debug "Accuracy #{accuracy}"
92
+ end
93
+
94
+ def confidence_plot
95
+ unless confidence_plot_id
96
+ tmpfile = "/tmp/#{id.to_s}_confidence.svg"
97
+ accuracies = []
98
+ confidences = []
99
+ correct_predictions = 0
100
+ incorrect_predictions = 0
101
+ predictions.each do |p|
102
+ p[:database_activities].each do |db_act|
103
+ if p[:value]
104
+ p[:value] == db_act ? correct_predictions += 1 : incorrect_predictions += 1
105
+ accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f
106
+ confidences << p[:confidence]
107
+
108
+ end
109
+ end
110
+ end
111
+ R.assign "accuracy", accuracies
112
+ R.assign "confidence", confidences
113
+ R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()"
114
+ R.eval "ggsave(file='#{tmpfile}', plot=image)"
115
+ file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg")
116
+ plot_id = $gridfs.insert_one(file)
117
+ update(:confidence_plot_id => plot_id)
118
+ end
119
+ $gridfs.find_one(_id: confidence_plot_id).data
120
+ end
121
+ end
122
+
123
+
124
+ class RegressionLeaveOneOutValidation < LeaveOneOutValidation
125
+
126
+
127
+ field :rmse, type: Float, default: 0.0
128
+ field :mae, type: Float, default: 0
129
+ #field :weighted_rmse, type: Float, default: 0
130
+ #field :weighted_mae, type: Float, default: 0
131
+ field :r_squared, type: Float
132
+ field :correlation_plot_id, type: BSON::ObjectId
133
+ field :confidence_plot_id, type: BSON::ObjectId
134
+
135
+ def statistics
136
+ confidence_sum = 0
137
+ predicted_values = []
138
+ measured_values = []
139
+ predictions.each do |pred|
140
+ pred[:database_activities].each do |activity|
141
+ if pred[:value]
142
+ predicted_values << pred[:value]
143
+ measured_values << activity
144
+ error = Math.log10(pred[:value])-Math.log10(activity)
145
+ self.rmse += error**2
146
+ #self.weighted_rmse += pred[:confidence]*error**2
147
+ self.mae += error.abs
148
+ #self.weighted_mae += pred[:confidence]*error.abs
149
+ #confidence_sum += pred[:confidence]
150
+ end
151
+ end
152
+ if pred[:database_activities].empty?
153
+ warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
154
+ $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
155
+ end
156
+ end
157
+ R.assign "measurement", measured_values
158
+ R.assign "prediction", predicted_values
159
+ R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')"
160
+ r = R.eval("r").to_ruby
161
+
162
+ self.mae = self.mae/predictions.size
163
+ #self.weighted_mae = self.weighted_mae/confidence_sum
164
+ self.rmse = Math.sqrt(self.rmse/predictions.size)
165
+ #self.weighted_rmse = Math.sqrt(self.weighted_rmse/confidence_sum)
166
+ self.r_squared = r**2
167
+ self.finished_at = Time.now
168
+ save
169
+ $logger.debug "R^2 #{r**2}"
170
+ $logger.debug "RMSE #{rmse}"
171
+ $logger.debug "MAE #{mae}"
172
+ end
173
+
174
+ def correlation_plot
175
+ unless correlation_plot_id
176
+ tmpfile = "/tmp/#{id.to_s}_correlation.svg"
177
+ predicted_values = []
178
+ measured_values = []
179
+ predictions.each do |pred|
180
+ pred[:database_activities].each do |activity|
181
+ if pred[:value]
182
+ predicted_values << pred[:value]
183
+ measured_values << activity
184
+ end
185
+ end
186
+ end
187
+ attributes = Model::Lazar.find(self.model_id).attributes
188
+ attributes.delete_if{|key,_| key.match(/_id|_at/) or ["_id","creator","name"].include? key}
189
+ attributes = attributes.values.collect{|v| v.is_a?(String) ? v.sub(/OpenTox::/,'') : v}.join("\n")
190
+ R.assign "measurement", measured_values
191
+ R.assign "prediction", predicted_values
192
+ R.eval "all = c(-log(measurement),-log(prediction))"
193
+ R.eval "range = c(min(all), max(all))"
194
+ R.eval "image = qplot(-log(prediction),-log(measurement),main='#{self.name}',asp=1,xlim=range, ylim=range)"
195
+ R.eval "image = image + geom_abline(intercept=0, slope=1)"
196
+ R.eval "ggsave(file='#{tmpfile}', plot=image)"
197
+ file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.svg")
198
+ plot_id = $gridfs.insert_one(file)
199
+ update(:correlation_plot_id => plot_id)
200
+ end
201
+ $gridfs.find_one(_id: correlation_plot_id).data
202
+ end
203
+ end
204
+
205
+ end
data/lib/model.rb CHANGED
@@ -2,51 +2,79 @@ module OpenTox
2
2
 
3
3
  module Model
4
4
 
5
- class Lazar
5
+ class Model
6
6
  include OpenTox
7
7
  include Mongoid::Document
8
8
  include Mongoid::Timestamps
9
9
  store_in collection: "models"
10
10
 
11
- field :title, as: :name, type: String
11
+ field :name, type: String
12
12
  field :creator, type: String, default: __FILE__
13
13
  # datasets
14
14
  field :training_dataset_id, type: BSON::ObjectId
15
15
  # algorithms
16
16
  field :prediction_algorithm, type: String
17
- field :neighbor_algorithm, type: String
18
- field :neighbor_algorithm_parameters, type: Hash
19
17
  # prediction feature
20
18
  field :prediction_feature_id, type: BSON::ObjectId
21
19
 
22
- attr_accessor :prediction_dataset
23
- attr_accessor :training_dataset
20
+ def training_dataset
21
+ Dataset.find(training_dataset_id)
22
+ end
23
+ end
24
+
25
+ class Lazar < Model
26
+
27
+ # algorithms
28
+ field :neighbor_algorithm, type: String
29
+ field :neighbor_algorithm_parameters, type: Hash, default: {}
24
30
 
25
31
  # Create a lazar model from a training_dataset and a feature_dataset
26
32
  # @param [OpenTox::Dataset] training_dataset
27
33
  # @return [OpenTox::Model::Lazar] Regression or classification model
28
- def self.create training_dataset
34
+ def initialize training_dataset, params={}
29
35
 
30
- bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1
36
+ super params
31
37
 
32
38
  # TODO document convention
33
39
  prediction_feature = training_dataset.features.first
34
- prediction_feature.nominal ? lazar = OpenTox::Model::LazarClassification.new : lazar = OpenTox::Model::LazarRegression.new
35
- lazar.training_dataset_id = training_dataset.id
36
- lazar.prediction_feature_id = prediction_feature.id
37
- lazar.title = prediction_feature.title
40
+ # set defaults for empty parameters
41
+ self.prediction_feature_id ||= prediction_feature.id
42
+ self.training_dataset_id ||= training_dataset.id
43
+ self.name ||= "#{training_dataset.name} #{prediction_feature.name}"
44
+ self.neighbor_algorithm_parameters ||= {}
45
+ self.neighbor_algorithm_parameters[:training_dataset_id] = training_dataset.id
46
+ save
47
+ self
48
+ end
38
49
 
39
- lazar.save
40
- lazar
50
+ def predict_compound compound
51
+ prediction_feature = Feature.find prediction_feature_id
52
+ neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters)
53
+ # remove neighbors without prediction_feature
54
+ # check for database activities (neighbors may include query compound)
55
+ database_activities = nil
56
+ prediction = {}
57
+ if neighbors.collect{|n| n["_id"]}.include? compound.id
58
+
59
+ database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["features"][prediction_feature.id.to_s].uniq
60
+ prediction[:database_activities] = database_activities
61
+ prediction[:warning] = "#{database_activities.size} compounds have been removed from neighbors, because they have the same structure as the query compound."
62
+ neighbors.delete_if{|n| n["_id"] == compound.id}
63
+ end
64
+ neighbors.delete_if{|n| n['features'].empty? or n['features'][prediction_feature.id.to_s] == [nil] }
65
+ if neighbors.empty?
66
+ prediction.merge!({:value => nil,:confidence => nil,:warning => "Could not find similar compounds with experimental data in the training dataset.",:neighbors => []})
67
+ else
68
+ prediction.merge!(Algorithm.run(prediction_algorithm, compound, {:neighbors => neighbors,:training_dataset_id=> training_dataset_id,:prediction_feature_id => prediction_feature.id}))
69
+ prediction[:neighbors] = neighbors
70
+ prediction[:neighbors] ||= []
71
+ end
72
+ prediction
41
73
  end
42
74
 
43
75
  def predict object
44
76
 
45
- t = Time.now
46
- at = Time.now
47
-
48
77
  training_dataset = Dataset.find training_dataset_id
49
- prediction_feature = Feature.find prediction_feature_id
50
78
 
51
79
  # parse data
52
80
  compounds = []
@@ -63,50 +91,33 @@ module OpenTox
63
91
 
64
92
  # make predictions
65
93
  predictions = []
66
- neighbors = []
67
- compounds.each_with_index do |compound,c|
68
- t = Time.new
69
- database_activities = training_dataset.values(compound,prediction_feature)
70
- if database_activities and !database_activities.empty?
71
- database_activities = database_activities.first if database_activities.size == 1
72
- predictions << {:compound => compound, :value => database_activities, :confidence => "measured", :warning => "Compound #{compound.smiles} occurs in training dataset with activity '#{database_activities}'."}
73
- next
74
- end
75
- neighbors = Algorithm.run(neighbor_algorithm, compound, neighbor_algorithm_parameters)
76
- # add activities
77
- # TODO: improve efficiency, takes 3 times longer than previous version
78
- neighbors.collect! do |n|
79
- rows = training_dataset.compound_ids.each_index.select{|i| training_dataset.compound_ids[i] == n.first}
80
- acts = rows.collect{|row| training_dataset.data_entries[row][0]}.compact
81
- acts.empty? ? nil : n << acts
82
- end
83
- neighbors.compact! # remove neighbors without training activities
84
- predictions << Algorithm.run(prediction_algorithm, neighbors)
85
- end
94
+ predictions = compounds.collect{|c| predict_compound c}
86
95
 
87
96
  # serialize result
88
97
  case object.class.to_s
89
98
  when "OpenTox::Compound"
90
99
  prediction = predictions.first
91
- prediction[:neighbors] = neighbors.sort{|a,b| b[1] <=> a[1]} # sort according to similarity
100
+ prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} # sort according to similarity
92
101
  return prediction
93
102
  when "Array"
94
103
  return predictions
95
104
  when "OpenTox::Dataset"
96
105
  # prepare prediction dataset
106
+ measurement_feature = Feature.find prediction_feature_id
107
+
108
+ prediction_feature = OpenTox::NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" )
97
109
  prediction_dataset = LazarPrediction.new(
98
- :title => "Lazar prediction for #{prediction_feature.title}",
110
+ :name => "Lazar prediction for #{prediction_feature.name}",
99
111
  :creator => __FILE__,
100
112
  :prediction_feature_id => prediction_feature.id
101
113
 
102
114
  )
103
- confidence_feature = OpenTox::NumericFeature.find_or_create_by( "title" => "Prediction confidence" )
104
- # TODO move into warnings field
105
- warning_feature = OpenTox::NominalFeature.find_or_create_by("title" => "Warnings")
106
- prediction_dataset.features = [ prediction_feature, confidence_feature, warning_feature ]
115
+ confidence_feature = OpenTox::NumericFeature.find_or_create_by( "name" => "Model RMSE" )
116
+ warning_feature = OpenTox::NominalFeature.find_or_create_by("name" => "Warnings")
117
+ prediction_dataset.features = [ prediction_feature, confidence_feature, measurement_feature, warning_feature ]
107
118
  prediction_dataset.compounds = compounds
108
- prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:confidence], p[:warning]]}
109
- prediction_dataset.save_all
119
+ prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:rmse] , p[:dataset_activities].to_s, p[:warning]]}
120
+ prediction_dataset.save
110
121
  return prediction_dataset
111
122
  end
112
123
 
@@ -120,26 +131,19 @@ module OpenTox
120
131
  end
121
132
 
122
133
  class LazarClassification < Lazar
123
- def initialize
124
- super
125
- self.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote"
126
- self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity"
127
- self.neighbor_algorithm_parameters = {:min_sim => 0.7}
128
- end
129
- end
130
-
131
- class LazarFminerClassification < LazarClassification
132
-
133
- def self.create training_dataset
134
- model = super(training_dataset)
135
- model.update "_type" => self.to_s # adjust class
136
- model = self.find model.id # adjust class
137
- model.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fminer_similarity"
138
- model.neighbor_algorithm_parameters = {
139
- :feature_calculation_algorithm => "OpenTox::Algorithm::Descriptor.smarts_match",
140
- :feature_dataset_id => Algorithm::Fminer.bbrc(training_dataset).id,
141
- :min_sim => 0.3
142
- }
134
+
135
+ def self.create training_dataset, params={}
136
+ model = self.new training_dataset, params
137
+ model.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote" unless model.prediction_algorithm
138
+ model.neighbor_algorithm ||= "fingerprint_neighbors"
139
+ model.neighbor_algorithm_parameters ||= {}
140
+ {
141
+ :type => "MP2D",
142
+ :training_dataset_id => training_dataset.id,
143
+ :min_sim => 0.1
144
+ }.each do |key,value|
145
+ model.neighbor_algorithm_parameters[key] ||= value
146
+ end
143
147
  model.save
144
148
  model
145
149
  end
@@ -147,20 +151,27 @@ module OpenTox
147
151
 
148
152
  class LazarRegression < Lazar
149
153
 
150
- def initialize
151
- super
152
- self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity"
153
- self.prediction_algorithm = "OpenTox::Algorithm::Regression.weighted_average"
154
- self.neighbor_algorithm_parameters = {:min_sim => 0.7}
154
+ def self.create training_dataset, params={}
155
+ model = self.new training_dataset, params
156
+ model.neighbor_algorithm ||= "fingerprint_neighbors"
157
+ model.prediction_algorithm ||= "OpenTox::Algorithm::Regression.local_fingerprint_regression"
158
+ model.neighbor_algorithm_parameters ||= {}
159
+ {
160
+ :type => "MP2D",
161
+ :training_dataset_id => training_dataset.id,
162
+ :min_sim => 0.1
163
+ }.each do |key,value|
164
+ model.neighbor_algorithm_parameters[key] ||= value
165
+ end
166
+ model.save
167
+ model
155
168
  end
156
-
157
169
  end
158
170
 
159
- class PredictionModel
171
+ class Prediction
160
172
  include OpenTox
161
173
  include Mongoid::Document
162
174
  include Mongoid::Timestamps
163
- store_in collection: "models"
164
175
 
165
176
  # TODO field Validations
166
177
  field :endpoint, type: String
@@ -168,10 +179,54 @@ module OpenTox
168
179
  field :source, type: String
169
180
  field :unit, type: String
170
181
  field :model_id, type: BSON::ObjectId
171
- field :crossvalidation_id, type: BSON::ObjectId
182
+ field :repeated_crossvalidation_id, type: BSON::ObjectId
183
+
184
+ def predict object
185
+ Lazar.find(model_id).predict object
186
+ end
187
+
188
+ def training_dataset
189
+ model.training_dataset
190
+ end
191
+
192
+ def model
193
+ Lazar.find model_id
194
+ end
195
+
196
+ def repeated_crossvalidation
197
+ RepeatedCrossValidation.find repeated_crossvalidation_id
198
+ end
199
+
200
+ def crossvalidations
201
+ repeated_crossvalidation.crossvalidations
202
+ end
203
+
204
+ def regression?
205
+ training_dataset.features.first.numeric?
206
+ end
207
+
208
+ def classification?
209
+ training_dataset.features.first.nominal?
210
+ end
211
+
212
+ def self.from_csv_file file
213
+ metadata_file = file.sub(/csv$/,"json")
214
+ bad_request_error "No metadata file #{metadata_file}" unless File.exist? metadata_file
215
+ prediction_model = self.new JSON.parse(File.read(metadata_file))
216
+ training_dataset = Dataset.from_csv_file file
217
+ model = nil
218
+ if training_dataset.features.first.nominal?
219
+ model = LazarClassification.create training_dataset
220
+ elsif training_dataset.features.first.numeric?
221
+ model = LazarRegression.create training_dataset
222
+ end
223
+ prediction_model[:model_id] = model.id
224
+ prediction_model[:repeated_crossvalidation_id] = RepeatedCrossValidation.create(model).id
225
+ prediction_model.save
226
+ prediction_model
227
+ end
172
228
  end
173
229
 
174
230
  end
175
231
 
176
232
  end
177
-
data/lib/opentox.rb CHANGED
@@ -12,8 +12,8 @@ module OpenTox
12
12
  include Mongoid::Document
13
13
  include Mongoid::Timestamps
14
14
  store_in collection: klass.downcase.pluralize
15
- field :title, as: :name, type: String
16
-
15
+ field :name, type: String
16
+ field :warnings, type: Array, default: []
17
17
  end
18
18
  OpenTox.const_set klass,c
19
19
  end
data/lib/overwrite.rb CHANGED
@@ -9,6 +9,11 @@ class Object
9
9
  def numeric?
10
10
  true if Float(self) rescue false
11
11
  end
12
+
13
+ # Returns dimension of nested arrays
14
+ def dimension
15
+ self.class == Array ? 1 + self[0].dimension : 0
16
+ end
12
17
  end
13
18
 
14
19
  class Numeric
@@ -17,6 +22,14 @@ class Numeric
17
22
  end
18
23
  end
19
24
 
25
+ class Float
26
+ # round to n significant digits
27
+ # http://stackoverflow.com/questions/8382619/how-to-round-a-float-to-a-specified-number-of-significant-digits-in-ruby
28
+ def signif(n)
29
+ Float("%.#{n}g" % self)
30
+ end
31
+ end
32
+
20
33
  module Enumerable
21
34
  # @return [Array] only the duplicates of an enumerable
22
35
  def duplicates
@@ -81,6 +94,26 @@ class Array
81
94
  return self.uniq.size == 1
82
95
  end
83
96
 
97
+ def median
98
+ sorted = self.sort
99
+ len = sorted.length
100
+ (sorted[(len - 1) / 2] + sorted[len / 2]) / 2.0
101
+ end
102
+
103
+ def mean
104
+ self.inject{ |sum, el| sum + el }.to_f / self.size
105
+ end
106
+
107
+ def sample_variance
108
+ m = self.mean
109
+ sum = self.inject(0){|accum, i| accum +(i-m)**2 }
110
+ sum/(self.length - 1).to_f
111
+ end
112
+
113
+ def standard_deviation
114
+ Math.sqrt(self.sample_variance)
115
+ end
116
+
84
117
  end
85
118
 
86
119
  module URI
@@ -116,4 +149,8 @@ module URI
116
149
  false
117
150
  end
118
151
 
152
+ def self.task? uri
153
+ uri =~ /task/ and URI.valid? uri
154
+ end
155
+
119
156
  end