lazar 0.0.7 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +3 -0
  3. data/README.md +2 -1
  4. data/VERSION +1 -1
  5. data/ext/lazar/extconf.rb +15 -76
  6. data/ext/lazar/rinstall.R +9 -0
  7. data/lazar.gemspec +7 -7
  8. data/lib/classification.rb +5 -78
  9. data/lib/compound.rb +201 -44
  10. data/lib/crossvalidation.rb +224 -121
  11. data/lib/dataset.rb +83 -93
  12. data/lib/error.rb +1 -1
  13. data/lib/experiment.rb +99 -0
  14. data/lib/feature.rb +2 -54
  15. data/lib/lazar.rb +47 -34
  16. data/lib/leave-one-out-validation.rb +205 -0
  17. data/lib/model.rb +131 -76
  18. data/lib/opentox.rb +2 -2
  19. data/lib/overwrite.rb +37 -0
  20. data/lib/physchem.rb +133 -0
  21. data/lib/regression.rb +117 -189
  22. data/lib/rest-client-wrapper.rb +4 -5
  23. data/lib/unique_descriptors.rb +6 -7
  24. data/lib/validation.rb +63 -69
  25. data/test/all.rb +2 -2
  26. data/test/classification.rb +41 -0
  27. data/test/compound.rb +116 -7
  28. data/test/data/LOAEL_log_mg_corrected_smiles.csv +567 -567
  29. data/test/data/LOAEL_log_mmol_corrected_smiles.csv +566 -566
  30. data/test/data/LOAEL_mmol_corrected_smiles.csv +568 -0
  31. data/test/data/batch_prediction.csv +25 -0
  32. data/test/data/batch_prediction_inchi_small.csv +4 -0
  33. data/test/data/batch_prediction_smiles_small.csv +4 -0
  34. data/test/data/hamster_carcinogenicity.json +3 -0
  35. data/test/data/loael.csv +568 -0
  36. data/test/dataset-long.rb +5 -8
  37. data/test/dataset.rb +31 -11
  38. data/test/default_environment.rb +11 -0
  39. data/test/descriptor.rb +26 -41
  40. data/test/error.rb +1 -3
  41. data/test/experiment.rb +301 -0
  42. data/test/feature.rb +22 -10
  43. data/test/lazar-long.rb +43 -23
  44. data/test/lazar-physchem-short.rb +19 -16
  45. data/test/prediction_models.rb +20 -0
  46. data/test/regression.rb +43 -0
  47. data/test/setup.rb +3 -1
  48. data/test/test_environment.rb +10 -0
  49. data/test/validation.rb +92 -26
  50. metadata +64 -38
  51. data/lib/SMARTS_InteLigand.txt +0 -983
  52. data/lib/bbrc.rb +0 -165
  53. data/lib/descriptor.rb +0 -247
  54. data/lib/neighbor.rb +0 -25
  55. data/lib/similarity.rb +0 -58
  56. data/mongoid.yml +0 -8
  57. data/test/descriptor-long.rb +0 -26
  58. data/test/fminer-long.rb +0 -38
  59. data/test/fminer.rb +0 -52
  60. data/test/lazar-fminer.rb +0 -50
  61. data/test/lazar-regression.rb +0 -27
@@ -0,0 +1,205 @@
1
+ module OpenTox
2
+
3
+ class LeaveOneOutValidation
4
+
5
+ field :model_id, type: BSON::ObjectId
6
+ field :dataset_id, type: BSON::ObjectId
7
+ field :nr_instances, type: Integer
8
+ field :nr_unpredicted, type: Integer
9
+ field :predictions, type: Array
10
+ field :finished_at, type: Time
11
+
12
+ def self.create model
13
+ model.training_dataset.features.first.nominal? ? klass = ClassificationLeaveOneOutValidation : klass = RegressionLeaveOneOutValidation
14
+ loo = klass.new :model_id => model.id, :dataset_id => model.training_dataset_id
15
+ compound_ids = model.training_dataset.compound_ids
16
+ predictions = model.predict model.training_dataset.compounds
17
+ predictions = predictions.each_with_index {|p,i| p[:compound_id] = compound_ids[i]}
18
+ predictions.select!{|p| p[:database_activities] and !p[:database_activities].empty?}
19
+ loo.nr_instances = predictions.size
20
+ predictions.select!{|p| p[:value]} # remove unpredicted
21
+ loo.predictions = predictions#.sort{|a,b| b[:confidence] <=> a[:confidence]}
22
+ loo.nr_unpredicted = loo.nr_instances - loo.predictions.size
23
+ loo.statistics
24
+ loo.save
25
+ loo
26
+ end
27
+
28
+ def model
29
+ Model::Lazar.find model_id
30
+ end
31
+ end
32
+
33
+ class ClassificationLeaveOneOutValidation < LeaveOneOutValidation
34
+
35
+ field :accept_values, type: Array
36
+ field :confusion_matrix, type: Array, default: []
37
+ field :weighted_confusion_matrix, type: Array, default: []
38
+ field :accuracy, type: Float
39
+ field :weighted_accuracy, type: Float
40
+ field :true_rate, type: Hash, default: {}
41
+ field :predictivity, type: Hash, default: {}
42
+ field :confidence_plot_id, type: BSON::ObjectId
43
+
44
+ def statistics
45
+ accept_values = Feature.find(model.prediction_feature_id).accept_values
46
+ confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
47
+ weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
48
+ predictions.each do |pred|
49
+ pred[:database_activities].each do |db_act|
50
+ if pred[:value]
51
+ if pred[:value] == db_act
52
+ if pred[:value] == accept_values[0]
53
+ confusion_matrix[0][0] += 1
54
+ weighted_confusion_matrix[0][0] += pred[:confidence]
55
+ elsif pred[:value] == accept_values[1]
56
+ confusion_matrix[1][1] += 1
57
+ weighted_confusion_matrix[1][1] += pred[:confidence]
58
+ end
59
+ else
60
+ if pred[:value] == accept_values[0]
61
+ confusion_matrix[0][1] += 1
62
+ weighted_confusion_matrix[0][1] += pred[:confidence]
63
+ elsif pred[:value] == accept_values[1]
64
+ confusion_matrix[1][0] += 1
65
+ weighted_confusion_matrix[1][0] += pred[:confidence]
66
+ end
67
+ end
68
+ end
69
+ end
70
+ end
71
+ accept_values.each_with_index do |v,i|
72
+ true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
73
+ predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
74
+ end
75
+ confidence_sum = 0
76
+ weighted_confusion_matrix.each do |r|
77
+ r.each do |c|
78
+ confidence_sum += c
79
+ end
80
+ end
81
+ update_attributes(
82
+ accept_values: accept_values,
83
+ confusion_matrix: confusion_matrix,
84
+ weighted_confusion_matrix: weighted_confusion_matrix,
85
+ accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f,
86
+ weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
87
+ true_rate: true_rate,
88
+ predictivity: predictivity,
89
+ finished_at: Time.now
90
+ )
91
+ $logger.debug "Accuracy #{accuracy}"
92
+ end
93
+
94
+ def confidence_plot
95
+ unless confidence_plot_id
96
+ tmpfile = "/tmp/#{id.to_s}_confidence.svg"
97
+ accuracies = []
98
+ confidences = []
99
+ correct_predictions = 0
100
+ incorrect_predictions = 0
101
+ predictions.each do |p|
102
+ p[:database_activities].each do |db_act|
103
+ if p[:value]
104
+ p[:value] == db_act ? correct_predictions += 1 : incorrect_predictions += 1
105
+ accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f
106
+ confidences << p[:confidence]
107
+
108
+ end
109
+ end
110
+ end
111
+ R.assign "accuracy", accuracies
112
+ R.assign "confidence", confidences
113
+ R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()"
114
+ R.eval "ggsave(file='#{tmpfile}', plot=image)"
115
+ file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg")
116
+ plot_id = $gridfs.insert_one(file)
117
+ update(:confidence_plot_id => plot_id)
118
+ end
119
+ $gridfs.find_one(_id: confidence_plot_id).data
120
+ end
121
+ end
122
+
123
+
124
+ class RegressionLeaveOneOutValidation < LeaveOneOutValidation
125
+
126
+
127
+ field :rmse, type: Float, default: 0.0
128
+ field :mae, type: Float, default: 0
129
+ #field :weighted_rmse, type: Float, default: 0
130
+ #field :weighted_mae, type: Float, default: 0
131
+ field :r_squared, type: Float
132
+ field :correlation_plot_id, type: BSON::ObjectId
133
+ field :confidence_plot_id, type: BSON::ObjectId
134
+
135
+ def statistics
136
+ confidence_sum = 0
137
+ predicted_values = []
138
+ measured_values = []
139
+ predictions.each do |pred|
140
+ pred[:database_activities].each do |activity|
141
+ if pred[:value]
142
+ predicted_values << pred[:value]
143
+ measured_values << activity
144
+ error = Math.log10(pred[:value])-Math.log10(activity)
145
+ self.rmse += error**2
146
+ #self.weighted_rmse += pred[:confidence]*error**2
147
+ self.mae += error.abs
148
+ #self.weighted_mae += pred[:confidence]*error.abs
149
+ #confidence_sum += pred[:confidence]
150
+ end
151
+ end
152
+ if pred[:database_activities].empty?
153
+ warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
154
+ $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
155
+ end
156
+ end
157
+ R.assign "measurement", measured_values
158
+ R.assign "prediction", predicted_values
159
+ R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')"
160
+ r = R.eval("r").to_ruby
161
+
162
+ self.mae = self.mae/predictions.size
163
+ #self.weighted_mae = self.weighted_mae/confidence_sum
164
+ self.rmse = Math.sqrt(self.rmse/predictions.size)
165
+ #self.weighted_rmse = Math.sqrt(self.weighted_rmse/confidence_sum)
166
+ self.r_squared = r**2
167
+ self.finished_at = Time.now
168
+ save
169
+ $logger.debug "R^2 #{r**2}"
170
+ $logger.debug "RMSE #{rmse}"
171
+ $logger.debug "MAE #{mae}"
172
+ end
173
+
174
+ def correlation_plot
175
+ unless correlation_plot_id
176
+ tmpfile = "/tmp/#{id.to_s}_correlation.svg"
177
+ predicted_values = []
178
+ measured_values = []
179
+ predictions.each do |pred|
180
+ pred[:database_activities].each do |activity|
181
+ if pred[:value]
182
+ predicted_values << pred[:value]
183
+ measured_values << activity
184
+ end
185
+ end
186
+ end
187
+ attributes = Model::Lazar.find(self.model_id).attributes
188
+ attributes.delete_if{|key,_| key.match(/_id|_at/) or ["_id","creator","name"].include? key}
189
+ attributes = attributes.values.collect{|v| v.is_a?(String) ? v.sub(/OpenTox::/,'') : v}.join("\n")
190
+ R.assign "measurement", measured_values
191
+ R.assign "prediction", predicted_values
192
+ R.eval "all = c(-log(measurement),-log(prediction))"
193
+ R.eval "range = c(min(all), max(all))"
194
+ R.eval "image = qplot(-log(prediction),-log(measurement),main='#{self.name}',asp=1,xlim=range, ylim=range)"
195
+ R.eval "image = image + geom_abline(intercept=0, slope=1)"
196
+ R.eval "ggsave(file='#{tmpfile}', plot=image)"
197
+ file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.svg")
198
+ plot_id = $gridfs.insert_one(file)
199
+ update(:correlation_plot_id => plot_id)
200
+ end
201
+ $gridfs.find_one(_id: correlation_plot_id).data
202
+ end
203
+ end
204
+
205
+ end
data/lib/model.rb CHANGED
@@ -2,51 +2,79 @@ module OpenTox
2
2
 
3
3
  module Model
4
4
 
5
- class Lazar
5
+ class Model
6
6
  include OpenTox
7
7
  include Mongoid::Document
8
8
  include Mongoid::Timestamps
9
9
  store_in collection: "models"
10
10
 
11
- field :title, as: :name, type: String
11
+ field :name, type: String
12
12
  field :creator, type: String, default: __FILE__
13
13
  # datasets
14
14
  field :training_dataset_id, type: BSON::ObjectId
15
15
  # algorithms
16
16
  field :prediction_algorithm, type: String
17
- field :neighbor_algorithm, type: String
18
- field :neighbor_algorithm_parameters, type: Hash
19
17
  # prediction feature
20
18
  field :prediction_feature_id, type: BSON::ObjectId
21
19
 
22
- attr_accessor :prediction_dataset
23
- attr_accessor :training_dataset
20
+ def training_dataset
21
+ Dataset.find(training_dataset_id)
22
+ end
23
+ end
24
+
25
+ class Lazar < Model
26
+
27
+ # algorithms
28
+ field :neighbor_algorithm, type: String
29
+ field :neighbor_algorithm_parameters, type: Hash, default: {}
24
30
 
25
31
  # Create a lazar model from a training_dataset and a feature_dataset
26
32
  # @param [OpenTox::Dataset] training_dataset
27
33
  # @return [OpenTox::Model::Lazar] Regression or classification model
28
- def self.create training_dataset
34
+ def initialize training_dataset, params={}
29
35
 
30
- bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1
36
+ super params
31
37
 
32
38
  # TODO document convention
33
39
  prediction_feature = training_dataset.features.first
34
- prediction_feature.nominal ? lazar = OpenTox::Model::LazarClassification.new : lazar = OpenTox::Model::LazarRegression.new
35
- lazar.training_dataset_id = training_dataset.id
36
- lazar.prediction_feature_id = prediction_feature.id
37
- lazar.title = prediction_feature.title
40
+ # set defaults for empty parameters
41
+ self.prediction_feature_id ||= prediction_feature.id
42
+ self.training_dataset_id ||= training_dataset.id
43
+ self.name ||= "#{training_dataset.name} #{prediction_feature.name}"
44
+ self.neighbor_algorithm_parameters ||= {}
45
+ self.neighbor_algorithm_parameters[:training_dataset_id] = training_dataset.id
46
+ save
47
+ self
48
+ end
38
49
 
39
- lazar.save
40
- lazar
50
+ def predict_compound compound
51
+ prediction_feature = Feature.find prediction_feature_id
52
+ neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters)
53
+ # remove neighbors without prediction_feature
54
+ # check for database activities (neighbors may include query compound)
55
+ database_activities = nil
56
+ prediction = {}
57
+ if neighbors.collect{|n| n["_id"]}.include? compound.id
58
+
59
+ database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["features"][prediction_feature.id.to_s].uniq
60
+ prediction[:database_activities] = database_activities
61
+ prediction[:warning] = "#{database_activities.size} compounds have been removed from neighbors, because they have the same structure as the query compound."
62
+ neighbors.delete_if{|n| n["_id"] == compound.id}
63
+ end
64
+ neighbors.delete_if{|n| n['features'].empty? or n['features'][prediction_feature.id.to_s] == [nil] }
65
+ if neighbors.empty?
66
+ prediction.merge!({:value => nil,:confidence => nil,:warning => "Could not find similar compounds with experimental data in the training dataset.",:neighbors => []})
67
+ else
68
+ prediction.merge!(Algorithm.run(prediction_algorithm, compound, {:neighbors => neighbors,:training_dataset_id=> training_dataset_id,:prediction_feature_id => prediction_feature.id}))
69
+ prediction[:neighbors] = neighbors
70
+ prediction[:neighbors] ||= []
71
+ end
72
+ prediction
41
73
  end
42
74
 
43
75
  def predict object
44
76
 
45
- t = Time.now
46
- at = Time.now
47
-
48
77
  training_dataset = Dataset.find training_dataset_id
49
- prediction_feature = Feature.find prediction_feature_id
50
78
 
51
79
  # parse data
52
80
  compounds = []
@@ -63,50 +91,33 @@ module OpenTox
63
91
 
64
92
  # make predictions
65
93
  predictions = []
66
- neighbors = []
67
- compounds.each_with_index do |compound,c|
68
- t = Time.new
69
- database_activities = training_dataset.values(compound,prediction_feature)
70
- if database_activities and !database_activities.empty?
71
- database_activities = database_activities.first if database_activities.size == 1
72
- predictions << {:compound => compound, :value => database_activities, :confidence => "measured", :warning => "Compound #{compound.smiles} occurs in training dataset with activity '#{database_activities}'."}
73
- next
74
- end
75
- neighbors = Algorithm.run(neighbor_algorithm, compound, neighbor_algorithm_parameters)
76
- # add activities
77
- # TODO: improve efficiency, takes 3 times longer than previous version
78
- neighbors.collect! do |n|
79
- rows = training_dataset.compound_ids.each_index.select{|i| training_dataset.compound_ids[i] == n.first}
80
- acts = rows.collect{|row| training_dataset.data_entries[row][0]}.compact
81
- acts.empty? ? nil : n << acts
82
- end
83
- neighbors.compact! # remove neighbors without training activities
84
- predictions << Algorithm.run(prediction_algorithm, neighbors)
85
- end
94
+ predictions = compounds.collect{|c| predict_compound c}
86
95
 
87
96
  # serialize result
88
97
  case object.class.to_s
89
98
  when "OpenTox::Compound"
90
99
  prediction = predictions.first
91
- prediction[:neighbors] = neighbors.sort{|a,b| b[1] <=> a[1]} # sort according to similarity
100
+ prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} # sort according to similarity
92
101
  return prediction
93
102
  when "Array"
94
103
  return predictions
95
104
  when "OpenTox::Dataset"
96
105
  # prepare prediction dataset
106
+ measurement_feature = Feature.find prediction_feature_id
107
+
108
+ prediction_feature = OpenTox::NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" )
97
109
  prediction_dataset = LazarPrediction.new(
98
- :title => "Lazar prediction for #{prediction_feature.title}",
110
+ :name => "Lazar prediction for #{prediction_feature.name}",
99
111
  :creator => __FILE__,
100
112
  :prediction_feature_id => prediction_feature.id
101
113
 
102
114
  )
103
- confidence_feature = OpenTox::NumericFeature.find_or_create_by( "title" => "Prediction confidence" )
104
- # TODO move into warnings field
105
- warning_feature = OpenTox::NominalFeature.find_or_create_by("title" => "Warnings")
106
- prediction_dataset.features = [ prediction_feature, confidence_feature, warning_feature ]
115
+ confidence_feature = OpenTox::NumericFeature.find_or_create_by( "name" => "Model RMSE" )
116
+ warning_feature = OpenTox::NominalFeature.find_or_create_by("name" => "Warnings")
117
+ prediction_dataset.features = [ prediction_feature, confidence_feature, measurement_feature, warning_feature ]
107
118
  prediction_dataset.compounds = compounds
108
- prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:confidence], p[:warning]]}
109
- prediction_dataset.save_all
119
+ prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:rmse] , p[:dataset_activities].to_s, p[:warning]]}
120
+ prediction_dataset.save
110
121
  return prediction_dataset
111
122
  end
112
123
 
@@ -120,26 +131,19 @@ module OpenTox
120
131
  end
121
132
 
122
133
  class LazarClassification < Lazar
123
- def initialize
124
- super
125
- self.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote"
126
- self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity"
127
- self.neighbor_algorithm_parameters = {:min_sim => 0.7}
128
- end
129
- end
130
-
131
- class LazarFminerClassification < LazarClassification
132
-
133
- def self.create training_dataset
134
- model = super(training_dataset)
135
- model.update "_type" => self.to_s # adjust class
136
- model = self.find model.id # adjust class
137
- model.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fminer_similarity"
138
- model.neighbor_algorithm_parameters = {
139
- :feature_calculation_algorithm => "OpenTox::Algorithm::Descriptor.smarts_match",
140
- :feature_dataset_id => Algorithm::Fminer.bbrc(training_dataset).id,
141
- :min_sim => 0.3
142
- }
134
+
135
+ def self.create training_dataset, params={}
136
+ model = self.new training_dataset, params
137
+ model.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote" unless model.prediction_algorithm
138
+ model.neighbor_algorithm ||= "fingerprint_neighbors"
139
+ model.neighbor_algorithm_parameters ||= {}
140
+ {
141
+ :type => "MP2D",
142
+ :training_dataset_id => training_dataset.id,
143
+ :min_sim => 0.1
144
+ }.each do |key,value|
145
+ model.neighbor_algorithm_parameters[key] ||= value
146
+ end
143
147
  model.save
144
148
  model
145
149
  end
@@ -147,20 +151,27 @@ module OpenTox
147
151
 
148
152
  class LazarRegression < Lazar
149
153
 
150
- def initialize
151
- super
152
- self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity"
153
- self.prediction_algorithm = "OpenTox::Algorithm::Regression.weighted_average"
154
- self.neighbor_algorithm_parameters = {:min_sim => 0.7}
154
+ def self.create training_dataset, params={}
155
+ model = self.new training_dataset, params
156
+ model.neighbor_algorithm ||= "fingerprint_neighbors"
157
+ model.prediction_algorithm ||= "OpenTox::Algorithm::Regression.local_fingerprint_regression"
158
+ model.neighbor_algorithm_parameters ||= {}
159
+ {
160
+ :type => "MP2D",
161
+ :training_dataset_id => training_dataset.id,
162
+ :min_sim => 0.1
163
+ }.each do |key,value|
164
+ model.neighbor_algorithm_parameters[key] ||= value
165
+ end
166
+ model.save
167
+ model
155
168
  end
156
-
157
169
  end
158
170
 
159
- class PredictionModel
171
+ class Prediction
160
172
  include OpenTox
161
173
  include Mongoid::Document
162
174
  include Mongoid::Timestamps
163
- store_in collection: "models"
164
175
 
165
176
  # TODO field Validations
166
177
  field :endpoint, type: String
@@ -168,10 +179,54 @@ module OpenTox
168
179
  field :source, type: String
169
180
  field :unit, type: String
170
181
  field :model_id, type: BSON::ObjectId
171
- field :crossvalidation_id, type: BSON::ObjectId
182
+ field :repeated_crossvalidation_id, type: BSON::ObjectId
183
+
184
+ def predict object
185
+ Lazar.find(model_id).predict object
186
+ end
187
+
188
+ def training_dataset
189
+ model.training_dataset
190
+ end
191
+
192
+ def model
193
+ Lazar.find model_id
194
+ end
195
+
196
+ def repeated_crossvalidation
197
+ RepeatedCrossValidation.find repeated_crossvalidation_id
198
+ end
199
+
200
+ def crossvalidations
201
+ repeated_crossvalidation.crossvalidations
202
+ end
203
+
204
+ def regression?
205
+ training_dataset.features.first.numeric?
206
+ end
207
+
208
+ def classification?
209
+ training_dataset.features.first.nominal?
210
+ end
211
+
212
+ def self.from_csv_file file
213
+ metadata_file = file.sub(/csv$/,"json")
214
+ bad_request_error "No metadata file #{metadata_file}" unless File.exist? metadata_file
215
+ prediction_model = self.new JSON.parse(File.read(metadata_file))
216
+ training_dataset = Dataset.from_csv_file file
217
+ model = nil
218
+ if training_dataset.features.first.nominal?
219
+ model = LazarClassification.create training_dataset
220
+ elsif training_dataset.features.first.numeric?
221
+ model = LazarRegression.create training_dataset
222
+ end
223
+ prediction_model[:model_id] = model.id
224
+ prediction_model[:repeated_crossvalidation_id] = RepeatedCrossValidation.create(model).id
225
+ prediction_model.save
226
+ prediction_model
227
+ end
172
228
  end
173
229
 
174
230
  end
175
231
 
176
232
  end
177
-
data/lib/opentox.rb CHANGED
@@ -12,8 +12,8 @@ module OpenTox
12
12
  include Mongoid::Document
13
13
  include Mongoid::Timestamps
14
14
  store_in collection: klass.downcase.pluralize
15
- field :title, as: :name, type: String
16
-
15
+ field :name, type: String
16
+ field :warnings, type: Array, default: []
17
17
  end
18
18
  OpenTox.const_set klass,c
19
19
  end
data/lib/overwrite.rb CHANGED
@@ -9,6 +9,11 @@ class Object
9
9
  def numeric?
10
10
  true if Float(self) rescue false
11
11
  end
12
+
13
+ # Returns dimension of nested arrays
14
+ def dimension
15
+ self.class == Array ? 1 + self[0].dimension : 0
16
+ end
12
17
  end
13
18
 
14
19
  class Numeric
@@ -17,6 +22,14 @@ class Numeric
17
22
  end
18
23
  end
19
24
 
25
+ class Float
26
+ # round to n significant digits
27
+ # http://stackoverflow.com/questions/8382619/how-to-round-a-float-to-a-specified-number-of-significant-digits-in-ruby
28
+ def signif(n)
29
+ Float("%.#{n}g" % self)
30
+ end
31
+ end
32
+
20
33
  module Enumerable
21
34
  # @return [Array] only the duplicates of an enumerable
22
35
  def duplicates
@@ -81,6 +94,26 @@ class Array
81
94
  return self.uniq.size == 1
82
95
  end
83
96
 
97
+ def median
98
+ sorted = self.sort
99
+ len = sorted.length
100
+ (sorted[(len - 1) / 2] + sorted[len / 2]) / 2.0
101
+ end
102
+
103
+ def mean
104
+ self.inject{ |sum, el| sum + el }.to_f / self.size
105
+ end
106
+
107
+ def sample_variance
108
+ m = self.mean
109
+ sum = self.inject(0){|accum, i| accum +(i-m)**2 }
110
+ sum/(self.length - 1).to_f
111
+ end
112
+
113
+ def standard_deviation
114
+ Math.sqrt(self.sample_variance)
115
+ end
116
+
84
117
  end
85
118
 
86
119
  module URI
@@ -116,4 +149,8 @@ module URI
116
149
  false
117
150
  end
118
151
 
152
+ def self.task? uri
153
+ uri =~ /task/ and URI.valid? uri
154
+ end
155
+
119
156
  end