lazar 0.0.7 → 0.0.9

Sign up to get free protection for your applications and to get access to all the features.
Files changed (61) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +3 -0
  3. data/README.md +2 -1
  4. data/VERSION +1 -1
  5. data/ext/lazar/extconf.rb +15 -76
  6. data/ext/lazar/rinstall.R +9 -0
  7. data/lazar.gemspec +7 -7
  8. data/lib/classification.rb +5 -78
  9. data/lib/compound.rb +201 -44
  10. data/lib/crossvalidation.rb +224 -121
  11. data/lib/dataset.rb +83 -93
  12. data/lib/error.rb +1 -1
  13. data/lib/experiment.rb +99 -0
  14. data/lib/feature.rb +2 -54
  15. data/lib/lazar.rb +47 -34
  16. data/lib/leave-one-out-validation.rb +205 -0
  17. data/lib/model.rb +131 -76
  18. data/lib/opentox.rb +2 -2
  19. data/lib/overwrite.rb +37 -0
  20. data/lib/physchem.rb +133 -0
  21. data/lib/regression.rb +117 -189
  22. data/lib/rest-client-wrapper.rb +4 -5
  23. data/lib/unique_descriptors.rb +6 -7
  24. data/lib/validation.rb +63 -69
  25. data/test/all.rb +2 -2
  26. data/test/classification.rb +41 -0
  27. data/test/compound.rb +116 -7
  28. data/test/data/LOAEL_log_mg_corrected_smiles.csv +567 -567
  29. data/test/data/LOAEL_log_mmol_corrected_smiles.csv +566 -566
  30. data/test/data/LOAEL_mmol_corrected_smiles.csv +568 -0
  31. data/test/data/batch_prediction.csv +25 -0
  32. data/test/data/batch_prediction_inchi_small.csv +4 -0
  33. data/test/data/batch_prediction_smiles_small.csv +4 -0
  34. data/test/data/hamster_carcinogenicity.json +3 -0
  35. data/test/data/loael.csv +568 -0
  36. data/test/dataset-long.rb +5 -8
  37. data/test/dataset.rb +31 -11
  38. data/test/default_environment.rb +11 -0
  39. data/test/descriptor.rb +26 -41
  40. data/test/error.rb +1 -3
  41. data/test/experiment.rb +301 -0
  42. data/test/feature.rb +22 -10
  43. data/test/lazar-long.rb +43 -23
  44. data/test/lazar-physchem-short.rb +19 -16
  45. data/test/prediction_models.rb +20 -0
  46. data/test/regression.rb +43 -0
  47. data/test/setup.rb +3 -1
  48. data/test/test_environment.rb +10 -0
  49. data/test/validation.rb +92 -26
  50. metadata +64 -38
  51. data/lib/SMARTS_InteLigand.txt +0 -983
  52. data/lib/bbrc.rb +0 -165
  53. data/lib/descriptor.rb +0 -247
  54. data/lib/neighbor.rb +0 -25
  55. data/lib/similarity.rb +0 -58
  56. data/mongoid.yml +0 -8
  57. data/test/descriptor-long.rb +0 -26
  58. data/test/fminer-long.rb +0 -38
  59. data/test/fminer.rb +0 -52
  60. data/test/lazar-fminer.rb +0 -50
  61. data/test/lazar-regression.rb +0 -27
data/lib/validation.rb CHANGED
@@ -2,7 +2,9 @@ module OpenTox
2
2
 
3
3
  class Validation
4
4
 
5
+ field :model_id, type: BSON::ObjectId
5
6
  field :prediction_dataset_id, type: BSON::ObjectId
7
+ field :crossvalidation_id, type: BSON::ObjectId
6
8
  field :test_dataset_id, type: BSON::ObjectId
7
9
  field :nr_instances, type: Integer
8
10
  field :nr_unpredicted, type: Integer
@@ -16,98 +18,90 @@ module OpenTox
16
18
  Dataset.find test_dataset_id
17
19
  end
18
20
 
19
- end
21
+ def model
22
+ Model::Lazar.find model_id
23
+ end
20
24
 
21
- class ClassificationValidation < Validation
22
- field :accept_values, type: String
23
- field :confusion_matrix, type: Array
24
- field :weighted_confusion_matrix, type: Array
25
+ def self.create model, training_set, test_set, crossvalidation=nil
26
+
27
+ atts = model.attributes.dup # do not modify attributes from original model
28
+ atts["_id"] = BSON::ObjectId.new
29
+ atts[:training_dataset_id] = training_set.id
30
+ validation_model = model.class.create training_set, atts
31
+ validation_model.save
32
+ cids = test_set.compound_ids
25
33
 
26
- def self.create model, training_set, test_set
27
- validation = self.class.new
28
- #feature_dataset = Dataset.find model.feature_dataset_id
29
- # TODO check and delegate to Algorithm
30
- #features = Algorithm.run feature_dataset.training_algorithm, training_set, feature_dataset.training_parameters
31
- validation_model = model.class.create training_set#, features
32
- test_set_without_activities = Dataset.new(:compound_ids => test_set.compound_ids) # just to be sure that activities cannot be used
34
+ test_set_without_activities = Dataset.new(:compound_ids => cids.uniq) # remove duplicates and make sure that activities cannot be used
33
35
  prediction_dataset = validation_model.predict test_set_without_activities
34
- accept_values = prediction_dataset.prediction_feature.accept_values
35
- confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
36
- weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
37
36
  predictions = []
38
37
  nr_unpredicted = 0
39
- prediction_dataset.data_entries.each_with_index do |pe,i|
40
- if pe[0] and pe[1] and pe[1].numeric?
41
- prediction = pe[0]
42
- # TODO prediction_feature, convention??
43
- # TODO generalize for multiple classes
44
- activity = test_set.data_entries[i].first
45
- confidence = prediction_dataset.data_entries[i][1]
46
- predictions << [prediction_dataset.compound_ids[i], activity, prediction, confidence]
47
- if prediction == activity
48
- if prediction == accept_values[0]
49
- confusion_matrix[0][0] += 1
50
- weighted_confusion_matrix[0][0] += confidence
51
- elsif prediction == accept_values[1]
52
- confusion_matrix[1][1] += 1
53
- weighted_confusion_matrix[1][1] += confidence
54
- end
55
- elsif prediction != activity
56
- if prediction == accept_values[0]
57
- confusion_matrix[0][1] += 1
58
- weighted_confusion_matrix[0][1] += confidence
59
- elsif prediction == accept_values[1]
60
- confusion_matrix[1][0] += 1
61
- weighted_confusion_matrix[1][0] += confidence
62
- end
63
- end
38
+ activities = test_set.data_entries.collect{|de| de.first}
39
+ prediction_dataset.data_entries.each_with_index do |de,i|
40
+ if de[0] #and de[1]
41
+ cid = prediction_dataset.compound_ids[i]
42
+ rows = cids.each_index.select{|r| cids[r] == cid }
43
+ activities = rows.collect{|r| test_set.data_entries[r][0]}
44
+ prediction = de.first
45
+ confidence = de[1]
46
+ predictions << [prediction_dataset.compound_ids[i], activities, prediction, de[1]]
64
47
  else
65
- nr_unpredicted += 1 if pe[0].nil?
48
+ nr_unpredicted += 1
66
49
  end
67
50
  end
68
51
  validation = self.new(
52
+ :model_id => validation_model.id,
69
53
  :prediction_dataset_id => prediction_dataset.id,
70
54
  :test_dataset_id => test_set.id,
71
55
  :nr_instances => test_set.compound_ids.size,
72
56
  :nr_unpredicted => nr_unpredicted,
73
- :accept_values => accept_values,
74
- :confusion_matrix => confusion_matrix,
75
- :weighted_confusion_matrix => weighted_confusion_matrix,
76
- :predictions => predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
57
+ :predictions => predictions#.sort{|a,b| p a; b[3] <=> a[3]} # sort according to confidence
77
58
  )
59
+ validation.crossvalidation_id = crossvalidation.id if crossvalidation
78
60
  validation.save
79
61
  validation
80
62
  end
63
+
64
+ end
65
+
66
+ class ClassificationValidation < Validation
81
67
  end
82
68
 
83
69
  class RegressionValidation < Validation
84
- def self.create model, training_set, test_set
85
-
86
- validation_model = Model::LazarRegression.create training_set
87
- test_set_without_activities = Dataset.new(:compound_ids => test_set.compound_ids) # just to be sure that activities cannot be used
88
- prediction_dataset = validation_model.predict test_set_without_activities
89
- predictions = []
90
- nr_unpredicted = 0
91
- activities = test_set.data_entries.collect{|de| de.first}
92
- prediction_dataset.data_entries.each_with_index do |de,i|
93
- if de[0] and de[1] and de[1].numeric?
94
- activity = activities[i]
95
- prediction = de.first
96
- confidence = de[1]
97
- predictions << [prediction_dataset.compound_ids[i], activity, prediction,confidence]
70
+
71
+ def statistics
72
+ rmse = 0
73
+ weighted_rmse = 0
74
+ rse = 0
75
+ weighted_rse = 0
76
+ mae = 0
77
+ weighted_mae = 0
78
+ confidence_sum = 0
79
+ predictions.each do |pred|
80
+ compound_id,activity,prediction,confidence = pred
81
+ if activity and prediction
82
+ error = Math.log10(prediction)-Math.log10(activity.median)
83
+ rmse += error**2
84
+ weighted_rmse += confidence*error**2
85
+ mae += error.abs
86
+ weighted_mae += confidence*error.abs
87
+ confidence_sum += confidence
98
88
  else
99
- nr_unpredicted += 1
89
+ warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
90
+ $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
100
91
  end
101
92
  end
102
- validation = self.new(
103
- :prediction_dataset_id => prediction_dataset.id,
104
- :test_dataset_id => test_set.id,
105
- :nr_instances => test_set.compound_ids.size,
106
- :nr_unpredicted => nr_unpredicted,
107
- :predictions => predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
108
- )
109
- validation.save
110
- validation
93
+ x = predictions.collect{|p| p[1].median}
94
+ y = predictions.collect{|p| p[2]}
95
+ R.assign "measurement", x
96
+ R.assign "prediction", y
97
+ R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')"
98
+ r = R.eval("r").to_ruby
99
+
100
+ mae = mae/predictions.size
101
+ weighted_mae = weighted_mae/confidence_sum
102
+ rmse = Math.sqrt(rmse/predictions.size)
103
+ weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
104
+ { "R^2" => r**2, "RMSE" => rmse, "MAE" => mae }
111
105
  end
112
106
  end
113
107
 
data/test/all.rb CHANGED
@@ -1,5 +1,5 @@
1
- exclude = ["./setup.rb","./all.rb"]
1
+ # "./default_environment.rb" has to be executed separately
2
+ exclude = ["./setup.rb","./all.rb", "./default_environment.rb"]
2
3
  (Dir[File.join(File.dirname(__FILE__),"*.rb")]-exclude).each do |test|
3
- p test
4
4
  require_relative test
5
5
  end
@@ -0,0 +1,41 @@
1
+ require_relative "setup.rb"
2
+
3
+ class LazarClassificationTest < MiniTest::Test
4
+
5
+ def test_lazar_classification
6
+ training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv")
7
+ model = Model::LazarClassification.create training_dataset
8
+
9
+ [ {
10
+ :compound => OpenTox::Compound.from_inchi("InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"),
11
+ :prediction => "false",
12
+ :confidence => 0.25281385281385277,
13
+ :nr_neighbors => 11
14
+ },{
15
+ :compound => OpenTox::Compound.from_smiles("c1ccccc1NN"),
16
+ :prediction => "false",
17
+ :confidence => 0.3639589577089577,
18
+ :nr_neighbors => 14
19
+ } ].each do |example|
20
+ prediction = model.predict example[:compound]
21
+ assert_equal example[:prediction], prediction[:value]
22
+ #assert_equal example[:confidence], prediction[:confidence]
23
+ #assert_equal example[:nr_neighbors], prediction[:neighbors].size
24
+ end
25
+
26
+ compound = Compound.from_smiles "CCO"
27
+ prediction = model.predict compound
28
+ assert_equal ["false"], prediction[:database_activities]
29
+ assert_equal "true", prediction[:value]
30
+
31
+ # make a dataset prediction
32
+ compound_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini.csv")
33
+ prediction = model.predict compound_dataset
34
+ assert_equal compound_dataset.compounds, prediction.compounds
35
+
36
+ assert_equal "Could not find similar compounds with experimental data in the training dataset.", prediction.data_entries[7][3]
37
+ assert_equal "1 compounds have been removed from neighbors, because they have the same structure as the query compound.", prediction.data_entries[14][3]
38
+ # cleanup
39
+ [training_dataset,model,compound_dataset].each{|o| o.delete}
40
+ end
41
+ end
data/test/compound.rb CHANGED
@@ -54,7 +54,6 @@ print c.sdf
54
54
 
55
55
  def test_inchikey
56
56
  c = OpenTox::Compound.from_inchi "InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"
57
- p c
58
57
  assert_equal "UHOVQNZJYSORNB-UHFFFAOYSA-N", c.inchikey
59
58
  end
60
59
 
@@ -65,8 +64,7 @@ print c.sdf
65
64
 
66
65
  def test_chemblid
67
66
  c = OpenTox::Compound.from_inchi "InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"
68
- #assert_equal "CHEMBL277500", c.chemblid
69
- assert_equal "CHEMBL581676", c.chemblid
67
+ assert_equal "CHEMBL277500", c.chemblid
70
68
  end
71
69
 
72
70
  def test_sdf_storage
@@ -78,17 +76,17 @@ print c.sdf
78
76
  def test_fingerprint
79
77
  c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N"
80
78
 
81
- assert c.fp4.collect{|fid| Feature.find(fid).name}.include? ("1,3-Tautomerizable")
82
- assert_equal c.fp4.size, c.fp4_size
79
+ assert_equal 9, c.fingerprint("FP4").size
83
80
  end
84
81
 
85
82
  def test_neighbors
86
83
  d = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.csv")
87
84
  d.compounds.each do |c|
88
- refute_nil c.fp4
85
+ refute_nil c.fingerprint("MP2D")
89
86
  end
90
87
  c = d.compounds[371]
91
- assert c.neighbors.size >= 19
88
+ n = c.fingerprint_neighbors({:type => "FP4", :min_sim => 0.7, :training_dataset_id => d.id })
89
+ assert n.size >= 18, "Neighbors size (#{n.size}) should be larger than 17"
92
90
  end
93
91
 
94
92
  def test_openbabel_segfault
@@ -97,4 +95,115 @@ print c.sdf
97
95
  c = Compound.from_inchi(inchi)
98
96
  assert_equal inchi, c.inchi
99
97
  end
98
+
99
+ def test_openbabel_fingerprint
100
+ [
101
+ "CC(=O)CC(C)C#N",
102
+ "CC(=O)CC(C)C",
103
+ "C(=O)CC(C)C#N",
104
+ ].each do |smi|
105
+ c = OpenTox::Compound.from_smiles smi
106
+ refute_nil c.fingerprint("FP4")
107
+ end
108
+ end
109
+
110
+ def test_fingerprint_neighbors
111
+ types = ["FP2", "FP3", "FP4", "MACCS"]
112
+ min_sim = 0.7
113
+ training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.csv")
114
+ [
115
+ "CC(=O)CC(C)C#N",
116
+ "CC(=O)CC(C)C",
117
+ "C(=O)CC(C)C#N",
118
+ ].each do |smi|
119
+ c = OpenTox::Compound.from_smiles smi
120
+ types.each do |type|
121
+ neighbors = c.fingerprint_neighbors({:type => type, :training_dataset_id => training_dataset.id, :min_sim => min_sim})
122
+ unless type == "FP2" and smi == "CC(=O)CC(C)C#N" or smi == "C(=O)CC(C)C#N" and (type == "FP2" or type == "MACCS")
123
+ refute_empty neighbors
124
+ end
125
+ end
126
+ end
127
+ end
128
+
129
+ def test_mna
130
+ c = OpenTox::Compound.from_smiles "N#[N+]C1=CC=CC=C1.F[B-](F)(F)F"
131
+ assert_equal 18, c.fingerprint("MNA").size
132
+ assert_equal 9, c.fingerprint("MNA").uniq.size
133
+ end
134
+
135
+ def test_mpd
136
+ c = OpenTox::Compound.from_smiles "N#[N+]C1=CC=CC=C1.F[B-](F)(F)F"
137
+ assert 13, c.fingerprint("MP2D").size
138
+ assert 7, c.fingerprint("MP2D").uniq.size
139
+ end
140
+
141
+ def test_fingerprint_count_neighbors
142
+ types = ["MP2D", "MNA"]
143
+ min_sim = 0.0
144
+ training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.csv")
145
+ [
146
+ "CC(=O)CC(C)C#N",
147
+ "CC(=O)CC(C)C",
148
+ "C(=O)CC(C)C#N",
149
+ ].each do |smi|
150
+ c = OpenTox::Compound.from_smiles smi
151
+ types.each do |type|
152
+ neighbors = c.fingerprint_count_neighbors({:type => type, :training_dataset_id => training_dataset.id, :min_sim => min_sim})
153
+ if type == "FP4"
154
+ fp4_neighbors = c.neighbors
155
+ neighbors.each do |n|
156
+ assert_includes fp4_neighbors, n
157
+ end
158
+ end
159
+ end
160
+ end
161
+ end
162
+
163
+ def test_fingerprint_db_neighbors
164
+ #skip
165
+ training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.csv")
166
+ [
167
+ "CC(=O)CC(C)C#N",
168
+ "CC(=O)CC(C)C",
169
+ "C(=O)CC(C)C#N",
170
+ ].each do |smi|
171
+ c = OpenTox::Compound.from_smiles smi
172
+ t = Time.now
173
+ neighbors = c.db_neighbors(:training_dataset_id => training_dataset.id, :min_sim => 0.2)
174
+ p Time.now - t
175
+ t = Time.now
176
+ neighbors2 = c.fingerprint_neighbors({:type => "MP2D", :training_dataset_id => training_dataset.id, :min_sim => 0.2})
177
+ p Time.now - t
178
+ p neighbors.size
179
+ p neighbors2.size
180
+ #p neighbors
181
+ #p neighbors2
182
+ #p neighbors2 - neighbors
183
+ #assert_equal neighbors, neighbors2
184
+ end
185
+ end
186
+
187
+ def test_molecular_weight
188
+ c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C"
189
+ assert_equal 100.15888, c.molecular_weight
190
+ end
191
+
192
+ def test_mg_conversions
193
+ # TODO fix!
194
+ skip
195
+ c = OpenTox::Compound.from_smiles "O"
196
+ mw = c.molecular_weight
197
+ assert_equal 18.01528, mw
198
+ assert_equal 0.8105107141417474, c.logmmol_to_mg(4.34688225631145, mw)
199
+ assert_equal 9007.64, c.mmol_to_mg(500, mw)
200
+ assert_equal 2437.9999984148976, c.logmg_to_mg(3.387033701)
201
+ end
202
+
203
+ def test_physchem
204
+ c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C"
205
+ assert_equal PhysChem::OBDESCRIPTORS.size, c.physchem.size
206
+ assert_equal PhysChem::OBDESCRIPTORS.size, c.physchem(PhysChem.openbabel_descriptors).size
207
+ assert_equal PhysChem::unique_descriptors.size, c.physchem(PhysChem.unique_descriptors).size
208
+ end
100
209
  end