lazar 0.0.7 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +3 -0
  3. data/README.md +2 -1
  4. data/VERSION +1 -1
  5. data/ext/lazar/extconf.rb +15 -76
  6. data/ext/lazar/rinstall.R +9 -0
  7. data/lazar.gemspec +7 -7
  8. data/lib/classification.rb +5 -78
  9. data/lib/compound.rb +201 -44
  10. data/lib/crossvalidation.rb +224 -121
  11. data/lib/dataset.rb +83 -93
  12. data/lib/error.rb +1 -1
  13. data/lib/experiment.rb +99 -0
  14. data/lib/feature.rb +2 -54
  15. data/lib/lazar.rb +47 -34
  16. data/lib/leave-one-out-validation.rb +205 -0
  17. data/lib/model.rb +131 -76
  18. data/lib/opentox.rb +2 -2
  19. data/lib/overwrite.rb +37 -0
  20. data/lib/physchem.rb +133 -0
  21. data/lib/regression.rb +117 -189
  22. data/lib/rest-client-wrapper.rb +4 -5
  23. data/lib/unique_descriptors.rb +6 -7
  24. data/lib/validation.rb +63 -69
  25. data/test/all.rb +2 -2
  26. data/test/classification.rb +41 -0
  27. data/test/compound.rb +116 -7
  28. data/test/data/LOAEL_log_mg_corrected_smiles.csv +567 -567
  29. data/test/data/LOAEL_log_mmol_corrected_smiles.csv +566 -566
  30. data/test/data/LOAEL_mmol_corrected_smiles.csv +568 -0
  31. data/test/data/batch_prediction.csv +25 -0
  32. data/test/data/batch_prediction_inchi_small.csv +4 -0
  33. data/test/data/batch_prediction_smiles_small.csv +4 -0
  34. data/test/data/hamster_carcinogenicity.json +3 -0
  35. data/test/data/loael.csv +568 -0
  36. data/test/dataset-long.rb +5 -8
  37. data/test/dataset.rb +31 -11
  38. data/test/default_environment.rb +11 -0
  39. data/test/descriptor.rb +26 -41
  40. data/test/error.rb +1 -3
  41. data/test/experiment.rb +301 -0
  42. data/test/feature.rb +22 -10
  43. data/test/lazar-long.rb +43 -23
  44. data/test/lazar-physchem-short.rb +19 -16
  45. data/test/prediction_models.rb +20 -0
  46. data/test/regression.rb +43 -0
  47. data/test/setup.rb +3 -1
  48. data/test/test_environment.rb +10 -0
  49. data/test/validation.rb +92 -26
  50. metadata +64 -38
  51. data/lib/SMARTS_InteLigand.txt +0 -983
  52. data/lib/bbrc.rb +0 -165
  53. data/lib/descriptor.rb +0 -247
  54. data/lib/neighbor.rb +0 -25
  55. data/lib/similarity.rb +0 -58
  56. data/mongoid.yml +0 -8
  57. data/test/descriptor-long.rb +0 -26
  58. data/test/fminer-long.rb +0 -38
  59. data/test/fminer.rb +0 -52
  60. data/test/lazar-fminer.rb +0 -50
  61. data/test/lazar-regression.rb +0 -27
data/lib/validation.rb CHANGED
@@ -2,7 +2,9 @@ module OpenTox
2
2
 
3
3
  class Validation
4
4
 
5
+ field :model_id, type: BSON::ObjectId
5
6
  field :prediction_dataset_id, type: BSON::ObjectId
7
+ field :crossvalidation_id, type: BSON::ObjectId
6
8
  field :test_dataset_id, type: BSON::ObjectId
7
9
  field :nr_instances, type: Integer
8
10
  field :nr_unpredicted, type: Integer
@@ -16,98 +18,90 @@ module OpenTox
16
18
  Dataset.find test_dataset_id
17
19
  end
18
20
 
19
- end
21
+ def model
22
+ Model::Lazar.find model_id
23
+ end
20
24
 
21
- class ClassificationValidation < Validation
22
- field :accept_values, type: String
23
- field :confusion_matrix, type: Array
24
- field :weighted_confusion_matrix, type: Array
25
+ def self.create model, training_set, test_set, crossvalidation=nil
26
+
27
+ atts = model.attributes.dup # do not modify attributes from original model
28
+ atts["_id"] = BSON::ObjectId.new
29
+ atts[:training_dataset_id] = training_set.id
30
+ validation_model = model.class.create training_set, atts
31
+ validation_model.save
32
+ cids = test_set.compound_ids
25
33
 
26
- def self.create model, training_set, test_set
27
- validation = self.class.new
28
- #feature_dataset = Dataset.find model.feature_dataset_id
29
- # TODO check and delegate to Algorithm
30
- #features = Algorithm.run feature_dataset.training_algorithm, training_set, feature_dataset.training_parameters
31
- validation_model = model.class.create training_set#, features
32
- test_set_without_activities = Dataset.new(:compound_ids => test_set.compound_ids) # just to be sure that activities cannot be used
34
+ test_set_without_activities = Dataset.new(:compound_ids => cids.uniq) # remove duplicates and make sure that activities cannot be used
33
35
  prediction_dataset = validation_model.predict test_set_without_activities
34
- accept_values = prediction_dataset.prediction_feature.accept_values
35
- confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
36
- weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
37
36
  predictions = []
38
37
  nr_unpredicted = 0
39
- prediction_dataset.data_entries.each_with_index do |pe,i|
40
- if pe[0] and pe[1] and pe[1].numeric?
41
- prediction = pe[0]
42
- # TODO prediction_feature, convention??
43
- # TODO generalize for multiple classes
44
- activity = test_set.data_entries[i].first
45
- confidence = prediction_dataset.data_entries[i][1]
46
- predictions << [prediction_dataset.compound_ids[i], activity, prediction, confidence]
47
- if prediction == activity
48
- if prediction == accept_values[0]
49
- confusion_matrix[0][0] += 1
50
- weighted_confusion_matrix[0][0] += confidence
51
- elsif prediction == accept_values[1]
52
- confusion_matrix[1][1] += 1
53
- weighted_confusion_matrix[1][1] += confidence
54
- end
55
- elsif prediction != activity
56
- if prediction == accept_values[0]
57
- confusion_matrix[0][1] += 1
58
- weighted_confusion_matrix[0][1] += confidence
59
- elsif prediction == accept_values[1]
60
- confusion_matrix[1][0] += 1
61
- weighted_confusion_matrix[1][0] += confidence
62
- end
63
- end
38
+ activities = test_set.data_entries.collect{|de| de.first}
39
+ prediction_dataset.data_entries.each_with_index do |de,i|
40
+ if de[0] #and de[1]
41
+ cid = prediction_dataset.compound_ids[i]
42
+ rows = cids.each_index.select{|r| cids[r] == cid }
43
+ activities = rows.collect{|r| test_set.data_entries[r][0]}
44
+ prediction = de.first
45
+ confidence = de[1]
46
+ predictions << [prediction_dataset.compound_ids[i], activities, prediction, de[1]]
64
47
  else
65
- nr_unpredicted += 1 if pe[0].nil?
48
+ nr_unpredicted += 1
66
49
  end
67
50
  end
68
51
  validation = self.new(
52
+ :model_id => validation_model.id,
69
53
  :prediction_dataset_id => prediction_dataset.id,
70
54
  :test_dataset_id => test_set.id,
71
55
  :nr_instances => test_set.compound_ids.size,
72
56
  :nr_unpredicted => nr_unpredicted,
73
- :accept_values => accept_values,
74
- :confusion_matrix => confusion_matrix,
75
- :weighted_confusion_matrix => weighted_confusion_matrix,
76
- :predictions => predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
57
+ :predictions => predictions#.sort{|a,b| p a; b[3] <=> a[3]} # sort according to confidence
77
58
  )
59
+ validation.crossvalidation_id = crossvalidation.id if crossvalidation
78
60
  validation.save
79
61
  validation
80
62
  end
63
+
64
+ end
65
+
66
+ class ClassificationValidation < Validation
81
67
  end
82
68
 
83
69
  class RegressionValidation < Validation
84
- def self.create model, training_set, test_set
85
-
86
- validation_model = Model::LazarRegression.create training_set
87
- test_set_without_activities = Dataset.new(:compound_ids => test_set.compound_ids) # just to be sure that activities cannot be used
88
- prediction_dataset = validation_model.predict test_set_without_activities
89
- predictions = []
90
- nr_unpredicted = 0
91
- activities = test_set.data_entries.collect{|de| de.first}
92
- prediction_dataset.data_entries.each_with_index do |de,i|
93
- if de[0] and de[1] and de[1].numeric?
94
- activity = activities[i]
95
- prediction = de.first
96
- confidence = de[1]
97
- predictions << [prediction_dataset.compound_ids[i], activity, prediction,confidence]
70
+
71
+ def statistics
72
+ rmse = 0
73
+ weighted_rmse = 0
74
+ rse = 0
75
+ weighted_rse = 0
76
+ mae = 0
77
+ weighted_mae = 0
78
+ confidence_sum = 0
79
+ predictions.each do |pred|
80
+ compound_id,activity,prediction,confidence = pred
81
+ if activity and prediction
82
+ error = Math.log10(prediction)-Math.log10(activity.median)
83
+ rmse += error**2
84
+ weighted_rmse += confidence*error**2
85
+ mae += error.abs
86
+ weighted_mae += confidence*error.abs
87
+ confidence_sum += confidence
98
88
  else
99
- nr_unpredicted += 1
89
+ warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
90
+ $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
100
91
  end
101
92
  end
102
- validation = self.new(
103
- :prediction_dataset_id => prediction_dataset.id,
104
- :test_dataset_id => test_set.id,
105
- :nr_instances => test_set.compound_ids.size,
106
- :nr_unpredicted => nr_unpredicted,
107
- :predictions => predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
108
- )
109
- validation.save
110
- validation
93
+ x = predictions.collect{|p| p[1].median}
94
+ y = predictions.collect{|p| p[2]}
95
+ R.assign "measurement", x
96
+ R.assign "prediction", y
97
+ R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')"
98
+ r = R.eval("r").to_ruby
99
+
100
+ mae = mae/predictions.size
101
+ weighted_mae = weighted_mae/confidence_sum
102
+ rmse = Math.sqrt(rmse/predictions.size)
103
+ weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
104
+ { "R^2" => r**2, "RMSE" => rmse, "MAE" => mae }
111
105
  end
112
106
  end
113
107
 
data/test/all.rb CHANGED
@@ -1,5 +1,5 @@
1
- exclude = ["./setup.rb","./all.rb"]
1
+ # "./default_environment.rb" has to be executed separately
2
+ exclude = ["./setup.rb","./all.rb", "./default_environment.rb"]
2
3
  (Dir[File.join(File.dirname(__FILE__),"*.rb")]-exclude).each do |test|
3
- p test
4
4
  require_relative test
5
5
  end
@@ -0,0 +1,41 @@
1
+ require_relative "setup.rb"
2
+
3
+ class LazarClassificationTest < MiniTest::Test
4
+
5
+ def test_lazar_classification
6
+ training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv")
7
+ model = Model::LazarClassification.create training_dataset
8
+
9
+ [ {
10
+ :compound => OpenTox::Compound.from_inchi("InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"),
11
+ :prediction => "false",
12
+ :confidence => 0.25281385281385277,
13
+ :nr_neighbors => 11
14
+ },{
15
+ :compound => OpenTox::Compound.from_smiles("c1ccccc1NN"),
16
+ :prediction => "false",
17
+ :confidence => 0.3639589577089577,
18
+ :nr_neighbors => 14
19
+ } ].each do |example|
20
+ prediction = model.predict example[:compound]
21
+ assert_equal example[:prediction], prediction[:value]
22
+ #assert_equal example[:confidence], prediction[:confidence]
23
+ #assert_equal example[:nr_neighbors], prediction[:neighbors].size
24
+ end
25
+
26
+ compound = Compound.from_smiles "CCO"
27
+ prediction = model.predict compound
28
+ assert_equal ["false"], prediction[:database_activities]
29
+ assert_equal "true", prediction[:value]
30
+
31
+ # make a dataset prediction
32
+ compound_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini.csv")
33
+ prediction = model.predict compound_dataset
34
+ assert_equal compound_dataset.compounds, prediction.compounds
35
+
36
+ assert_equal "Could not find similar compounds with experimental data in the training dataset.", prediction.data_entries[7][3]
37
+ assert_equal "1 compounds have been removed from neighbors, because they have the same structure as the query compound.", prediction.data_entries[14][3]
38
+ # cleanup
39
+ [training_dataset,model,compound_dataset].each{|o| o.delete}
40
+ end
41
+ end
data/test/compound.rb CHANGED
@@ -54,7 +54,6 @@ print c.sdf
54
54
 
55
55
  def test_inchikey
56
56
  c = OpenTox::Compound.from_inchi "InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"
57
- p c
58
57
  assert_equal "UHOVQNZJYSORNB-UHFFFAOYSA-N", c.inchikey
59
58
  end
60
59
 
@@ -65,8 +64,7 @@ print c.sdf
65
64
 
66
65
  def test_chemblid
67
66
  c = OpenTox::Compound.from_inchi "InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"
68
- #assert_equal "CHEMBL277500", c.chemblid
69
- assert_equal "CHEMBL581676", c.chemblid
67
+ assert_equal "CHEMBL277500", c.chemblid
70
68
  end
71
69
 
72
70
  def test_sdf_storage
@@ -78,17 +76,17 @@ print c.sdf
78
76
  def test_fingerprint
79
77
  c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N"
80
78
 
81
- assert c.fp4.collect{|fid| Feature.find(fid).name}.include? ("1,3-Tautomerizable")
82
- assert_equal c.fp4.size, c.fp4_size
79
+ assert_equal 9, c.fingerprint("FP4").size
83
80
  end
84
81
 
85
82
  def test_neighbors
86
83
  d = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.csv")
87
84
  d.compounds.each do |c|
88
- refute_nil c.fp4
85
+ refute_nil c.fingerprint("MP2D")
89
86
  end
90
87
  c = d.compounds[371]
91
- assert c.neighbors.size >= 19
88
+ n = c.fingerprint_neighbors({:type => "FP4", :min_sim => 0.7, :training_dataset_id => d.id })
89
+ assert n.size >= 18, "Neighbors size (#{n.size}) should be larger than 17"
92
90
  end
93
91
 
94
92
  def test_openbabel_segfault
@@ -97,4 +95,115 @@ print c.sdf
97
95
  c = Compound.from_inchi(inchi)
98
96
  assert_equal inchi, c.inchi
99
97
  end
98
+
99
+ def test_openbabel_fingerprint
100
+ [
101
+ "CC(=O)CC(C)C#N",
102
+ "CC(=O)CC(C)C",
103
+ "C(=O)CC(C)C#N",
104
+ ].each do |smi|
105
+ c = OpenTox::Compound.from_smiles smi
106
+ refute_nil c.fingerprint("FP4")
107
+ end
108
+ end
109
+
110
+ def test_fingerprint_neighbors
111
+ types = ["FP2", "FP3", "FP4", "MACCS"]
112
+ min_sim = 0.7
113
+ training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.csv")
114
+ [
115
+ "CC(=O)CC(C)C#N",
116
+ "CC(=O)CC(C)C",
117
+ "C(=O)CC(C)C#N",
118
+ ].each do |smi|
119
+ c = OpenTox::Compound.from_smiles smi
120
+ types.each do |type|
121
+ neighbors = c.fingerprint_neighbors({:type => type, :training_dataset_id => training_dataset.id, :min_sim => min_sim})
122
+ unless type == "FP2" and smi == "CC(=O)CC(C)C#N" or smi == "C(=O)CC(C)C#N" and (type == "FP2" or type == "MACCS")
123
+ refute_empty neighbors
124
+ end
125
+ end
126
+ end
127
+ end
128
+
129
+ def test_mna
130
+ c = OpenTox::Compound.from_smiles "N#[N+]C1=CC=CC=C1.F[B-](F)(F)F"
131
+ assert_equal 18, c.fingerprint("MNA").size
132
+ assert_equal 9, c.fingerprint("MNA").uniq.size
133
+ end
134
+
135
+ def test_mpd
136
+ c = OpenTox::Compound.from_smiles "N#[N+]C1=CC=CC=C1.F[B-](F)(F)F"
137
+ assert 13, c.fingerprint("MP2D").size
138
+ assert 7, c.fingerprint("MP2D").uniq.size
139
+ end
140
+
141
+ def test_fingerprint_count_neighbors
142
+ types = ["MP2D", "MNA"]
143
+ min_sim = 0.0
144
+ training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.csv")
145
+ [
146
+ "CC(=O)CC(C)C#N",
147
+ "CC(=O)CC(C)C",
148
+ "C(=O)CC(C)C#N",
149
+ ].each do |smi|
150
+ c = OpenTox::Compound.from_smiles smi
151
+ types.each do |type|
152
+ neighbors = c.fingerprint_count_neighbors({:type => type, :training_dataset_id => training_dataset.id, :min_sim => min_sim})
153
+ if type == "FP4"
154
+ fp4_neighbors = c.neighbors
155
+ neighbors.each do |n|
156
+ assert_includes fp4_neighbors, n
157
+ end
158
+ end
159
+ end
160
+ end
161
+ end
162
+
163
+ def test_fingerprint_db_neighbors
164
+ #skip
165
+ training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.csv")
166
+ [
167
+ "CC(=O)CC(C)C#N",
168
+ "CC(=O)CC(C)C",
169
+ "C(=O)CC(C)C#N",
170
+ ].each do |smi|
171
+ c = OpenTox::Compound.from_smiles smi
172
+ t = Time.now
173
+ neighbors = c.db_neighbors(:training_dataset_id => training_dataset.id, :min_sim => 0.2)
174
+ p Time.now - t
175
+ t = Time.now
176
+ neighbors2 = c.fingerprint_neighbors({:type => "MP2D", :training_dataset_id => training_dataset.id, :min_sim => 0.2})
177
+ p Time.now - t
178
+ p neighbors.size
179
+ p neighbors2.size
180
+ #p neighbors
181
+ #p neighbors2
182
+ #p neighbors2 - neighbors
183
+ #assert_equal neighbors, neighbors2
184
+ end
185
+ end
186
+
187
+ def test_molecular_weight
188
+ c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C"
189
+ assert_equal 100.15888, c.molecular_weight
190
+ end
191
+
192
+ def test_mg_conversions
193
+ # TODO fix!
194
+ skip
195
+ c = OpenTox::Compound.from_smiles "O"
196
+ mw = c.molecular_weight
197
+ assert_equal 18.01528, mw
198
+ assert_equal 0.8105107141417474, c.logmmol_to_mg(4.34688225631145, mw)
199
+ assert_equal 9007.64, c.mmol_to_mg(500, mw)
200
+ assert_equal 2437.9999984148976, c.logmg_to_mg(3.387033701)
201
+ end
202
+
203
+ def test_physchem
204
+ c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C"
205
+ assert_equal PhysChem::OBDESCRIPTORS.size, c.physchem.size
206
+ assert_equal PhysChem::OBDESCRIPTORS.size, c.physchem(PhysChem.openbabel_descriptors).size
207
+ assert_equal PhysChem::unique_descriptors.size, c.physchem(PhysChem.unique_descriptors).size
208
+ end
100
209
  end