lazar 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (98) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/.yardopts +4 -0
  4. data/Gemfile +2 -0
  5. data/LICENSE +674 -0
  6. data/README.md +44 -0
  7. data/Rakefile +1 -0
  8. data/VERSION +1 -0
  9. data/ext/lazar/extconf.rb +87 -0
  10. data/java/CdkDescriptorInfo.class +0 -0
  11. data/java/CdkDescriptorInfo.java +22 -0
  12. data/java/CdkDescriptors.class +0 -0
  13. data/java/CdkDescriptors.java +141 -0
  14. data/java/Jmol.jar +0 -0
  15. data/java/JoelibDescriptorInfo.class +0 -0
  16. data/java/JoelibDescriptorInfo.java +15 -0
  17. data/java/JoelibDescriptors.class +0 -0
  18. data/java/JoelibDescriptors.java +60 -0
  19. data/java/Rakefile +15 -0
  20. data/java/cdk-1.4.19.jar +0 -0
  21. data/java/joelib2.jar +0 -0
  22. data/java/log4j.jar +0 -0
  23. data/lazar.gemspec +29 -0
  24. data/lib/SMARTS_InteLigand.txt +983 -0
  25. data/lib/algorithm.rb +21 -0
  26. data/lib/bbrc.rb +165 -0
  27. data/lib/classification.rb +107 -0
  28. data/lib/compound.rb +254 -0
  29. data/lib/crossvalidation.rb +187 -0
  30. data/lib/dataset.rb +334 -0
  31. data/lib/descriptor.rb +247 -0
  32. data/lib/error.rb +66 -0
  33. data/lib/feature.rb +97 -0
  34. data/lib/lazar-model.rb +170 -0
  35. data/lib/lazar.rb +69 -0
  36. data/lib/neighbor.rb +25 -0
  37. data/lib/opentox.rb +22 -0
  38. data/lib/overwrite.rb +119 -0
  39. data/lib/regression.rb +199 -0
  40. data/lib/rest-client-wrapper.rb +98 -0
  41. data/lib/similarity.rb +58 -0
  42. data/lib/unique_descriptors.rb +120 -0
  43. data/lib/validation.rb +114 -0
  44. data/mongoid.yml +8 -0
  45. data/test/all.rb +5 -0
  46. data/test/compound.rb +100 -0
  47. data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +13553 -0
  48. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +436 -0
  49. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +568 -0
  50. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +87 -0
  51. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +978 -0
  52. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +1120 -0
  53. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +1113 -0
  54. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +850 -0
  55. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +829 -0
  56. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +1198 -0
  57. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +1505 -0
  58. data/test/data/EPAFHM.csv +618 -0
  59. data/test/data/EPAFHM.medi.csv +100 -0
  60. data/test/data/EPAFHM.mini.csv +22 -0
  61. data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +581 -0
  62. data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +1217 -0
  63. data/test/data/ISSCAN-multi.csv +59 -0
  64. data/test/data/LOAEL_log_mg_corrected_smiles.csv +568 -0
  65. data/test/data/LOAEL_log_mmol_corrected_smiles.csv +568 -0
  66. data/test/data/acetaldehyde.sdf +14 -0
  67. data/test/data/boiling_points.ext.sdf +11460 -0
  68. data/test/data/cpdb_100.csv +101 -0
  69. data/test/data/hamster_carcinogenicity.csv +86 -0
  70. data/test/data/hamster_carcinogenicity.mini.bool_float.csv +11 -0
  71. data/test/data/hamster_carcinogenicity.mini.bool_int.csv +11 -0
  72. data/test/data/hamster_carcinogenicity.mini.bool_string.csv +11 -0
  73. data/test/data/hamster_carcinogenicity.mini.csv +11 -0
  74. data/test/data/hamster_carcinogenicity.ntriples +618 -0
  75. data/test/data/hamster_carcinogenicity.sdf +2805 -0
  76. data/test/data/hamster_carcinogenicity.xls +0 -0
  77. data/test/data/hamster_carcinogenicity.yaml +352 -0
  78. data/test/data/hamster_carcinogenicity_with_errors.csv +88 -0
  79. data/test/data/kazius.csv +4070 -0
  80. data/test/data/multi_cell_call.csv +1067 -0
  81. data/test/data/multi_cell_call_no_dup.csv +1057 -0
  82. data/test/data/multicolumn.csv +8 -0
  83. data/test/data/rat_feature_dataset.csv +1179 -0
  84. data/test/data/wrong_dataset.csv +8 -0
  85. data/test/dataset-long.rb +117 -0
  86. data/test/dataset.rb +199 -0
  87. data/test/descriptor-long.rb +26 -0
  88. data/test/descriptor.rb +83 -0
  89. data/test/error.rb +24 -0
  90. data/test/feature.rb +65 -0
  91. data/test/fminer-long.rb +38 -0
  92. data/test/fminer.rb +52 -0
  93. data/test/lazar-fminer.rb +50 -0
  94. data/test/lazar-long.rb +72 -0
  95. data/test/lazar-physchem-short.rb +27 -0
  96. data/test/setup.rb +6 -0
  97. data/test/validation.rb +41 -0
  98. metadata +212 -0
@@ -0,0 +1,72 @@
1
+ require_relative "setup.rb"
2
+
3
+ class LazarExtendedTest < MiniTest::Test
4
+
5
+ def test_lazar_bbrc_ham_minfreq
6
+ dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv")
7
+ model = OpenTox::Model::Lazar.create dataset, OpenTox::Algorithm::Fminer.bbrc(dataset, :min_frequency => 5)
8
+ feature_dataset = OpenTox::Dataset.find model.feature_dataset_id
9
+ assert_equal dataset.compounds.size, feature_dataset.compounds.size
10
+ assert_equal 41, feature_dataset.features.size
11
+ assert_equal 'N-C=N', feature_dataset.features.first.smarts
12
+ compound = OpenTox::Compound.from_inchi("InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H")
13
+ prediction = model.predict compound
14
+ assert_equal "false", prediction[:value]
15
+ assert_equal 0.12380952380952381, prediction[:confidence]
16
+ dataset.delete
17
+ model.delete
18
+ feature_dataset.delete
19
+ end
20
+
21
+ def test_lazar_bbrc_large_ds
22
+ # TODO fminer crashes with these settings
23
+ skip "it seems that fminer aborts without further notice"
24
+ dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"multi_cell_call_no_dup.csv")
25
+ feature_dataset = OpenTox::Algorithm::Fminer.bbrc dataset#, :min_frequency => 15)
26
+ model = OpenTox::Model::Lazar.create dataset, feature_dataset
27
+ model.save
28
+ p model.id
29
+ feature_dataset = OpenTox::CalculatedDataset.find model.feature_dataset_id
30
+ assert_equal dataset.compounds.size, feature_dataset.compounds.size
31
+ assert_equal 52, feature_dataset.features.size
32
+ assert_equal '[#17&A]-[#6&A]', feature_dataset.features.first.title
33
+ compound = OpenTox::Compound.from_inchi("InChI=1S/C10H9NO2S/c1-8-2-4-9(5-3-8)13-6-10(12)11-7-14/h2-5H,6H2,1H3")
34
+ prediction_dataset = model.predict compound
35
+ prediction = prediction_dataset.data_entries.first
36
+ assert_in_delta 0.025, prediction[:confidence], 0.001
37
+ #assert_equal 0.025885845574483608, prediction[:confidence]
38
+ # with compound change in training_dataset see:
39
+ # https://github.com/opentox/opentox-test/commit/0e78c9c59d087adbd4cc58bab60fb29cbe0c1da0
40
+ #assert_equal 0.02422364949075546, prediction[:confidence]
41
+ dataset.delete
42
+ model.delete
43
+ feature_dataset.delete
44
+ prediction_dataset.delete
45
+ end
46
+
47
+ def test_lazar_kazius
48
+ t = Time.now
49
+ dataset = Dataset.from_csv_file File.join(DATA_DIR,"kazius.csv")
50
+ p "Dataset upload: #{Time.now-t}"
51
+ t = Time.now
52
+ feature_dataset = Algorithm::Fminer.bbrc(dataset, :min_frequency => 100)
53
+ p "Feature mining: #{Time.now-t}"
54
+ t = Time.now
55
+ assert_equal feature_dataset.compounds.size, dataset.compounds.size
56
+ model = Model::Lazar.create dataset, feature_dataset
57
+ =begin
58
+ =end
59
+ #model = Model::Lazar.find('55bcf5bf7a7838381200017e')
60
+ #p model.id
61
+ #prediction_times = []
62
+ 2.times do
63
+ compound = Compound.from_smiles("Clc1ccccc1NN")
64
+ prediction = model.predict compound
65
+ assert_equal "1", prediction[:value]
66
+ assert_in_delta 0.019858401199860445, prediction[:confidence], 0.001
67
+ end
68
+ #dataset.delete
69
+ #feature_dataset.delete
70
+ end
71
+
72
+ end
@@ -0,0 +1,27 @@
1
+ require_relative "setup.rb"
2
+
3
+ class LazarPhyschemDescriptorTest < MiniTest::Test
4
+ def test_epafhm
5
+ # check available descriptors
6
+ @descriptors = OpenTox::Algorithm::Descriptor::DESCRIPTORS.keys
7
+ assert_equal 111,@descriptors.size,"wrong number of physchem descriptors"
8
+ @descriptor_values = OpenTox::Algorithm::Descriptor::DESCRIPTOR_VALUES
9
+
10
+ # select descriptors for test
11
+ @num_features_offset = 0
12
+ @descriptors.keep_if{|x| x=~/^Openbabel\./}
13
+ @descriptors.delete("Openbabel.L5") # TODO Openbabel.L5 does not work, investigate!!!
14
+ puts "Descriptors: #{@descriptors}"
15
+
16
+ # UPLOAD DATA
17
+ training_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi.csv")
18
+ puts "Dataset: "+training_dataset.id
19
+ # feature_dataset = Algorithm::Descriptor.physchem training_dataset, @descriptors
20
+ model = Model::LazarRegression.create training_dataset#, feature_dataset
21
+ #p model
22
+ compound = Compound.from_smiles "CC(C)(C)CN"
23
+ prediction = model.predict compound
24
+ p prediction
25
+
26
+ end
27
+ end
data/test/setup.rb ADDED
@@ -0,0 +1,6 @@
1
+ require 'minitest/autorun'
2
+ require_relative '../lib/lazar.rb'
3
+ include OpenTox
4
+ TEST_DIR ||= File.expand_path(File.dirname(__FILE__))
5
+ DATA_DIR ||= File.join(TEST_DIR,"data")
6
+ #$mongo.database.drop
@@ -0,0 +1,41 @@
1
+ require_relative "setup.rb"
2
+
3
+ class ValidationTest < MiniTest::Test
4
+
5
+ def test_fminer_crossvalidation
6
+ dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
7
+ model = Model::LazarFminerClassification.create dataset#, features
8
+ cv = ClassificationCrossValidation.create model
9
+ p cv.accuracy
10
+ p cv.weighted_accuracy
11
+ assert cv.accuracy > 0.8
12
+ assert cv.weighted_accuracy > cv.accuracy, "Weighted accuracy (#{cv.weighted_accuracy}) larger than unweighted accuracy(#{cv.accuracy}) "
13
+ end
14
+
15
+ def test_classification_crossvalidation
16
+ dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
17
+ model = Model::LazarClassification.create dataset#, features
18
+ cv = ClassificationCrossValidation.create model
19
+ p cv.accuracy
20
+ p cv.weighted_accuracy
21
+ assert cv.accuracy > 0.7
22
+ assert cv.weighted_accuracy > cv.accuracy, "Weighted accuracy should be larger than unweighted accuracy."
23
+ end
24
+
25
+ def test_regression_crossvalidation
26
+ dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
27
+ #dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.csv"
28
+ model = Model::LazarRegression.create dataset
29
+ cv = RegressionCrossValidation.create model
30
+ p cv.rmse
31
+ p cv.weighted_rmse
32
+ p cv.mae
33
+ p cv.weighted_mae
34
+ `inkview #{cv.plot}`
35
+ assert cv.rmse < 30, "RMSE > 30"
36
+ assert cv.weighted_rmse < cv.rmse, "Weighted RMSE (#{cv.weighted_rmse}) larger than unweighted RMSE(#{cv.rmse}) "
37
+ assert cv.mae < 12
38
+ assert cv.weighted_mae < cv.mae
39
+ end
40
+
41
+ end
metadata ADDED
@@ -0,0 +1,212 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: lazar
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Christoph Helma, Martin Guetlein, Andreas Maunz, Micha Rautenberg, David Vorgrimmler,
8
+ Denis Gebele
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2015-08-19 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: bundler
16
+ requirement: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - ">="
19
+ - !ruby/object:Gem::Version
20
+ version: '0'
21
+ type: :runtime
22
+ prerelease: false
23
+ version_requirements: !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ version: '0'
28
+ - !ruby/object:Gem::Dependency
29
+ name: rest-client
30
+ requirement: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - ">="
33
+ - !ruby/object:Gem::Version
34
+ version: '0'
35
+ type: :runtime
36
+ prerelease: false
37
+ version_requirements: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ version: '0'
42
+ - !ruby/object:Gem::Dependency
43
+ name: nokogiri
44
+ requirement: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - ">="
47
+ - !ruby/object:Gem::Version
48
+ version: '0'
49
+ type: :runtime
50
+ prerelease: false
51
+ version_requirements: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - ">="
54
+ - !ruby/object:Gem::Version
55
+ version: '0'
56
+ - !ruby/object:Gem::Dependency
57
+ name: rserve-client
58
+ requirement: !ruby/object:Gem::Requirement
59
+ requirements:
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ version: '0'
63
+ type: :runtime
64
+ prerelease: false
65
+ version_requirements: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ - !ruby/object:Gem::Dependency
71
+ name: mongoid
72
+ requirement: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - "~>"
75
+ - !ruby/object:Gem::Version
76
+ version: 5.0beta
77
+ type: :runtime
78
+ prerelease: false
79
+ version_requirements: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - "~>"
82
+ - !ruby/object:Gem::Version
83
+ version: 5.0beta
84
+ description: Libraries for lazy structure-activity relationships and read-across.
85
+ email:
86
+ - helma@in-silico.ch
87
+ executables: []
88
+ extensions:
89
+ - ext/lazar/extconf.rb
90
+ extra_rdoc_files: []
91
+ files:
92
+ - ".gitignore"
93
+ - ".yardopts"
94
+ - Gemfile
95
+ - LICENSE
96
+ - README.md
97
+ - Rakefile
98
+ - VERSION
99
+ - ext/lazar/extconf.rb
100
+ - java/CdkDescriptorInfo.class
101
+ - java/CdkDescriptorInfo.java
102
+ - java/CdkDescriptors.class
103
+ - java/CdkDescriptors.java
104
+ - java/Jmol.jar
105
+ - java/JoelibDescriptorInfo.class
106
+ - java/JoelibDescriptorInfo.java
107
+ - java/JoelibDescriptors.class
108
+ - java/JoelibDescriptors.java
109
+ - java/Rakefile
110
+ - java/cdk-1.4.19.jar
111
+ - java/joelib2.jar
112
+ - java/log4j.jar
113
+ - lazar.gemspec
114
+ - lib/SMARTS_InteLigand.txt
115
+ - lib/algorithm.rb
116
+ - lib/bbrc.rb
117
+ - lib/classification.rb
118
+ - lib/compound.rb
119
+ - lib/crossvalidation.rb
120
+ - lib/dataset.rb
121
+ - lib/descriptor.rb
122
+ - lib/error.rb
123
+ - lib/feature.rb
124
+ - lib/lazar-model.rb
125
+ - lib/lazar.rb
126
+ - lib/neighbor.rb
127
+ - lib/opentox.rb
128
+ - lib/overwrite.rb
129
+ - lib/regression.rb
130
+ - lib/rest-client-wrapper.rb
131
+ - lib/similarity.rb
132
+ - lib/unique_descriptors.rb
133
+ - lib/validation.rb
134
+ - mongoid.yml
135
+ - test/all.rb
136
+ - test/compound.rb
137
+ - test/data/CPDBAS_v5c_1547_29Apr2008part.sdf
138
+ - test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv
139
+ - test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv
140
+ - test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv
141
+ - test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv
142
+ - test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv
143
+ - test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv
144
+ - test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv
145
+ - test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv
146
+ - test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv
147
+ - test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv
148
+ - test/data/EPAFHM.csv
149
+ - test/data/EPAFHM.medi.csv
150
+ - test/data/EPAFHM.mini.csv
151
+ - test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv
152
+ - test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv
153
+ - test/data/ISSCAN-multi.csv
154
+ - test/data/LOAEL_log_mg_corrected_smiles.csv
155
+ - test/data/LOAEL_log_mmol_corrected_smiles.csv
156
+ - test/data/acetaldehyde.sdf
157
+ - test/data/boiling_points.ext.sdf
158
+ - test/data/cpdb_100.csv
159
+ - test/data/hamster_carcinogenicity.csv
160
+ - test/data/hamster_carcinogenicity.mini.bool_float.csv
161
+ - test/data/hamster_carcinogenicity.mini.bool_int.csv
162
+ - test/data/hamster_carcinogenicity.mini.bool_string.csv
163
+ - test/data/hamster_carcinogenicity.mini.csv
164
+ - test/data/hamster_carcinogenicity.ntriples
165
+ - test/data/hamster_carcinogenicity.sdf
166
+ - test/data/hamster_carcinogenicity.xls
167
+ - test/data/hamster_carcinogenicity.yaml
168
+ - test/data/hamster_carcinogenicity_with_errors.csv
169
+ - test/data/kazius.csv
170
+ - test/data/multi_cell_call.csv
171
+ - test/data/multi_cell_call_no_dup.csv
172
+ - test/data/multicolumn.csv
173
+ - test/data/rat_feature_dataset.csv
174
+ - test/data/wrong_dataset.csv
175
+ - test/dataset-long.rb
176
+ - test/dataset.rb
177
+ - test/descriptor-long.rb
178
+ - test/descriptor.rb
179
+ - test/error.rb
180
+ - test/feature.rb
181
+ - test/fminer-long.rb
182
+ - test/fminer.rb
183
+ - test/lazar-fminer.rb
184
+ - test/lazar-long.rb
185
+ - test/lazar-physchem-short.rb
186
+ - test/setup.rb
187
+ - test/validation.rb
188
+ homepage: http://github.com/opentox/lazar
189
+ licenses:
190
+ - GPL-3
191
+ metadata: {}
192
+ post_install_message:
193
+ rdoc_options: []
194
+ require_paths:
195
+ - lib
196
+ required_ruby_version: !ruby/object:Gem::Requirement
197
+ requirements:
198
+ - - ">="
199
+ - !ruby/object:Gem::Version
200
+ version: '0'
201
+ required_rubygems_version: !ruby/object:Gem::Requirement
202
+ requirements:
203
+ - - ">="
204
+ - !ruby/object:Gem::Version
205
+ version: '0'
206
+ requirements: []
207
+ rubyforge_project: lazar
208
+ rubygems_version: 2.2.2
209
+ signing_key:
210
+ specification_version: 4
211
+ summary: Lazar framework
212
+ test_files: []