lazar 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/.yardopts +4 -0
  4. data/Gemfile +2 -0
  5. data/LICENSE +674 -0
  6. data/README.md +44 -0
  7. data/Rakefile +1 -0
  8. data/VERSION +1 -0
  9. data/ext/lazar/extconf.rb +87 -0
  10. data/java/CdkDescriptorInfo.class +0 -0
  11. data/java/CdkDescriptorInfo.java +22 -0
  12. data/java/CdkDescriptors.class +0 -0
  13. data/java/CdkDescriptors.java +141 -0
  14. data/java/Jmol.jar +0 -0
  15. data/java/JoelibDescriptorInfo.class +0 -0
  16. data/java/JoelibDescriptorInfo.java +15 -0
  17. data/java/JoelibDescriptors.class +0 -0
  18. data/java/JoelibDescriptors.java +60 -0
  19. data/java/Rakefile +15 -0
  20. data/java/cdk-1.4.19.jar +0 -0
  21. data/java/joelib2.jar +0 -0
  22. data/java/log4j.jar +0 -0
  23. data/lazar.gemspec +29 -0
  24. data/lib/SMARTS_InteLigand.txt +983 -0
  25. data/lib/algorithm.rb +21 -0
  26. data/lib/bbrc.rb +165 -0
  27. data/lib/classification.rb +107 -0
  28. data/lib/compound.rb +254 -0
  29. data/lib/crossvalidation.rb +187 -0
  30. data/lib/dataset.rb +334 -0
  31. data/lib/descriptor.rb +247 -0
  32. data/lib/error.rb +66 -0
  33. data/lib/feature.rb +97 -0
  34. data/lib/lazar-model.rb +170 -0
  35. data/lib/lazar.rb +69 -0
  36. data/lib/neighbor.rb +25 -0
  37. data/lib/opentox.rb +22 -0
  38. data/lib/overwrite.rb +119 -0
  39. data/lib/regression.rb +199 -0
  40. data/lib/rest-client-wrapper.rb +98 -0
  41. data/lib/similarity.rb +58 -0
  42. data/lib/unique_descriptors.rb +120 -0
  43. data/lib/validation.rb +114 -0
  44. data/mongoid.yml +8 -0
  45. data/test/all.rb +5 -0
  46. data/test/compound.rb +100 -0
  47. data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +13553 -0
  48. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +436 -0
  49. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +568 -0
  50. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +87 -0
  51. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +978 -0
  52. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +1120 -0
  53. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +1113 -0
  54. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +850 -0
  55. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +829 -0
  56. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +1198 -0
  57. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +1505 -0
  58. data/test/data/EPAFHM.csv +618 -0
  59. data/test/data/EPAFHM.medi.csv +100 -0
  60. data/test/data/EPAFHM.mini.csv +22 -0
  61. data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +581 -0
  62. data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +1217 -0
  63. data/test/data/ISSCAN-multi.csv +59 -0
  64. data/test/data/LOAEL_log_mg_corrected_smiles.csv +568 -0
  65. data/test/data/LOAEL_log_mmol_corrected_smiles.csv +568 -0
  66. data/test/data/acetaldehyde.sdf +14 -0
  67. data/test/data/boiling_points.ext.sdf +11460 -0
  68. data/test/data/cpdb_100.csv +101 -0
  69. data/test/data/hamster_carcinogenicity.csv +86 -0
  70. data/test/data/hamster_carcinogenicity.mini.bool_float.csv +11 -0
  71. data/test/data/hamster_carcinogenicity.mini.bool_int.csv +11 -0
  72. data/test/data/hamster_carcinogenicity.mini.bool_string.csv +11 -0
  73. data/test/data/hamster_carcinogenicity.mini.csv +11 -0
  74. data/test/data/hamster_carcinogenicity.ntriples +618 -0
  75. data/test/data/hamster_carcinogenicity.sdf +2805 -0
  76. data/test/data/hamster_carcinogenicity.xls +0 -0
  77. data/test/data/hamster_carcinogenicity.yaml +352 -0
  78. data/test/data/hamster_carcinogenicity_with_errors.csv +88 -0
  79. data/test/data/kazius.csv +4070 -0
  80. data/test/data/multi_cell_call.csv +1067 -0
  81. data/test/data/multi_cell_call_no_dup.csv +1057 -0
  82. data/test/data/multicolumn.csv +8 -0
  83. data/test/data/rat_feature_dataset.csv +1179 -0
  84. data/test/data/wrong_dataset.csv +8 -0
  85. data/test/dataset-long.rb +117 -0
  86. data/test/dataset.rb +199 -0
  87. data/test/descriptor-long.rb +26 -0
  88. data/test/descriptor.rb +83 -0
  89. data/test/error.rb +24 -0
  90. data/test/feature.rb +65 -0
  91. data/test/fminer-long.rb +38 -0
  92. data/test/fminer.rb +52 -0
  93. data/test/lazar-fminer.rb +50 -0
  94. data/test/lazar-long.rb +72 -0
  95. data/test/lazar-physchem-short.rb +27 -0
  96. data/test/setup.rb +6 -0
  97. data/test/validation.rb +41 -0
  98. metadata +212 -0
@@ -0,0 +1,72 @@
1
+ require_relative "setup.rb"
2
+
3
+ class LazarExtendedTest < MiniTest::Test
4
+
5
+ def test_lazar_bbrc_ham_minfreq
6
+ dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv")
7
+ model = OpenTox::Model::Lazar.create dataset, OpenTox::Algorithm::Fminer.bbrc(dataset, :min_frequency => 5)
8
+ feature_dataset = OpenTox::Dataset.find model.feature_dataset_id
9
+ assert_equal dataset.compounds.size, feature_dataset.compounds.size
10
+ assert_equal 41, feature_dataset.features.size
11
+ assert_equal 'N-C=N', feature_dataset.features.first.smarts
12
+ compound = OpenTox::Compound.from_inchi("InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H")
13
+ prediction = model.predict compound
14
+ assert_equal "false", prediction[:value]
15
+ assert_equal 0.12380952380952381, prediction[:confidence]
16
+ dataset.delete
17
+ model.delete
18
+ feature_dataset.delete
19
+ end
20
+
21
+ def test_lazar_bbrc_large_ds
22
+ # TODO fminer crashes with these settings
23
+ skip "it seems that fminer aborts without further notice"
24
+ dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"multi_cell_call_no_dup.csv")
25
+ feature_dataset = OpenTox::Algorithm::Fminer.bbrc dataset#, :min_frequency => 15)
26
+ model = OpenTox::Model::Lazar.create dataset, feature_dataset
27
+ model.save
28
+ p model.id
29
+ feature_dataset = OpenTox::CalculatedDataset.find model.feature_dataset_id
30
+ assert_equal dataset.compounds.size, feature_dataset.compounds.size
31
+ assert_equal 52, feature_dataset.features.size
32
+ assert_equal '[#17&A]-[#6&A]', feature_dataset.features.first.title
33
+ compound = OpenTox::Compound.from_inchi("InChI=1S/C10H9NO2S/c1-8-2-4-9(5-3-8)13-6-10(12)11-7-14/h2-5H,6H2,1H3")
34
+ prediction_dataset = model.predict compound
35
+ prediction = prediction_dataset.data_entries.first
36
+ assert_in_delta 0.025, prediction[:confidence], 0.001
37
+ #assert_equal 0.025885845574483608, prediction[:confidence]
38
+ # with compound change in training_dataset see:
39
+ # https://github.com/opentox/opentox-test/commit/0e78c9c59d087adbd4cc58bab60fb29cbe0c1da0
40
+ #assert_equal 0.02422364949075546, prediction[:confidence]
41
+ dataset.delete
42
+ model.delete
43
+ feature_dataset.delete
44
+ prediction_dataset.delete
45
+ end
46
+
47
+ def test_lazar_kazius
48
+ t = Time.now
49
+ dataset = Dataset.from_csv_file File.join(DATA_DIR,"kazius.csv")
50
+ p "Dataset upload: #{Time.now-t}"
51
+ t = Time.now
52
+ feature_dataset = Algorithm::Fminer.bbrc(dataset, :min_frequency => 100)
53
+ p "Feature mining: #{Time.now-t}"
54
+ t = Time.now
55
+ assert_equal feature_dataset.compounds.size, dataset.compounds.size
56
+ model = Model::Lazar.create dataset, feature_dataset
57
+ =begin
58
+ =end
59
+ #model = Model::Lazar.find('55bcf5bf7a7838381200017e')
60
+ #p model.id
61
+ #prediction_times = []
62
+ 2.times do
63
+ compound = Compound.from_smiles("Clc1ccccc1NN")
64
+ prediction = model.predict compound
65
+ assert_equal "1", prediction[:value]
66
+ assert_in_delta 0.019858401199860445, prediction[:confidence], 0.001
67
+ end
68
+ #dataset.delete
69
+ #feature_dataset.delete
70
+ end
71
+
72
+ end
@@ -0,0 +1,27 @@
1
+ require_relative "setup.rb"
2
+
3
+ class LazarPhyschemDescriptorTest < MiniTest::Test
4
+ def test_epafhm
5
+ # check available descriptors
6
+ @descriptors = OpenTox::Algorithm::Descriptor::DESCRIPTORS.keys
7
+ assert_equal 111,@descriptors.size,"wrong number of physchem descriptors"
8
+ @descriptor_values = OpenTox::Algorithm::Descriptor::DESCRIPTOR_VALUES
9
+
10
+ # select descriptors for test
11
+ @num_features_offset = 0
12
+ @descriptors.keep_if{|x| x=~/^Openbabel\./}
13
+ @descriptors.delete("Openbabel.L5") # TODO Openbabel.L5 does not work, investigate!!!
14
+ puts "Descriptors: #{@descriptors}"
15
+
16
+ # UPLOAD DATA
17
+ training_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi.csv")
18
+ puts "Dataset: "+training_dataset.id
19
+ # feature_dataset = Algorithm::Descriptor.physchem training_dataset, @descriptors
20
+ model = Model::LazarRegression.create training_dataset#, feature_dataset
21
+ #p model
22
+ compound = Compound.from_smiles "CC(C)(C)CN"
23
+ prediction = model.predict compound
24
+ p prediction
25
+
26
+ end
27
+ end
data/test/setup.rb ADDED
@@ -0,0 +1,6 @@
1
+ require 'minitest/autorun'
2
+ require_relative '../lib/lazar.rb'
3
+ include OpenTox
4
+ TEST_DIR ||= File.expand_path(File.dirname(__FILE__))
5
+ DATA_DIR ||= File.join(TEST_DIR,"data")
6
+ #$mongo.database.drop
@@ -0,0 +1,41 @@
1
+ require_relative "setup.rb"
2
+
3
+ class ValidationTest < MiniTest::Test
4
+
5
+ def test_fminer_crossvalidation
6
+ dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
7
+ model = Model::LazarFminerClassification.create dataset#, features
8
+ cv = ClassificationCrossValidation.create model
9
+ p cv.accuracy
10
+ p cv.weighted_accuracy
11
+ assert cv.accuracy > 0.8
12
+ assert cv.weighted_accuracy > cv.accuracy, "Weighted accuracy (#{cv.weighted_accuracy}) larger than unweighted accuracy(#{cv.accuracy}) "
13
+ end
14
+
15
+ def test_classification_crossvalidation
16
+ dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
17
+ model = Model::LazarClassification.create dataset#, features
18
+ cv = ClassificationCrossValidation.create model
19
+ p cv.accuracy
20
+ p cv.weighted_accuracy
21
+ assert cv.accuracy > 0.7
22
+ assert cv.weighted_accuracy > cv.accuracy, "Weighted accuracy should be larger than unweighted accuracy."
23
+ end
24
+
25
+ def test_regression_crossvalidation
26
+ dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
27
+ #dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.csv"
28
+ model = Model::LazarRegression.create dataset
29
+ cv = RegressionCrossValidation.create model
30
+ p cv.rmse
31
+ p cv.weighted_rmse
32
+ p cv.mae
33
+ p cv.weighted_mae
34
+ `inkview #{cv.plot}`
35
+ assert cv.rmse < 30, "RMSE > 30"
36
+ assert cv.weighted_rmse < cv.rmse, "Weighted RMSE (#{cv.weighted_rmse}) larger than unweighted RMSE(#{cv.rmse}) "
37
+ assert cv.mae < 12
38
+ assert cv.weighted_mae < cv.mae
39
+ end
40
+
41
+ end
metadata ADDED
@@ -0,0 +1,212 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: lazar
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Christoph Helma, Martin Guetlein, Andreas Maunz, Micha Rautenberg, David Vorgrimmler,
8
+ Denis Gebele
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2015-08-19 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: bundler
16
+ requirement: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - ">="
19
+ - !ruby/object:Gem::Version
20
+ version: '0'
21
+ type: :runtime
22
+ prerelease: false
23
+ version_requirements: !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ version: '0'
28
+ - !ruby/object:Gem::Dependency
29
+ name: rest-client
30
+ requirement: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - ">="
33
+ - !ruby/object:Gem::Version
34
+ version: '0'
35
+ type: :runtime
36
+ prerelease: false
37
+ version_requirements: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ version: '0'
42
+ - !ruby/object:Gem::Dependency
43
+ name: nokogiri
44
+ requirement: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - ">="
47
+ - !ruby/object:Gem::Version
48
+ version: '0'
49
+ type: :runtime
50
+ prerelease: false
51
+ version_requirements: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - ">="
54
+ - !ruby/object:Gem::Version
55
+ version: '0'
56
+ - !ruby/object:Gem::Dependency
57
+ name: rserve-client
58
+ requirement: !ruby/object:Gem::Requirement
59
+ requirements:
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ version: '0'
63
+ type: :runtime
64
+ prerelease: false
65
+ version_requirements: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ - !ruby/object:Gem::Dependency
71
+ name: mongoid
72
+ requirement: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - "~>"
75
+ - !ruby/object:Gem::Version
76
+ version: 5.0beta
77
+ type: :runtime
78
+ prerelease: false
79
+ version_requirements: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - "~>"
82
+ - !ruby/object:Gem::Version
83
+ version: 5.0beta
84
+ description: Libraries for lazy structure-activity relationships and read-across.
85
+ email:
86
+ - helma@in-silico.ch
87
+ executables: []
88
+ extensions:
89
+ - ext/lazar/extconf.rb
90
+ extra_rdoc_files: []
91
+ files:
92
+ - ".gitignore"
93
+ - ".yardopts"
94
+ - Gemfile
95
+ - LICENSE
96
+ - README.md
97
+ - Rakefile
98
+ - VERSION
99
+ - ext/lazar/extconf.rb
100
+ - java/CdkDescriptorInfo.class
101
+ - java/CdkDescriptorInfo.java
102
+ - java/CdkDescriptors.class
103
+ - java/CdkDescriptors.java
104
+ - java/Jmol.jar
105
+ - java/JoelibDescriptorInfo.class
106
+ - java/JoelibDescriptorInfo.java
107
+ - java/JoelibDescriptors.class
108
+ - java/JoelibDescriptors.java
109
+ - java/Rakefile
110
+ - java/cdk-1.4.19.jar
111
+ - java/joelib2.jar
112
+ - java/log4j.jar
113
+ - lazar.gemspec
114
+ - lib/SMARTS_InteLigand.txt
115
+ - lib/algorithm.rb
116
+ - lib/bbrc.rb
117
+ - lib/classification.rb
118
+ - lib/compound.rb
119
+ - lib/crossvalidation.rb
120
+ - lib/dataset.rb
121
+ - lib/descriptor.rb
122
+ - lib/error.rb
123
+ - lib/feature.rb
124
+ - lib/lazar-model.rb
125
+ - lib/lazar.rb
126
+ - lib/neighbor.rb
127
+ - lib/opentox.rb
128
+ - lib/overwrite.rb
129
+ - lib/regression.rb
130
+ - lib/rest-client-wrapper.rb
131
+ - lib/similarity.rb
132
+ - lib/unique_descriptors.rb
133
+ - lib/validation.rb
134
+ - mongoid.yml
135
+ - test/all.rb
136
+ - test/compound.rb
137
+ - test/data/CPDBAS_v5c_1547_29Apr2008part.sdf
138
+ - test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv
139
+ - test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv
140
+ - test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv
141
+ - test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv
142
+ - test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv
143
+ - test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv
144
+ - test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv
145
+ - test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv
146
+ - test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv
147
+ - test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv
148
+ - test/data/EPAFHM.csv
149
+ - test/data/EPAFHM.medi.csv
150
+ - test/data/EPAFHM.mini.csv
151
+ - test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv
152
+ - test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv
153
+ - test/data/ISSCAN-multi.csv
154
+ - test/data/LOAEL_log_mg_corrected_smiles.csv
155
+ - test/data/LOAEL_log_mmol_corrected_smiles.csv
156
+ - test/data/acetaldehyde.sdf
157
+ - test/data/boiling_points.ext.sdf
158
+ - test/data/cpdb_100.csv
159
+ - test/data/hamster_carcinogenicity.csv
160
+ - test/data/hamster_carcinogenicity.mini.bool_float.csv
161
+ - test/data/hamster_carcinogenicity.mini.bool_int.csv
162
+ - test/data/hamster_carcinogenicity.mini.bool_string.csv
163
+ - test/data/hamster_carcinogenicity.mini.csv
164
+ - test/data/hamster_carcinogenicity.ntriples
165
+ - test/data/hamster_carcinogenicity.sdf
166
+ - test/data/hamster_carcinogenicity.xls
167
+ - test/data/hamster_carcinogenicity.yaml
168
+ - test/data/hamster_carcinogenicity_with_errors.csv
169
+ - test/data/kazius.csv
170
+ - test/data/multi_cell_call.csv
171
+ - test/data/multi_cell_call_no_dup.csv
172
+ - test/data/multicolumn.csv
173
+ - test/data/rat_feature_dataset.csv
174
+ - test/data/wrong_dataset.csv
175
+ - test/dataset-long.rb
176
+ - test/dataset.rb
177
+ - test/descriptor-long.rb
178
+ - test/descriptor.rb
179
+ - test/error.rb
180
+ - test/feature.rb
181
+ - test/fminer-long.rb
182
+ - test/fminer.rb
183
+ - test/lazar-fminer.rb
184
+ - test/lazar-long.rb
185
+ - test/lazar-physchem-short.rb
186
+ - test/setup.rb
187
+ - test/validation.rb
188
+ homepage: http://github.com/opentox/lazar
189
+ licenses:
190
+ - GPL-3
191
+ metadata: {}
192
+ post_install_message:
193
+ rdoc_options: []
194
+ require_paths:
195
+ - lib
196
+ required_ruby_version: !ruby/object:Gem::Requirement
197
+ requirements:
198
+ - - ">="
199
+ - !ruby/object:Gem::Version
200
+ version: '0'
201
+ required_rubygems_version: !ruby/object:Gem::Requirement
202
+ requirements:
203
+ - - ">="
204
+ - !ruby/object:Gem::Version
205
+ version: '0'
206
+ requirements: []
207
+ rubyforge_project: lazar
208
+ rubygems_version: 2.2.2
209
+ signing_key:
210
+ specification_version: 4
211
+ summary: Lazar framework
212
+ test_files: []