pubchem 0.0.5 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: cc1a04a9f940becd4f4eff582d8105d6f3772eed
4
- data.tar.gz: 224b9440fe38fcfa39fe9b360a2f32a9e145b27f
3
+ metadata.gz: 8c55a845631951401782b0af20e268b9181e3ca8
4
+ data.tar.gz: d70a2f0fddefa25016b76d0442b6c4d8d07b8884
5
5
  SHA512:
6
- metadata.gz: 6f72420e95796c668a1154877ef5ad2455569e00fbf618ee2ebfc5256433fbcf0cb0471d67ecf2044bce075f0dbf9a59f769610026620936a74dc488fa8a0e22
7
- data.tar.gz: 297ba5d561ed323425c6c5804eceb035d17baf2c8865acf7b4bade28ccb544b32dd79a6e9f319c92acc5bdd45e1e0d1a075648779fcd1598104d36db34acc62a
6
+ metadata.gz: f9a1f1bbcb944abdace6ab61745620c329258072da9d29c6a2d2266d57fe64847dc45a7ec10882b604e8c4988191cb033253280aedf3262d9ff591f75ad1ea84
7
+ data.tar.gz: 925506e71420d361b5c776233676a37aea0aab3a7694f20f77ca7e991ec96dde527f8b0731a0a8c57e8a51d29b78dbf77b86813f97ba8634187a63a2a746ed38
@@ -0,0 +1,4 @@
1
+ pkg
2
+ xml
3
+ !xml/substance_sample.xml
4
+ !xml/compound_sample.xml
@@ -0,0 +1,52 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ pubchem (0.1.1)
5
+ fuzzy-string-match (~> 0.9.7)
6
+ mechanize (~> 2.7.3)
7
+ nokogiri (~> 1.6.6.2)
8
+ ox (~> 2.2.1)
9
+
10
+ GEM
11
+ remote: https://rubygems.org/
12
+ specs:
13
+ RubyInline (3.12.4)
14
+ ZenTest (~> 4.3)
15
+ ZenTest (4.11.0)
16
+ domain_name (0.5.24)
17
+ unf (>= 0.0.5, < 1.0.0)
18
+ fuzzy-string-match (0.9.7)
19
+ RubyInline (>= 3.8.6)
20
+ http-cookie (1.0.2)
21
+ domain_name (~> 0.5)
22
+ mechanize (2.7.3)
23
+ domain_name (~> 0.5, >= 0.5.1)
24
+ http-cookie (~> 1.0)
25
+ mime-types (~> 2.0)
26
+ net-http-digest_auth (~> 1.1, >= 1.1.1)
27
+ net-http-persistent (~> 2.5, >= 2.5.2)
28
+ nokogiri (~> 1.4)
29
+ ntlm-http (~> 0.1, >= 0.1.1)
30
+ webrobots (>= 0.0.9, < 0.2)
31
+ mime-types (2.6.1)
32
+ mini_portile (0.6.2)
33
+ net-http-digest_auth (1.4)
34
+ net-http-persistent (2.9.4)
35
+ nokogiri (1.6.6.2)
36
+ mini_portile (~> 0.6.0)
37
+ ntlm-http (0.1.1)
38
+ ox (2.2.1)
39
+ unf (0.1.4)
40
+ unf_ext
41
+ unf_ext (0.0.7.1)
42
+ webrobots (0.1.1)
43
+
44
+ PLATFORMS
45
+ ruby
46
+
47
+ DEPENDENCIES
48
+ bundler (~> 1.10)
49
+ pubchem!
50
+
51
+ BUNDLED WITH
52
+ 1.10.3
@@ -2,22 +2,18 @@
2
2
 
3
3
  For getting all that juicy substance and compound data from Pubchem.
4
4
 
5
- ## Installation
5
+ Please email me if you end up using this: zachaysan@gmail.com
6
+
7
+ I'd be interested to hear if open sourcing this helped someone else.
6
8
 
7
- `apt-get install wget`
9
+ ## Installation
8
10
 
9
- Or
11
+ `apt-get install wget` or `sudo apt-get install wget`
10
12
 
11
- `sudo apt-get install wget`
13
+ then
12
14
 
13
15
  `gem install pubchem`
14
16
 
15
17
  ## Usage
16
18
 
17
- ```ruby
18
- pubchem = Pubchem.new
19
-
20
- pubchem.get_ids([16,405], "~/yay.zip")
21
-
22
- puts "Do a happy dance!"
23
- ```
19
+ See `example.rb` for how to use Pubchem.
data/example.rb CHANGED
@@ -1,7 +1,28 @@
1
+ require 'pp'
1
2
  require_relative "lib/pubchem"
2
3
 
4
+ reader = Reader.new
5
+ reader.read('xml/compound_sample.xml')
6
+ reader.read('xml/substance_sample.xml')
7
+ reader.save("xml/names.xml",
8
+ "xml/pubchem_substance_ids.xml",
9
+ "xml/pubchem_compound_ids.xml")
10
+
11
+ # The first two terms match, the last one replaces a "1H"
12
+ # with a "2H", resulting in a non-match.
13
+
14
+ terms = [ "COC1=C(C=C2CC3=CC(=C(C=C3CC4=CC(=C(C=C4CC2=C1)OC(=O)C5=CC=NC=C5)OC)OC(=O)C6=CC=NC=C6)OC)OC(=O)C7=CC=NC=E9",
15
+ "4-methoxy-1H-indole-3-carbaldehyde",
16
+ "4-methoxy-2H-indole-3-carbaldehyde",
17
+ "2-amino-4,5-dimethyl-1H-pyrrole-3-carbonitrile" ]
18
+
19
+ pp reader.match_list_of_names terms
20
+ pp reader.retrieve_compound_ids
21
+ pp reader.pubchem_substance_ids
3
22
  pubchem = Pubchem.new
4
23
 
5
- pubchem.get_ids([16,405], "~/yay.zip")
24
+ ids = reader.retrieve_substance_ids.map {|k,v| v}
25
+
26
+ pubchem.get_substance_ids(ids, "yay.zip")
6
27
 
7
28
  puts "Do a happy dance!"
@@ -1,4 +1,5 @@
1
1
  require 'mechanize'
2
+ require_relative 'pubchem/reader'
2
3
 
3
4
  class Pubchem
4
5
 
@@ -13,9 +14,26 @@ class Pubchem
13
14
 
14
15
  end
15
16
 
17
+ def get_compound_ids(ids,
18
+ filename,
19
+ retrieve_mode: :image,
20
+ delay: nil)
21
+
22
+ self.get_ids(ids, filename, :compound, delay: delay)
23
+ end
24
+
25
+ def get_substance_ids(ids,
26
+ filename,
27
+ retrieve_mode: :image,
28
+ delay: nil)
29
+
30
+ self.get_ids(ids, filename, :substance, delay: delay)
31
+
32
+ end
33
+
16
34
  def get_ids(ids,
17
35
  filename,
18
- db: :compound,
36
+ db,
19
37
  retrieve_mode: :image,
20
38
  delay: nil)
21
39
 
@@ -46,9 +64,9 @@ class Pubchem
46
64
 
47
65
  ftp_url = ftp_link.to_s
48
66
  size = ftp_url.size
49
-
67
+
50
68
  # We don't want to allow scary characters into our URL since it is a
51
- # security risk, so we only allow lower and upper case letters, numbers,
69
+ # security risk, so we only allow lower and upper case letters, numbers,
52
70
  # / forward slashes
53
71
  # : colons
54
72
  # . periods
@@ -0,0 +1,223 @@
1
+ require 'set'
2
+ require 'nokogiri'
3
+ require 'fuzzystringmatch'
4
+ require 'ox'
5
+
6
+ class Reader
7
+
8
+ attr_accessor :names,
9
+ :pubchem_substance_ids,
10
+ :pubchem_compound_ids
11
+
12
+ def initialize(names_filename=nil,
13
+ pubchem_substance_ids_filename=nil,
14
+ pubchem_compound_ids_filename=nil)
15
+
16
+ @fuzzy_matcher = FuzzyStringMatch::JaroWinkler
17
+ .create( :native )
18
+
19
+ return if initialize_from_files( names_filename,
20
+ pubchem_substance_ids_filename,
21
+ pubchem_compound_ids_filename )
22
+
23
+ @names = Hash.new { |h,k| h[k] = Set.new }
24
+
25
+ @pubchem_substance_ids = Hash.new { |h,k| h[k] = Set.new }
26
+ @pubchem_compound_ids = Hash.new { |h,k| h[k] = Set.new }
27
+
28
+ end
29
+
30
+ def initialize_from_files(names_filename,
31
+ pubchem_substance_ids_filename,
32
+ pubchem_compound_ids_filename)
33
+
34
+ filenames = [ names_filename,
35
+ pubchem_substance_ids_filename,
36
+ pubchem_compound_ids_filename ]
37
+
38
+ return nil unless filenames.any?
39
+ raise "Both filenames required" unless filenames.all?
40
+
41
+ @names = Ox.load_file(names_filename)
42
+ @pubchem_substance_ids = Ox.load_file(pubchem_substance_ids_filename)
43
+ @pubchem_compound_ids = Ox.load_file(pubchem_compound_ids_filename)
44
+
45
+ end
46
+
47
+ def save(names_filename,
48
+ pubchem_substance_ids_filename,
49
+ pubchem_compound_ids_filename)
50
+
51
+ Ox.to_file(names_filename, @names, indent: 0)
52
+ Ox.to_file(pubchem_substance_ids_filename, @pubchem_substance_ids, indent: 0)
53
+ Ox.to_file(pubchem_compound_ids_filename, @pubchem_compound_ids, indent: 0)
54
+
55
+ end
56
+
57
+ def read(xml_filepath, type: nil)
58
+
59
+ filepath = File.basename(xml_filepath)
60
+ if type.nil? and filepath.downcase.start_with? "compound"
61
+ type = :compound
62
+ elsif type.nil? and filepath.downcase.start_with? "substance"
63
+ type = :substance
64
+ else
65
+ raise "Cannot infer pubchem type"
66
+ end
67
+
68
+ f = File.open(xml_filepath)
69
+ doc = Nokogiri::XML(f)
70
+ f.close
71
+ @current_type = type.to_s
72
+ case type
73
+ when :compound
74
+ doc.css("PC-Compounds PC-Compound").each do |compound|
75
+ self.parse_compound(compound)
76
+ end
77
+ when :substance
78
+ doc.css("PC-Substances PC-Substance").each do |substance|
79
+ self.parse_substance(substance)
80
+ end
81
+ else
82
+ raise "Unknown type"
83
+ end
84
+
85
+ end
86
+
87
+ def parse_compound(compound)
88
+
89
+ @pubchem_id = compound.css("PC-Compound_id
90
+ PC-CompoundType
91
+ PC-CompoundType_id
92
+ PC-CompoundType_id_cid").text.to_i
93
+
94
+ compound.css("PC-Compound_props").each do |property|
95
+ self.parse_property(property)
96
+ end
97
+
98
+ end
99
+
100
+ def parse_substance(substance)
101
+
102
+
103
+ @pubchem_id = substance.css("PC-Substance_sid
104
+ PC-ID
105
+ PC-ID_id").text.to_i
106
+
107
+ substance.css("PC-Substance_synonyms
108
+ PC-Substance_synonyms_E").each do |substance_synonym|
109
+ self.add_name(substance_synonym.text)
110
+ end
111
+
112
+ end
113
+
114
+ def parse_property(property)
115
+
116
+ property.css("PC-InfoData").each do |info_data|
117
+ parse_info_data(info_data)
118
+ end
119
+
120
+ end
121
+
122
+ def parse_info_data(info_data)
123
+
124
+ urn_label = info_data.css("PC-InfoData_urn
125
+ PC-Urn
126
+ PC-Urn_label").first.text
127
+ name = nil
128
+ case urn_label
129
+ when "SMILES"
130
+ name = info_data.css("PC-InfoData_value
131
+ PC-InfoData_value_sval").first.text
132
+ when"IUPAC Name"
133
+ name = info_data.css("PC-InfoData_value
134
+ PC-InfoData_value_sval").first.text
135
+ end
136
+
137
+ self.add_name(name)
138
+ end
139
+
140
+ def add_name(name)
141
+ return if name.nil? || name.empty?
142
+
143
+ # Speed up lookups with sorted names
144
+ @names[self.short_code(name)].add name
145
+
146
+ if @current_type == "substance"
147
+ @pubchem_substance_ids[name].add @pubchem_id
148
+ elsif @current_type == "compound"
149
+ @pubchem_compound_ids[name].add @pubchem_id
150
+ else
151
+ raise "Unknown substance"
152
+ end
153
+
154
+ end
155
+
156
+ def fuzzy_name_lookup(lookup_name, threshold)
157
+
158
+ closest_distance = 0.0
159
+ closest_name = nil
160
+
161
+ # Optimistically check for exact name match
162
+ exact_match = self.short_code(lookup_name).include? lookup_name
163
+
164
+ return @pubchem_ids[lookup_name] if exact_match
165
+ return nil if threshold == 1.0
166
+
167
+ @names[self.short_code(lookup_name)].each do |name|
168
+
169
+ distance = @fuzzy_matcher.getDistance(lookup_name, name)
170
+
171
+ if distance > closest_distance
172
+ closest_name = name
173
+ closest_distance = distance
174
+ end
175
+
176
+ end
177
+
178
+ return closest_name if closest_distance > 0.99
179
+
180
+ end
181
+
182
+ def match_list_of_names(names, threshold=0.99)
183
+ @matched_names = names.inject({}) do |acc, name|
184
+ acc[name] = self.fuzzy_name_lookup(name, threshold)
185
+ acc
186
+ end
187
+ end
188
+
189
+ def retrieve_ids(collection)
190
+ msg = "@matched_names required, see #{self.class}#match_list_of_names"
191
+
192
+ raise msg unless @matched_names
193
+
194
+ @matched_names.inject({}) do |acc, name|
195
+ input_name = name[0]
196
+ matched_name = name[1]
197
+
198
+ if matched_name
199
+ ids = collection[matched_name]
200
+ if ids.size > 1
201
+ puts "WARNING: Multiple matching sets"
202
+ end
203
+ collection_id = collection[matched_name].first
204
+ acc[input_name] = collection_id if collection_id
205
+ end
206
+
207
+ acc
208
+ end
209
+ end
210
+
211
+ def retrieve_substance_ids
212
+ self.retrieve_ids(@pubchem_substance_ids)
213
+ end
214
+
215
+ def retrieve_compound_ids
216
+ self.retrieve_ids(@pubchem_compound_ids)
217
+ end
218
+
219
+ def short_code(name)
220
+ name[0..2].downcase
221
+ end
222
+
223
+ end
@@ -1,3 +1,3 @@
1
1
  module Pubchem
2
- VERSION = "0.0.5"
2
+ VERSION = "0.1.1"
3
3
  end
@@ -22,6 +22,9 @@ Gem::Specification.new do |spec|
22
22
  spec.require_paths = ["lib"]
23
23
 
24
24
  spec.add_runtime_dependency "mechanize", "~> 2.7.3"
25
+ spec.add_runtime_dependency "nokogiri", "~> 1.6.6.2"
26
+ spec.add_runtime_dependency "fuzzy-string-match", "~> 0.9.7"
27
+ spec.add_runtime_dependency "ox", "~> 2.2.1"
25
28
 
26
29
  spec.add_development_dependency "bundler", "~> 1.10"
27
30
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pubchem
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Zach Aysan
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2015-09-17 00:00:00.000000000 Z
11
+ date: 2015-09-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
@@ -24,6 +24,48 @@ dependencies:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: 2.7.3
27
+ - !ruby/object:Gem::Dependency
28
+ name: nokogiri
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 1.6.6.2
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 1.6.6.2
41
+ - !ruby/object:Gem::Dependency
42
+ name: fuzzy-string-match
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: 0.9.7
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: 0.9.7
55
+ - !ruby/object:Gem::Dependency
56
+ name: ox
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: 2.2.1
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: 2.2.1
27
69
  - !ruby/object:Gem::Dependency
28
70
  name: bundler
29
71
  requirement: !ruby/object:Gem::Requirement
@@ -44,22 +86,25 @@ description: |2-
44
86
  their form. This helps with that!
45
87
  email:
46
88
  - zachaysan@gmail.com
47
- executables:
48
- - ".gitkeep"
89
+ executables: []
49
90
  extensions: []
50
91
  extra_rdoc_files: []
51
92
  files:
93
+ - ".gitignore"
52
94
  - Gemfile
95
+ - Gemfile.lock
53
96
  - README.markdown
54
97
  - Rakefile
55
98
  - bin/console
56
99
  - bin/setup
57
100
  - example.rb
58
- - exe/.gitkeep
59
101
  - lib/pubchem.rb
102
+ - lib/pubchem/reader.rb
60
103
  - lib/pubchem/version.rb
61
104
  - pubchem.gemspec
62
105
  - run
106
+ - xml/compound_sample.xml
107
+ - xml/substance_sample.xml
63
108
  homepage: https://github.com/zachaysan/pubchem
64
109
  licenses:
65
110
  - MIT
File without changes