pubchem 0.0.5 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: cc1a04a9f940becd4f4eff582d8105d6f3772eed
4
- data.tar.gz: 224b9440fe38fcfa39fe9b360a2f32a9e145b27f
3
+ metadata.gz: 8c55a845631951401782b0af20e268b9181e3ca8
4
+ data.tar.gz: d70a2f0fddefa25016b76d0442b6c4d8d07b8884
5
5
  SHA512:
6
- metadata.gz: 6f72420e95796c668a1154877ef5ad2455569e00fbf618ee2ebfc5256433fbcf0cb0471d67ecf2044bce075f0dbf9a59f769610026620936a74dc488fa8a0e22
7
- data.tar.gz: 297ba5d561ed323425c6c5804eceb035d17baf2c8865acf7b4bade28ccb544b32dd79a6e9f319c92acc5bdd45e1e0d1a075648779fcd1598104d36db34acc62a
6
+ metadata.gz: f9a1f1bbcb944abdace6ab61745620c329258072da9d29c6a2d2266d57fe64847dc45a7ec10882b604e8c4988191cb033253280aedf3262d9ff591f75ad1ea84
7
+ data.tar.gz: 925506e71420d361b5c776233676a37aea0aab3a7694f20f77ca7e991ec96dde527f8b0731a0a8c57e8a51d29b78dbf77b86813f97ba8634187a63a2a746ed38
@@ -0,0 +1,4 @@
1
+ pkg
2
+ xml
3
+ !xml/substance_sample.xml
4
+ !xml/compound_sample.xml
@@ -0,0 +1,52 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ pubchem (0.1.1)
5
+ fuzzy-string-match (~> 0.9.7)
6
+ mechanize (~> 2.7.3)
7
+ nokogiri (~> 1.6.6.2)
8
+ ox (~> 2.2.1)
9
+
10
+ GEM
11
+ remote: https://rubygems.org/
12
+ specs:
13
+ RubyInline (3.12.4)
14
+ ZenTest (~> 4.3)
15
+ ZenTest (4.11.0)
16
+ domain_name (0.5.24)
17
+ unf (>= 0.0.5, < 1.0.0)
18
+ fuzzy-string-match (0.9.7)
19
+ RubyInline (>= 3.8.6)
20
+ http-cookie (1.0.2)
21
+ domain_name (~> 0.5)
22
+ mechanize (2.7.3)
23
+ domain_name (~> 0.5, >= 0.5.1)
24
+ http-cookie (~> 1.0)
25
+ mime-types (~> 2.0)
26
+ net-http-digest_auth (~> 1.1, >= 1.1.1)
27
+ net-http-persistent (~> 2.5, >= 2.5.2)
28
+ nokogiri (~> 1.4)
29
+ ntlm-http (~> 0.1, >= 0.1.1)
30
+ webrobots (>= 0.0.9, < 0.2)
31
+ mime-types (2.6.1)
32
+ mini_portile (0.6.2)
33
+ net-http-digest_auth (1.4)
34
+ net-http-persistent (2.9.4)
35
+ nokogiri (1.6.6.2)
36
+ mini_portile (~> 0.6.0)
37
+ ntlm-http (0.1.1)
38
+ ox (2.2.1)
39
+ unf (0.1.4)
40
+ unf_ext
41
+ unf_ext (0.0.7.1)
42
+ webrobots (0.1.1)
43
+
44
+ PLATFORMS
45
+ ruby
46
+
47
+ DEPENDENCIES
48
+ bundler (~> 1.10)
49
+ pubchem!
50
+
51
+ BUNDLED WITH
52
+ 1.10.3
@@ -2,22 +2,18 @@
2
2
 
3
3
  For getting all that juicy substance and compound data from Pubchem.
4
4
 
5
- ## Installation
5
+ Please email me if you end up using this: zachaysan@gmail.com
6
+
7
+ I'd be interested to hear if open sourcing this helped someone else.
6
8
 
7
- `apt-get install wget`
9
+ ## Installation
8
10
 
9
- Or
11
+ `apt-get install wget` or `sudo apt-get install wget`
10
12
 
11
- `sudo apt-get install wget`
13
+ then
12
14
 
13
15
  `gem install pubchem`
14
16
 
15
17
  ## Usage
16
18
 
17
- ```ruby
18
- pubchem = Pubchem.new
19
-
20
- pubchem.get_ids([16,405], "~/yay.zip")
21
-
22
- puts "Do a happy dance!"
23
- ```
19
+ See `example.rb` for how to use Pubchem.
data/example.rb CHANGED
@@ -1,7 +1,28 @@
1
+ require 'pp'
1
2
  require_relative "lib/pubchem"
2
3
 
4
+ reader = Reader.new
5
+ reader.read('xml/compound_sample.xml')
6
+ reader.read('xml/substance_sample.xml')
7
+ reader.save("xml/names.xml",
8
+ "xml/pubchem_substance_ids.xml",
9
+ "xml/pubchem_compound_ids.xml")
10
+
11
+ # The first two terms match, the last one replaces a "1H"
12
+ # with a "2H", resulting in a non-match.
13
+
14
+ terms = [ "COC1=C(C=C2CC3=CC(=C(C=C3CC4=CC(=C(C=C4CC2=C1)OC(=O)C5=CC=NC=C5)OC)OC(=O)C6=CC=NC=C6)OC)OC(=O)C7=CC=NC=E9",
15
+ "4-methoxy-1H-indole-3-carbaldehyde",
16
+ "4-methoxy-2H-indole-3-carbaldehyde",
17
+ "2-amino-4,5-dimethyl-1H-pyrrole-3-carbonitrile" ]
18
+
19
+ pp reader.match_list_of_names terms
20
+ pp reader.retrieve_compound_ids
21
+ pp reader.pubchem_substance_ids
3
22
  pubchem = Pubchem.new
4
23
 
5
- pubchem.get_ids([16,405], "~/yay.zip")
24
+ ids = reader.retrieve_substance_ids.map {|k,v| v}
25
+
26
+ pubchem.get_substance_ids(ids, "yay.zip")
6
27
 
7
28
  puts "Do a happy dance!"
@@ -1,4 +1,5 @@
1
1
  require 'mechanize'
2
+ require_relative 'pubchem/reader'
2
3
 
3
4
  class Pubchem
4
5
 
@@ -13,9 +14,26 @@ class Pubchem
13
14
 
14
15
  end
15
16
 
17
+ def get_compound_ids(ids,
18
+ filename,
19
+ retrieve_mode: :image,
20
+ delay: nil)
21
+
22
+ self.get_ids(ids, filename, :compound, delay: delay)
23
+ end
24
+
25
+ def get_substance_ids(ids,
26
+ filename,
27
+ retrieve_mode: :image,
28
+ delay: nil)
29
+
30
+ self.get_ids(ids, filename, :substance, delay: delay)
31
+
32
+ end
33
+
16
34
  def get_ids(ids,
17
35
  filename,
18
- db: :compound,
36
+ db,
19
37
  retrieve_mode: :image,
20
38
  delay: nil)
21
39
 
@@ -46,9 +64,9 @@ class Pubchem
46
64
 
47
65
  ftp_url = ftp_link.to_s
48
66
  size = ftp_url.size
49
-
67
+
50
68
  # We don't want to allow scary characters into our URL since it is a
51
- # security risk, so we only allow lower and upper case letters, numbers,
69
+ # security risk, so we only allow lower and upper case letters, numbers,
52
70
  # / forward slashes
53
71
  # : colons
54
72
  # . periods
@@ -0,0 +1,223 @@
1
+ require 'set'
2
+ require 'nokogiri'
3
+ require 'fuzzystringmatch'
4
+ require 'ox'
5
+
6
+ class Reader
7
+
8
+ attr_accessor :names,
9
+ :pubchem_substance_ids,
10
+ :pubchem_compound_ids
11
+
12
+ def initialize(names_filename=nil,
13
+ pubchem_substance_ids_filename=nil,
14
+ pubchem_compound_ids_filename=nil)
15
+
16
+ @fuzzy_matcher = FuzzyStringMatch::JaroWinkler
17
+ .create( :native )
18
+
19
+ return if initialize_from_files( names_filename,
20
+ pubchem_substance_ids_filename,
21
+ pubchem_compound_ids_filename )
22
+
23
+ @names = Hash.new { |h,k| h[k] = Set.new }
24
+
25
+ @pubchem_substance_ids = Hash.new { |h,k| h[k] = Set.new }
26
+ @pubchem_compound_ids = Hash.new { |h,k| h[k] = Set.new }
27
+
28
+ end
29
+
30
+ def initialize_from_files(names_filename,
31
+ pubchem_substance_ids_filename,
32
+ pubchem_compound_ids_filename)
33
+
34
+ filenames = [ names_filename,
35
+ pubchem_substance_ids_filename,
36
+ pubchem_compound_ids_filename ]
37
+
38
+ return nil unless filenames.any?
39
+ raise "Both filenames required" unless filenames.all?
40
+
41
+ @names = Ox.load_file(names_filename)
42
+ @pubchem_substance_ids = Ox.load_file(pubchem_substance_ids_filename)
43
+ @pubchem_compound_ids = Ox.load_file(pubchem_compound_ids_filename)
44
+
45
+ end
46
+
47
+ def save(names_filename,
48
+ pubchem_substance_ids_filename,
49
+ pubchem_compound_ids_filename)
50
+
51
+ Ox.to_file(names_filename, @names, indent: 0)
52
+ Ox.to_file(pubchem_substance_ids_filename, @pubchem_substance_ids, indent: 0)
53
+ Ox.to_file(pubchem_compound_ids_filename, @pubchem_compound_ids, indent: 0)
54
+
55
+ end
56
+
57
+ def read(xml_filepath, type: nil)
58
+
59
+ filepath = File.basename(xml_filepath)
60
+ if type.nil? and filepath.downcase.start_with? "compound"
61
+ type = :compound
62
+ elsif type.nil? and filepath.downcase.start_with? "substance"
63
+ type = :substance
64
+ else
65
+ raise "Cannot infer pubchem type"
66
+ end
67
+
68
+ f = File.open(xml_filepath)
69
+ doc = Nokogiri::XML(f)
70
+ f.close
71
+ @current_type = type.to_s
72
+ case type
73
+ when :compound
74
+ doc.css("PC-Compounds PC-Compound").each do |compound|
75
+ self.parse_compound(compound)
76
+ end
77
+ when :substance
78
+ doc.css("PC-Substances PC-Substance").each do |substance|
79
+ self.parse_substance(substance)
80
+ end
81
+ else
82
+ raise "Unknown type"
83
+ end
84
+
85
+ end
86
+
87
+ def parse_compound(compound)
88
+
89
+ @pubchem_id = compound.css("PC-Compound_id
90
+ PC-CompoundType
91
+ PC-CompoundType_id
92
+ PC-CompoundType_id_cid").text.to_i
93
+
94
+ compound.css("PC-Compound_props").each do |property|
95
+ self.parse_property(property)
96
+ end
97
+
98
+ end
99
+
100
+ def parse_substance(substance)
101
+
102
+
103
+ @pubchem_id = substance.css("PC-Substance_sid
104
+ PC-ID
105
+ PC-ID_id").text.to_i
106
+
107
+ substance.css("PC-Substance_synonyms
108
+ PC-Substance_synonyms_E").each do |substance_synonym|
109
+ self.add_name(substance_synonym.text)
110
+ end
111
+
112
+ end
113
+
114
+ def parse_property(property)
115
+
116
+ property.css("PC-InfoData").each do |info_data|
117
+ parse_info_data(info_data)
118
+ end
119
+
120
+ end
121
+
122
+ def parse_info_data(info_data)
123
+
124
+ urn_label = info_data.css("PC-InfoData_urn
125
+ PC-Urn
126
+ PC-Urn_label").first.text
127
+ name = nil
128
+ case urn_label
129
+ when "SMILES"
130
+ name = info_data.css("PC-InfoData_value
131
+ PC-InfoData_value_sval").first.text
132
+ when"IUPAC Name"
133
+ name = info_data.css("PC-InfoData_value
134
+ PC-InfoData_value_sval").first.text
135
+ end
136
+
137
+ self.add_name(name)
138
+ end
139
+
140
+ def add_name(name)
141
+ return if name.nil? || name.empty?
142
+
143
+ # Speed up lookups with sorted names
144
+ @names[self.short_code(name)].add name
145
+
146
+ if @current_type == "substance"
147
+ @pubchem_substance_ids[name].add @pubchem_id
148
+ elsif @current_type == "compound"
149
+ @pubchem_compound_ids[name].add @pubchem_id
150
+ else
151
+ raise "Unknown substance"
152
+ end
153
+
154
+ end
155
+
156
+ def fuzzy_name_lookup(lookup_name, threshold)
157
+
158
+ closest_distance = 0.0
159
+ closest_name = nil
160
+
161
+ # Optimistically check for exact name match
162
+ exact_match = self.short_code(lookup_name).include? lookup_name
163
+
164
+ return @pubchem_ids[lookup_name] if exact_match
165
+ return nil if threshold == 1.0
166
+
167
+ @names[self.short_code(lookup_name)].each do |name|
168
+
169
+ distance = @fuzzy_matcher.getDistance(lookup_name, name)
170
+
171
+ if distance > closest_distance
172
+ closest_name = name
173
+ closest_distance = distance
174
+ end
175
+
176
+ end
177
+
178
+ return closest_name if closest_distance > 0.99
179
+
180
+ end
181
+
182
+ def match_list_of_names(names, threshold=0.99)
183
+ @matched_names = names.inject({}) do |acc, name|
184
+ acc[name] = self.fuzzy_name_lookup(name, threshold)
185
+ acc
186
+ end
187
+ end
188
+
189
+ def retrieve_ids(collection)
190
+ msg = "@matched_names required, see #{self.class}#match_list_of_names"
191
+
192
+ raise msg unless @matched_names
193
+
194
+ @matched_names.inject({}) do |acc, name|
195
+ input_name = name[0]
196
+ matched_name = name[1]
197
+
198
+ if matched_name
199
+ ids = collection[matched_name]
200
+ if ids.size > 1
201
+ puts "WARNING: Multiple matching sets"
202
+ end
203
+ collection_id = collection[matched_name].first
204
+ acc[input_name] = collection_id if collection_id
205
+ end
206
+
207
+ acc
208
+ end
209
+ end
210
+
211
+ def retrieve_substance_ids
212
+ self.retrieve_ids(@pubchem_substance_ids)
213
+ end
214
+
215
+ def retrieve_compound_ids
216
+ self.retrieve_ids(@pubchem_compound_ids)
217
+ end
218
+
219
+ def short_code(name)
220
+ name[0..2].downcase
221
+ end
222
+
223
+ end
@@ -1,3 +1,3 @@
1
1
  module Pubchem
2
- VERSION = "0.0.5"
2
+ VERSION = "0.1.1"
3
3
  end
@@ -22,6 +22,9 @@ Gem::Specification.new do |spec|
22
22
  spec.require_paths = ["lib"]
23
23
 
24
24
  spec.add_runtime_dependency "mechanize", "~> 2.7.3"
25
+ spec.add_runtime_dependency "nokogiri", "~> 1.6.6.2"
26
+ spec.add_runtime_dependency "fuzzy-string-match", "~> 0.9.7"
27
+ spec.add_runtime_dependency "ox", "~> 2.2.1"
25
28
 
26
29
  spec.add_development_dependency "bundler", "~> 1.10"
27
30
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pubchem
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Zach Aysan
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2015-09-17 00:00:00.000000000 Z
11
+ date: 2015-09-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
@@ -24,6 +24,48 @@ dependencies:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: 2.7.3
27
+ - !ruby/object:Gem::Dependency
28
+ name: nokogiri
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 1.6.6.2
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 1.6.6.2
41
+ - !ruby/object:Gem::Dependency
42
+ name: fuzzy-string-match
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: 0.9.7
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: 0.9.7
55
+ - !ruby/object:Gem::Dependency
56
+ name: ox
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: 2.2.1
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: 2.2.1
27
69
  - !ruby/object:Gem::Dependency
28
70
  name: bundler
29
71
  requirement: !ruby/object:Gem::Requirement
@@ -44,22 +86,25 @@ description: |2-
44
86
  their form. This helps with that!
45
87
  email:
46
88
  - zachaysan@gmail.com
47
- executables:
48
- - ".gitkeep"
89
+ executables: []
49
90
  extensions: []
50
91
  extra_rdoc_files: []
51
92
  files:
93
+ - ".gitignore"
52
94
  - Gemfile
95
+ - Gemfile.lock
53
96
  - README.markdown
54
97
  - Rakefile
55
98
  - bin/console
56
99
  - bin/setup
57
100
  - example.rb
58
- - exe/.gitkeep
59
101
  - lib/pubchem.rb
102
+ - lib/pubchem/reader.rb
60
103
  - lib/pubchem/version.rb
61
104
  - pubchem.gemspec
62
105
  - run
106
+ - xml/compound_sample.xml
107
+ - xml/substance_sample.xml
63
108
  homepage: https://github.com/zachaysan/pubchem
64
109
  licenses:
65
110
  - MIT
File without changes