pubchem 0.0.5 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +4 -0
- data/Gemfile.lock +52 -0
- data/README.markdown +7 -11
- data/example.rb +22 -1
- data/lib/pubchem.rb +21 -3
- data/lib/pubchem/reader.rb +223 -0
- data/lib/pubchem/version.rb +1 -1
- data/pubchem.gemspec +3 -0
- metadata +50 -5
- data/exe/.gitkeep +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8c55a845631951401782b0af20e268b9181e3ca8
|
4
|
+
data.tar.gz: d70a2f0fddefa25016b76d0442b6c4d8d07b8884
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f9a1f1bbcb944abdace6ab61745620c329258072da9d29c6a2d2266d57fe64847dc45a7ec10882b604e8c4988191cb033253280aedf3262d9ff591f75ad1ea84
|
7
|
+
data.tar.gz: 925506e71420d361b5c776233676a37aea0aab3a7694f20f77ca7e991ec96dde527f8b0731a0a8c57e8a51d29b78dbf77b86813f97ba8634187a63a2a746ed38
|
data/.gitignore
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
pubchem (0.1.1)
|
5
|
+
fuzzy-string-match (~> 0.9.7)
|
6
|
+
mechanize (~> 2.7.3)
|
7
|
+
nokogiri (~> 1.6.6.2)
|
8
|
+
ox (~> 2.2.1)
|
9
|
+
|
10
|
+
GEM
|
11
|
+
remote: https://rubygems.org/
|
12
|
+
specs:
|
13
|
+
RubyInline (3.12.4)
|
14
|
+
ZenTest (~> 4.3)
|
15
|
+
ZenTest (4.11.0)
|
16
|
+
domain_name (0.5.24)
|
17
|
+
unf (>= 0.0.5, < 1.0.0)
|
18
|
+
fuzzy-string-match (0.9.7)
|
19
|
+
RubyInline (>= 3.8.6)
|
20
|
+
http-cookie (1.0.2)
|
21
|
+
domain_name (~> 0.5)
|
22
|
+
mechanize (2.7.3)
|
23
|
+
domain_name (~> 0.5, >= 0.5.1)
|
24
|
+
http-cookie (~> 1.0)
|
25
|
+
mime-types (~> 2.0)
|
26
|
+
net-http-digest_auth (~> 1.1, >= 1.1.1)
|
27
|
+
net-http-persistent (~> 2.5, >= 2.5.2)
|
28
|
+
nokogiri (~> 1.4)
|
29
|
+
ntlm-http (~> 0.1, >= 0.1.1)
|
30
|
+
webrobots (>= 0.0.9, < 0.2)
|
31
|
+
mime-types (2.6.1)
|
32
|
+
mini_portile (0.6.2)
|
33
|
+
net-http-digest_auth (1.4)
|
34
|
+
net-http-persistent (2.9.4)
|
35
|
+
nokogiri (1.6.6.2)
|
36
|
+
mini_portile (~> 0.6.0)
|
37
|
+
ntlm-http (0.1.1)
|
38
|
+
ox (2.2.1)
|
39
|
+
unf (0.1.4)
|
40
|
+
unf_ext
|
41
|
+
unf_ext (0.0.7.1)
|
42
|
+
webrobots (0.1.1)
|
43
|
+
|
44
|
+
PLATFORMS
|
45
|
+
ruby
|
46
|
+
|
47
|
+
DEPENDENCIES
|
48
|
+
bundler (~> 1.10)
|
49
|
+
pubchem!
|
50
|
+
|
51
|
+
BUNDLED WITH
|
52
|
+
1.10.3
|
data/README.markdown
CHANGED
@@ -2,22 +2,18 @@
|
|
2
2
|
|
3
3
|
For getting all that juicy substance and compound data from Pubchem.
|
4
4
|
|
5
|
-
|
5
|
+
Please email me if you end up using this: zachaysan@gmail.com
|
6
|
+
|
7
|
+
I'd be interested to hear if open sourcing this helped someone else.
|
6
8
|
|
7
|
-
|
9
|
+
## Installation
|
8
10
|
|
9
|
-
|
11
|
+
`apt-get install wget` or `sudo apt-get install wget`
|
10
12
|
|
11
|
-
|
13
|
+
then
|
12
14
|
|
13
15
|
`gem install pubchem`
|
14
16
|
|
15
17
|
## Usage
|
16
18
|
|
17
|
-
|
18
|
-
pubchem = Pubchem.new
|
19
|
-
|
20
|
-
pubchem.get_ids([16,405], "~/yay.zip")
|
21
|
-
|
22
|
-
puts "Do a happy dance!"
|
23
|
-
```
|
19
|
+
See `example.rb` for how to use Pubchem.
|
data/example.rb
CHANGED
@@ -1,7 +1,28 @@
|
|
1
|
+
require 'pp'
|
1
2
|
require_relative "lib/pubchem"
|
2
3
|
|
4
|
+
reader = Reader.new
|
5
|
+
reader.read('xml/compound_sample.xml')
|
6
|
+
reader.read('xml/substance_sample.xml')
|
7
|
+
reader.save("xml/names.xml",
|
8
|
+
"xml/pubchem_substance_ids.xml",
|
9
|
+
"xml/pubchem_compound_ids.xml")
|
10
|
+
|
11
|
+
# The first two terms match, the last one replaces a "1H"
|
12
|
+
# with a "2H", resulting in a non-match.
|
13
|
+
|
14
|
+
terms = [ "COC1=C(C=C2CC3=CC(=C(C=C3CC4=CC(=C(C=C4CC2=C1)OC(=O)C5=CC=NC=C5)OC)OC(=O)C6=CC=NC=C6)OC)OC(=O)C7=CC=NC=E9",
|
15
|
+
"4-methoxy-1H-indole-3-carbaldehyde",
|
16
|
+
"4-methoxy-2H-indole-3-carbaldehyde",
|
17
|
+
"2-amino-4,5-dimethyl-1H-pyrrole-3-carbonitrile" ]
|
18
|
+
|
19
|
+
pp reader.match_list_of_names terms
|
20
|
+
pp reader.retrieve_compound_ids
|
21
|
+
pp reader.pubchem_substance_ids
|
3
22
|
pubchem = Pubchem.new
|
4
23
|
|
5
|
-
|
24
|
+
ids = reader.retrieve_substance_ids.map {|k,v| v}
|
25
|
+
|
26
|
+
pubchem.get_substance_ids(ids, "yay.zip")
|
6
27
|
|
7
28
|
puts "Do a happy dance!"
|
data/lib/pubchem.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'mechanize'
|
2
|
+
require_relative 'pubchem/reader'
|
2
3
|
|
3
4
|
class Pubchem
|
4
5
|
|
@@ -13,9 +14,26 @@ class Pubchem
|
|
13
14
|
|
14
15
|
end
|
15
16
|
|
17
|
+
def get_compound_ids(ids,
|
18
|
+
filename,
|
19
|
+
retrieve_mode: :image,
|
20
|
+
delay: nil)
|
21
|
+
|
22
|
+
self.get_ids(ids, filename, :compound, delay: delay)
|
23
|
+
end
|
24
|
+
|
25
|
+
def get_substance_ids(ids,
|
26
|
+
filename,
|
27
|
+
retrieve_mode: :image,
|
28
|
+
delay: nil)
|
29
|
+
|
30
|
+
self.get_ids(ids, filename, :substance, delay: delay)
|
31
|
+
|
32
|
+
end
|
33
|
+
|
16
34
|
def get_ids(ids,
|
17
35
|
filename,
|
18
|
-
db
|
36
|
+
db,
|
19
37
|
retrieve_mode: :image,
|
20
38
|
delay: nil)
|
21
39
|
|
@@ -46,9 +64,9 @@ class Pubchem
|
|
46
64
|
|
47
65
|
ftp_url = ftp_link.to_s
|
48
66
|
size = ftp_url.size
|
49
|
-
|
67
|
+
|
50
68
|
# We don't want to allow scary characters into our URL since it is a
|
51
|
-
# security risk, so we only allow lower and upper case letters, numbers,
|
69
|
+
# security risk, so we only allow lower and upper case letters, numbers,
|
52
70
|
# / forward slashes
|
53
71
|
# : colons
|
54
72
|
# . periods
|
@@ -0,0 +1,223 @@
|
|
1
|
+
require 'set'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'fuzzystringmatch'
|
4
|
+
require 'ox'
|
5
|
+
|
6
|
+
class Reader
|
7
|
+
|
8
|
+
attr_accessor :names,
|
9
|
+
:pubchem_substance_ids,
|
10
|
+
:pubchem_compound_ids
|
11
|
+
|
12
|
+
def initialize(names_filename=nil,
|
13
|
+
pubchem_substance_ids_filename=nil,
|
14
|
+
pubchem_compound_ids_filename=nil)
|
15
|
+
|
16
|
+
@fuzzy_matcher = FuzzyStringMatch::JaroWinkler
|
17
|
+
.create( :native )
|
18
|
+
|
19
|
+
return if initialize_from_files( names_filename,
|
20
|
+
pubchem_substance_ids_filename,
|
21
|
+
pubchem_compound_ids_filename )
|
22
|
+
|
23
|
+
@names = Hash.new { |h,k| h[k] = Set.new }
|
24
|
+
|
25
|
+
@pubchem_substance_ids = Hash.new { |h,k| h[k] = Set.new }
|
26
|
+
@pubchem_compound_ids = Hash.new { |h,k| h[k] = Set.new }
|
27
|
+
|
28
|
+
end
|
29
|
+
|
30
|
+
def initialize_from_files(names_filename,
|
31
|
+
pubchem_substance_ids_filename,
|
32
|
+
pubchem_compound_ids_filename)
|
33
|
+
|
34
|
+
filenames = [ names_filename,
|
35
|
+
pubchem_substance_ids_filename,
|
36
|
+
pubchem_compound_ids_filename ]
|
37
|
+
|
38
|
+
return nil unless filenames.any?
|
39
|
+
raise "Both filenames required" unless filenames.all?
|
40
|
+
|
41
|
+
@names = Ox.load_file(names_filename)
|
42
|
+
@pubchem_substance_ids = Ox.load_file(pubchem_substance_ids_filename)
|
43
|
+
@pubchem_compound_ids = Ox.load_file(pubchem_compound_ids_filename)
|
44
|
+
|
45
|
+
end
|
46
|
+
|
47
|
+
def save(names_filename,
|
48
|
+
pubchem_substance_ids_filename,
|
49
|
+
pubchem_compound_ids_filename)
|
50
|
+
|
51
|
+
Ox.to_file(names_filename, @names, indent: 0)
|
52
|
+
Ox.to_file(pubchem_substance_ids_filename, @pubchem_substance_ids, indent: 0)
|
53
|
+
Ox.to_file(pubchem_compound_ids_filename, @pubchem_compound_ids, indent: 0)
|
54
|
+
|
55
|
+
end
|
56
|
+
|
57
|
+
def read(xml_filepath, type: nil)
|
58
|
+
|
59
|
+
filepath = File.basename(xml_filepath)
|
60
|
+
if type.nil? and filepath.downcase.start_with? "compound"
|
61
|
+
type = :compound
|
62
|
+
elsif type.nil? and filepath.downcase.start_with? "substance"
|
63
|
+
type = :substance
|
64
|
+
else
|
65
|
+
raise "Cannot infer pubchem type"
|
66
|
+
end
|
67
|
+
|
68
|
+
f = File.open(xml_filepath)
|
69
|
+
doc = Nokogiri::XML(f)
|
70
|
+
f.close
|
71
|
+
@current_type = type.to_s
|
72
|
+
case type
|
73
|
+
when :compound
|
74
|
+
doc.css("PC-Compounds PC-Compound").each do |compound|
|
75
|
+
self.parse_compound(compound)
|
76
|
+
end
|
77
|
+
when :substance
|
78
|
+
doc.css("PC-Substances PC-Substance").each do |substance|
|
79
|
+
self.parse_substance(substance)
|
80
|
+
end
|
81
|
+
else
|
82
|
+
raise "Unknown type"
|
83
|
+
end
|
84
|
+
|
85
|
+
end
|
86
|
+
|
87
|
+
def parse_compound(compound)
|
88
|
+
|
89
|
+
@pubchem_id = compound.css("PC-Compound_id
|
90
|
+
PC-CompoundType
|
91
|
+
PC-CompoundType_id
|
92
|
+
PC-CompoundType_id_cid").text.to_i
|
93
|
+
|
94
|
+
compound.css("PC-Compound_props").each do |property|
|
95
|
+
self.parse_property(property)
|
96
|
+
end
|
97
|
+
|
98
|
+
end
|
99
|
+
|
100
|
+
def parse_substance(substance)
|
101
|
+
|
102
|
+
|
103
|
+
@pubchem_id = substance.css("PC-Substance_sid
|
104
|
+
PC-ID
|
105
|
+
PC-ID_id").text.to_i
|
106
|
+
|
107
|
+
substance.css("PC-Substance_synonyms
|
108
|
+
PC-Substance_synonyms_E").each do |substance_synonym|
|
109
|
+
self.add_name(substance_synonym.text)
|
110
|
+
end
|
111
|
+
|
112
|
+
end
|
113
|
+
|
114
|
+
def parse_property(property)
|
115
|
+
|
116
|
+
property.css("PC-InfoData").each do |info_data|
|
117
|
+
parse_info_data(info_data)
|
118
|
+
end
|
119
|
+
|
120
|
+
end
|
121
|
+
|
122
|
+
def parse_info_data(info_data)
|
123
|
+
|
124
|
+
urn_label = info_data.css("PC-InfoData_urn
|
125
|
+
PC-Urn
|
126
|
+
PC-Urn_label").first.text
|
127
|
+
name = nil
|
128
|
+
case urn_label
|
129
|
+
when "SMILES"
|
130
|
+
name = info_data.css("PC-InfoData_value
|
131
|
+
PC-InfoData_value_sval").first.text
|
132
|
+
when"IUPAC Name"
|
133
|
+
name = info_data.css("PC-InfoData_value
|
134
|
+
PC-InfoData_value_sval").first.text
|
135
|
+
end
|
136
|
+
|
137
|
+
self.add_name(name)
|
138
|
+
end
|
139
|
+
|
140
|
+
def add_name(name)
|
141
|
+
return if name.nil? || name.empty?
|
142
|
+
|
143
|
+
# Speed up lookups with sorted names
|
144
|
+
@names[self.short_code(name)].add name
|
145
|
+
|
146
|
+
if @current_type == "substance"
|
147
|
+
@pubchem_substance_ids[name].add @pubchem_id
|
148
|
+
elsif @current_type == "compound"
|
149
|
+
@pubchem_compound_ids[name].add @pubchem_id
|
150
|
+
else
|
151
|
+
raise "Unknown substance"
|
152
|
+
end
|
153
|
+
|
154
|
+
end
|
155
|
+
|
156
|
+
def fuzzy_name_lookup(lookup_name, threshold)
|
157
|
+
|
158
|
+
closest_distance = 0.0
|
159
|
+
closest_name = nil
|
160
|
+
|
161
|
+
# Optimistically check for exact name match
|
162
|
+
exact_match = self.short_code(lookup_name).include? lookup_name
|
163
|
+
|
164
|
+
return @pubchem_ids[lookup_name] if exact_match
|
165
|
+
return nil if threshold == 1.0
|
166
|
+
|
167
|
+
@names[self.short_code(lookup_name)].each do |name|
|
168
|
+
|
169
|
+
distance = @fuzzy_matcher.getDistance(lookup_name, name)
|
170
|
+
|
171
|
+
if distance > closest_distance
|
172
|
+
closest_name = name
|
173
|
+
closest_distance = distance
|
174
|
+
end
|
175
|
+
|
176
|
+
end
|
177
|
+
|
178
|
+
return closest_name if closest_distance > 0.99
|
179
|
+
|
180
|
+
end
|
181
|
+
|
182
|
+
def match_list_of_names(names, threshold=0.99)
|
183
|
+
@matched_names = names.inject({}) do |acc, name|
|
184
|
+
acc[name] = self.fuzzy_name_lookup(name, threshold)
|
185
|
+
acc
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
189
|
+
def retrieve_ids(collection)
|
190
|
+
msg = "@matched_names required, see #{self.class}#match_list_of_names"
|
191
|
+
|
192
|
+
raise msg unless @matched_names
|
193
|
+
|
194
|
+
@matched_names.inject({}) do |acc, name|
|
195
|
+
input_name = name[0]
|
196
|
+
matched_name = name[1]
|
197
|
+
|
198
|
+
if matched_name
|
199
|
+
ids = collection[matched_name]
|
200
|
+
if ids.size > 1
|
201
|
+
puts "WARNING: Multiple matching sets"
|
202
|
+
end
|
203
|
+
collection_id = collection[matched_name].first
|
204
|
+
acc[input_name] = collection_id if collection_id
|
205
|
+
end
|
206
|
+
|
207
|
+
acc
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
def retrieve_substance_ids
|
212
|
+
self.retrieve_ids(@pubchem_substance_ids)
|
213
|
+
end
|
214
|
+
|
215
|
+
def retrieve_compound_ids
|
216
|
+
self.retrieve_ids(@pubchem_compound_ids)
|
217
|
+
end
|
218
|
+
|
219
|
+
def short_code(name)
|
220
|
+
name[0..2].downcase
|
221
|
+
end
|
222
|
+
|
223
|
+
end
|
data/lib/pubchem/version.rb
CHANGED
data/pubchem.gemspec
CHANGED
@@ -22,6 +22,9 @@ Gem::Specification.new do |spec|
|
|
22
22
|
spec.require_paths = ["lib"]
|
23
23
|
|
24
24
|
spec.add_runtime_dependency "mechanize", "~> 2.7.3"
|
25
|
+
spec.add_runtime_dependency "nokogiri", "~> 1.6.6.2"
|
26
|
+
spec.add_runtime_dependency "fuzzy-string-match", "~> 0.9.7"
|
27
|
+
spec.add_runtime_dependency "ox", "~> 2.2.1"
|
25
28
|
|
26
29
|
spec.add_development_dependency "bundler", "~> 1.10"
|
27
30
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pubchem
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Zach Aysan
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-09-
|
11
|
+
date: 2015-09-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|
@@ -24,6 +24,48 @@ dependencies:
|
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: 2.7.3
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: nokogiri
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 1.6.6.2
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 1.6.6.2
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: fuzzy-string-match
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 0.9.7
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 0.9.7
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: ox
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 2.2.1
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: 2.2.1
|
27
69
|
- !ruby/object:Gem::Dependency
|
28
70
|
name: bundler
|
29
71
|
requirement: !ruby/object:Gem::Requirement
|
@@ -44,22 +86,25 @@ description: |2-
|
|
44
86
|
their form. This helps with that!
|
45
87
|
email:
|
46
88
|
- zachaysan@gmail.com
|
47
|
-
executables:
|
48
|
-
- ".gitkeep"
|
89
|
+
executables: []
|
49
90
|
extensions: []
|
50
91
|
extra_rdoc_files: []
|
51
92
|
files:
|
93
|
+
- ".gitignore"
|
52
94
|
- Gemfile
|
95
|
+
- Gemfile.lock
|
53
96
|
- README.markdown
|
54
97
|
- Rakefile
|
55
98
|
- bin/console
|
56
99
|
- bin/setup
|
57
100
|
- example.rb
|
58
|
-
- exe/.gitkeep
|
59
101
|
- lib/pubchem.rb
|
102
|
+
- lib/pubchem/reader.rb
|
60
103
|
- lib/pubchem/version.rb
|
61
104
|
- pubchem.gemspec
|
62
105
|
- run
|
106
|
+
- xml/compound_sample.xml
|
107
|
+
- xml/substance_sample.xml
|
63
108
|
homepage: https://github.com/zachaysan/pubchem
|
64
109
|
licenses:
|
65
110
|
- MIT
|
data/exe/.gitkeep
DELETED
File without changes
|