RubyGems - pubchem - Versions diffs - 0.0.5 → 0.1.1 - Mend

pubchem 0.0.5 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: cc1a04a9f940becd4f4eff582d8105d6f3772eed
-  data.tar.gz: 224b9440fe38fcfa39fe9b360a2f32a9e145b27f
+  metadata.gz: 8c55a845631951401782b0af20e268b9181e3ca8
+  data.tar.gz: d70a2f0fddefa25016b76d0442b6c4d8d07b8884
 SHA512:
-  metadata.gz: 6f72420e95796c668a1154877ef5ad2455569e00fbf618ee2ebfc5256433fbcf0cb0471d67ecf2044bce075f0dbf9a59f769610026620936a74dc488fa8a0e22
-  data.tar.gz: 297ba5d561ed323425c6c5804eceb035d17baf2c8865acf7b4bade28ccb544b32dd79a6e9f319c92acc5bdd45e1e0d1a075648779fcd1598104d36db34acc62a
+  metadata.gz: f9a1f1bbcb944abdace6ab61745620c329258072da9d29c6a2d2266d57fe64847dc45a7ec10882b604e8c4988191cb033253280aedf3262d9ff591f75ad1ea84
+  data.tar.gz: 925506e71420d361b5c776233676a37aea0aab3a7694f20f77ca7e991ec96dde527f8b0731a0a8c57e8a51d29b78dbf77b86813f97ba8634187a63a2a746ed38

data/.gitignore ADDED

@@ -0,0 +1,4 @@
+pkg
+xml
+!xml/substance_sample.xml
+!xml/compound_sample.xml

data/Gemfile.lock ADDED

@@ -0,0 +1,52 @@
+PATH
+  remote: .
+  specs:
+    pubchem (0.1.1)
+      fuzzy-string-match (~> 0.9.7)
+      mechanize (~> 2.7.3)
+      nokogiri (~> 1.6.6.2)
+      ox (~> 2.2.1)
+GEM
+  remote: https://rubygems.org/
+  specs:
+    RubyInline (3.12.4)
+      ZenTest (~> 4.3)
+    ZenTest (4.11.0)
+    domain_name (0.5.24)
+      unf (>= 0.0.5, < 1.0.0)
+    fuzzy-string-match (0.9.7)
+      RubyInline (>= 3.8.6)
+    http-cookie (1.0.2)
+      domain_name (~> 0.5)
+    mechanize (2.7.3)
+      domain_name (~> 0.5, >= 0.5.1)
+      http-cookie (~> 1.0)
+      mime-types (~> 2.0)
+      net-http-digest_auth (~> 1.1, >= 1.1.1)
+      net-http-persistent (~> 2.5, >= 2.5.2)
+      nokogiri (~> 1.4)
+      ntlm-http (~> 0.1, >= 0.1.1)
+      webrobots (>= 0.0.9, < 0.2)
+    mime-types (2.6.1)
+    mini_portile (0.6.2)
+    net-http-digest_auth (1.4)
+    net-http-persistent (2.9.4)
+    nokogiri (1.6.6.2)
+      mini_portile (~> 0.6.0)
+    ntlm-http (0.1.1)
+    ox (2.2.1)
+    unf (0.1.4)
+      unf_ext
+    unf_ext (0.0.7.1)
+    webrobots (0.1.1)
+PLATFORMS
+  ruby
+DEPENDENCIES
+  bundler (~> 1.10)
+  pubchem!
+BUNDLED WITH
+   1.10.3

data/README.markdown CHANGED

@@ -2,22 +2,18 @@
 For getting all that juicy substance and compound data from Pubchem.
-## Installation
+Please email me if you end up using this: zachaysan@gmail.com
+I'd be interested to hear if open sourcing this helped someone else.
-`apt-get install wget`
+## Installation
-Or
+`apt-get install wget` or `sudo apt-get install wget`
-`sudo apt-get install wget`
+then
 `gem install pubchem`
 ## Usage
-```ruby
-pubchem = Pubchem.new
-pubchem.get_ids([16,405], "~/yay.zip")
-puts "Do a happy dance!"
-```
+See `example.rb` for how to use Pubchem.

data/example.rb CHANGED

@@ -1,7 +1,28 @@
+require 'pp'
 require_relative "lib/pubchem"
+reader = Reader.new
+reader.read('xml/compound_sample.xml')
+reader.read('xml/substance_sample.xml')
+reader.save("xml/names.xml",
+            "xml/pubchem_substance_ids.xml",
+            "xml/pubchem_compound_ids.xml")
+# The first two terms match, the last one replaces a "1H"
+# with a "2H", resulting in a non-match.
+terms = [ "COC1=C(C=C2CC3=CC(=C(C=C3CC4=CC(=C(C=C4CC2=C1)OC(=O)C5=CC=NC=C5)OC)OC(=O)C6=CC=NC=C6)OC)OC(=O)C7=CC=NC=E9",
+          "4-methoxy-1H-indole-3-carbaldehyde",
+          "4-methoxy-2H-indole-3-carbaldehyde",
+          "2-amino-4,5-dimethyl-1H-pyrrole-3-carbonitrile" ]
+pp reader.match_list_of_names terms
+pp reader.retrieve_compound_ids
+pp reader.pubchem_substance_ids
 pubchem = Pubchem.new
-pubchem.get_ids([16,405], "~/yay.zip")
+ids = reader.retrieve_substance_ids.map {|k,v| v}
+pubchem.get_substance_ids(ids, "yay.zip")
 puts "Do a happy dance!"

data/lib/pubchem.rb CHANGED

@@ -1,4 +1,5 @@
 require 'mechanize'
+require_relative 'pubchem/reader'
 class Pubchem
@@ -13,9 +14,26 @@ class Pubchem
   end
+  def get_compound_ids(ids,
+                       filename,
+                       retrieve_mode: :image,
+                       delay: nil)
+    self.get_ids(ids, filename, :compound, delay: delay)
+  end
+  def get_substance_ids(ids,
+                       filename,
+                       retrieve_mode: :image,
+                       delay: nil)
+    self.get_ids(ids, filename, :substance, delay: delay)
+  end
   def get_ids(ids,
               filename,
-              db: :compound,
+              db,
               retrieve_mode: :image,
               delay: nil)
@@ -46,9 +64,9 @@ class Pubchem
       ftp_url = ftp_link.to_s
       size = ftp_url.size
       # We don't want to allow scary characters into our URL since it is a
-      # security risk, so we only allow lower and upper case letters, numbers,
+      # security risk, so we only allow lower and upper case letters, numbers,
       # /   forward slashes
       # :   colons
       # .   periods

data/lib/pubchem/reader.rb ADDED

@@ -0,0 +1,223 @@
+require 'set'
+require 'nokogiri'
+require 'fuzzystringmatch'
+require 'ox'
+class Reader
+  attr_accessor :names,
+                :pubchem_substance_ids,
+                :pubchem_compound_ids
+  def initialize(names_filename=nil,
+                 pubchem_substance_ids_filename=nil,
+                 pubchem_compound_ids_filename=nil)
+    @fuzzy_matcher = FuzzyStringMatch::JaroWinkler
+                     .create( :native )
+    return if initialize_from_files( names_filename,
+                                     pubchem_substance_ids_filename,
+                                     pubchem_compound_ids_filename )
+    @names = Hash.new { |h,k| h[k] = Set.new }
+    @pubchem_substance_ids = Hash.new { |h,k| h[k] = Set.new }
+    @pubchem_compound_ids = Hash.new  { |h,k| h[k] = Set.new }
+  end
+  def initialize_from_files(names_filename,
+                            pubchem_substance_ids_filename,
+                            pubchem_compound_ids_filename)
+    filenames = [ names_filename,
+                  pubchem_substance_ids_filename,
+                  pubchem_compound_ids_filename ]
+    return nil unless filenames.any?
+    raise "Both filenames required" unless filenames.all?
+    @names = Ox.load_file(names_filename)
+    @pubchem_substance_ids = Ox.load_file(pubchem_substance_ids_filename)
+    @pubchem_compound_ids = Ox.load_file(pubchem_compound_ids_filename)
+  end
+  def save(names_filename,
+           pubchem_substance_ids_filename,
+           pubchem_compound_ids_filename)
+    Ox.to_file(names_filename, @names, indent: 0)
+    Ox.to_file(pubchem_substance_ids_filename, @pubchem_substance_ids, indent: 0)
+    Ox.to_file(pubchem_compound_ids_filename, @pubchem_compound_ids, indent: 0)
+  end
+  def read(xml_filepath, type: nil)
+    filepath = File.basename(xml_filepath)
+    if type.nil? and filepath.downcase.start_with? "compound"
+      type = :compound
+    elsif type.nil? and filepath.downcase.start_with? "substance"
+      type = :substance
+    else
+      raise "Cannot infer pubchem type"
+    end
+    f = File.open(xml_filepath)
+    doc = Nokogiri::XML(f)
+    f.close
+    @current_type = type.to_s
+    case type
+    when :compound
+      doc.css("PC-Compounds PC-Compound").each do |compound|
+        self.parse_compound(compound)
+      end
+    when :substance
+      doc.css("PC-Substances PC-Substance").each do |substance|
+        self.parse_substance(substance)
+      end
+    else
+      raise "Unknown type"
+    end
+  end
+  def parse_compound(compound)
+    @pubchem_id = compound.css("PC-Compound_id
+                                PC-CompoundType
+                                PC-CompoundType_id
+                                PC-CompoundType_id_cid").text.to_i
+    compound.css("PC-Compound_props").each do |property|
+      self.parse_property(property)
+    end
+  end
+  def parse_substance(substance)
+    @pubchem_id = substance.css("PC-Substance_sid
+                                 PC-ID
+                                 PC-ID_id").text.to_i
+    substance.css("PC-Substance_synonyms
+                   PC-Substance_synonyms_E").each do |substance_synonym|
+      self.add_name(substance_synonym.text)
+    end
+  end
+  def parse_property(property)
+    property.css("PC-InfoData").each do |info_data|
+      parse_info_data(info_data)
+    end
+  end
+  def parse_info_data(info_data)
+    urn_label = info_data.css("PC-InfoData_urn
+                               PC-Urn
+                               PC-Urn_label").first.text
+    name = nil
+    case urn_label
+    when "SMILES"
+      name = info_data.css("PC-InfoData_value
+                            PC-InfoData_value_sval").first.text
+    when"IUPAC Name"
+      name = info_data.css("PC-InfoData_value
+                            PC-InfoData_value_sval").first.text
+    end
+    self.add_name(name)
+  end
+  def add_name(name)
+    return if name.nil? || name.empty?
+    # Speed up lookups with sorted names
+    @names[self.short_code(name)].add name
+    if @current_type == "substance"
+      @pubchem_substance_ids[name].add @pubchem_id
+    elsif @current_type == "compound"
+      @pubchem_compound_ids[name].add @pubchem_id
+    else
+      raise "Unknown substance"
+    end
+  end
+  def fuzzy_name_lookup(lookup_name, threshold)
+    closest_distance = 0.0
+    closest_name = nil
+    # Optimistically check for exact name match
+    exact_match = self.short_code(lookup_name).include? lookup_name
+    return @pubchem_ids[lookup_name] if exact_match
+    return nil if threshold == 1.0
+    @names[self.short_code(lookup_name)].each do |name|
+      distance = @fuzzy_matcher.getDistance(lookup_name, name)
+      if distance > closest_distance
+        closest_name = name
+        closest_distance = distance
+      end
+    end
+    return closest_name if closest_distance > 0.99
+  end
+  def match_list_of_names(names, threshold=0.99)
+    @matched_names = names.inject({}) do |acc, name|
+      acc[name] = self.fuzzy_name_lookup(name, threshold)
+      acc
+    end
+  end
+  def retrieve_ids(collection)
+    msg = "@matched_names required, see #{self.class}#match_list_of_names"
+    raise msg unless @matched_names
+    @matched_names.inject({}) do |acc, name|
+      input_name = name[0]
+      matched_name = name[1]
+      if matched_name
+        ids = collection[matched_name]
+        if ids.size > 1
+          puts "WARNING: Multiple matching sets"
+        end
+        collection_id = collection[matched_name].first
+        acc[input_name] = collection_id if collection_id
+      end
+      acc
+    end
+  end
+  def retrieve_substance_ids
+    self.retrieve_ids(@pubchem_substance_ids)
+  end
+  def retrieve_compound_ids
+    self.retrieve_ids(@pubchem_compound_ids)
+  end
+  def short_code(name)
+    name[0..2].downcase
+  end
+end

data/lib/pubchem/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Pubchem
-  VERSION = "0.0.5"
+  VERSION = "0.1.1"
 end

data/pubchem.gemspec CHANGED

@@ -22,6 +22,9 @@ Gem::Specification.new do |spec|
   spec.require_paths = ["lib"]
   spec.add_runtime_dependency "mechanize", "~> 2.7.3"
+  spec.add_runtime_dependency "nokogiri", "~> 1.6.6.2"
+  spec.add_runtime_dependency "fuzzy-string-match", "~> 0.9.7"
+  spec.add_runtime_dependency "ox", "~> 2.2.1"
   spec.add_development_dependency "bundler", "~> 1.10"

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: pubchem
 version: !ruby/object:Gem::Version
-  version: 0.0.5
+  version: 0.1.1
 platform: ruby
 authors:
 - Zach Aysan
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2015-09-17 00:00:00.000000000 Z
+date: 2015-09-24 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: mechanize
@@ -24,6 +24,48 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: 2.7.3
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 1.6.6.2
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 1.6.6.2
+- !ruby/object:Gem::Dependency
+  name: fuzzy-string-match
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 0.9.7
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 0.9.7
+- !ruby/object:Gem::Dependency
+  name: ox
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 2.2.1
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 2.2.1
 - !ruby/object:Gem::Dependency
   name: bundler
   requirement: !ruby/object:Gem::Requirement
@@ -44,22 +86,25 @@ description: |2-
                            their form. This helps with that!
 email:
 - zachaysan@gmail.com
-executables:
-- ".gitkeep"
+executables: []
 extensions: []
 extra_rdoc_files: []
 files:
+- ".gitignore"
 - Gemfile
+- Gemfile.lock
 - README.markdown
 - Rakefile
 - bin/console
 - bin/setup
 - example.rb
-- exe/.gitkeep
 - lib/pubchem.rb
+- lib/pubchem/reader.rb
 - lib/pubchem/version.rb
 - pubchem.gemspec
 - run
+- xml/compound_sample.xml
+- xml/substance_sample.xml
 homepage: https://github.com/zachaysan/pubchem
 licenses:
 - MIT

data/exe/.gitkeep DELETED

File without changes