RubyGems - pacer-xml - Versions diffs - 0.2.2-java → 0.2.3-java - Mend

pacer-xml 0.2.2-java → 0.2.3-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

data/Gemfile +2 -1
data/lib/pacer-xml/build_graph.rb +20 -11
data/lib/pacer-xml/sample/patent_text.rb +0 -0
data/lib/pacer-xml/sample.rb +116 -32
data/lib/pacer-xml/string_route.rb +2 -1
data/lib/pacer-xml/version.rb +1 -1
data/lib/pacer-xml.rb +3 -3
data/pacer-xml.gemspec +2 -0
metadata +19 -2

data/Gemfile CHANGED Viewed

@@ -3,4 +3,5 @@ source "http://rubygems.org"
 # Specify your gem's dependencies in pacer-graph.gemspec
 gemspec
-gem 'pacer', path: '~/xn/pacer'
+gem 'pacer', path: '../pacer'
+gem 'pacer-neo4j', path: '../pacer-neo4j'

data/lib/pacer-xml/build_graph.rb CHANGED Viewed

@@ -13,13 +13,16 @@ module PacerXml
     attr_reader :graph
     attr_accessor :depth, :documents
-    attr_reader :rename, :html, :skip
+    attr_reader :rename, :html, :skip, :with_body
     def initialize(graph, opts = {})
       @documents = 0
       @graph = graph
       # treat tag as a property containing html
       @html = (opts[:html] || []).map(&:to_s).to_set
+      # capture the body into a body property in addition to any tags it contains.
+      @with_body = (opts[:with_body] || []).map(&:to_s).to_set
       # skip property or tag
       @skip = (opts[:skip] || []).map(&:to_s).to_set
       # rename type or property
@@ -42,6 +45,7 @@ module PacerXml
     def visit_vertex_fields(e)
       h = e.fields
+      h['body'] = e.inner_html if with_body? e
       h['type'] = rename[h['type']]
       rename.each do |from, to|
         if h.key? from
@@ -78,6 +82,10 @@ module PacerXml
       skip.include? e.name or html.include? e.name
     end
+    def with_body?(e)
+      with_body.include? e.name
+    end
     def level
       self.depth += 1
       yield
@@ -116,6 +124,7 @@ module PacerXml
       return nil if skip? rel
       level do
         attrs = visit_edge_fields rel
+        rel[:body] = rel.inner_text if with_body? rel
         attrs.delete :type
         rel.contained_rels.map do |to_e|
           visit_many_rel(from_e, from, rel, to_e, attrs)
@@ -159,15 +168,15 @@ module PacerXml
     def build(doc)
       result = super
       #tell "CACHE size #{ cache[:size] },  hits:"
-      if cache[:stats] and documents % 100 == 99
-        tell '-----------------'
-        cache.each do |k, adds|
-          next unless k.is_a? String
-          adds = adds.length
-          hits = cache[:hits][k]
-          tell("%40s: %6s / %6s = %5.4f" % [k, hits, adds, (hits/adds.to_f)])
-        end
-      end
+     #if cache[:stats] and documents % 100 == 99
+     #  tell '-----------------'
+     #  cache.each do |k, adds|
+     #    next unless k.is_a? String
+     #    adds = adds.length
+     #    hits = cache[:hits][k]
+     #    tell("%40s: %6s / %6s = %5.4f" % [k, hits, adds, (hits/adds.to_f)])
+     #  end
+     #end
       result
     end
@@ -192,7 +201,7 @@ module PacerXml
         ct = cache[rename[e.name]]
         kill = cache[:kill]
         if kill and cache[:hits][rename[e.name]] == 0 and ct.length > kill
-          tell "cache kill #{ e.description }"
+          #tell "cache kill #{ e.description }"
           cache[:skip] << rename[e.name]
           cache[:size] -= ct.length
           cache[rename[e.name]] = []

data/lib/pacer-xml/sample/patent_text.rb ADDED Viewed

File without changes

data/lib/pacer-xml/sample.rb CHANGED Viewed

@@ -6,25 +6,61 @@ module PacerXml
       # Will actually load 101. To avoid this side-effect of
       # prefetching, the route should be defined as:
       # xml_route.limit(100).import(...)
-      def load_100(*args)
-        i = importer(*args).limit(100)
+      def load_100(*args, &block)
+        i = importer(*args, &block).limit(100)
         i.run!
         i.graph
       end
+      def load_100_with_text(graph = nil, args = {}, &block)
+        load_100 graph, args.merge(source: :full_text), &block
+      end
+      def load_all_with_text(graph = nil, args = {}, &block)
+        load_all graph, args.merge(source: :full_text), &block
+      end
+      def load_all_software(*args)
+        load_all_with_text(*args) do |xml_documents|
+          xml_documents.select do |raw_xml|
+            raw_xml =~ /software/i
+          end
+        end
+      end
+      def load_100_software(*args)
+        load_100_with_text(*args) do |xml_documents|
+          xml_documents.select do |raw_xml|
+            raw_xml =~ /software/i
+          end
+        end
+      end
       # Uses a Neo4j graph because the data is too big to fit in memory
       # without configuring the JVM to use more than its small default
       # footprint.
       #
-      # Alternatively, to start the JVM with more memory, try:
-      # bundle exec jruby -J-Xmx2048m -S irb
-      def load_all(graph = nil, *args)
+      # Alternatively, To start the JVM with more memory, try:
+      # bundle exec jruby -J-Xmx2g -S irb
+      def load_all(graph = nil, args = {}, &block)
         require 'pacer-neo4j'
         n = Time.now.to_i % 1000000
         graph ||= Pacer.neo4j "sample.#{n}.graph"
-        i = importer(graph, *args)
-        i.run!
-        i.graph
+        i = importer(graph, args, &block)
+        if args[:thread]
+          t = Thread.new do
+            begin
+              i.run!
+            rescue Exception => e
+              pp e
+              pp e.backtrace
+            end
+          end
+          t[:graph] = graph
+          t
+        else
+          i
+        end
       end
       def structure(g)
@@ -49,10 +85,11 @@ module PacerXml
       #
       # Import can successfully be run with no options specified, but this patent
       # xml is particularly hairy.
-      def importer(graph = nil, fn = nil, start_rule = nil, end_rule = nil)
-        html = [:abstract]
+      def importer(graph = nil, args = {}, &block)
+        html = [:abstract, :description]
+        with_body = ['claim-text']
         rename = {
-          'classification-national' => 'classification',
+          'classification-national' => 'class',
           'assistant-examiner' => 'examiner',
           'primary-examiner' => 'examiner',
           'us-term-of-grant' => 'term',
@@ -60,21 +97,29 @@ module PacerXml
           'document-id' => 'document',
           'us-related-documents' => 'related-document',
           'us-patent-grant' => 'patent-version',
-          'us-bibliographic-data-grant' => 'patent'
+          'us-bibliographic-data-grant' => 'patent',
+          "us-field-of-classification-search" => 'possible-class'
         }
-        cache = { stats: true }
+        skip = Set['classification-ipcr']
+        skip_cache = Set['figures', 'figure']
+        cache = { stats: true, skip: skip_cache }.merge(args.fetch(:cache, {}))
         graph ||= Pacer.tg
         graph.create_key_index :type, :vertex
-        xml_route = xml(fn, start_rule, end_rule)
-        xml_route.
-          process { print '.' }.
-          import(graph, html: html, rename: rename, cache: cache)
+        start_time = Time.now
+        n = 0
+        xml_route = xml(args, &block)
+        unless args[:silent]
+          xml_route = xml_route.process do
+            n += 1
+            puts "\n       #{ n } patents in #{ Time.now - start_time }s" if n % 100 == 0
+          end
+        end
+        xml_route.import(graph, html: html, skip: skip, rename: rename, cache: cache, with_body: with_body)
       end
-      def xml(fn = nil, *args)
-        fn ||= a_week
-        path = download_patent_grant fn
-        Pacer.xml path, *args
+      def xml(args, &block)
+        path = download_patent_grant args
+        Pacer.xml path, args[:start_chunk_rule], args[:end_chunk_rule], &block
       end
       def cleanup(fn = nil)
@@ -83,21 +128,60 @@ module PacerXml
         Dir["/tmp/#{name}*"].each { |f| File.delete f }
       end
+      def path(args)
+        if args[:path]
+          args[:path]
+        else
+          "/tmp/#{patent_file(args)}.xml"
+        end
+      end
+      def url(args)
+        if args[:url]
+          args[:url]
+        elsif args[:path]
+          nil
+        elsif args[:source] == :full_text
+          "http://storage.googleapis.com/patents/grant_full_text/#{patent_year(args)}/#{patent_file(args)}.zip"
+        else
+          "http://storage.googleapis.com/patents/grantbib/#{patent_year(args)}/#{patent_file(args)}.zip"
+        end
+      end
       private
-      def a_week
-        'ipgb20120103_wk01'
+      def patent_date(args)
+        args.fetch :date, Date.parse('20120103')
       end
-      def download_patent_grant(fn)
-        puts "Downloading a sample xml file from"
-        puts "http://www.google.com/googlebooks/uspto-patents-grants-biblio.html"
-        name, week = fn.split '_'
-        result = "/tmp/#{name}.xml"
-        Dir.chdir '/tmp' do
-          unless File.exists? result
-            system "curl http://storage.googleapis.com/patents/grantbib/2012/#{fn}.zip > #{fn}.zip"
-            system "unzip #{fn}.zip"
+      def patent_file(args)
+        if args[:source] == :full_text
+          date = patent_date(args).strftime "%y%m%d"
+          file = "ipg#{date}"
+        else
+          date = patent_date(args).strftime "%Y%m%d_wk%V"
+          file = "ipgb#{date}"
+        end
+      end
+      def patent_year(args)
+        patent_date(args).year
+      end
+      def download_patent_grant(args)
+        location = url(args)
+        result = path(args)
+        unless File.exists? result
+          if location
+            puts "Downloading a sample xml file from"
+            puts "http://www.google.com/googlebooks/uspto-patents-grants-biblio.html"
+            puts location
+            Dir.chdir '/tmp' do
+              system "curl #{location} > #{result}.zip"
+              system "unzip #{result}.zip"
+            end
+          else
+            throw "File not found"
           end
         end
         result

data/lib/pacer-xml/string_route.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 module Pacer
   module Core
     module StringRoute
-      def xml_stream(enter = nil, leave = nil)
+      def xml_stream(enter = nil, leave = nil, &block)
         enter ||= /<\?xml/
         leave ||= enter
         enter = build_rule :enter, enter
@@ -10,6 +10,7 @@ module Pacer
           lines << s
         end.route
         joined = r.map(element_type: :string, info: 'join', &:join).route
+        joined = block.call joined if block
         joined.xml
       end

data/lib/pacer-xml/version.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 module PacerXml
   unless const_defined? :VERSION
     START_TIME = Time.now
-    VERSION = '0.2.2'
+    VERSION = '0.2.3'
     PACER_VERSION = '>= 1.1.1'
   end
 end

data/lib/pacer-xml.rb CHANGED Viewed

@@ -37,12 +37,12 @@ require_relative 'pacer-xml/sample'
 module Pacer
   class << self
-    def xml(file, enter = nil, leave = nil)
+    def xml(file, enter = nil, leave = nil, &block)
       if file.is_a? String
-        file = File.open '/tmp/ipgb20120103.xml'
+        file = File.open file
       end
       lines = file.each_line.to_route(element_type: :string, info: 'lines').route
-      lines.xml_stream(enter, leave).route
+      lines.xml_stream(enter, leave, &block).route
     end
   end
 end

data/pacer-xml.gemspec CHANGED Viewed

@@ -15,6 +15,8 @@ Gem::Specification.new do |s|
   s.add_dependency 'pacer', PacerXml::PACER_VERSION
   s.add_dependency 'pacer-neo4j', ">= 2.1"
   s.add_dependency 'nokogiri'
+  s.add_development_dependency 'awesome_print', '0.4.0'
   s.rubyforge_project = "pacer-xml"

metadata CHANGED Viewed

@@ -2,14 +2,14 @@
 name: pacer-xml
 version: !ruby/object:Gem::Version
   prerelease:
-  version: 0.2.2
+  version: 0.2.3
 platform: java
 authors:
 - Darrick Wiebe
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-10-31 00:00:00.000000000 Z
+date: 2013-02-04 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: pacer
@@ -61,6 +61,22 @@ dependencies:
     none: false
   prerelease: false
   type: :runtime
+- !ruby/object:Gem::Dependency
+  name: awesome_print
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '='
+      - !ruby/object:Gem::Version
+        version: 0.4.0
+    none: false
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '='
+      - !ruby/object:Gem::Version
+        version: 0.4.0
+    none: false
+  prerelease: false
+  type: :development
 description: XML streaming and graph import for Pacer
 email:
 - dw@xnlogic.com
@@ -76,6 +92,7 @@ files:
 - lib/pacer-xml/build_graph.rb
 - lib/pacer-xml/nokogiri_node.rb
 - lib/pacer-xml/sample.rb
+- lib/pacer-xml/sample/patent_text.rb
 - lib/pacer-xml/string_route.rb
 - lib/pacer-xml/version.rb
 - lib/pacer-xml/xml_route.rb