RubyGems - pacer-xml - Versions diffs - 0.2.2-java → 0.2.3-java - Mend

pacer-xml 0.2.2-java → 0.2.3-java

Files changed (9) hide show

data/Gemfile +2 -1
data/lib/pacer-xml/build_graph.rb +20 -11
data/lib/pacer-xml/sample/patent_text.rb +0 -0
data/lib/pacer-xml/sample.rb +116 -32
data/lib/pacer-xml/string_route.rb +2 -1
data/lib/pacer-xml/version.rb +1 -1
data/lib/pacer-xml.rb +3 -3
data/pacer-xml.gemspec +2 -0
metadata +19 -2

data/Gemfile CHANGED Viewed

@@ -3,4 +3,5 @@ source "http://rubygems.org"
 # Specify your gem's dependencies in pacer-graph.gemspec
 gemspec
-gem 'pacer', path: '~/xn/pacer'
+gem 'pacer', path: '../pacer'
+gem 'pacer-neo4j', path: '../pacer-neo4j'

data/lib/pacer-xml/build_graph.rb CHANGED Viewed

@@ -13,13 +13,16 @@ module PacerXml
     attr_reader :graph
     attr_accessor :depth, :documents
-    attr_reader :rename, :html, :skip
+    attr_reader :rename, :html, :skip, :with_body
     def initialize(graph, opts = {})
       @documents = 0
       @graph = graph
       # treat tag as a property containing html
       @html = (opts[:html] || []).map(&:to_s).to_set
+      # capture the body into a body property in addition to any tags it contains.
+      @with_body = (opts[:with_body] || []).map(&:to_s).to_set
       # skip property or tag
       @skip = (opts[:skip] || []).map(&:to_s).to_set
       # rename type or property
@@ -42,6 +45,7 @@ module PacerXml
     def visit_vertex_fields(e)
       h = e.fields
+      h['body'] = e.inner_html if with_body? e
       h['type'] = rename[h['type']]
       rename.each do |from, to|
         if h.key? from
@@ -78,6 +82,10 @@ module PacerXml
       skip.include? e.name or html.include? e.name
     end
+    def with_body?(e)
+      with_body.include? e.name
+    end
     def level
       self.depth += 1
       yield
@@ -116,6 +124,7 @@ module PacerXml
       return nil if skip? rel
       level do
         attrs = visit_edge_fields rel
+        rel[:body] = rel.inner_text if with_body? rel
         attrs.delete :type
         rel.contained_rels.map do |to_e|
           visit_many_rel(from_e, from, rel, to_e, attrs)
@@ -159,15 +168,15 @@ module PacerXml
     def build(doc)
       result = super
       #tell "CACHE size #{ cache[:size] },  hits:"
-      if cache[:stats] and documents % 100 == 99
-        tell '-----------------'
-        cache.each do |k, adds|
-          next unless k.is_a? String
-          adds = adds.length
-          hits = cache[:hits][k]
-          tell("%40s: %6s / %6s = %5.4f" % [k, hits, adds, (hits/adds.to_f)])
-        end
-      end
+     #if cache[:stats] and documents % 100 == 99
+     #  tell '-----------------'
+     #  cache.each do |k, adds|
+     #    next unless k.is_a? String
+     #    adds = adds.length
+     #    hits = cache[:hits][k]
+     #    tell("%40s: %6s / %6s = %5.4f" % [k, hits, adds, (hits/adds.to_f)])
+     #  end
+     #end
       result
     end
@@ -192,7 +201,7 @@ module PacerXml
         ct = cache[rename[e.name]]
         kill = cache[:kill]
         if kill and cache[:hits][rename[e.name]] == 0 and ct.length > kill
-          tell "cache kill #{ e.description }"
+          #tell "cache kill #{ e.description }"
           cache[:skip] << rename[e.name]
           cache[:size] -= ct.length
           cache[rename[e.name]] = []

data/lib/pacer-xml/sample/patent_text.rb ADDED Viewed

File without changes

data/lib/pacer-xml/sample.rb CHANGED Viewed

@@ -6,25 +6,61 @@ module PacerXml
       # Will actually load 101. To avoid this side-effect of
       # prefetching, the route should be defined as:
       # xml_route.limit(100).import(...)
-      def load_100(*args)
-        i = importer(*args).limit(100)
+      def load_100(*args, &block)
+        i = importer(*args, &block).limit(100)
         i.run!
         i.graph
       end
+      def load_100_with_text(graph = nil, args = {}, &block)
+        load_100 graph, args.merge(source: :full_text), &block
+      end
+      def load_all_with_text(graph = nil, args = {}, &block)
+        load_all graph, args.merge(source: :full_text), &block
+      end
+      def load_all_software(*args)
+        load_all_with_text(*args) do |xml_documents|
+          xml_documents.select do |raw_xml|
+            raw_xml =~ /software/i
+          end
+        end
+      end
+      def load_100_software(*args)
+        load_100_with_text(*args) do |xml_documents|
+          xml_documents.select do |raw_xml|
+            raw_xml =~ /software/i
+          end
+        end
+      end
       # Uses a Neo4j graph because the data is too big to fit in memory
       # without configuring the JVM to use more than its small default
       # footprint.
       #
-      # Alternatively, to start the JVM with more memory, try:
-      # bundle exec jruby -J-Xmx2048m -S irb
-      def load_all(graph = nil, *args)
+      # Alternatively, To start the JVM with more memory, try:
+      # bundle exec jruby -J-Xmx2g -S irb
+      def load_all(graph = nil, args = {}, &block)
         require 'pacer-neo4j'
         n = Time.now.to_i % 1000000
         graph ||= Pacer.neo4j "sample.#{n}.graph"
-        i = importer(graph, *args)
-        i.run!
-        i.graph
+        i = importer(graph, args, &block)
+        if args[:thread]
+          t = Thread.new do
+            begin
+              i.run!
+            rescue Exception => e
+              pp e
+              pp e.backtrace
+            end
+          end
+          t[:graph] = graph
+          t
+        else
+          i
+        end
       end
       def structure(g)
@@ -49,10 +85,11 @@ module PacerXml
       #
       # Import can successfully be run with no options specified, but this patent
       # xml is particularly hairy.
-      def importer(graph = nil, fn = nil, start_rule = nil, end_rule = nil)
-        html = [:abstract]
+      def importer(graph = nil, args = {}, &block)
+        html = [:abstract, :description]
+        with_body = ['claim-text']
         rename = {
-          'classification-national' => 'classification',
+          'classification-national' => 'class',
           'assistant-examiner' => 'examiner',
           'primary-examiner' => 'examiner',
           'us-term-of-grant' => 'term',
@@ -60,21 +97,29 @@ module PacerXml
           'document-id' => 'document',
           'us-related-documents' => 'related-document',
           'us-patent-grant' => 'patent-version',
-          'us-bibliographic-data-grant' => 'patent'
+          'us-bibliographic-data-grant' => 'patent',
+          "us-field-of-classification-search" => 'possible-class'
         }
-        cache = { stats: true }
+        skip = Set['classification-ipcr']
+        skip_cache = Set['figures', 'figure']
+        cache = { stats: true, skip: skip_cache }.merge(args.fetch(:cache, {}))
         graph ||= Pacer.tg
         graph.create_key_index :type, :vertex
-        xml_route = xml(fn, start_rule, end_rule)
-        xml_route.
-          process { print '.' }.
-          import(graph, html: html, rename: rename, cache: cache)
+        start_time = Time.now
+        n = 0
+        xml_route = xml(args, &block)
+        unless args[:silent]
+          xml_route = xml_route.process do
+            n += 1
+            puts "\n       #{ n } patents in #{ Time.now - start_time }s" if n % 100 == 0
+          end
+        end
+        xml_route.import(graph, html: html, skip: skip, rename: rename, cache: cache, with_body: with_body)
       end
-      def xml(fn = nil, *args)
-        fn ||= a_week
-        path = download_patent_grant fn
-        Pacer.xml path, *args
+      def xml(args, &block)
+        path = download_patent_grant args
+        Pacer.xml path, args[:start_chunk_rule], args[:end_chunk_rule], &block
       end
       def cleanup(fn = nil)
@@ -83,21 +128,60 @@ module PacerXml
         Dir["/tmp/#{name}*"].each { |f| File.delete f }
       end
+      def path(args)
+        if args[:path]
+          args[:path]
+        else
+          "/tmp/#{patent_file(args)}.xml"
+        end
+      end
+      def url(args)
+        if args[:url]
+          args[:url]
+        elsif args[:path]
+          nil
+        elsif args[:source] == :full_text
+          "http://storage.googleapis.com/patents/grant_full_text/#{patent_year(args)}/#{patent_file(args)}.zip"
+        else
+          "http://storage.googleapis.com/patents/grantbib/#{patent_year(args)}/#{patent_file(args)}.zip"
+        end
+      end
       private
-      def a_week
-        'ipgb20120103_wk01'
+      def patent_date(args)
+        args.fetch :date, Date.parse('20120103')
       end
-      def download_patent_grant(fn)
-        puts "Downloading a sample xml file from"
-        puts "http://www.google.com/googlebooks/uspto-patents-grants-biblio.html"
-        name, week = fn.split '_'
-        result = "/tmp/#{name}.xml"
-        Dir.chdir '/tmp' do
-          unless File.exists? result
-            system "curl http://storage.googleapis.com/patents/grantbib/2012/#{fn}.zip > #{fn}.zip"
-            system "unzip #{fn}.zip"
+      def patent_file(args)
+        if args[:source] == :full_text
+          date = patent_date(args).strftime "%y%m%d"
+          file = "ipg#{date}"
+        else
+          date = patent_date(args).strftime "%Y%m%d_wk%V"
+          file = "ipgb#{date}"
+        end
+      end
+      def patent_year(args)
+        patent_date(args).year
+      end
+      def download_patent_grant(args)
+        location = url(args)
+        result = path(args)
+        unless File.exists? result
+          if location
+            puts "Downloading a sample xml file from"
+            puts "http://www.google.com/googlebooks/uspto-patents-grants-biblio.html"
+            puts location
+            Dir.chdir '/tmp' do
+              system "curl #{location} > #{result}.zip"
+              system "unzip #{result}.zip"
+            end
+          else
+            throw "File not found"
           end
         end
         result

data/lib/pacer-xml/string_route.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 module Pacer
   module Core
     module StringRoute
-      def xml_stream(enter = nil, leave = nil)
+      def xml_stream(enter = nil, leave = nil, &block)
         enter ||= /<\?xml/
         leave ||= enter
         enter = build_rule :enter, enter
@@ -10,6 +10,7 @@ module Pacer
           lines << s
         end.route
         joined = r.map(element_type: :string, info: 'join', &:join).route
+        joined = block.call joined if block
         joined.xml
       end

data/lib/pacer-xml/version.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 module PacerXml
   unless const_defined? :VERSION
     START_TIME = Time.now
-    VERSION = '0.2.2'
+    VERSION = '0.2.3'
     PACER_VERSION = '>= 1.1.1'
   end
 end

data/lib/pacer-xml.rb CHANGED Viewed

@@ -37,12 +37,12 @@ require_relative 'pacer-xml/sample'
 module Pacer
   class << self
-    def xml(file, enter = nil, leave = nil)
+    def xml(file, enter = nil, leave = nil, &block)
       if file.is_a? String
-        file = File.open '/tmp/ipgb20120103.xml'
+        file = File.open file
       end
       lines = file.each_line.to_route(element_type: :string, info: 'lines').route
-      lines.xml_stream(enter, leave).route
+      lines.xml_stream(enter, leave, &block).route
     end
   end
 end

data/pacer-xml.gemspec CHANGED Viewed

@@ -15,6 +15,8 @@ Gem::Specification.new do |s|
   s.add_dependency 'pacer', PacerXml::PACER_VERSION
   s.add_dependency 'pacer-neo4j', ">= 2.1"
   s.add_dependency 'nokogiri'
+  s.add_development_dependency 'awesome_print', '0.4.0'
   s.rubyforge_project = "pacer-xml"

metadata CHANGED Viewed

@@ -2,14 +2,14 @@
 name: pacer-xml
 version: !ruby/object:Gem::Version
   prerelease:
-  version: 0.2.2
+  version: 0.2.3
 platform: java
 authors:
 - Darrick Wiebe
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-10-31 00:00:00.000000000 Z
+date: 2013-02-04 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: pacer
@@ -61,6 +61,22 @@ dependencies:
     none: false
   prerelease: false
   type: :runtime
+- !ruby/object:Gem::Dependency
+  name: awesome_print
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '='
+      - !ruby/object:Gem::Version
+        version: 0.4.0
+    none: false
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '='
+      - !ruby/object:Gem::Version
+        version: 0.4.0
+    none: false
+  prerelease: false
+  type: :development
 description: XML streaming and graph import for Pacer
 email:
 - dw@xnlogic.com
@@ -76,6 +92,7 @@ files:
 - lib/pacer-xml/build_graph.rb
 - lib/pacer-xml/nokogiri_node.rb
 - lib/pacer-xml/sample.rb
+- lib/pacer-xml/sample/patent_text.rb
 - lib/pacer-xml/string_route.rb
 - lib/pacer-xml/version.rb
 - lib/pacer-xml/xml_route.rb