pacer-xml 0.2.2-java → 0.2.3-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile CHANGED
@@ -3,4 +3,5 @@ source "http://rubygems.org"
3
3
  # Specify your gem's dependencies in pacer-graph.gemspec
4
4
  gemspec
5
5
 
6
- gem 'pacer', path: '~/xn/pacer'
6
+ gem 'pacer', path: '../pacer'
7
+ gem 'pacer-neo4j', path: '../pacer-neo4j'
@@ -13,13 +13,16 @@ module PacerXml
13
13
 
14
14
  attr_reader :graph
15
15
  attr_accessor :depth, :documents
16
- attr_reader :rename, :html, :skip
16
+ attr_reader :rename, :html, :skip, :with_body
17
17
 
18
18
  def initialize(graph, opts = {})
19
19
  @documents = 0
20
20
  @graph = graph
21
21
  # treat tag as a property containing html
22
22
  @html = (opts[:html] || []).map(&:to_s).to_set
23
+ # capture the body into a body property in addition to any tags it contains.
24
+ @with_body = (opts[:with_body] || []).map(&:to_s).to_set
25
+
23
26
  # skip property or tag
24
27
  @skip = (opts[:skip] || []).map(&:to_s).to_set
25
28
  # rename type or property
@@ -42,6 +45,7 @@ module PacerXml
42
45
 
43
46
  def visit_vertex_fields(e)
44
47
  h = e.fields
48
+ h['body'] = e.inner_html if with_body? e
45
49
  h['type'] = rename[h['type']]
46
50
  rename.each do |from, to|
47
51
  if h.key? from
@@ -78,6 +82,10 @@ module PacerXml
78
82
  skip.include? e.name or html.include? e.name
79
83
  end
80
84
 
85
+ def with_body?(e)
86
+ with_body.include? e.name
87
+ end
88
+
81
89
  def level
82
90
  self.depth += 1
83
91
  yield
@@ -116,6 +124,7 @@ module PacerXml
116
124
  return nil if skip? rel
117
125
  level do
118
126
  attrs = visit_edge_fields rel
127
+ rel[:body] = rel.inner_text if with_body? rel
119
128
  attrs.delete :type
120
129
  rel.contained_rels.map do |to_e|
121
130
  visit_many_rel(from_e, from, rel, to_e, attrs)
@@ -159,15 +168,15 @@ module PacerXml
159
168
  def build(doc)
160
169
  result = super
161
170
  #tell "CACHE size #{ cache[:size] }, hits:"
162
- if cache[:stats] and documents % 100 == 99
163
- tell '-----------------'
164
- cache.each do |k, adds|
165
- next unless k.is_a? String
166
- adds = adds.length
167
- hits = cache[:hits][k]
168
- tell("%40s: %6s / %6s = %5.4f" % [k, hits, adds, (hits/adds.to_f)])
169
- end
170
- end
171
+ #if cache[:stats] and documents % 100 == 99
172
+ # tell '-----------------'
173
+ # cache.each do |k, adds|
174
+ # next unless k.is_a? String
175
+ # adds = adds.length
176
+ # hits = cache[:hits][k]
177
+ # tell("%40s: %6s / %6s = %5.4f" % [k, hits, adds, (hits/adds.to_f)])
178
+ # end
179
+ #end
171
180
  result
172
181
  end
173
182
 
@@ -192,7 +201,7 @@ module PacerXml
192
201
  ct = cache[rename[e.name]]
193
202
  kill = cache[:kill]
194
203
  if kill and cache[:hits][rename[e.name]] == 0 and ct.length > kill
195
- tell "cache kill #{ e.description }"
204
+ #tell "cache kill #{ e.description }"
196
205
  cache[:skip] << rename[e.name]
197
206
  cache[:size] -= ct.length
198
207
  cache[rename[e.name]] = []
File without changes
@@ -6,25 +6,61 @@ module PacerXml
6
6
  # Will actually load 101. To avoid this side-effect of
7
7
  # prefetching, the route should be defined as:
8
8
  # xml_route.limit(100).import(...)
9
- def load_100(*args)
10
- i = importer(*args).limit(100)
9
+ def load_100(*args, &block)
10
+ i = importer(*args, &block).limit(100)
11
11
  i.run!
12
12
  i.graph
13
13
  end
14
14
 
15
+ def load_100_with_text(graph = nil, args = {}, &block)
16
+ load_100 graph, args.merge(source: :full_text), &block
17
+ end
18
+
19
+ def load_all_with_text(graph = nil, args = {}, &block)
20
+ load_all graph, args.merge(source: :full_text), &block
21
+ end
22
+
23
+ def load_all_software(*args)
24
+ load_all_with_text(*args) do |xml_documents|
25
+ xml_documents.select do |raw_xml|
26
+ raw_xml =~ /software/i
27
+ end
28
+ end
29
+ end
30
+
31
+ def load_100_software(*args)
32
+ load_100_with_text(*args) do |xml_documents|
33
+ xml_documents.select do |raw_xml|
34
+ raw_xml =~ /software/i
35
+ end
36
+ end
37
+ end
38
+
15
39
  # Uses a Neo4j graph because the data is too big to fit in memory
16
40
  # without configuring the JVM to use more than its small default
17
41
  # footprint.
18
42
  #
19
- # Alternatively, to start the JVM with more memory, try:
20
- # bundle exec jruby -J-Xmx2048m -S irb
21
- def load_all(graph = nil, *args)
43
+ # Alternatively, To start the JVM with more memory, try:
44
+ # bundle exec jruby -J-Xmx2g -S irb
45
+ def load_all(graph = nil, args = {}, &block)
22
46
  require 'pacer-neo4j'
23
47
  n = Time.now.to_i % 1000000
24
48
  graph ||= Pacer.neo4j "sample.#{n}.graph"
25
- i = importer(graph, *args)
26
- i.run!
27
- i.graph
49
+ i = importer(graph, args, &block)
50
+ if args[:thread]
51
+ t = Thread.new do
52
+ begin
53
+ i.run!
54
+ rescue Exception => e
55
+ pp e
56
+ pp e.backtrace
57
+ end
58
+ end
59
+ t[:graph] = graph
60
+ t
61
+ else
62
+ i
63
+ end
28
64
  end
29
65
 
30
66
  def structure(g)
@@ -49,10 +85,11 @@ module PacerXml
49
85
  #
50
86
  # Import can successfully be run with no options specified, but this patent
51
87
  # xml is particularly hairy.
52
- def importer(graph = nil, fn = nil, start_rule = nil, end_rule = nil)
53
- html = [:abstract]
88
+ def importer(graph = nil, args = {}, &block)
89
+ html = [:abstract, :description]
90
+ with_body = ['claim-text']
54
91
  rename = {
55
- 'classification-national' => 'classification',
92
+ 'classification-national' => 'class',
56
93
  'assistant-examiner' => 'examiner',
57
94
  'primary-examiner' => 'examiner',
58
95
  'us-term-of-grant' => 'term',
@@ -60,21 +97,29 @@ module PacerXml
60
97
  'document-id' => 'document',
61
98
  'us-related-documents' => 'related-document',
62
99
  'us-patent-grant' => 'patent-version',
63
- 'us-bibliographic-data-grant' => 'patent'
100
+ 'us-bibliographic-data-grant' => 'patent',
101
+ "us-field-of-classification-search" => 'possible-class'
64
102
  }
65
- cache = { stats: true }
103
+ skip = Set['classification-ipcr']
104
+ skip_cache = Set['figures', 'figure']
105
+ cache = { stats: true, skip: skip_cache }.merge(args.fetch(:cache, {}))
66
106
  graph ||= Pacer.tg
67
107
  graph.create_key_index :type, :vertex
68
- xml_route = xml(fn, start_rule, end_rule)
69
- xml_route.
70
- process { print '.' }.
71
- import(graph, html: html, rename: rename, cache: cache)
108
+ start_time = Time.now
109
+ n = 0
110
+ xml_route = xml(args, &block)
111
+ unless args[:silent]
112
+ xml_route = xml_route.process do
113
+ n += 1
114
+ puts "\n #{ n } patents in #{ Time.now - start_time }s" if n % 100 == 0
115
+ end
116
+ end
117
+ xml_route.import(graph, html: html, skip: skip, rename: rename, cache: cache, with_body: with_body)
72
118
  end
73
119
 
74
- def xml(fn = nil, *args)
75
- fn ||= a_week
76
- path = download_patent_grant fn
77
- Pacer.xml path, *args
120
+ def xml(args, &block)
121
+ path = download_patent_grant args
122
+ Pacer.xml path, args[:start_chunk_rule], args[:end_chunk_rule], &block
78
123
  end
79
124
 
80
125
  def cleanup(fn = nil)
@@ -83,21 +128,60 @@ module PacerXml
83
128
  Dir["/tmp/#{name}*"].each { |f| File.delete f }
84
129
  end
85
130
 
131
+ def path(args)
132
+ if args[:path]
133
+ args[:path]
134
+ else
135
+ "/tmp/#{patent_file(args)}.xml"
136
+ end
137
+ end
138
+
139
+ def url(args)
140
+ if args[:url]
141
+ args[:url]
142
+ elsif args[:path]
143
+ nil
144
+ elsif args[:source] == :full_text
145
+ "http://storage.googleapis.com/patents/grant_full_text/#{patent_year(args)}/#{patent_file(args)}.zip"
146
+ else
147
+ "http://storage.googleapis.com/patents/grantbib/#{patent_year(args)}/#{patent_file(args)}.zip"
148
+ end
149
+ end
150
+
86
151
  private
87
152
 
88
- def a_week
89
- 'ipgb20120103_wk01'
153
+ def patent_date(args)
154
+ args.fetch :date, Date.parse('20120103')
90
155
  end
91
156
 
92
- def download_patent_grant(fn)
93
- puts "Downloading a sample xml file from"
94
- puts "http://www.google.com/googlebooks/uspto-patents-grants-biblio.html"
95
- name, week = fn.split '_'
96
- result = "/tmp/#{name}.xml"
97
- Dir.chdir '/tmp' do
98
- unless File.exists? result
99
- system "curl http://storage.googleapis.com/patents/grantbib/2012/#{fn}.zip > #{fn}.zip"
100
- system "unzip #{fn}.zip"
157
+ def patent_file(args)
158
+ if args[:source] == :full_text
159
+ date = patent_date(args).strftime "%y%m%d"
160
+ file = "ipg#{date}"
161
+ else
162
+ date = patent_date(args).strftime "%Y%m%d_wk%V"
163
+ file = "ipgb#{date}"
164
+ end
165
+ end
166
+
167
+ def patent_year(args)
168
+ patent_date(args).year
169
+ end
170
+
171
+ def download_patent_grant(args)
172
+ location = url(args)
173
+ result = path(args)
174
+ unless File.exists? result
175
+ if location
176
+ puts "Downloading a sample xml file from"
177
+ puts "http://www.google.com/googlebooks/uspto-patents-grants-biblio.html"
178
+ puts location
179
+ Dir.chdir '/tmp' do
180
+ system "curl #{location} > #{result}.zip"
181
+ system "unzip #{result}.zip"
182
+ end
183
+ else
184
+ throw "File not found"
101
185
  end
102
186
  end
103
187
  result
@@ -1,7 +1,7 @@
1
1
  module Pacer
2
2
  module Core
3
3
  module StringRoute
4
- def xml_stream(enter = nil, leave = nil)
4
+ def xml_stream(enter = nil, leave = nil, &block)
5
5
  enter ||= /<\?xml/
6
6
  leave ||= enter
7
7
  enter = build_rule :enter, enter
@@ -10,6 +10,7 @@ module Pacer
10
10
  lines << s
11
11
  end.route
12
12
  joined = r.map(element_type: :string, info: 'join', &:join).route
13
+ joined = block.call joined if block
13
14
  joined.xml
14
15
  end
15
16
 
@@ -1,7 +1,7 @@
1
1
  module PacerXml
2
2
  unless const_defined? :VERSION
3
3
  START_TIME = Time.now
4
- VERSION = '0.2.2'
4
+ VERSION = '0.2.3'
5
5
  PACER_VERSION = '>= 1.1.1'
6
6
  end
7
7
  end
data/lib/pacer-xml.rb CHANGED
@@ -37,12 +37,12 @@ require_relative 'pacer-xml/sample'
37
37
 
38
38
  module Pacer
39
39
  class << self
40
- def xml(file, enter = nil, leave = nil)
40
+ def xml(file, enter = nil, leave = nil, &block)
41
41
  if file.is_a? String
42
- file = File.open '/tmp/ipgb20120103.xml'
42
+ file = File.open file
43
43
  end
44
44
  lines = file.each_line.to_route(element_type: :string, info: 'lines').route
45
- lines.xml_stream(enter, leave).route
45
+ lines.xml_stream(enter, leave, &block).route
46
46
  end
47
47
  end
48
48
  end
data/pacer-xml.gemspec CHANGED
@@ -15,6 +15,8 @@ Gem::Specification.new do |s|
15
15
  s.add_dependency 'pacer', PacerXml::PACER_VERSION
16
16
  s.add_dependency 'pacer-neo4j', ">= 2.1"
17
17
  s.add_dependency 'nokogiri'
18
+ s.add_development_dependency 'awesome_print', '0.4.0'
19
+
18
20
 
19
21
  s.rubyforge_project = "pacer-xml"
20
22
 
metadata CHANGED
@@ -2,14 +2,14 @@
2
2
  name: pacer-xml
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.2.2
5
+ version: 0.2.3
6
6
  platform: java
7
7
  authors:
8
8
  - Darrick Wiebe
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-10-31 00:00:00.000000000 Z
12
+ date: 2013-02-04 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: pacer
@@ -61,6 +61,22 @@ dependencies:
61
61
  none: false
62
62
  prerelease: false
63
63
  type: :runtime
64
+ - !ruby/object:Gem::Dependency
65
+ name: awesome_print
66
+ version_requirements: !ruby/object:Gem::Requirement
67
+ requirements:
68
+ - - '='
69
+ - !ruby/object:Gem::Version
70
+ version: 0.4.0
71
+ none: false
72
+ requirement: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - '='
75
+ - !ruby/object:Gem::Version
76
+ version: 0.4.0
77
+ none: false
78
+ prerelease: false
79
+ type: :development
64
80
  description: XML streaming and graph import for Pacer
65
81
  email:
66
82
  - dw@xnlogic.com
@@ -76,6 +92,7 @@ files:
76
92
  - lib/pacer-xml/build_graph.rb
77
93
  - lib/pacer-xml/nokogiri_node.rb
78
94
  - lib/pacer-xml/sample.rb
95
+ - lib/pacer-xml/sample/patent_text.rb
79
96
  - lib/pacer-xml/string_route.rb
80
97
  - lib/pacer-xml/version.rb
81
98
  - lib/pacer-xml/xml_route.rb