pacer-xml 0.2.2-java → 0.2.3-java

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile CHANGED
@@ -3,4 +3,5 @@ source "http://rubygems.org"
3
3
  # Specify your gem's dependencies in pacer-graph.gemspec
4
4
  gemspec
5
5
 
6
- gem 'pacer', path: '~/xn/pacer'
6
+ gem 'pacer', path: '../pacer'
7
+ gem 'pacer-neo4j', path: '../pacer-neo4j'
@@ -13,13 +13,16 @@ module PacerXml
13
13
 
14
14
  attr_reader :graph
15
15
  attr_accessor :depth, :documents
16
- attr_reader :rename, :html, :skip
16
+ attr_reader :rename, :html, :skip, :with_body
17
17
 
18
18
  def initialize(graph, opts = {})
19
19
  @documents = 0
20
20
  @graph = graph
21
21
  # treat tag as a property containing html
22
22
  @html = (opts[:html] || []).map(&:to_s).to_set
23
+ # capture the body into a body property in addition to any tags it contains.
24
+ @with_body = (opts[:with_body] || []).map(&:to_s).to_set
25
+
23
26
  # skip property or tag
24
27
  @skip = (opts[:skip] || []).map(&:to_s).to_set
25
28
  # rename type or property
@@ -42,6 +45,7 @@ module PacerXml
42
45
 
43
46
  def visit_vertex_fields(e)
44
47
  h = e.fields
48
+ h['body'] = e.inner_html if with_body? e
45
49
  h['type'] = rename[h['type']]
46
50
  rename.each do |from, to|
47
51
  if h.key? from
@@ -78,6 +82,10 @@ module PacerXml
78
82
  skip.include? e.name or html.include? e.name
79
83
  end
80
84
 
85
+ def with_body?(e)
86
+ with_body.include? e.name
87
+ end
88
+
81
89
  def level
82
90
  self.depth += 1
83
91
  yield
@@ -116,6 +124,7 @@ module PacerXml
116
124
  return nil if skip? rel
117
125
  level do
118
126
  attrs = visit_edge_fields rel
127
+ rel[:body] = rel.inner_text if with_body? rel
119
128
  attrs.delete :type
120
129
  rel.contained_rels.map do |to_e|
121
130
  visit_many_rel(from_e, from, rel, to_e, attrs)
@@ -159,15 +168,15 @@ module PacerXml
159
168
  def build(doc)
160
169
  result = super
161
170
  #tell "CACHE size #{ cache[:size] }, hits:"
162
- if cache[:stats] and documents % 100 == 99
163
- tell '-----------------'
164
- cache.each do |k, adds|
165
- next unless k.is_a? String
166
- adds = adds.length
167
- hits = cache[:hits][k]
168
- tell("%40s: %6s / %6s = %5.4f" % [k, hits, adds, (hits/adds.to_f)])
169
- end
170
- end
171
+ #if cache[:stats] and documents % 100 == 99
172
+ # tell '-----------------'
173
+ # cache.each do |k, adds|
174
+ # next unless k.is_a? String
175
+ # adds = adds.length
176
+ # hits = cache[:hits][k]
177
+ # tell("%40s: %6s / %6s = %5.4f" % [k, hits, adds, (hits/adds.to_f)])
178
+ # end
179
+ #end
171
180
  result
172
181
  end
173
182
 
@@ -192,7 +201,7 @@ module PacerXml
192
201
  ct = cache[rename[e.name]]
193
202
  kill = cache[:kill]
194
203
  if kill and cache[:hits][rename[e.name]] == 0 and ct.length > kill
195
- tell "cache kill #{ e.description }"
204
+ #tell "cache kill #{ e.description }"
196
205
  cache[:skip] << rename[e.name]
197
206
  cache[:size] -= ct.length
198
207
  cache[rename[e.name]] = []
File without changes
@@ -6,25 +6,61 @@ module PacerXml
6
6
  # Will actually load 101. To avoid this side-effect of
7
7
  # prefetching, the route should be defined as:
8
8
  # xml_route.limit(100).import(...)
9
- def load_100(*args)
10
- i = importer(*args).limit(100)
9
+ def load_100(*args, &block)
10
+ i = importer(*args, &block).limit(100)
11
11
  i.run!
12
12
  i.graph
13
13
  end
14
14
 
15
+ def load_100_with_text(graph = nil, args = {}, &block)
16
+ load_100 graph, args.merge(source: :full_text), &block
17
+ end
18
+
19
+ def load_all_with_text(graph = nil, args = {}, &block)
20
+ load_all graph, args.merge(source: :full_text), &block
21
+ end
22
+
23
+ def load_all_software(*args)
24
+ load_all_with_text(*args) do |xml_documents|
25
+ xml_documents.select do |raw_xml|
26
+ raw_xml =~ /software/i
27
+ end
28
+ end
29
+ end
30
+
31
+ def load_100_software(*args)
32
+ load_100_with_text(*args) do |xml_documents|
33
+ xml_documents.select do |raw_xml|
34
+ raw_xml =~ /software/i
35
+ end
36
+ end
37
+ end
38
+
15
39
  # Uses a Neo4j graph because the data is too big to fit in memory
16
40
  # without configuring the JVM to use more than its small default
17
41
  # footprint.
18
42
  #
19
- # Alternatively, to start the JVM with more memory, try:
20
- # bundle exec jruby -J-Xmx2048m -S irb
21
- def load_all(graph = nil, *args)
43
+ # Alternatively, To start the JVM with more memory, try:
44
+ # bundle exec jruby -J-Xmx2g -S irb
45
+ def load_all(graph = nil, args = {}, &block)
22
46
  require 'pacer-neo4j'
23
47
  n = Time.now.to_i % 1000000
24
48
  graph ||= Pacer.neo4j "sample.#{n}.graph"
25
- i = importer(graph, *args)
26
- i.run!
27
- i.graph
49
+ i = importer(graph, args, &block)
50
+ if args[:thread]
51
+ t = Thread.new do
52
+ begin
53
+ i.run!
54
+ rescue Exception => e
55
+ pp e
56
+ pp e.backtrace
57
+ end
58
+ end
59
+ t[:graph] = graph
60
+ t
61
+ else
62
+ i
63
+ end
28
64
  end
29
65
 
30
66
  def structure(g)
@@ -49,10 +85,11 @@ module PacerXml
49
85
  #
50
86
  # Import can successfully be run with no options specified, but this patent
51
87
  # xml is particularly hairy.
52
- def importer(graph = nil, fn = nil, start_rule = nil, end_rule = nil)
53
- html = [:abstract]
88
+ def importer(graph = nil, args = {}, &block)
89
+ html = [:abstract, :description]
90
+ with_body = ['claim-text']
54
91
  rename = {
55
- 'classification-national' => 'classification',
92
+ 'classification-national' => 'class',
56
93
  'assistant-examiner' => 'examiner',
57
94
  'primary-examiner' => 'examiner',
58
95
  'us-term-of-grant' => 'term',
@@ -60,21 +97,29 @@ module PacerXml
60
97
  'document-id' => 'document',
61
98
  'us-related-documents' => 'related-document',
62
99
  'us-patent-grant' => 'patent-version',
63
- 'us-bibliographic-data-grant' => 'patent'
100
+ 'us-bibliographic-data-grant' => 'patent',
101
+ "us-field-of-classification-search" => 'possible-class'
64
102
  }
65
- cache = { stats: true }
103
+ skip = Set['classification-ipcr']
104
+ skip_cache = Set['figures', 'figure']
105
+ cache = { stats: true, skip: skip_cache }.merge(args.fetch(:cache, {}))
66
106
  graph ||= Pacer.tg
67
107
  graph.create_key_index :type, :vertex
68
- xml_route = xml(fn, start_rule, end_rule)
69
- xml_route.
70
- process { print '.' }.
71
- import(graph, html: html, rename: rename, cache: cache)
108
+ start_time = Time.now
109
+ n = 0
110
+ xml_route = xml(args, &block)
111
+ unless args[:silent]
112
+ xml_route = xml_route.process do
113
+ n += 1
114
+ puts "\n #{ n } patents in #{ Time.now - start_time }s" if n % 100 == 0
115
+ end
116
+ end
117
+ xml_route.import(graph, html: html, skip: skip, rename: rename, cache: cache, with_body: with_body)
72
118
  end
73
119
 
74
- def xml(fn = nil, *args)
75
- fn ||= a_week
76
- path = download_patent_grant fn
77
- Pacer.xml path, *args
120
+ def xml(args, &block)
121
+ path = download_patent_grant args
122
+ Pacer.xml path, args[:start_chunk_rule], args[:end_chunk_rule], &block
78
123
  end
79
124
 
80
125
  def cleanup(fn = nil)
@@ -83,21 +128,60 @@ module PacerXml
83
128
  Dir["/tmp/#{name}*"].each { |f| File.delete f }
84
129
  end
85
130
 
131
+ def path(args)
132
+ if args[:path]
133
+ args[:path]
134
+ else
135
+ "/tmp/#{patent_file(args)}.xml"
136
+ end
137
+ end
138
+
139
+ def url(args)
140
+ if args[:url]
141
+ args[:url]
142
+ elsif args[:path]
143
+ nil
144
+ elsif args[:source] == :full_text
145
+ "http://storage.googleapis.com/patents/grant_full_text/#{patent_year(args)}/#{patent_file(args)}.zip"
146
+ else
147
+ "http://storage.googleapis.com/patents/grantbib/#{patent_year(args)}/#{patent_file(args)}.zip"
148
+ end
149
+ end
150
+
86
151
  private
87
152
 
88
- def a_week
89
- 'ipgb20120103_wk01'
153
+ def patent_date(args)
154
+ args.fetch :date, Date.parse('20120103')
90
155
  end
91
156
 
92
- def download_patent_grant(fn)
93
- puts "Downloading a sample xml file from"
94
- puts "http://www.google.com/googlebooks/uspto-patents-grants-biblio.html"
95
- name, week = fn.split '_'
96
- result = "/tmp/#{name}.xml"
97
- Dir.chdir '/tmp' do
98
- unless File.exists? result
99
- system "curl http://storage.googleapis.com/patents/grantbib/2012/#{fn}.zip > #{fn}.zip"
100
- system "unzip #{fn}.zip"
157
+ def patent_file(args)
158
+ if args[:source] == :full_text
159
+ date = patent_date(args).strftime "%y%m%d"
160
+ file = "ipg#{date}"
161
+ else
162
+ date = patent_date(args).strftime "%Y%m%d_wk%V"
163
+ file = "ipgb#{date}"
164
+ end
165
+ end
166
+
167
+ def patent_year(args)
168
+ patent_date(args).year
169
+ end
170
+
171
+ def download_patent_grant(args)
172
+ location = url(args)
173
+ result = path(args)
174
+ unless File.exists? result
175
+ if location
176
+ puts "Downloading a sample xml file from"
177
+ puts "http://www.google.com/googlebooks/uspto-patents-grants-biblio.html"
178
+ puts location
179
+ Dir.chdir '/tmp' do
180
+ system "curl #{location} > #{result}.zip"
181
+ system "unzip #{result}.zip"
182
+ end
183
+ else
184
+ throw "File not found"
101
185
  end
102
186
  end
103
187
  result
@@ -1,7 +1,7 @@
1
1
  module Pacer
2
2
  module Core
3
3
  module StringRoute
4
- def xml_stream(enter = nil, leave = nil)
4
+ def xml_stream(enter = nil, leave = nil, &block)
5
5
  enter ||= /<\?xml/
6
6
  leave ||= enter
7
7
  enter = build_rule :enter, enter
@@ -10,6 +10,7 @@ module Pacer
10
10
  lines << s
11
11
  end.route
12
12
  joined = r.map(element_type: :string, info: 'join', &:join).route
13
+ joined = block.call joined if block
13
14
  joined.xml
14
15
  end
15
16
 
@@ -1,7 +1,7 @@
1
1
  module PacerXml
2
2
  unless const_defined? :VERSION
3
3
  START_TIME = Time.now
4
- VERSION = '0.2.2'
4
+ VERSION = '0.2.3'
5
5
  PACER_VERSION = '>= 1.1.1'
6
6
  end
7
7
  end
data/lib/pacer-xml.rb CHANGED
@@ -37,12 +37,12 @@ require_relative 'pacer-xml/sample'
37
37
 
38
38
  module Pacer
39
39
  class << self
40
- def xml(file, enter = nil, leave = nil)
40
+ def xml(file, enter = nil, leave = nil, &block)
41
41
  if file.is_a? String
42
- file = File.open '/tmp/ipgb20120103.xml'
42
+ file = File.open file
43
43
  end
44
44
  lines = file.each_line.to_route(element_type: :string, info: 'lines').route
45
- lines.xml_stream(enter, leave).route
45
+ lines.xml_stream(enter, leave, &block).route
46
46
  end
47
47
  end
48
48
  end
data/pacer-xml.gemspec CHANGED
@@ -15,6 +15,8 @@ Gem::Specification.new do |s|
15
15
  s.add_dependency 'pacer', PacerXml::PACER_VERSION
16
16
  s.add_dependency 'pacer-neo4j', ">= 2.1"
17
17
  s.add_dependency 'nokogiri'
18
+ s.add_development_dependency 'awesome_print', '0.4.0'
19
+
18
20
 
19
21
  s.rubyforge_project = "pacer-xml"
20
22
 
metadata CHANGED
@@ -2,14 +2,14 @@
2
2
  name: pacer-xml
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.2.2
5
+ version: 0.2.3
6
6
  platform: java
7
7
  authors:
8
8
  - Darrick Wiebe
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-10-31 00:00:00.000000000 Z
12
+ date: 2013-02-04 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: pacer
@@ -61,6 +61,22 @@ dependencies:
61
61
  none: false
62
62
  prerelease: false
63
63
  type: :runtime
64
+ - !ruby/object:Gem::Dependency
65
+ name: awesome_print
66
+ version_requirements: !ruby/object:Gem::Requirement
67
+ requirements:
68
+ - - '='
69
+ - !ruby/object:Gem::Version
70
+ version: 0.4.0
71
+ none: false
72
+ requirement: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - '='
75
+ - !ruby/object:Gem::Version
76
+ version: 0.4.0
77
+ none: false
78
+ prerelease: false
79
+ type: :development
64
80
  description: XML streaming and graph import for Pacer
65
81
  email:
66
82
  - dw@xnlogic.com
@@ -76,6 +92,7 @@ files:
76
92
  - lib/pacer-xml/build_graph.rb
77
93
  - lib/pacer-xml/nokogiri_node.rb
78
94
  - lib/pacer-xml/sample.rb
95
+ - lib/pacer-xml/sample/patent_text.rb
79
96
  - lib/pacer-xml/string_route.rb
80
97
  - lib/pacer-xml/version.rb
81
98
  - lib/pacer-xml/xml_route.rb