pacer-xml 0.2.2-java → 0.2.3-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +2 -1
- data/lib/pacer-xml/build_graph.rb +20 -11
- data/lib/pacer-xml/sample/patent_text.rb +0 -0
- data/lib/pacer-xml/sample.rb +116 -32
- data/lib/pacer-xml/string_route.rb +2 -1
- data/lib/pacer-xml/version.rb +1 -1
- data/lib/pacer-xml.rb +3 -3
- data/pacer-xml.gemspec +2 -0
- metadata +19 -2
data/Gemfile
CHANGED
@@ -13,13 +13,16 @@ module PacerXml
|
|
13
13
|
|
14
14
|
attr_reader :graph
|
15
15
|
attr_accessor :depth, :documents
|
16
|
-
attr_reader :rename, :html, :skip
|
16
|
+
attr_reader :rename, :html, :skip, :with_body
|
17
17
|
|
18
18
|
def initialize(graph, opts = {})
|
19
19
|
@documents = 0
|
20
20
|
@graph = graph
|
21
21
|
# treat tag as a property containing html
|
22
22
|
@html = (opts[:html] || []).map(&:to_s).to_set
|
23
|
+
# capture the body into a body property in addition to any tags it contains.
|
24
|
+
@with_body = (opts[:with_body] || []).map(&:to_s).to_set
|
25
|
+
|
23
26
|
# skip property or tag
|
24
27
|
@skip = (opts[:skip] || []).map(&:to_s).to_set
|
25
28
|
# rename type or property
|
@@ -42,6 +45,7 @@ module PacerXml
|
|
42
45
|
|
43
46
|
def visit_vertex_fields(e)
|
44
47
|
h = e.fields
|
48
|
+
h['body'] = e.inner_html if with_body? e
|
45
49
|
h['type'] = rename[h['type']]
|
46
50
|
rename.each do |from, to|
|
47
51
|
if h.key? from
|
@@ -78,6 +82,10 @@ module PacerXml
|
|
78
82
|
skip.include? e.name or html.include? e.name
|
79
83
|
end
|
80
84
|
|
85
|
+
def with_body?(e)
|
86
|
+
with_body.include? e.name
|
87
|
+
end
|
88
|
+
|
81
89
|
def level
|
82
90
|
self.depth += 1
|
83
91
|
yield
|
@@ -116,6 +124,7 @@ module PacerXml
|
|
116
124
|
return nil if skip? rel
|
117
125
|
level do
|
118
126
|
attrs = visit_edge_fields rel
|
127
|
+
rel[:body] = rel.inner_text if with_body? rel
|
119
128
|
attrs.delete :type
|
120
129
|
rel.contained_rels.map do |to_e|
|
121
130
|
visit_many_rel(from_e, from, rel, to_e, attrs)
|
@@ -159,15 +168,15 @@ module PacerXml
|
|
159
168
|
def build(doc)
|
160
169
|
result = super
|
161
170
|
#tell "CACHE size #{ cache[:size] }, hits:"
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
+
#if cache[:stats] and documents % 100 == 99
|
172
|
+
# tell '-----------------'
|
173
|
+
# cache.each do |k, adds|
|
174
|
+
# next unless k.is_a? String
|
175
|
+
# adds = adds.length
|
176
|
+
# hits = cache[:hits][k]
|
177
|
+
# tell("%40s: %6s / %6s = %5.4f" % [k, hits, adds, (hits/adds.to_f)])
|
178
|
+
# end
|
179
|
+
#end
|
171
180
|
result
|
172
181
|
end
|
173
182
|
|
@@ -192,7 +201,7 @@ module PacerXml
|
|
192
201
|
ct = cache[rename[e.name]]
|
193
202
|
kill = cache[:kill]
|
194
203
|
if kill and cache[:hits][rename[e.name]] == 0 and ct.length > kill
|
195
|
-
tell "cache kill #{ e.description }"
|
204
|
+
#tell "cache kill #{ e.description }"
|
196
205
|
cache[:skip] << rename[e.name]
|
197
206
|
cache[:size] -= ct.length
|
198
207
|
cache[rename[e.name]] = []
|
File without changes
|
data/lib/pacer-xml/sample.rb
CHANGED
@@ -6,25 +6,61 @@ module PacerXml
|
|
6
6
|
# Will actually load 101. To avoid this side-effect of
|
7
7
|
# prefetching, the route should be defined as:
|
8
8
|
# xml_route.limit(100).import(...)
|
9
|
-
def load_100(*args)
|
10
|
-
i = importer(*args).limit(100)
|
9
|
+
def load_100(*args, &block)
|
10
|
+
i = importer(*args, &block).limit(100)
|
11
11
|
i.run!
|
12
12
|
i.graph
|
13
13
|
end
|
14
14
|
|
15
|
+
def load_100_with_text(graph = nil, args = {}, &block)
|
16
|
+
load_100 graph, args.merge(source: :full_text), &block
|
17
|
+
end
|
18
|
+
|
19
|
+
def load_all_with_text(graph = nil, args = {}, &block)
|
20
|
+
load_all graph, args.merge(source: :full_text), &block
|
21
|
+
end
|
22
|
+
|
23
|
+
def load_all_software(*args)
|
24
|
+
load_all_with_text(*args) do |xml_documents|
|
25
|
+
xml_documents.select do |raw_xml|
|
26
|
+
raw_xml =~ /software/i
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def load_100_software(*args)
|
32
|
+
load_100_with_text(*args) do |xml_documents|
|
33
|
+
xml_documents.select do |raw_xml|
|
34
|
+
raw_xml =~ /software/i
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
15
39
|
# Uses a Neo4j graph because the data is too big to fit in memory
|
16
40
|
# without configuring the JVM to use more than its small default
|
17
41
|
# footprint.
|
18
42
|
#
|
19
|
-
# Alternatively,
|
20
|
-
# bundle exec jruby -J-
|
21
|
-
def load_all(graph = nil,
|
43
|
+
# Alternatively, To start the JVM with more memory, try:
|
44
|
+
# bundle exec jruby -J-Xmx2g -S irb
|
45
|
+
def load_all(graph = nil, args = {}, &block)
|
22
46
|
require 'pacer-neo4j'
|
23
47
|
n = Time.now.to_i % 1000000
|
24
48
|
graph ||= Pacer.neo4j "sample.#{n}.graph"
|
25
|
-
i = importer(graph,
|
26
|
-
|
27
|
-
|
49
|
+
i = importer(graph, args, &block)
|
50
|
+
if args[:thread]
|
51
|
+
t = Thread.new do
|
52
|
+
begin
|
53
|
+
i.run!
|
54
|
+
rescue Exception => e
|
55
|
+
pp e
|
56
|
+
pp e.backtrace
|
57
|
+
end
|
58
|
+
end
|
59
|
+
t[:graph] = graph
|
60
|
+
t
|
61
|
+
else
|
62
|
+
i
|
63
|
+
end
|
28
64
|
end
|
29
65
|
|
30
66
|
def structure(g)
|
@@ -49,10 +85,11 @@ module PacerXml
|
|
49
85
|
#
|
50
86
|
# Import can successfully be run with no options specified, but this patent
|
51
87
|
# xml is particularly hairy.
|
52
|
-
def importer(graph = nil,
|
53
|
-
html = [:abstract]
|
88
|
+
def importer(graph = nil, args = {}, &block)
|
89
|
+
html = [:abstract, :description]
|
90
|
+
with_body = ['claim-text']
|
54
91
|
rename = {
|
55
|
-
'classification-national' => '
|
92
|
+
'classification-national' => 'class',
|
56
93
|
'assistant-examiner' => 'examiner',
|
57
94
|
'primary-examiner' => 'examiner',
|
58
95
|
'us-term-of-grant' => 'term',
|
@@ -60,21 +97,29 @@ module PacerXml
|
|
60
97
|
'document-id' => 'document',
|
61
98
|
'us-related-documents' => 'related-document',
|
62
99
|
'us-patent-grant' => 'patent-version',
|
63
|
-
'us-bibliographic-data-grant' => 'patent'
|
100
|
+
'us-bibliographic-data-grant' => 'patent',
|
101
|
+
"us-field-of-classification-search" => 'possible-class'
|
64
102
|
}
|
65
|
-
|
103
|
+
skip = Set['classification-ipcr']
|
104
|
+
skip_cache = Set['figures', 'figure']
|
105
|
+
cache = { stats: true, skip: skip_cache }.merge(args.fetch(:cache, {}))
|
66
106
|
graph ||= Pacer.tg
|
67
107
|
graph.create_key_index :type, :vertex
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
108
|
+
start_time = Time.now
|
109
|
+
n = 0
|
110
|
+
xml_route = xml(args, &block)
|
111
|
+
unless args[:silent]
|
112
|
+
xml_route = xml_route.process do
|
113
|
+
n += 1
|
114
|
+
puts "\n #{ n } patents in #{ Time.now - start_time }s" if n % 100 == 0
|
115
|
+
end
|
116
|
+
end
|
117
|
+
xml_route.import(graph, html: html, skip: skip, rename: rename, cache: cache, with_body: with_body)
|
72
118
|
end
|
73
119
|
|
74
|
-
def xml(
|
75
|
-
|
76
|
-
path
|
77
|
-
Pacer.xml path, *args
|
120
|
+
def xml(args, &block)
|
121
|
+
path = download_patent_grant args
|
122
|
+
Pacer.xml path, args[:start_chunk_rule], args[:end_chunk_rule], &block
|
78
123
|
end
|
79
124
|
|
80
125
|
def cleanup(fn = nil)
|
@@ -83,21 +128,60 @@ module PacerXml
|
|
83
128
|
Dir["/tmp/#{name}*"].each { |f| File.delete f }
|
84
129
|
end
|
85
130
|
|
131
|
+
def path(args)
|
132
|
+
if args[:path]
|
133
|
+
args[:path]
|
134
|
+
else
|
135
|
+
"/tmp/#{patent_file(args)}.xml"
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
def url(args)
|
140
|
+
if args[:url]
|
141
|
+
args[:url]
|
142
|
+
elsif args[:path]
|
143
|
+
nil
|
144
|
+
elsif args[:source] == :full_text
|
145
|
+
"http://storage.googleapis.com/patents/grant_full_text/#{patent_year(args)}/#{patent_file(args)}.zip"
|
146
|
+
else
|
147
|
+
"http://storage.googleapis.com/patents/grantbib/#{patent_year(args)}/#{patent_file(args)}.zip"
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
86
151
|
private
|
87
152
|
|
88
|
-
def
|
89
|
-
'
|
153
|
+
def patent_date(args)
|
154
|
+
args.fetch :date, Date.parse('20120103')
|
90
155
|
end
|
91
156
|
|
92
|
-
def
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
157
|
+
def patent_file(args)
|
158
|
+
if args[:source] == :full_text
|
159
|
+
date = patent_date(args).strftime "%y%m%d"
|
160
|
+
file = "ipg#{date}"
|
161
|
+
else
|
162
|
+
date = patent_date(args).strftime "%Y%m%d_wk%V"
|
163
|
+
file = "ipgb#{date}"
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
def patent_year(args)
|
168
|
+
patent_date(args).year
|
169
|
+
end
|
170
|
+
|
171
|
+
def download_patent_grant(args)
|
172
|
+
location = url(args)
|
173
|
+
result = path(args)
|
174
|
+
unless File.exists? result
|
175
|
+
if location
|
176
|
+
puts "Downloading a sample xml file from"
|
177
|
+
puts "http://www.google.com/googlebooks/uspto-patents-grants-biblio.html"
|
178
|
+
puts location
|
179
|
+
Dir.chdir '/tmp' do
|
180
|
+
system "curl #{location} > #{result}.zip"
|
181
|
+
system "unzip #{result}.zip"
|
182
|
+
end
|
183
|
+
else
|
184
|
+
throw "File not found"
|
101
185
|
end
|
102
186
|
end
|
103
187
|
result
|
@@ -1,7 +1,7 @@
|
|
1
1
|
module Pacer
|
2
2
|
module Core
|
3
3
|
module StringRoute
|
4
|
-
def xml_stream(enter = nil, leave = nil)
|
4
|
+
def xml_stream(enter = nil, leave = nil, &block)
|
5
5
|
enter ||= /<\?xml/
|
6
6
|
leave ||= enter
|
7
7
|
enter = build_rule :enter, enter
|
@@ -10,6 +10,7 @@ module Pacer
|
|
10
10
|
lines << s
|
11
11
|
end.route
|
12
12
|
joined = r.map(element_type: :string, info: 'join', &:join).route
|
13
|
+
joined = block.call joined if block
|
13
14
|
joined.xml
|
14
15
|
end
|
15
16
|
|
data/lib/pacer-xml/version.rb
CHANGED
data/lib/pacer-xml.rb
CHANGED
@@ -37,12 +37,12 @@ require_relative 'pacer-xml/sample'
|
|
37
37
|
|
38
38
|
module Pacer
|
39
39
|
class << self
|
40
|
-
def xml(file, enter = nil, leave = nil)
|
40
|
+
def xml(file, enter = nil, leave = nil, &block)
|
41
41
|
if file.is_a? String
|
42
|
-
file = File.open
|
42
|
+
file = File.open file
|
43
43
|
end
|
44
44
|
lines = file.each_line.to_route(element_type: :string, info: 'lines').route
|
45
|
-
lines.xml_stream(enter, leave).route
|
45
|
+
lines.xml_stream(enter, leave, &block).route
|
46
46
|
end
|
47
47
|
end
|
48
48
|
end
|
data/pacer-xml.gemspec
CHANGED
@@ -15,6 +15,8 @@ Gem::Specification.new do |s|
|
|
15
15
|
s.add_dependency 'pacer', PacerXml::PACER_VERSION
|
16
16
|
s.add_dependency 'pacer-neo4j', ">= 2.1"
|
17
17
|
s.add_dependency 'nokogiri'
|
18
|
+
s.add_development_dependency 'awesome_print', '0.4.0'
|
19
|
+
|
18
20
|
|
19
21
|
s.rubyforge_project = "pacer-xml"
|
20
22
|
|
metadata
CHANGED
@@ -2,14 +2,14 @@
|
|
2
2
|
name: pacer-xml
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.2.
|
5
|
+
version: 0.2.3
|
6
6
|
platform: java
|
7
7
|
authors:
|
8
8
|
- Darrick Wiebe
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2013-02-04 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: pacer
|
@@ -61,6 +61,22 @@ dependencies:
|
|
61
61
|
none: false
|
62
62
|
prerelease: false
|
63
63
|
type: :runtime
|
64
|
+
- !ruby/object:Gem::Dependency
|
65
|
+
name: awesome_print
|
66
|
+
version_requirements: !ruby/object:Gem::Requirement
|
67
|
+
requirements:
|
68
|
+
- - '='
|
69
|
+
- !ruby/object:Gem::Version
|
70
|
+
version: 0.4.0
|
71
|
+
none: false
|
72
|
+
requirement: !ruby/object:Gem::Requirement
|
73
|
+
requirements:
|
74
|
+
- - '='
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: 0.4.0
|
77
|
+
none: false
|
78
|
+
prerelease: false
|
79
|
+
type: :development
|
64
80
|
description: XML streaming and graph import for Pacer
|
65
81
|
email:
|
66
82
|
- dw@xnlogic.com
|
@@ -76,6 +92,7 @@ files:
|
|
76
92
|
- lib/pacer-xml/build_graph.rb
|
77
93
|
- lib/pacer-xml/nokogiri_node.rb
|
78
94
|
- lib/pacer-xml/sample.rb
|
95
|
+
- lib/pacer-xml/sample/patent_text.rb
|
79
96
|
- lib/pacer-xml/string_route.rb
|
80
97
|
- lib/pacer-xml/version.rb
|
81
98
|
- lib/pacer-xml/xml_route.rb
|