pdf-extract 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/assign.rb +72 -0
- data/bin/config.json +4 -0
- data/bin/fac_v19n11_s5.mask.pdf +0 -0
- data/bin/margins.mask.pdf +0 -0
- data/bin/one-column.mask.pdf +24110 -39
- data/bin/pdf-extract +146 -0
- data/bin/s002040050107_Arch_Toxicol_1994_68_8.mask.pdf +0 -0
- data/bin/some3.mask.pdf +0 -0
- data/bin/some5.mask.pdf +0 -0
- data/bin/some6.mask.pdf +0 -0
- data/bin/train.rb +48 -0
- data/bin/two-column.mask.pdf +0 -0
- data/data/familynames.db +0 -0
- data/data/stopwords.txt +1 -0
- data/lib/analysis/columns.rb +75 -0
- data/lib/analysis/margins.rb +84 -0
- data/lib/analysis/sections.rb +156 -0
- data/lib/analysis/titles.rb +53 -0
- data/lib/analysis/zones.rb +128 -0
- data/lib/font_metrics.rb +240 -0
- data/lib/kmeans.rb +114 -0
- data/lib/language.rb +58 -0
- data/lib/model/characters.rb +320 -0
- data/lib/model/chunks.rb +103 -0
- data/lib/model/regions.rb +112 -0
- data/lib/multi_range.rb +69 -0
- data/lib/names.rb +85 -0
- data/lib/pdf-extract.rb +77 -0
- data/lib/pdf.rb +255 -0
- data/lib/references/references.rb +184 -0
- data/lib/references/resolve.rb +113 -0
- data/lib/references/resolved_references.rb +37 -0
- data/lib/spatial.rb +188 -0
- data/lib/view/abstract_view.rb +32 -0
- data/lib/view/pdf_view.rb +43 -0
- data/lib/view/png_view.rb +30 -0
- data/lib/view/xml_view.rb +113 -0
- metadata +208 -0
data/lib/pdf-extract.rb
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
require_relative 'pdf'
|
2
|
+
require_relative 'model/characters'
|
3
|
+
require_relative 'model/chunks'
|
4
|
+
require_relative 'model/regions'
|
5
|
+
require_relative 'analysis/titles'
|
6
|
+
require_relative 'analysis/margins'
|
7
|
+
require_relative 'analysis/zones'
|
8
|
+
require_relative 'analysis/columns'
|
9
|
+
require_relative 'analysis/sections'
|
10
|
+
require_relative 'references/references'
|
11
|
+
require_relative 'references/resolved_references'
|
12
|
+
require_relative 'view/png_view'
|
13
|
+
require_relative 'view/pdf_view'
|
14
|
+
require_relative 'view/xml_view'
|
15
|
+
|
16
|
+
module PdfExtract
|
17
|
+
|
18
|
+
@views = {}
|
19
|
+
|
20
|
+
@parsers = []
|
21
|
+
|
22
|
+
def self.add_view name, view_class
|
23
|
+
@views[name] = view_class
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.add_parser parser_class
|
27
|
+
@parsers << parser_class
|
28
|
+
end
|
29
|
+
|
30
|
+
def self.parse filename, &block
|
31
|
+
pdf = Pdf.new
|
32
|
+
|
33
|
+
@parsers.each do |p|
|
34
|
+
p.include_in pdf
|
35
|
+
end
|
36
|
+
|
37
|
+
yield pdf
|
38
|
+
|
39
|
+
pdf.spatial_calls.each do |spatial_call|
|
40
|
+
name = spatial_call[:name]
|
41
|
+
receiver = Receiver.new pdf
|
42
|
+
pdf.spatial_builders[name].call receiver
|
43
|
+
receiver.invoke_calls filename, pdf.spatial_options[name]
|
44
|
+
end
|
45
|
+
|
46
|
+
pdf
|
47
|
+
end
|
48
|
+
|
49
|
+
def self.view_class short_name
|
50
|
+
@views[short_name]
|
51
|
+
end
|
52
|
+
|
53
|
+
def self.view filename, options = {}, &block
|
54
|
+
pdf = parse filename, &block
|
55
|
+
view_class(options[:as]).new(pdf, filename).render options
|
56
|
+
end
|
57
|
+
|
58
|
+
def self.init
|
59
|
+
add_parser Characters
|
60
|
+
add_parser Chunks
|
61
|
+
add_parser Regions
|
62
|
+
add_parser Titles
|
63
|
+
add_parser Margins
|
64
|
+
add_parser Zones
|
65
|
+
add_parser Columns
|
66
|
+
add_parser Sections
|
67
|
+
add_parser References
|
68
|
+
add_parser ResolvedReferences
|
69
|
+
|
70
|
+
add_view :pdf, PdfView
|
71
|
+
add_view :png, PngView
|
72
|
+
add_view :xml, XmlView
|
73
|
+
end
|
74
|
+
|
75
|
+
init
|
76
|
+
|
77
|
+
end
|
data/lib/pdf.rb
ADDED
@@ -0,0 +1,255 @@
|
|
1
|
+
require 'pdf-reader'
|
2
|
+
|
3
|
+
module PdfExtract
|
4
|
+
|
5
|
+
class Settings
|
6
|
+
|
7
|
+
@@defaults = {}
|
8
|
+
|
9
|
+
def self.default key, default_value
|
10
|
+
@@defaults[key] = default_value
|
11
|
+
end
|
12
|
+
|
13
|
+
def initialize
|
14
|
+
@settings = {}
|
15
|
+
@agents = {}
|
16
|
+
end
|
17
|
+
|
18
|
+
def [] key
|
19
|
+
@settings[key] || @@defaults[key] ||
|
20
|
+
raise("Attempt to use undeclared setting \"#{key}\"")
|
21
|
+
end
|
22
|
+
|
23
|
+
def set key, value, agent=""
|
24
|
+
if @@defaults[key]
|
25
|
+
@settings[key] = value.to_f
|
26
|
+
@agents[key] = agent
|
27
|
+
else
|
28
|
+
raise "Attempt to set an undefined setting \"#{key}\""
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def unmodified
|
33
|
+
@@defaults.reject { |k, v| @settings[k] }
|
34
|
+
end
|
35
|
+
|
36
|
+
def modified
|
37
|
+
@settings
|
38
|
+
end
|
39
|
+
|
40
|
+
def agent key
|
41
|
+
@agents[key]
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
45
|
+
|
46
|
+
class Receiver
|
47
|
+
|
48
|
+
def initialize pdf
|
49
|
+
@pdf = pdf
|
50
|
+
@listeners = {}
|
51
|
+
@object_listeners = {}
|
52
|
+
end
|
53
|
+
|
54
|
+
def for callback_name, &block
|
55
|
+
@listeners[callback_name] = {:type => @pdf.operating_type, :fn => block}
|
56
|
+
end
|
57
|
+
|
58
|
+
def objects type_name, &block
|
59
|
+
@object_listeners[type_name] ||= []
|
60
|
+
@object_listeners[type_name] << block
|
61
|
+
end
|
62
|
+
|
63
|
+
def before &block
|
64
|
+
@before = block
|
65
|
+
end
|
66
|
+
|
67
|
+
def after &block
|
68
|
+
@after = {:type => @pdf.operating_type, :fn => block}
|
69
|
+
end
|
70
|
+
|
71
|
+
def expand_listeners_to_callback_methods
|
72
|
+
# TODO merge on callback_name
|
73
|
+
@listeners.each_pair do |callback_name, callback_handler|
|
74
|
+
p = proc do |*args|
|
75
|
+
spatial_objects = callback_handler[:fn].call args
|
76
|
+
self.add_spatial_objects callback_handler[:type], spatial_objects
|
77
|
+
end
|
78
|
+
|
79
|
+
self.class.send :define_method, callback_name, p
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def call_object_listeners spatial_objects
|
84
|
+
@object_listeners.each_pair do |type, listeners|
|
85
|
+
listeners.each do |listener|
|
86
|
+
spatial_objects[type].each { |obj| listener.call obj }
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
def call_after
|
92
|
+
self.add_spatial_objects @after[:type], @after[:fn].call unless @after.nil?
|
93
|
+
end
|
94
|
+
|
95
|
+
def call_before
|
96
|
+
@before.call unless @before.nil?
|
97
|
+
end
|
98
|
+
|
99
|
+
def for_calls?
|
100
|
+
@listeners.size > 0
|
101
|
+
end
|
102
|
+
|
103
|
+
def object_calls?
|
104
|
+
@object_listeners.size > 0
|
105
|
+
end
|
106
|
+
|
107
|
+
def add_spatial_objects default_type, objs
|
108
|
+
if objs.class != Array
|
109
|
+
objs = [objs] unless objs.nil?
|
110
|
+
objs = [] if objs.nil?
|
111
|
+
end
|
112
|
+
|
113
|
+
objs.each do |obj|
|
114
|
+
type = obj.delete(:group) || default_type
|
115
|
+
@pdf.spatial_objects[type] ||= []
|
116
|
+
@pdf.spatial_objects[type] << obj
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
def invoke_calls filename, spatial_options
|
121
|
+
if spatial_options[:paged]
|
122
|
+
|
123
|
+
paged_objs = {}
|
124
|
+
@object_listeners.each_pair do |type, _|
|
125
|
+
@pdf.paged_objects(type).each_pair do |page, objs|
|
126
|
+
paged_objs[page] ||= {}
|
127
|
+
paged_objs[page][type] = objs
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
paged_objs.each_pair do |page, objs|
|
132
|
+
self.call_before
|
133
|
+
|
134
|
+
if self.object_calls?
|
135
|
+
@object_listeners.each_pair do |type, listeners|
|
136
|
+
listeners.each do |listener|
|
137
|
+
if objs[type].nil?
|
138
|
+
raise "#{@pdf.operating_type} is missing a dependency on #{type}"
|
139
|
+
end
|
140
|
+
objs[type].each { |obj| listener.call obj }
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
self.call_after
|
146
|
+
end
|
147
|
+
|
148
|
+
else
|
149
|
+
|
150
|
+
self.call_before
|
151
|
+
if self.object_calls?
|
152
|
+
self.call_object_listeners @pdf.spatial_objects
|
153
|
+
end
|
154
|
+
self.call_after
|
155
|
+
|
156
|
+
end
|
157
|
+
|
158
|
+
if self.for_calls?
|
159
|
+
self.expand_listeners_to_callback_methods
|
160
|
+
PDF::Reader.file filename, self, :raw_text => true
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
end
|
165
|
+
|
166
|
+
class Pdf
|
167
|
+
|
168
|
+
attr_accessor :operating_type, :spatial_calls, :spatial_builders, :spatial_objects
|
169
|
+
attr_accessor :spatial_options, :settings
|
170
|
+
|
171
|
+
def method_missing name, *args
|
172
|
+
raise "No such spatial type #{name}"
|
173
|
+
end
|
174
|
+
|
175
|
+
def spatials name, options = {}, &block
|
176
|
+
add_spatials_method name, options, &block
|
177
|
+
end
|
178
|
+
|
179
|
+
def initialize
|
180
|
+
@spatial_builders = {}
|
181
|
+
@spatial_calls = []
|
182
|
+
@spatial_objects = {}
|
183
|
+
@spatial_options = {}
|
184
|
+
@settings = Settings.new
|
185
|
+
end
|
186
|
+
|
187
|
+
def explicit_call? name
|
188
|
+
@spatial_calls.count { |obj| obj[:name] == name and obj[:explicit] } > 0
|
189
|
+
end
|
190
|
+
|
191
|
+
def paged_objects type
|
192
|
+
paged_objs = {}
|
193
|
+
|
194
|
+
if @spatial_objects[type]
|
195
|
+
@spatial_objects[type].each do |obj|
|
196
|
+
paged_objs[obj[:page]] ||= []
|
197
|
+
paged_objs[obj[:page]] << obj
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
paged_objs
|
202
|
+
end
|
203
|
+
|
204
|
+
def [](type)
|
205
|
+
@spatial_objects[type]
|
206
|
+
end
|
207
|
+
|
208
|
+
def set setting, value, agent=""
|
209
|
+
@settings.set setting, value, agent
|
210
|
+
end
|
211
|
+
|
212
|
+
private
|
213
|
+
|
214
|
+
def append_deps deps_list
|
215
|
+
# TODO if explicit is true, overwrite non-explicit deps.
|
216
|
+
deps_list.each do |dep|
|
217
|
+
append_deps @spatial_options[dep].fetch(:depends_on, [])
|
218
|
+
if @spatial_calls.count { |obj| obj[:name] == dep }.zero?
|
219
|
+
@spatial_calls << {
|
220
|
+
:name => dep,
|
221
|
+
:explicit => false
|
222
|
+
}
|
223
|
+
end
|
224
|
+
end
|
225
|
+
end
|
226
|
+
|
227
|
+
def add_spatials_method name, options={}, &block
|
228
|
+
options = {:depends_on => [], :defined_by => []}.merge options
|
229
|
+
|
230
|
+
@spatial_objects[name] = []
|
231
|
+
@spatial_builders[name] = proc { |receiver|
|
232
|
+
@operating_type = name
|
233
|
+
block.call receiver unless block.nil?
|
234
|
+
}
|
235
|
+
@spatial_options[name] = options
|
236
|
+
|
237
|
+
p = Proc.new do
|
238
|
+
append_deps options[:depends_on]
|
239
|
+
|
240
|
+
@spatial_calls << {
|
241
|
+
:name => name,
|
242
|
+
:explicit => true
|
243
|
+
}
|
244
|
+
|
245
|
+
@spatial_objects[name].each do |o|
|
246
|
+
yield o
|
247
|
+
end
|
248
|
+
end
|
249
|
+
|
250
|
+
self.class.send :define_method, name, p
|
251
|
+
end
|
252
|
+
end
|
253
|
+
|
254
|
+
end
|
255
|
+
|
@@ -0,0 +1,184 @@
|
|
1
|
+
require_relative "../spatial"
|
2
|
+
|
3
|
+
module PdfExtract
|
4
|
+
module References
|
5
|
+
|
6
|
+
Settings.default :min_score, 6.4
|
7
|
+
Settings.default :min_sequence_count, 3
|
8
|
+
Settings.default :max_reference_order, 1000
|
9
|
+
|
10
|
+
def self.partition_by ary, &block
|
11
|
+
matching = []
|
12
|
+
parts = []
|
13
|
+
ary.each do |item|
|
14
|
+
if yield(item)
|
15
|
+
parts << matching
|
16
|
+
matching = []
|
17
|
+
end
|
18
|
+
matching << item
|
19
|
+
end
|
20
|
+
parts.reject { |p| p.empty? }
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.frequencies lines, delimit_key
|
24
|
+
fs = {}
|
25
|
+
lines.each do |line|
|
26
|
+
val = line[delimit_key].floor
|
27
|
+
fs[val] ||= 0
|
28
|
+
fs[val] = fs[val].next
|
29
|
+
end
|
30
|
+
|
31
|
+
ary = []
|
32
|
+
fs.each_pair do |key, val|
|
33
|
+
ary << {:value => key, :count => val}
|
34
|
+
end
|
35
|
+
|
36
|
+
ary.sort_by { |item| item[:count] }.reverse
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.select_delimiter lines, delimit_key
|
40
|
+
frequencies(lines, delimit_key)[1][:value]
|
41
|
+
end
|
42
|
+
|
43
|
+
def self.split_by_margin lines
|
44
|
+
delimiting_x_offset = select_delimiter lines, :x_offset
|
45
|
+
lines = lines.drop_while { |l| l[:x_offset].floor != delimiting_x_offset }
|
46
|
+
parts = partition_by(lines) { |line| line[:x_offset].floor == delimiting_x_offset }
|
47
|
+
parts.map { |part| {:content => part.map { |line| line[:content] }.join(" ")} }
|
48
|
+
end
|
49
|
+
|
50
|
+
def self.split_by_line_spacing lines
|
51
|
+
delimiting_spacing = select_delimiter lines, :spacing
|
52
|
+
lines = lines.drop_while { |l| l[:spacing].floor != delimiting_spacing }
|
53
|
+
parts = partition_by(lines) { |line| line[:spacing].floor == delimiting_spacing }
|
54
|
+
parts.map { |part| {:content => part.map { |line| line[:content] }.join(" ")} }
|
55
|
+
end
|
56
|
+
|
57
|
+
def self.split_by_delimiter pdf, s
|
58
|
+
# Find sequential numbers and use them as partition points.
|
59
|
+
|
60
|
+
# Determine the charcaters that are most likely part of numeric
|
61
|
+
# delimiters.
|
62
|
+
|
63
|
+
after = {}
|
64
|
+
before = {}
|
65
|
+
last_n = -1
|
66
|
+
|
67
|
+
s.scan /[^\d]?\d+[^\d]/ do |m|
|
68
|
+
n = m[/\d+/].to_i
|
69
|
+
if n < pdf.settings[:max_reference_order]
|
70
|
+
if last_n == -1
|
71
|
+
before[m[0]] ||= 0
|
72
|
+
before[m[0]] = before[m[0]].next
|
73
|
+
after[m[-1]] ||= 0
|
74
|
+
after[m[-1]] = after[m[-1]].next
|
75
|
+
last_n = n
|
76
|
+
elsif n == last_n.next
|
77
|
+
before[m[0]] ||= 0
|
78
|
+
before[m[0]] = before[m[0]].next
|
79
|
+
after[m[-1]] ||= 0
|
80
|
+
after[m[-1]] = after[m[-1]].next
|
81
|
+
last_n = last_n.next
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
b_s = "" if before.length.zero?
|
87
|
+
b_s = "\\" + before.max_by { |_, v| v }[0] unless before.length.zero?
|
88
|
+
a_s = "" if after.length.zero?
|
89
|
+
a_s = "\\" + after.max_by { |_, v| v }[0] unless after.length.zero?
|
90
|
+
|
91
|
+
# TODO Turn into settings. Needs typed settings
|
92
|
+
if ["", "\\[", "\\ "].include?(b_s) && ["", "\\.", "\\]", "\\ "].include?(a_s)
|
93
|
+
|
94
|
+
# Split by the delimiters and record separate refs.
|
95
|
+
|
96
|
+
last_n = -1
|
97
|
+
current_ref = ""
|
98
|
+
refs = []
|
99
|
+
parts = s.partition(Regexp.new "#{b_s}?\\d+#{a_s}")
|
100
|
+
|
101
|
+
while not parts[1].length.zero?
|
102
|
+
n = parts[1][/\d+/].to_i
|
103
|
+
if n < pdf.settings[:max_reference_order] && last_n == -1
|
104
|
+
last_n = n
|
105
|
+
elsif n == last_n.next
|
106
|
+
current_ref += parts[0]
|
107
|
+
refs << {
|
108
|
+
:content => current_ref.strip,
|
109
|
+
:order => last_n
|
110
|
+
}
|
111
|
+
current_ref = ""
|
112
|
+
last_n = last_n.next
|
113
|
+
else
|
114
|
+
current_ref += parts[0] + parts[1]
|
115
|
+
end
|
116
|
+
|
117
|
+
parts = parts[2].partition(Regexp.new "#{b_s}?\\d+#{a_s}")
|
118
|
+
end
|
119
|
+
|
120
|
+
refs << {
|
121
|
+
:content => (current_ref + parts[0]).strip,
|
122
|
+
:order => last_n
|
123
|
+
}
|
124
|
+
|
125
|
+
refs
|
126
|
+
|
127
|
+
else
|
128
|
+
[]
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
def self.multi_margin? lines
|
133
|
+
lines.uniq { |line| line[:x_offset].floor }.count > 1
|
134
|
+
end
|
135
|
+
|
136
|
+
def self.multi_spacing? lines
|
137
|
+
lines.uniq { |line| line[:spacing].floor }.count > 1
|
138
|
+
end
|
139
|
+
|
140
|
+
def self.numeric_sequence? pdf, content
|
141
|
+
last_n = -1
|
142
|
+
seq_count = 0
|
143
|
+
content.scan /\d+/ do |m|
|
144
|
+
# Avoid misinterpreting years as sequence
|
145
|
+
if m.to_i < pdf.settings[:max_reference_order]
|
146
|
+
if last_n == -1
|
147
|
+
last_n = m.to_i
|
148
|
+
elsif last_n.next == m.to_i
|
149
|
+
last_n = last_n.next
|
150
|
+
seq_count = seq_count.next
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
seq_count >= pdf.settings[:min_sequence_count]
|
156
|
+
end
|
157
|
+
|
158
|
+
def self.include_in pdf
|
159
|
+
pdf.spatials :references, :depends_on => [:sections] do |parser|
|
160
|
+
|
161
|
+
refs = []
|
162
|
+
|
163
|
+
parser.objects :sections do |section|
|
164
|
+
# TODO Take top x%, fix Infinity coming back from score.
|
165
|
+
if section[:reference_score] >= pdf.settings[:min_score]
|
166
|
+
if numeric_sequence? pdf, Spatial.get_text_content(section)
|
167
|
+
refs += split_by_delimiter pdf, Spatial.get_text_content(section)
|
168
|
+
elsif multi_margin? section[:lines]
|
169
|
+
refs += split_by_margin section[:lines]
|
170
|
+
elsif multi_spacing? section[:lines]
|
171
|
+
refs += split_by_line_spacing section[:lines]
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
parser.after do
|
177
|
+
refs
|
178
|
+
end
|
179
|
+
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
end
|
184
|
+
end
|
@@ -0,0 +1,113 @@
|
|
1
|
+
require 'cgi'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'open-uri'
|
4
|
+
require 'net/http'
|
5
|
+
|
6
|
+
module PdfExtract::Resolve
|
7
|
+
|
8
|
+
class Sigg
|
9
|
+
|
10
|
+
def self.find ref
|
11
|
+
url = "http://api.labs.crossref.org/search?q=#{CGI.escape(ref)}"
|
12
|
+
resolved = {}
|
13
|
+
begin
|
14
|
+
doc = Nokogiri::HTML(open url)
|
15
|
+
|
16
|
+
result = doc.at_css "div.result"
|
17
|
+
unless result.nil?
|
18
|
+
score = result.at_css("span.cr_score").content.to_s
|
19
|
+
if score.to_i >= 90
|
20
|
+
doi = result.at_css "span.doi"
|
21
|
+
resolved[:doi] = doi.content.sub "http://dx.doi.org/", ""
|
22
|
+
end
|
23
|
+
end
|
24
|
+
rescue
|
25
|
+
end
|
26
|
+
resolved
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
30
|
+
|
31
|
+
class FreeCite
|
32
|
+
|
33
|
+
def self.find ref
|
34
|
+
Net::HTTP.start "freecite.library.brown.edu" do |http|
|
35
|
+
r = http.post "/citations/create", "citation=#{ref}",
|
36
|
+
"Accept" => "text/xml"
|
37
|
+
doc = Nokogiri::XML r.body
|
38
|
+
|
39
|
+
{
|
40
|
+
:title => doc.at_xpath("//title").content,
|
41
|
+
:journal => doc.at_xpath("//journal").content,
|
42
|
+
:pages => doc.at_xpath("//pages").content,
|
43
|
+
:year => doc.at_xpath("//year").content
|
44
|
+
}
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
class SimpleTextQuery
|
51
|
+
|
52
|
+
@@cookie = nil
|
53
|
+
|
54
|
+
def self.find ref
|
55
|
+
create_session
|
56
|
+
|
57
|
+
post = Net::HTTP::Post.new "/SimpleTextQuery"
|
58
|
+
post.add_field "Cookie", @@cookie
|
59
|
+
post.add_field "Referer", "http://www.crossref.org/SimpleTextQuery"
|
60
|
+
post.set_form_data({
|
61
|
+
"command" => "Submit",
|
62
|
+
"freetext" => ref,
|
63
|
+
#"emailField" => "kward@crossref.org",
|
64
|
+
"doiField" => "",
|
65
|
+
#"username" => "",
|
66
|
+
#"password" => ""
|
67
|
+
})
|
68
|
+
response = Net::HTTP.start "www.crossref.org" do |http|
|
69
|
+
http.request post
|
70
|
+
end
|
71
|
+
|
72
|
+
doc = Nokogiri::HTML response.body
|
73
|
+
doi = doc.at_css "td.resultB > a"
|
74
|
+
|
75
|
+
if doi.nil?
|
76
|
+
{}
|
77
|
+
else
|
78
|
+
{:doi => doi.content.sub("doi:", "")}
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
def self.create_session
|
83
|
+
if @@cookie.nil?
|
84
|
+
Net::HTTP.start "www.crossref.org" do |http|
|
85
|
+
response = http.get "/SimpleTextQuery"
|
86
|
+
@@cookie = response["Set-Cookie"]
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
end
|
92
|
+
|
93
|
+
@@resolvers = [Sigg]
|
94
|
+
|
95
|
+
def self.resolvers= resolver
|
96
|
+
@@resolvers = resolver
|
97
|
+
end
|
98
|
+
|
99
|
+
def self.add_resolver resolver
|
100
|
+
unless @@resolvers.contains? resolver
|
101
|
+
@@resolvers << resolver
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
def self.find ref
|
106
|
+
ref = ref.dup
|
107
|
+
@@resolvers.each do |resolver|
|
108
|
+
ref.merge! resolver.find(ref[:content])
|
109
|
+
end
|
110
|
+
ref
|
111
|
+
end
|
112
|
+
|
113
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require_relative 'resolve'
|
2
|
+
|
3
|
+
module PdfExtract
|
4
|
+
module ResolvedReferences
|
5
|
+
|
6
|
+
def self.include_in pdf
|
7
|
+
pdf.spatials :resolved_references, :depends_on => [:references] do |parser|
|
8
|
+
|
9
|
+
resolved_refs = []
|
10
|
+
|
11
|
+
parser.objects :references do |ref|
|
12
|
+
resolved_refs << ref.merge(Resolve.find(ref))
|
13
|
+
end
|
14
|
+
|
15
|
+
parser.after do
|
16
|
+
resolved_refs
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.reverse_resolve ref
|
23
|
+
|
24
|
+
url = "http://api.labs.crossref.org/search?q=#{CGI.escape(ref)}"
|
25
|
+
doc = Nokogiri::HTML(open url)
|
26
|
+
|
27
|
+
result = doc.at_css "div.result"
|
28
|
+
score = result.at_css("span.cr_score").content.to_s
|
29
|
+
if score.to_i >= 90
|
30
|
+
result.at_css("span.doi").content.sub("http://dx.doi.org/", "")
|
31
|
+
else
|
32
|
+
""
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
end
|