pdf-extract 0.0.10 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/8630-31489-1-PB.mask.pdf +0 -0
- data/bin/pdf-extract +1 -2
- data/bin/test2.mask.pdf +0 -0
- data/bin/test3.mask.pdf +0 -0
- data/bin/test4.mask.pdf +0 -0
- data/bin/test5.mask.pdf +0 -0
- data/bin/test6.mask.pdf +0 -0
- data/bin/tmp.txt +368 -0
- data/lib/analysis/columns.rb +9 -5
- data/lib/analysis/sections.rb +50 -32
- data/lib/font_metrics.rb +11 -3
- data/lib/language.rb +9 -9
- data/lib/model/chunks.rb +8 -4
- data/lib/model/regions.rb +7 -7
- data/lib/multi_range.rb +13 -3
- data/lib/pdf-extract.rb +0 -2
- data/lib/references/references.rb +16 -15
- data/lib/references/resolve.rb +15 -15
- data/lib/references/score.rb +1 -1
- data/lib/spatial.rb +13 -13
- metadata +77 -134
- data/lib/view/png_view.rb +0 -30
data/lib/font_metrics.rb
CHANGED
@@ -29,7 +29,7 @@ module PdfExtract
|
|
29
29
|
@ascent = 0
|
30
30
|
@descent = 0
|
31
31
|
@bbox = [0, 0, 0, 0]
|
32
|
-
|
32
|
+
|
33
33
|
base_font = font.basefont.to_s
|
34
34
|
if @@base_fonts.key? base_font
|
35
35
|
@ascent = @@base_fonts[base_font][:Ascent]
|
@@ -37,12 +37,20 @@ module PdfExtract
|
|
37
37
|
@bbox = @@base_fonts[base_font][:FontBBox]
|
38
38
|
@glyph_width_lookup = proc { |c|
|
39
39
|
@@base_fonts[base_font][:Widths].fetch(c.codepoints.first, 0)
|
40
|
-
}
|
40
|
+
}
|
41
41
|
else
|
42
42
|
@ascent = font.ascent
|
43
43
|
@descent = font.descent
|
44
44
|
@bbox = font.bbox
|
45
|
-
@glyph_width_lookup = proc
|
45
|
+
@glyph_width_lookup = proc do |c|
|
46
|
+
begin
|
47
|
+
font.glyph_width c.codepoints.first
|
48
|
+
rescue TypeError => e
|
49
|
+
# It seems some fonts don't have a first char attribute in their
|
50
|
+
# descriptor and this causes problems for pdf-reader.
|
51
|
+
0
|
52
|
+
end
|
53
|
+
end
|
46
54
|
end
|
47
55
|
|
48
56
|
if not @bbox.nil?
|
data/lib/language.rb
CHANGED
@@ -17,7 +17,7 @@ module PdfExtract::Language
|
|
17
17
|
when "\ufb05" then r << "ft"
|
18
18
|
when "\ufb06" then r << "st"
|
19
19
|
when "\u1d6b" then r << "ue"
|
20
|
-
|
20
|
+
|
21
21
|
# Normalise some punctuation.
|
22
22
|
when "\u2018" then r << "'"
|
23
23
|
when "\u2019" then r << "'"
|
@@ -33,19 +33,19 @@ module PdfExtract::Language
|
|
33
33
|
r << c
|
34
34
|
end
|
35
35
|
end
|
36
|
-
|
36
|
+
|
37
37
|
r.gsub /\s+/, " "
|
38
38
|
end
|
39
|
-
|
39
|
+
|
40
40
|
def self.letter_ratio s
|
41
41
|
s.count("A-Z0-9\-[],.\"'()") / s.length.to_f
|
42
42
|
end
|
43
|
-
|
43
|
+
|
44
44
|
# TODO Ignore caps in middle of words
|
45
45
|
def self.cap_ratio s
|
46
46
|
sentence_end = true
|
47
47
|
cap_count = 0
|
48
|
-
|
48
|
+
|
49
49
|
s.each_char do |c|
|
50
50
|
if c =~ /\./
|
51
51
|
sentence_end = true
|
@@ -56,13 +56,13 @@ module PdfExtract::Language
|
|
56
56
|
sentence_end = false
|
57
57
|
end
|
58
58
|
end
|
59
|
-
|
59
|
+
|
60
60
|
cap_count / s.split.length.to_f
|
61
61
|
end
|
62
|
-
|
62
|
+
|
63
63
|
def self.year_ratio s
|
64
64
|
words = s.split
|
65
|
-
|
65
|
+
|
66
66
|
year_words = words.map do |word|
|
67
67
|
word =~ /[^\d]\d{4}[^\d]/
|
68
68
|
end
|
@@ -77,6 +77,6 @@ module PdfExtract::Language
|
|
77
77
|
def self.word_count s
|
78
78
|
s.split.count
|
79
79
|
end
|
80
|
-
|
80
|
+
|
81
81
|
end
|
82
82
|
|
data/lib/model/chunks.rb
CHANGED
@@ -24,14 +24,14 @@ module PdfExtract
|
|
24
24
|
}
|
25
25
|
|
26
26
|
def self.include_in pdf
|
27
|
-
|
27
|
+
|
28
28
|
pdf.spatials :chunks, :paged => true, :depends_on => [:characters] do |parser|
|
29
29
|
rows = {}
|
30
30
|
|
31
31
|
parser.before do
|
32
32
|
rows = {}
|
33
33
|
end
|
34
|
-
|
34
|
+
|
35
35
|
parser.objects :characters do |chars|
|
36
36
|
y = chars[:y]
|
37
37
|
rows[y] = [] if rows[y].nil?
|
@@ -48,7 +48,7 @@ module PdfExtract
|
|
48
48
|
char_slop = pdf.settings[:char_slop]
|
49
49
|
word_slop = pdf.settings[:word_slop]
|
50
50
|
overlap_slop = pdf.settings[:overlap_slop]
|
51
|
-
|
51
|
+
|
52
52
|
text_chunks = []
|
53
53
|
|
54
54
|
rows.each_pair do |y, row|
|
@@ -105,7 +105,11 @@ module PdfExtract
|
|
105
105
|
end
|
106
106
|
|
107
107
|
merged_text_chunks << text_chunks.first
|
108
|
-
|
108
|
+
|
109
|
+
# Remove empty lines - they mess up region detection by
|
110
|
+
# making them join together.
|
111
|
+
merged_text_chunks.reject { |chunk| chunk[:content].strip == "" }
|
112
|
+
end
|
109
113
|
end
|
110
114
|
end
|
111
115
|
|
data/lib/model/regions.rb
CHANGED
@@ -41,7 +41,7 @@ to be part of the same region. :line_slop is multiplied by the average line heig
|
|
41
41
|
height_taken = from_top + line[:height]
|
42
42
|
end
|
43
43
|
end
|
44
|
-
|
44
|
+
|
45
45
|
def self.include_in pdf
|
46
46
|
pdf.spatials :regions, :paged => true, :depends_on => [:chunks] do |parser|
|
47
47
|
chunks = []
|
@@ -51,7 +51,7 @@ to be part of the same region. :line_slop is multiplied by the average line heig
|
|
51
51
|
chunks = []
|
52
52
|
regions = []
|
53
53
|
end
|
54
|
-
|
54
|
+
|
55
55
|
parser.objects :chunks do |chunk|
|
56
56
|
y = chunk[:y].floor
|
57
57
|
|
@@ -72,16 +72,16 @@ to be part of the same region. :line_slop is multiplied by the average line heig
|
|
72
72
|
chunk[:lines] = [Spatial.as_line(chunk)]
|
73
73
|
chunk.delete :content
|
74
74
|
end
|
75
|
-
|
75
|
+
|
76
76
|
compare_index = 1
|
77
77
|
while chunks.count > compare_index
|
78
78
|
b = chunks.first
|
79
79
|
t = chunks[compare_index]
|
80
|
-
|
80
|
+
|
81
81
|
line_height = b[:line_height]
|
82
82
|
line_slop = [line_height, t[:height]].min * pdf.settings[:line_slop]
|
83
83
|
incident_y = (b[:y] + b[:height] + line_slop) >= t[:y]
|
84
|
-
|
84
|
+
|
85
85
|
if incident_y && incident(t, b)
|
86
86
|
chunks[0] = Spatial.merge t, b, :lines => true
|
87
87
|
chunks.delete_at compare_index
|
@@ -96,7 +96,7 @@ to be part of the same region. :line_slop is multiplied by the average line heig
|
|
96
96
|
compare_index = 1
|
97
97
|
end
|
98
98
|
end
|
99
|
-
|
99
|
+
|
100
100
|
regions << chunks.first unless chunks.first.nil?
|
101
101
|
|
102
102
|
regions.each do |region|
|
@@ -112,6 +112,6 @@ to be part of the same region. :line_slop is multiplied by the average line heig
|
|
112
112
|
end
|
113
113
|
end
|
114
114
|
end
|
115
|
-
|
115
|
+
|
116
116
|
end
|
117
117
|
end
|
data/lib/multi_range.rb
CHANGED
@@ -15,7 +15,7 @@ module PdfExtract
|
|
15
15
|
r.include?(range.min) || r.include?(range.max) ||
|
16
16
|
range.include?(r.min) || range.include?(r.max)
|
17
17
|
end
|
18
|
-
|
18
|
+
|
19
19
|
incident << range
|
20
20
|
|
21
21
|
non_incident = @ranges - incident
|
@@ -46,11 +46,21 @@ module PdfExtract
|
|
46
46
|
end
|
47
47
|
|
48
48
|
def max
|
49
|
-
@
|
49
|
+
@ranges.sort_by { |r| -r.max }.first.max
|
50
50
|
end
|
51
51
|
|
52
52
|
def min
|
53
|
-
@
|
53
|
+
@ranges.sort_by { |r| r.min }.first.min
|
54
|
+
end
|
55
|
+
|
56
|
+
def widest
|
57
|
+
widest = @ranges.sort_by { |r| r.max - r.min }.last
|
58
|
+
widest.max - widest.min
|
59
|
+
end
|
60
|
+
|
61
|
+
def narrowest
|
62
|
+
narrowest = @ranges.sort_by { |r| r.max - r.min }.first
|
63
|
+
narrowest.max - narrowest.min
|
54
64
|
end
|
55
65
|
|
56
66
|
def avg
|
data/lib/pdf-extract.rb
CHANGED
@@ -9,7 +9,6 @@ require_relative 'analysis/columns'
|
|
9
9
|
require_relative 'analysis/sections'
|
10
10
|
require_relative 'references/references'
|
11
11
|
require_relative 'references/resolved_references'
|
12
|
-
require_relative 'view/png_view'
|
13
12
|
require_relative 'view/pdf_view'
|
14
13
|
require_relative 'view/xml_view'
|
15
14
|
|
@@ -68,7 +67,6 @@ module PdfExtract
|
|
68
67
|
add_parser ResolvedReferences
|
69
68
|
|
70
69
|
add_view :pdf, PdfView
|
71
|
-
add_view :png, PngView
|
72
70
|
add_view :xml, XmlView
|
73
71
|
end
|
74
72
|
|
@@ -6,7 +6,7 @@ module PdfExtract
|
|
6
6
|
module References
|
7
7
|
|
8
8
|
Settings.declare :reference_flex, {
|
9
|
-
:default => 0.
|
9
|
+
:default => 0.2,
|
10
10
|
:module => self.name,
|
11
11
|
:description => "Article sections are given a score as potential reference sections. Their score is based on article section features, such as the number of family names that appear, the ratio of uppercase letters to lowercase, and so on. Any article section that has a score that is more than 1 - :reference_flex percent of the best score will be parsed as a reference section."
|
12
12
|
}
|
@@ -16,7 +16,7 @@ module PdfExtract
|
|
16
16
|
:module => self.name,
|
17
17
|
:description => "There must be :min_sequence_count or more numbered references within a candidate reference section for them to be parsed as number-delimited references."
|
18
18
|
}
|
19
|
-
|
19
|
+
|
20
20
|
Settings.declare :max_reference_order, {
|
21
21
|
:default => 1000,
|
22
22
|
:module => self.name,
|
@@ -82,11 +82,11 @@ module PdfExtract
|
|
82
82
|
|
83
83
|
# Determine the charcaters that are most likely part of numeric
|
84
84
|
# delimiters.
|
85
|
-
|
85
|
+
|
86
86
|
after = {}
|
87
87
|
before = {}
|
88
88
|
last_n = -1
|
89
|
-
|
89
|
+
|
90
90
|
s.scan /[^\d]?\d+[^\d]/ do |m|
|
91
91
|
n = m[/\d+/].to_i
|
92
92
|
if n < pdf.settings[:max_reference_order]
|
@@ -115,14 +115,14 @@ module PdfExtract
|
|
115
115
|
if ["", "\\[", "\\ "].include?(b_s) && ["", "\\.", "\\]", "\\ "].include?(a_s)
|
116
116
|
|
117
117
|
# Split by the delimiters and record separate refs.
|
118
|
-
|
118
|
+
|
119
119
|
last_n = -1
|
120
120
|
current_ref = ""
|
121
121
|
refs = []
|
122
122
|
parts = s.partition(Regexp.new "#{b_s}?\\d+#{a_s}")
|
123
|
-
|
123
|
+
|
124
124
|
while not parts[1].length.zero?
|
125
|
-
n = parts[1][/\d+/].to_i
|
125
|
+
n = parts[1][/\d+/].to_i
|
126
126
|
if n < pdf.settings[:max_reference_order] && last_n == -1
|
127
127
|
last_n = n
|
128
128
|
elsif n == last_n.next
|
@@ -139,12 +139,12 @@ module PdfExtract
|
|
139
139
|
|
140
140
|
parts = parts[2].partition(Regexp.new "#{b_s}?\\d+#{a_s}")
|
141
141
|
end
|
142
|
-
|
142
|
+
|
143
143
|
refs << {
|
144
144
|
:content => (current_ref + parts[0]).strip,
|
145
145
|
:order => last_n
|
146
146
|
}
|
147
|
-
|
147
|
+
|
148
148
|
refs
|
149
149
|
|
150
150
|
else
|
@@ -177,7 +177,7 @@ module PdfExtract
|
|
177
177
|
|
178
178
|
seq_count >= pdf.settings[:min_sequence_count]
|
179
179
|
end
|
180
|
-
|
180
|
+
|
181
181
|
def self.include_in pdf
|
182
182
|
pdf.spatials :references, :depends_on => [:sections] do |parser|
|
183
183
|
|
@@ -190,7 +190,7 @@ module PdfExtract
|
|
190
190
|
parser.after do
|
191
191
|
max_score = sections.map {|s| s[:reference_score]}.max
|
192
192
|
min_permittable = max_score - (max_score * pdf.settings[:reference_flex])
|
193
|
-
|
193
|
+
|
194
194
|
refs = []
|
195
195
|
|
196
196
|
sections = sections.reject do |s|
|
@@ -199,13 +199,14 @@ module PdfExtract
|
|
199
199
|
# half of an article.
|
200
200
|
s[:lateness] < pdf.settings[:min_lateness] || s[:year_ratio].zero?
|
201
201
|
end
|
202
|
-
|
202
|
+
|
203
203
|
sections.each do |section|
|
204
204
|
if section[:reference_score] >= min_permittable
|
205
205
|
# TODO Enable classification once we have a reasonable model.
|
206
206
|
#if Score.reference?(section)
|
207
|
-
|
208
|
-
|
207
|
+
content = Spatial.get_text_content(section)
|
208
|
+
if numeric_sequence? pdf, content
|
209
|
+
refs += split_by_delimiter pdf, content
|
209
210
|
elsif multi_margin? section[:lines]
|
210
211
|
refs += split_by_margin section[:lines]
|
211
212
|
elsif multi_spacing? section[:lines]
|
@@ -213,7 +214,7 @@ module PdfExtract
|
|
213
214
|
end
|
214
215
|
end
|
215
216
|
end
|
216
|
-
|
217
|
+
|
217
218
|
# TODO Ideally we wouldn't see the ref headers here.
|
218
219
|
# Unfortunately publication details can look a lot like references.
|
219
220
|
refs.reject do |ref|
|
data/lib/references/resolve.rb
CHANGED
@@ -12,12 +12,12 @@ module PdfExtract::Resolve
|
|
12
12
|
resolved = {}
|
13
13
|
begin
|
14
14
|
doc = Nokogiri::HTML(open url)
|
15
|
-
|
15
|
+
|
16
16
|
result = doc.at_css "div.result"
|
17
17
|
unless result.nil?
|
18
18
|
score = result.at_css("span.cr_score").content.to_s
|
19
19
|
if score.to_i >= 90
|
20
|
-
doi = result.at_css "span.doi"
|
20
|
+
doi = result.at_css "span.doi"
|
21
21
|
resolved[:doi] = doi.content.sub "http://dx.doi.org/", ""
|
22
22
|
end
|
23
23
|
end
|
@@ -25,17 +25,17 @@ module PdfExtract::Resolve
|
|
25
25
|
end
|
26
26
|
resolved
|
27
27
|
end
|
28
|
-
|
28
|
+
|
29
29
|
end
|
30
|
-
|
30
|
+
|
31
31
|
class FreeCite
|
32
|
-
|
32
|
+
|
33
33
|
def self.find ref
|
34
34
|
Net::HTTP.start "freecite.library.brown.edu" do |http|
|
35
35
|
r = http.post "/citations/create", "citation=#{ref}",
|
36
36
|
"Accept" => "text/xml"
|
37
37
|
doc = Nokogiri::XML r.body
|
38
|
-
|
38
|
+
|
39
39
|
{
|
40
40
|
:title => doc.at_xpath("//title").content,
|
41
41
|
:journal => doc.at_xpath("//journal").content,
|
@@ -44,13 +44,13 @@ module PdfExtract::Resolve
|
|
44
44
|
}
|
45
45
|
end
|
46
46
|
end
|
47
|
-
|
47
|
+
|
48
48
|
end
|
49
|
-
|
49
|
+
|
50
50
|
class SimpleTextQuery
|
51
51
|
|
52
52
|
@@cookie = nil
|
53
|
-
|
53
|
+
|
54
54
|
def self.find ref
|
55
55
|
create_session
|
56
56
|
|
@@ -68,10 +68,10 @@ module PdfExtract::Resolve
|
|
68
68
|
response = Net::HTTP.start "www.crossref.org" do |http|
|
69
69
|
http.request post
|
70
70
|
end
|
71
|
-
|
71
|
+
|
72
72
|
doc = Nokogiri::HTML response.body
|
73
73
|
doi = doc.at_css "td.resultB > a"
|
74
|
-
|
74
|
+
|
75
75
|
if doi.nil?
|
76
76
|
{}
|
77
77
|
else
|
@@ -87,11 +87,11 @@ module PdfExtract::Resolve
|
|
87
87
|
end
|
88
88
|
end
|
89
89
|
end
|
90
|
-
|
90
|
+
|
91
91
|
end
|
92
|
-
|
92
|
+
|
93
93
|
@@resolvers = [Sigg]
|
94
|
-
|
94
|
+
|
95
95
|
def self.resolvers= resolver
|
96
96
|
@@resolvers = resolver
|
97
97
|
end
|
@@ -109,5 +109,5 @@ module PdfExtract::Resolve
|
|
109
109
|
end
|
110
110
|
ref
|
111
111
|
end
|
112
|
-
|
112
|
+
|
113
113
|
end
|
data/lib/references/score.rb
CHANGED
data/lib/spatial.rb
CHANGED
@@ -24,13 +24,13 @@ module PdfExtract
|
|
24
24
|
|
25
25
|
def self.merge_lines a, b, so
|
26
26
|
so[:lines] = []
|
27
|
-
|
27
|
+
|
28
28
|
if a.key? :lines
|
29
29
|
so[:lines] += a[:lines]
|
30
30
|
else
|
31
31
|
so[:lines] << as_line(a)
|
32
32
|
end
|
33
|
-
|
33
|
+
|
34
34
|
if b.key? :lines
|
35
35
|
so[:lines] += b[:lines]
|
36
36
|
else
|
@@ -60,7 +60,7 @@ module PdfExtract
|
|
60
60
|
so[:content] = (a[:content] + options[:separator] + b[:content])
|
61
61
|
so[:content] = so[:content].gsub /\s+/, " "
|
62
62
|
end
|
63
|
-
|
63
|
+
|
64
64
|
if get_text_content(a).length > get_text_content(b).length
|
65
65
|
so[:font] = a[:font]
|
66
66
|
so[:line_height] = a[:line_height]
|
@@ -115,12 +115,12 @@ module PdfExtract
|
|
115
115
|
# correct write order, specified by write_mode.
|
116
116
|
def self.collapse objs, options={}
|
117
117
|
options = @@default_options.merge options
|
118
|
-
|
118
|
+
|
119
119
|
sorted = case write_mode
|
120
120
|
when :left_to_right
|
121
121
|
objs.sort_by { |obj| -(obj[:y].floor * 100) + (obj[:x] / 100.0) }
|
122
122
|
end
|
123
|
-
|
123
|
+
|
124
124
|
if sorted.count == 1
|
125
125
|
sorted.first.dup
|
126
126
|
else
|
@@ -132,18 +132,18 @@ module PdfExtract
|
|
132
132
|
end
|
133
133
|
end
|
134
134
|
|
135
|
-
def self.contains? a, b
|
136
|
-
a_x1 = a[:x]
|
137
|
-
a_x2 = a[:x] + a[:width]
|
138
|
-
a_y1 = a[:y]
|
139
|
-
a_y2 = a[:y] + a[:height]
|
135
|
+
def self.contains? a, b, padding=0
|
136
|
+
a_x1 = a[:x] - padding
|
137
|
+
a_x2 = a[:x] + a[:width] + (padding * 2)
|
138
|
+
a_y1 = a[:y] - padding
|
139
|
+
a_y2 = a[:y] + a[:height] + (padding * 2)
|
140
140
|
|
141
141
|
b_x1 = b[:x]
|
142
142
|
b_x2 = b[:x] + b[:width]
|
143
143
|
b_y1 = b[:y]
|
144
144
|
b_y2 = b[:y] + b[:height]
|
145
145
|
|
146
|
-
b_x1 >= a_x1 && b_x2 <= a_x2 && b_y1 >= a_y1 && b_y2 <= a_y2
|
146
|
+
b_x1 >= a_x1 && b_x2 <= a_x2 && b_y1 >= a_y1 && b_y2 <= a_y2
|
147
147
|
end
|
148
148
|
|
149
149
|
def self.overlap? from, by, a, b
|
@@ -158,7 +158,7 @@ module PdfExtract
|
|
158
158
|
diffs = items.map {|item| (item[f] - ideals[f][0]).abs}
|
159
159
|
diffs.map! {|d| d.nan? ? 1 : d}
|
160
160
|
max_diff = diffs.max
|
161
|
-
|
161
|
+
|
162
162
|
scores = diffs.map do |d|
|
163
163
|
if d == 0
|
164
164
|
ideals[f][1]
|
@@ -173,6 +173,6 @@ module PdfExtract
|
|
173
173
|
end
|
174
174
|
end
|
175
175
|
end
|
176
|
-
|
176
|
+
|
177
177
|
end
|
178
178
|
end
|