pdf-extract 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/assign.rb +72 -0
- data/bin/config.json +4 -0
- data/bin/fac_v19n11_s5.mask.pdf +0 -0
- data/bin/margins.mask.pdf +0 -0
- data/bin/one-column.mask.pdf +24110 -39
- data/bin/pdf-extract +146 -0
- data/bin/s002040050107_Arch_Toxicol_1994_68_8.mask.pdf +0 -0
- data/bin/some3.mask.pdf +0 -0
- data/bin/some5.mask.pdf +0 -0
- data/bin/some6.mask.pdf +0 -0
- data/bin/train.rb +48 -0
- data/bin/two-column.mask.pdf +0 -0
- data/data/familynames.db +0 -0
- data/data/stopwords.txt +1 -0
- data/lib/analysis/columns.rb +75 -0
- data/lib/analysis/margins.rb +84 -0
- data/lib/analysis/sections.rb +156 -0
- data/lib/analysis/titles.rb +53 -0
- data/lib/analysis/zones.rb +128 -0
- data/lib/font_metrics.rb +240 -0
- data/lib/kmeans.rb +114 -0
- data/lib/language.rb +58 -0
- data/lib/model/characters.rb +320 -0
- data/lib/model/chunks.rb +103 -0
- data/lib/model/regions.rb +112 -0
- data/lib/multi_range.rb +69 -0
- data/lib/names.rb +85 -0
- data/lib/pdf-extract.rb +77 -0
- data/lib/pdf.rb +255 -0
- data/lib/references/references.rb +184 -0
- data/lib/references/resolve.rb +113 -0
- data/lib/references/resolved_references.rb +37 -0
- data/lib/spatial.rb +188 -0
- data/lib/view/abstract_view.rb +32 -0
- data/lib/view/pdf_view.rb +43 -0
- data/lib/view/png_view.rb +30 -0
- data/lib/view/xml_view.rb +113 -0
- metadata +208 -0
@@ -0,0 +1,320 @@
|
|
1
|
+
require 'matrix'
|
2
|
+
|
3
|
+
require_relative '../font_metrics'
|
4
|
+
|
5
|
+
module PdfExtract
|
6
|
+
module Characters
|
7
|
+
|
8
|
+
# TODO Implement writing mode and :FontMatrix.
|
9
|
+
|
10
|
+
def self.glyph_descent c, state
|
11
|
+
if state.last[:font_metrics].nil? || state.last[:font_metrics].descent.nil?
|
12
|
+
0
|
13
|
+
else
|
14
|
+
state.last[:font_metrics].descent / 1000.0
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.glyph_ascent c, state
|
19
|
+
if state.last[:font_metrics].nil? || state.last[:font_metrics].ascent.nil?
|
20
|
+
0
|
21
|
+
else
|
22
|
+
state.last[:font_metrics].ascent / 1000.0
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.glyph_width c, state
|
27
|
+
# :Widths may be used to determine glyph width. This is the same as
|
28
|
+
# horizontal displacemnt.
|
29
|
+
glyph_displacement(c, state)[0]
|
30
|
+
end
|
31
|
+
|
32
|
+
def self.glyph_height c, state
|
33
|
+
# :Ascent and :Descent from the :FontDescriptor can be used to determine
|
34
|
+
# maximum glyph height.
|
35
|
+
glyph_ascent(c, state) - glyph_descent(c, state)
|
36
|
+
end
|
37
|
+
|
38
|
+
def self.glyph_displacement c, state
|
39
|
+
# For non-Type3 fonts, vertical displacement is the glyph width,
|
40
|
+
# horizontal displacement is always 0. Note glyph width is given
|
41
|
+
# in 1000ths of text units.
|
42
|
+
if state.last[:font_metrics].nil?
|
43
|
+
# XXX Why are some font resources not reported via resource_font?
|
44
|
+
# Bug in pdf-reader? Possibly because of :Font entry in graphics
|
45
|
+
# state set.
|
46
|
+
[ 0, 0 ]
|
47
|
+
else
|
48
|
+
[ state.last[:font_metrics].glyph_width(c) / 1000.0, 0 ]
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def self.make_text_runs text, tj, state, render_state, page, page_number
|
53
|
+
# TODO Ignore chars outside the page :MediaBox.
|
54
|
+
# TODO Mul UserUnit if specified by page.
|
55
|
+
# TODO Include writing mode, so that runs can be joined either
|
56
|
+
# virtically or horizontally in the join stage.
|
57
|
+
|
58
|
+
objs = []
|
59
|
+
h_scale_mod = (state.last[:h_scale] / 100.0)
|
60
|
+
s = state.last
|
61
|
+
|
62
|
+
disp_x, disp_y = [0, 0]
|
63
|
+
spacing = 0
|
64
|
+
tx = ((disp_x - (tj / 1000.0)) * s[:font_size] + spacing) * h_scale_mod
|
65
|
+
ty = (disp_y - (tj / 1000.0)) * s[:font_size] + spacing
|
66
|
+
|
67
|
+
# TODO Should use either tx or ty depending on writing mode.
|
68
|
+
render_state[:tm] = Matrix[ [1, 0, 0], [0, 1, 0], [tx, 0, 1] ] * render_state[:tm]
|
69
|
+
|
70
|
+
# tj applies only to the first char of the Tj op.
|
71
|
+
tj = 0
|
72
|
+
|
73
|
+
text.each_char do |c|
|
74
|
+
trm = Matrix[ [s[:font_size] * h_scale_mod, 0, 0],
|
75
|
+
[0, s[:font_size], 0],
|
76
|
+
[0, s[:rise], 1] ]
|
77
|
+
trm = trm * render_state[:tm] * state.last[:ctm]
|
78
|
+
|
79
|
+
bl_pos = Matrix.rows( [ [0, glyph_descent(c, state), 1] ])
|
80
|
+
bl_pos = bl_pos * trm
|
81
|
+
|
82
|
+
width = glyph_width(c, state)
|
83
|
+
height = glyph_descent(c, state) + glyph_height(c, state)
|
84
|
+
|
85
|
+
tr_pos = Matrix.rows([ [width, height, 1] ])
|
86
|
+
tr_pos = tr_pos * trm
|
87
|
+
|
88
|
+
px = bl_pos.row(0)[0]
|
89
|
+
py = bl_pos.row(0)[1]
|
90
|
+
|
91
|
+
objs << {
|
92
|
+
:x => px,
|
93
|
+
:y => py,
|
94
|
+
:width => tr_pos.row(0)[0] - px,
|
95
|
+
:height => tr_pos.row(0)[1] - py,
|
96
|
+
:line_height => tr_pos.row(0)[1] - py,
|
97
|
+
:content => state.last[:font].to_utf8(c),
|
98
|
+
:page => page_number,
|
99
|
+
:font => state.last[:font].basefont,
|
100
|
+
:page_width => page[:MediaBox][2] - page[:MediaBox][0],
|
101
|
+
:page_height => page[:MediaBox][3] - page[:MediaBox][1]
|
102
|
+
}
|
103
|
+
|
104
|
+
disp_x, disp_y = glyph_displacement(c, state)
|
105
|
+
spacing = s[:char_spacing] if c != ' '
|
106
|
+
spacing = s[:word_spacing] if c == ' '
|
107
|
+
tx = ((disp_x - (tj / 1000.0)) * s[:font_size] + spacing) * h_scale_mod
|
108
|
+
ty = (disp_y - (tj / 1000.0)) * s[:font_size] + spacing
|
109
|
+
|
110
|
+
# TODO Should use either tx or ty depending on writing mode.
|
111
|
+
render_state[:tm] = Matrix[ [1, 0, 0], [0, 1, 0], [tx, 0, 1] ] * render_state[:tm]
|
112
|
+
end
|
113
|
+
|
114
|
+
objs
|
115
|
+
end
|
116
|
+
|
117
|
+
def self.include_in pdf
|
118
|
+
|
119
|
+
pdf.spatials :characters do |parser|
|
120
|
+
state = []
|
121
|
+
page = nil
|
122
|
+
fonts = {}
|
123
|
+
font_metrics = {}
|
124
|
+
page_n = 0
|
125
|
+
render_state = {
|
126
|
+
:tm => Matrix.identity(3),
|
127
|
+
:tlm => Matrix.identity(3)
|
128
|
+
}
|
129
|
+
|
130
|
+
parser.for :resource_font do |data|
|
131
|
+
fonts[data[0]] = data[1]
|
132
|
+
font_metrics[data[0]] = FontMetrics.new data[1]
|
133
|
+
nil
|
134
|
+
end
|
135
|
+
|
136
|
+
parser.for :begin_page do |data|
|
137
|
+
page = data[0]
|
138
|
+
page_n = page_n.next
|
139
|
+
state << {
|
140
|
+
:h_scale => 100,
|
141
|
+
:char_spacing => 0,
|
142
|
+
:word_spacing => 0,
|
143
|
+
:leading => 0,
|
144
|
+
:rise => 0,
|
145
|
+
:font => nil,
|
146
|
+
:font_metrics => nil,
|
147
|
+
:font_size => 0,
|
148
|
+
:ctm => Matrix.identity(3)
|
149
|
+
}
|
150
|
+
nil
|
151
|
+
end
|
152
|
+
|
153
|
+
parser.for :end_page do |data|
|
154
|
+
state.pop
|
155
|
+
nil
|
156
|
+
end
|
157
|
+
|
158
|
+
parser.for :begin_text_object do |data|
|
159
|
+
render_state = {
|
160
|
+
:tm => Matrix.identity(3),
|
161
|
+
:tlm => Matrix.identity(3)
|
162
|
+
}
|
163
|
+
nil
|
164
|
+
end
|
165
|
+
|
166
|
+
# Graphics state operators.
|
167
|
+
|
168
|
+
parser.for :set_graphics_state_parameters do |data|
|
169
|
+
# TODO Handle gs graphics state dictionary set operation for
|
170
|
+
# :Font dictionary entries. Probably why font is sometimes nil.
|
171
|
+
# puts data
|
172
|
+
nil
|
173
|
+
end
|
174
|
+
|
175
|
+
parser.for :save_graphics_state do |data|
|
176
|
+
state.push state.last.dup
|
177
|
+
nil
|
178
|
+
end
|
179
|
+
|
180
|
+
parser.for :restore_graphics_state do |data|
|
181
|
+
state.pop
|
182
|
+
nil
|
183
|
+
end
|
184
|
+
|
185
|
+
parser.for :concatenate_matrix do |data|
|
186
|
+
a, b, c, d, e, f = data
|
187
|
+
ctm = state.last[:ctm]
|
188
|
+
state.last[:ctm] = Matrix[ [a, b, 0], [c, d, 0], [e, f, 1] ] * ctm
|
189
|
+
nil
|
190
|
+
end
|
191
|
+
|
192
|
+
# State change operators.
|
193
|
+
|
194
|
+
parser.for :set_text_leading do |data|
|
195
|
+
state.last[:leading] = data.first
|
196
|
+
nil
|
197
|
+
end
|
198
|
+
|
199
|
+
parser.for :set_text_rise do |data|
|
200
|
+
state.last[:rise] = data.first
|
201
|
+
nil
|
202
|
+
end
|
203
|
+
|
204
|
+
parser.for :set_character_spacing do |data|
|
205
|
+
state.last[:char_spacing] = data.first
|
206
|
+
nil
|
207
|
+
end
|
208
|
+
|
209
|
+
parser.for :set_word_spacing do |data|
|
210
|
+
state.last[:word_spacing] = data.first
|
211
|
+
nil
|
212
|
+
end
|
213
|
+
|
214
|
+
parser.for :set_horizontal_text_scaling do |data|
|
215
|
+
state.last[:h_scale] = data.first
|
216
|
+
nil
|
217
|
+
end
|
218
|
+
|
219
|
+
# Position change operators.
|
220
|
+
|
221
|
+
parser.for :move_text_position do |data|
|
222
|
+
render_state[:tm] = Matrix[
|
223
|
+
[1, 0, 0], [0, 1, 0], [data[0], data[1], 1]
|
224
|
+
] * render_state[:tlm]
|
225
|
+
render_state[:tlm] = render_state[:tm]
|
226
|
+
nil
|
227
|
+
end
|
228
|
+
|
229
|
+
parser.for :move_text_position_and_set_leading do |data|
|
230
|
+
state.last[:leading] = -data[1]
|
231
|
+
render_state[:tm] = Matrix[
|
232
|
+
[1, 0, 0], [0, 1, 0], [data[0], data[1], 1]
|
233
|
+
] * render_state[:tlm]
|
234
|
+
render_state[:tlm] = render_state[:tm]
|
235
|
+
nil
|
236
|
+
end
|
237
|
+
|
238
|
+
# Font change operators.
|
239
|
+
|
240
|
+
parser.for :set_text_font_and_size do |data|
|
241
|
+
state.last[:font] = fonts[data[0]]
|
242
|
+
state.last[:font_metrics] = font_metrics[data[0]]
|
243
|
+
state.last[:font_size] = data[1]
|
244
|
+
nil
|
245
|
+
end
|
246
|
+
|
247
|
+
# Text matrix change operators.
|
248
|
+
|
249
|
+
parser.for :set_text_matrix_and_text_line_matrix do |data|
|
250
|
+
# -- --
|
251
|
+
# | a b 0 |
|
252
|
+
# | c d 0 |
|
253
|
+
# | e f 1 |
|
254
|
+
# -- --
|
255
|
+
a, b, c, d, e, f = data
|
256
|
+
render_state[:tm] = Matrix[ [a, b, 0], [c, d, 0], [e, f, 1] ]
|
257
|
+
render_state[:tlm] = Matrix[ [a, b, 0], [c, d, 0], [e, f, 1] ]
|
258
|
+
nil
|
259
|
+
end
|
260
|
+
|
261
|
+
# New line operators.
|
262
|
+
|
263
|
+
parser.for :move_to_start_of_next_line do |data|
|
264
|
+
render_state[:tm] = Matrix[
|
265
|
+
[1, 0, 0], [0, 1, 0], [0, -state.last[:leading], 1]
|
266
|
+
] * render_state[:tlm]
|
267
|
+
render_state[:tlm] = render_state[:tm]
|
268
|
+
nil
|
269
|
+
end
|
270
|
+
|
271
|
+
# Show text operators.
|
272
|
+
|
273
|
+
parser.for :set_spacing_next_line_show_text_raw do |data|
|
274
|
+
state.last[:word_spacing] = data[0]
|
275
|
+
state.last[:char_spacing] = data[1]
|
276
|
+
|
277
|
+
render_state[:tm] = Matrix[
|
278
|
+
[1, 0, 0], [0, 1, 0], [0, -state.last[:leading], 1]
|
279
|
+
] * render_state[:tlm]
|
280
|
+
render_state[:tlm] = render_state[:tm]
|
281
|
+
|
282
|
+
make_text_runs data[2], 0, state, render_state, page, page_n
|
283
|
+
end
|
284
|
+
|
285
|
+
parser.for :move_to_next_line_and_show_text_raw do |data|
|
286
|
+
render_state[:tm] = Matrix[
|
287
|
+
[1, 0, 0], [0, 1, 0], [0, -state.last[:leading], 1]
|
288
|
+
] * render_state[:tlm]
|
289
|
+
render_state[:tlm] = render_state[:tm]
|
290
|
+
|
291
|
+
make_text_runs data.first, 0, state, render_state, page, page_n
|
292
|
+
end
|
293
|
+
|
294
|
+
parser.for :show_text_raw do |data|
|
295
|
+
make_text_runs data.first, 0, state, render_state, page, page_n
|
296
|
+
end
|
297
|
+
|
298
|
+
parser.for :show_text_with_positioning_raw do |data|
|
299
|
+
data = data.first
|
300
|
+
runs = []
|
301
|
+
tj = 0
|
302
|
+
|
303
|
+
data.each do |item|
|
304
|
+
case item.class.to_s
|
305
|
+
when "Fixnum", "Float"
|
306
|
+
tj = item
|
307
|
+
when "String"
|
308
|
+
runs << make_text_runs(item, tj, state, render_state, page, page_n)
|
309
|
+
tj = 0
|
310
|
+
end
|
311
|
+
end
|
312
|
+
|
313
|
+
runs.flatten
|
314
|
+
end
|
315
|
+
|
316
|
+
end
|
317
|
+
end
|
318
|
+
|
319
|
+
end
|
320
|
+
end
|
data/lib/model/chunks.rb
ADDED
@@ -0,0 +1,103 @@
|
|
1
|
+
require_relative '../spatial'
|
2
|
+
|
3
|
+
module PdfExtract
|
4
|
+
module Chunks
|
5
|
+
|
6
|
+
# TODO Look for obj[:writing_mode] == :vertical or :horizontal
|
7
|
+
|
8
|
+
Settings.default :char_slop, 0.2
|
9
|
+
Settings.default :word_slop, 4.0
|
10
|
+
Settings.default :overlap_slop, 0.9
|
11
|
+
|
12
|
+
def self.include_in pdf
|
13
|
+
char_slop = 0.2
|
14
|
+
word_slop = 4.0
|
15
|
+
overlap_slop = 0.9
|
16
|
+
|
17
|
+
pdf.spatials :chunks, :paged => true, :depends_on => [:characters] do |parser|
|
18
|
+
rows = {}
|
19
|
+
|
20
|
+
parser.before do
|
21
|
+
rows = {}
|
22
|
+
end
|
23
|
+
|
24
|
+
parser.objects :characters do |chars|
|
25
|
+
y = chars[:y]
|
26
|
+
rows[y] = [] if rows[y].nil?
|
27
|
+
|
28
|
+
idx = rows[y].index { |obj| chars[:x] <= obj[:x] }
|
29
|
+
if idx.nil?
|
30
|
+
rows[y] << chars.dup
|
31
|
+
else
|
32
|
+
rows[y].insert idx, chars.dup
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
parser.after do
|
37
|
+
char_slop = pdf.settings[:char_slop]
|
38
|
+
word_slop = pdf.settings[:word_slop]
|
39
|
+
overlap_slop = pdf.settings[:overlap_slop]
|
40
|
+
|
41
|
+
text_chunks = []
|
42
|
+
|
43
|
+
rows.each_pair do |y, row|
|
44
|
+
char_width = row.first[:width]
|
45
|
+
|
46
|
+
while row.length > 1
|
47
|
+
left = row.first
|
48
|
+
right = row[1]
|
49
|
+
|
50
|
+
if (left[:x] + left[:width] + (char_width * char_slop)) >= right[:x]
|
51
|
+
# join as adjacent chars
|
52
|
+
row[0] = Spatial.merge left, right
|
53
|
+
row.delete_at 1
|
54
|
+
char_width = right[:width] unless right[:content].strip =~ /[^A-Za-z0-9]/
|
55
|
+
elsif (left[:x] + left[:width] + (char_width * word_slop)) >= right[:x]
|
56
|
+
# join with a ' ' in the middle.
|
57
|
+
row[0] = Spatial.merge left, right, :separator => ' '
|
58
|
+
row.delete_at 1
|
59
|
+
char_width = right[:width] unless right[:content].strip =~ /[^A-Za-z0-9]/
|
60
|
+
else
|
61
|
+
# leave 'em be.
|
62
|
+
text_chunks << left
|
63
|
+
row.delete_at 0
|
64
|
+
char_width = row.first[:width]
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
text_chunks << row.first
|
69
|
+
end
|
70
|
+
|
71
|
+
# Merge chunks that have slightly different :y positions but which
|
72
|
+
# mostly overlap in the y dimension.
|
73
|
+
|
74
|
+
text_chunks.sort_by! { |obj| obj[:x] }
|
75
|
+
merged_text_chunks = []
|
76
|
+
|
77
|
+
while text_chunks.count > 1
|
78
|
+
left = text_chunks.first
|
79
|
+
right = text_chunks[1]
|
80
|
+
|
81
|
+
overlap = [left[:height], right[:height]].min - (left[:y] - right[:y]).abs
|
82
|
+
overlap = overlap / [left[:height], right[:height]].min
|
83
|
+
|
84
|
+
if overlap >= overlap_slop
|
85
|
+
# TODO follow char / word slop rules.
|
86
|
+
# join
|
87
|
+
text_chunks[0] = Spatial.merge left, right
|
88
|
+
text_chunks.delete_at 1
|
89
|
+
else
|
90
|
+
# no join
|
91
|
+
merged_text_chunks << text_chunks.first
|
92
|
+
text_chunks.delete_at 0
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
merged_text_chunks << text_chunks.first
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
@@ -0,0 +1,112 @@
|
|
1
|
+
require_relative '../spatial'
|
2
|
+
|
3
|
+
module PdfExtract
|
4
|
+
module Regions
|
5
|
+
|
6
|
+
Settings.default :line_slop, 1.0
|
7
|
+
|
8
|
+
# TODO Handle :writing_mode once present in characters and text_chunks.
|
9
|
+
|
10
|
+
def self.incident l, r
|
11
|
+
lx1 = l[:x]
|
12
|
+
lx2 = l[:x] + l[:width]
|
13
|
+
rx1 = r[:x]
|
14
|
+
rx2 = r[:x] + r[:width]
|
15
|
+
|
16
|
+
lr = (lx1..lx2)
|
17
|
+
rr = (rx1..rx2)
|
18
|
+
|
19
|
+
lr.include? rx1 or lr.include? rx2 or rr.include? lx1 or rr.include? lx2
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.append_line_offsets region
|
23
|
+
region[:lines] ||= []
|
24
|
+
region[:lines].each do |line|
|
25
|
+
line[:x_offset] = line[:x] - region[:x]
|
26
|
+
line[:y_offset] = line[:y] - region[:y]
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def self.append_line_spacing region
|
31
|
+
region[:lines] ||= []
|
32
|
+
height_taken = 0
|
33
|
+
region[:lines].each do |line|
|
34
|
+
from_top = region[:height] - (line[:y_offset] + line[:height])
|
35
|
+
line[:spacing] = from_top - height_taken
|
36
|
+
height_taken = from_top + line[:height]
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def self.include_in pdf
|
41
|
+
pdf.spatials :regions, :paged => true, :depends_on => [:chunks] do |parser|
|
42
|
+
chunks = []
|
43
|
+
regions = []
|
44
|
+
|
45
|
+
parser.before do
|
46
|
+
chunks = []
|
47
|
+
regions = []
|
48
|
+
end
|
49
|
+
|
50
|
+
parser.objects :chunks do |chunk|
|
51
|
+
y = chunk[:y].floor
|
52
|
+
|
53
|
+
idx = chunks.index { |obj| chunk[:y] <= obj[:y] }
|
54
|
+
if idx.nil?
|
55
|
+
chunks << chunk.dup
|
56
|
+
else
|
57
|
+
chunks.insert idx, chunk.dup
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
# TODO Rewrite to use Spatial::collapse so that text is in proper
|
62
|
+
# order.
|
63
|
+
|
64
|
+
parser.after do
|
65
|
+
# Convert chunks to have line content.
|
66
|
+
chunks.each do |chunk|
|
67
|
+
chunk[:lines] = [Spatial.as_line(chunk)]
|
68
|
+
chunk.delete :content
|
69
|
+
end
|
70
|
+
|
71
|
+
compare_index = 1
|
72
|
+
while chunks.count > compare_index
|
73
|
+
b = chunks.first
|
74
|
+
t = chunks[compare_index]
|
75
|
+
|
76
|
+
line_height = b[:line_height]
|
77
|
+
line_slop = [line_height, t[:height]].min * pdf.settings[:line_slop]
|
78
|
+
incident_y = (b[:y] + b[:height] + line_slop) >= t[:y]
|
79
|
+
|
80
|
+
if incident_y && incident(t, b)
|
81
|
+
chunks[0] = Spatial.merge t, b, :lines => true
|
82
|
+
chunks.delete_at compare_index
|
83
|
+
compare_index = 1
|
84
|
+
elsif compare_index < chunks.count - 1
|
85
|
+
# Could be more chunks within range.
|
86
|
+
compare_index = compare_index.next
|
87
|
+
else
|
88
|
+
# Finished region.
|
89
|
+
regions << chunks.first
|
90
|
+
chunks.delete_at 0
|
91
|
+
compare_index = 1
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
regions << chunks.first unless chunks.first.nil?
|
96
|
+
|
97
|
+
regions.each do |region|
|
98
|
+
append_line_offsets region
|
99
|
+
append_line_spacing region
|
100
|
+
|
101
|
+
region[:lines].map! do |line|
|
102
|
+
Spatial.drop_spatial line
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
regions.sort_by { |obj| -obj[:y] }
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
end
|
112
|
+
end
|
data/lib/multi_range.rb
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
|
2
|
+
module PdfExtract
|
3
|
+
class MultiRange
|
4
|
+
|
5
|
+
attr_accessor :ranges
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
@ranges = []
|
9
|
+
end
|
10
|
+
|
11
|
+
def append range
|
12
|
+
return if range.max.nil? || range.min.nil?
|
13
|
+
|
14
|
+
incident = @ranges.select do |r|
|
15
|
+
r.include?(range.min) || r.include?(range.max) ||
|
16
|
+
range.include?(r.min) || range.include?(r.max)
|
17
|
+
end
|
18
|
+
|
19
|
+
incident << range
|
20
|
+
|
21
|
+
non_incident = @ranges - incident
|
22
|
+
|
23
|
+
non_incident << (incident.collect { |r| r.min }.min .. incident.collect { |r| r.max }.max)
|
24
|
+
@ranges = non_incident
|
25
|
+
|
26
|
+
@max_excluded = nil
|
27
|
+
@min_excluded = nil
|
28
|
+
@max = nil
|
29
|
+
@min = nil
|
30
|
+
end
|
31
|
+
|
32
|
+
def max_excluded
|
33
|
+
if @max_excluded.nil?
|
34
|
+
@max_excluded = @ranges.first.max if count == 1
|
35
|
+
@max_excluded = @ranges.sort_by { |r| -r.min }.first.min unless count == 1
|
36
|
+
end
|
37
|
+
@max_excluded
|
38
|
+
end
|
39
|
+
|
40
|
+
def min_excluded
|
41
|
+
if @min_excluded.nil?
|
42
|
+
@min_excluded = @ranges.first.min if count == 1
|
43
|
+
@min_excluded = @ranges.sort_by { |r| r.max }.first.max unless count == 1
|
44
|
+
end
|
45
|
+
@min_excluded
|
46
|
+
end
|
47
|
+
|
48
|
+
def max
|
49
|
+
@max ||= @ranges.sort_by { |r| -r.max }.first.max
|
50
|
+
end
|
51
|
+
|
52
|
+
def min
|
53
|
+
@min ||= @ranges.sort_by { |r| r.min }.first.min
|
54
|
+
end
|
55
|
+
|
56
|
+
def avg
|
57
|
+
@ranges.reduce(0) { |sum, r| sum += (r.max - r.min) } / @ranges.count.to_f
|
58
|
+
end
|
59
|
+
|
60
|
+
def covered
|
61
|
+
@ranges.reduce(0) { |total, r| total += (r.max - r.min) }
|
62
|
+
end
|
63
|
+
|
64
|
+
def count
|
65
|
+
@ranges.count
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
69
|
+
end
|
data/lib/names.rb
ADDED
@@ -0,0 +1,85 @@
|
|
1
|
+
require "net/http"
|
2
|
+
require "json"
|
3
|
+
require "sqlite3"
|
4
|
+
|
5
|
+
require_relative "pdf-extract"
|
6
|
+
|
7
|
+
module PdfExtract::Names
|
8
|
+
|
9
|
+
class NamesDatabase
|
10
|
+
@@ambiguous_weighting = 0.1
|
11
|
+
@@unambiguous_weighting = 1.0
|
12
|
+
|
13
|
+
def self.path_to_data data_filename
|
14
|
+
File.join(File.dirname(File.expand_path(__FILE__)), "../data/" + data_filename)
|
15
|
+
end
|
16
|
+
|
17
|
+
@@db = SQLite3::Database.new(path_to_data("familynames.db"), {:readonly => true})
|
18
|
+
@@stop_words = File.open(path_to_data("stopwords.txt")).read.split(",")
|
19
|
+
|
20
|
+
def self.detect_names content
|
21
|
+
words = content.split
|
22
|
+
sum = 0.0
|
23
|
+
|
24
|
+
words.each do |word|
|
25
|
+
word = word.downcase
|
26
|
+
|
27
|
+
if not @@stop_words.include? word && word.length > 1
|
28
|
+
query_word = word.capitalize.gsub(/-(.)/) { |s|
|
29
|
+
"-" + s[1].capitalize
|
30
|
+
}
|
31
|
+
|
32
|
+
@@db.execute("select * from names where name = ?", query_word) do |row|
|
33
|
+
if row[2] == 1
|
34
|
+
sum += @@ambiguous_weighting
|
35
|
+
else
|
36
|
+
sum += @@unambiguous_weighting
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
42
|
+
|
43
|
+
if sum == 0
|
44
|
+
{:name_frequency => 0}
|
45
|
+
else
|
46
|
+
{:name_frequency => (sum / words.length.to_f)}
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
class NamesService
|
52
|
+
def self.detect_names content
|
53
|
+
data = {:name_frequency => 0.0}
|
54
|
+
begin
|
55
|
+
response = Net::HTTP.start "names.crrd.dyndns.org" do |http|
|
56
|
+
http.post "/detect", content
|
57
|
+
end
|
58
|
+
|
59
|
+
if response.code == "200"
|
60
|
+
data = JSON.parse response.body
|
61
|
+
end
|
62
|
+
rescue
|
63
|
+
end
|
64
|
+
data
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
class NoDetection
|
69
|
+
def self.detect_names content
|
70
|
+
{:name_frequency => 0.0}
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
@@detector = NamesDatabase
|
75
|
+
|
76
|
+
def self.detector= detector_class
|
77
|
+
@@detector = detector_class
|
78
|
+
end
|
79
|
+
|
80
|
+
def self.detect_names content
|
81
|
+
@@detector.detect_names content
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
85
|
+
|