tabula-extractor 0.6.6-java → 0.7.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/AUTHORS.md +1 -0
- data/README.md +27 -11
- data/bin/tabula +61 -19
- data/ext/liblsd-linux32.so +0 -0
- data/ext/liblsd-linux64.so +0 -0
- data/ext/liblsd.dll +0 -0
- data/ext/liblsd.dylib +0 -0
- data/ext/liblsd64.dll +0 -0
- data/ext/lsd.c +137 -137
- data/ext/lsd.h +9 -9
- data/lib/tabula.rb +20 -3
- data/lib/tabula/core_ext.rb +261 -0
- data/lib/tabula/entities.rb +11 -456
- data/lib/tabula/entities/cell.rb +42 -0
- data/lib/tabula/entities/has_cells.rb +244 -0
- data/lib/tabula/entities/line.rb +39 -0
- data/lib/tabula/entities/page.rb +269 -0
- data/lib/tabula/entities/page_area.rb +7 -0
- data/lib/tabula/entities/ruling.rb +300 -0
- data/lib/tabula/entities/spreadsheet.rb +92 -0
- data/lib/tabula/entities/table.rb +81 -0
- data/lib/tabula/entities/text_chunk.rb +114 -0
- data/lib/tabula/entities/text_element.rb +112 -0
- data/lib/tabula/entities/zone_entity.rb +57 -0
- data/lib/tabula/extraction.rb +327 -0
- data/lib/tabula/line_segment_detector.rb +9 -7
- data/lib/tabula/pdf_line_extractor.rb +319 -0
- data/lib/tabula/pdf_render.rb +1 -5
- data/lib/tabula/spreadsheet_extractor.rb +52 -0
- data/lib/tabula/table_extractor.rb +50 -348
- data/lib/tabula/table_guesser.rb +21 -23
- data/lib/tabula/version.rb +1 -1
- data/lib/tabula/writers.rb +5 -6
- data/tabula-extractor.gemspec +1 -0
- data/target/pdfbox-app-2.0.0-SNAPSHOT.jar +0 -0
- data/test/data/47008204D_USA.page4.pdf +0 -0
- data/test/data/560015757GV_China.page1.pdf +0 -0
- data/test/data/GSK_2012_Q4.page437.pdf +0 -0
- data/test/data/S2MNCEbirdisland.pdf +0 -0
- data/test/data/campaign_donors.pdf +0 -0
- data/test/data/frx_2012_disclosure.tsv +88 -0
- data/test/data/no_tables.pdf +0 -0
- data/test/data/puertos1.pdf +0 -0
- data/test/data/spanning_cells.csv +21 -0
- data/test/data/spanning_cells.pdf +0 -0
- data/test/data/strongschools.pdf +0 -0
- data/{vertical_rulings_bug.pdf → test/data/vertical_rulings_bug.pdf} +0 -0
- data/test/data/vietnam3.pdf +0 -0
- data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
- data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
- data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
- data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
- data/test/heuristic.rb +50 -0
- data/test/test_bin_tabula.sh +7 -0
- data/test/tests.rb +476 -63
- metadata +79 -28
- data/lib/geom/point.rb +0 -21
- data/lib/geom/rectangle.rb +0 -101
- data/lib/geom/segment.rb +0 -82
- data/lib/tabula/pdf_dump.rb +0 -132
- data/lib/tabula/whitespace.rb +0 -50
- data/vertical_rulings_bug.rb +0 -29
@@ -1,370 +1,72 @@
|
|
1
|
-
require 'csv'
|
2
|
-
|
3
1
|
module Tabula
|
4
|
-
class TableExtractor
|
5
|
-
attr_accessor :text_elements, :options
|
6
|
-
|
7
|
-
DEFAULT_OPTIONS = {
|
8
|
-
:horizontal_rulings => [],
|
9
|
-
:vertical_rulings => [],
|
10
|
-
:merge_words => true,
|
11
|
-
:split_multiline_cells => false
|
12
|
-
}
|
13
|
-
|
14
|
-
def initialize(text_elements, options = {})
|
15
|
-
self.text_elements = text_elements
|
16
|
-
self.options = DEFAULT_OPTIONS.merge(options)
|
17
|
-
|
18
|
-
if self.options[:merge_words]
|
19
|
-
if self.options[:vertical_rulings]
|
20
|
-
merge_words_in_a_vertical_rulings_aware_manner!(self.options[:vertical_rulings])
|
21
|
-
else
|
22
|
-
merge_words!
|
23
|
-
end
|
24
|
-
end
|
25
|
-
|
26
|
-
end
|
27
|
-
|
28
|
-
def get_rows
|
29
|
-
hg = self.get_line_boundaries
|
30
|
-
hg.sort_by(&:top).map { |r| {'top' => r.top, 'bottom' => r.bottom, 'text' => r.texts} }
|
31
|
-
end
|
32
|
-
|
33
|
-
# TODO finish writing this method
|
34
|
-
# it should be analogous to get_line_boundaries
|
35
|
-
# (ie, take into account vertical ruling lines if available)
|
36
|
-
def group_by_columns
|
37
|
-
columns = []
|
38
|
-
tes = self.text_elements.sort_by &:left
|
39
|
-
|
40
|
-
# we don't have vertical rulings
|
41
|
-
if self.options[:vertical_rulings].empty?
|
42
|
-
tes.each do |te|
|
43
|
-
if column = columns.detect { |c| te.horizontally_overlaps?(c) }
|
44
|
-
column << te
|
45
|
-
else
|
46
|
-
columns << Column.new(te.left, te.width, [te])
|
47
|
-
end
|
48
|
-
end
|
49
|
-
else
|
50
|
-
self.options[:vertical_rulings].sort_by! &:left
|
51
|
-
1.upto(self.options[:vertical_rulings].size - 1) do |i|
|
52
|
-
left_ruling_line = self.options[:vertical_rulings][i - 1]
|
53
|
-
right_ruling_line = self.options[:vertical_rulings][i]
|
54
|
-
columns << Column.new(left_ruling_line.left, right_ruling_line.left - left_ruling_line.left, []) if (right_ruling_line.left - left_ruling_line.left > 10)
|
55
|
-
end
|
56
|
-
tes.each do |te|
|
57
|
-
if column = columns.detect { |c| te.horizontally_overlaps?(c) }
|
58
|
-
column << te
|
59
|
-
else
|
60
|
-
#puts "couldn't find a place for #{te.inspect}"
|
61
|
-
#columns << Column.new(te.left, te.width, [te])
|
62
|
-
end
|
63
|
-
end
|
64
|
-
end
|
65
|
-
columns
|
66
|
-
end
|
67
|
-
|
68
|
-
def get_columns
|
69
|
-
TableExtractor.new(text_elements).group_by_columns.map do |c|
|
70
|
-
{'left' => c.left, 'right' => c.right, 'width' => c.width}
|
71
|
-
end
|
72
|
-
end
|
73
|
-
|
74
|
-
def get_line_boundaries
|
75
|
-
boundaries = []
|
76
|
-
|
77
|
-
if self.options[:horizontal_rulings].empty?
|
78
|
-
# we don't have rulings
|
79
|
-
# iteratively grow boundaries to construct lines
|
80
|
-
self.text_elements.each do |te|
|
81
|
-
row = boundaries.detect { |l| l.vertically_overlaps?(te) }
|
82
|
-
ze = ZoneEntity.new(te.top, te.left, te.width, te.height)
|
83
|
-
if row.nil?
|
84
|
-
boundaries << ze
|
85
|
-
ze.texts << te.text
|
86
|
-
else
|
87
|
-
row.merge!(ze)
|
88
|
-
row.texts << te.text
|
89
|
-
end
|
90
|
-
end
|
91
|
-
else
|
92
|
-
self.options[:horizontal_rulings].sort_by!(&:top)
|
93
|
-
1.upto(self.options[:horizontal_rulings].size - 1) do |i|
|
94
|
-
above = self.options[:horizontal_rulings][i - 1]
|
95
|
-
below = self.options[:horizontal_rulings][i]
|
96
|
-
|
97
|
-
# construct zone between a horizontal ruling and the next
|
98
|
-
ze = ZoneEntity.new(above.top,
|
99
|
-
[above.left, below.left].min,
|
100
|
-
[above.width, below.width].max,
|
101
|
-
below.top - above.top)
|
102
|
-
|
103
|
-
# skip areas shorter than some threshold
|
104
|
-
# TODO: this should be the height of the shortest character, or something like that
|
105
|
-
next if ze.height < 2
|
106
|
-
|
107
|
-
boundaries << ze
|
108
|
-
end
|
109
|
-
end
|
110
|
-
boundaries
|
111
|
-
end
|
112
|
-
|
113
|
-
private
|
114
|
-
|
115
|
-
#this is where spaces come from!
|
116
|
-
def merge_words!
|
117
|
-
return self.text_elements if @merged # only merge once. awful hack.
|
118
|
-
@merged = true
|
119
|
-
current_word_index = i = 0
|
120
|
-
char1 = self.text_elements[i]
|
121
|
-
|
122
|
-
while i < self.text_elements.size-1 do
|
123
|
-
|
124
|
-
char2 = self.text_elements[i+1]
|
125
|
-
|
126
|
-
next if char2.nil? or char1.nil?
|
127
|
-
|
128
|
-
if self.text_elements[current_word_index].should_merge?(char2)
|
129
|
-
self.text_elements[current_word_index].merge!(char2)
|
130
|
-
char1 = char2
|
131
|
-
self.text_elements[i+1] = nil
|
132
|
-
else
|
133
|
-
# is there a space? is this within `CHARACTER_DISTANCE_THRESHOLD` points of previous char?
|
134
|
-
if (char1.text != " ") and (char2.text != " ") and self.text_elements[current_word_index].should_add_space?(char2)
|
135
|
-
self.text_elements[current_word_index].text += " "
|
136
|
-
#self.text_elements[current_word_index].width += self.text_elements[current_word_index].width_of_space
|
137
|
-
end
|
138
|
-
current_word_index = i+1
|
139
|
-
end
|
140
|
-
i += 1
|
141
|
-
end
|
142
|
-
self.text_elements.compact!
|
143
|
-
return self.text_elements
|
144
|
-
end
|
145
|
-
|
146
|
-
#this is where spaces come from!
|
147
|
-
def merge_words_in_a_vertical_rulings_aware_manner!(vertical_rulings)
|
148
|
-
#don't merge words across a ruling.
|
149
|
-
|
150
|
-
return self.text_elements if @merged # only merge once. awful hack.
|
151
|
-
@merged = true
|
152
|
-
current_word_index = i = 0
|
153
|
-
char1 = self.text_elements[i]
|
154
|
-
vertical_ruling_locations = vertical_rulings.map &:left
|
155
|
-
|
156
|
-
while i < self.text_elements.size-1 do
|
157
|
-
|
158
|
-
char2 = self.text_elements[i+1]
|
159
|
-
|
160
|
-
next if char2.nil? or char1.nil?
|
161
|
-
|
162
|
-
if self.text_elements[current_word_index].should_merge?(char2)
|
163
|
-
unless vertical_ruling_locations.map{|loc| self.text_elements[current_word_index].left < loc && char2.left > loc}.include?(true)
|
164
|
-
self.text_elements[current_word_index].merge!(char2)
|
165
|
-
end
|
166
|
-
|
167
|
-
char1 = char2
|
168
|
-
self.text_elements[i+1] = nil
|
169
|
-
else
|
170
|
-
# is there a space? is this within `CHARACTER_DISTANCE_THRESHOLD` points of previous char?
|
171
|
-
if (char1.text != " ") and (char2.text != " ") and self.text_elements[current_word_index].should_add_space?(char2)
|
172
|
-
self.text_elements[current_word_index].text += " "
|
173
|
-
#self.text_elements[current_word_index].width += self.text_elements[current_word_index].width_of_space
|
174
|
-
end
|
175
|
-
current_word_index = i+1
|
176
|
-
end
|
177
|
-
i += 1
|
178
|
-
end
|
179
|
-
self.text_elements.compact!
|
180
|
-
return self.text_elements
|
181
|
-
end
|
182
|
-
end
|
183
2
|
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
def Tabula.group_by_columns(text_elements, merge_words=false)
|
188
|
-
TableExtractor.new(text_elements, :merge_words => merge_words).group_by_columns
|
3
|
+
def Tabula.merge_words(text_elements, options={})
|
4
|
+
warn 'Tabula.merge_words is DEPRECATED. Use Tabula::TextElement.merge_words instead'
|
5
|
+
TextElement.merge_words(text_elements, options)
|
189
6
|
end
|
190
7
|
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
def Tabula.get_line_boundaries(text_elements)
|
195
|
-
TableExtractor.new(text_elements).get_line_boundaries
|
196
|
-
end
|
197
|
-
|
198
|
-
##
|
199
|
-
# Deprecated.
|
200
|
-
##
|
201
|
-
def Tabula.get_columns(text_elements, merge_words=true)
|
202
|
-
TableExtractor.new(text_elements, :merge_words => merge_words).get_columns
|
203
|
-
end
|
204
|
-
|
205
|
-
##
|
206
|
-
# Deprecated.
|
207
|
-
##
|
208
|
-
def Tabula.get_rows(text_elements, merge_words=true)
|
209
|
-
TableExtractor.new(text_elements, :merge_words => merge_words).get_rows
|
210
|
-
end
|
211
|
-
|
212
|
-
def Tabula.lines_to_csv(lines)
|
213
|
-
CSV.generate do |csv|
|
214
|
-
lines.each do |l|
|
215
|
-
csv << l.map { |c| c.text.strip }
|
216
|
-
end
|
217
|
-
end
|
218
|
-
end
|
219
|
-
|
220
|
-
ONLY_SPACES_RE = Regexp.new('^\s+$')
|
221
|
-
|
222
|
-
def Tabula.group_by_lines(text_elements)
|
223
|
-
lines = []
|
224
|
-
text_elements.each do |te|
|
225
|
-
next if te.text =~ ONLY_SPACES_RE
|
226
|
-
l = lines.find { |line| line.horizontal_overlap_ratio(te) >= 0.01 }
|
227
|
-
if l.nil?
|
228
|
-
l = Line.new
|
229
|
-
lines << l
|
230
|
-
end
|
231
|
-
l << te
|
232
|
-
end
|
233
|
-
lines
|
8
|
+
def Tabula.group_by_lines(text_chunks)
|
9
|
+
warn 'Tabula.group_by_lines is DEPRECATED. Use Tabula::TextChunk.group_by_lines instead.'
|
10
|
+
TextChunk.group_by_lines(text_chunks)
|
234
11
|
end
|
235
12
|
|
236
13
|
# Returns an array of Tabula::Line
|
237
|
-
def Tabula.make_table(
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
if text_elements.empty?
|
242
|
-
return []
|
243
|
-
end
|
244
|
-
|
245
|
-
extractor = TableExtractor.new(text_elements, options).text_elements
|
246
|
-
lines = group_by_lines(text_elements)
|
247
|
-
top = lines[0].text_elements.map(&:top).min
|
248
|
-
right = 0
|
249
|
-
columns = []
|
250
|
-
|
251
|
-
text_elements.sort_by(&:left).each do |te|
|
252
|
-
next if te.text =~ ONLY_SPACES_RE
|
253
|
-
if te.top >= top
|
254
|
-
left = te.left
|
255
|
-
if (left > right)
|
256
|
-
columns << right
|
257
|
-
right = te.right
|
258
|
-
elsif te.right > right
|
259
|
-
right = te.right
|
260
|
-
end
|
261
|
-
end
|
262
|
-
end
|
263
|
-
|
264
|
-
separators = columns[1..-1].sort.reverse
|
265
|
-
|
266
|
-
table = Table.new(lines.count, separators)
|
267
|
-
lines.each_with_index do |line, i|
|
268
|
-
line.text_elements.each do |te|
|
269
|
-
j = separators.find_index { |s| te.left > s } || separators.count
|
270
|
-
table.add_text_element(te, i, separators.count - j)
|
271
|
-
end
|
272
|
-
end
|
273
|
-
|
274
|
-
table.lines.map { |l|
|
275
|
-
l.text_elements.map! { |te|
|
276
|
-
te.nil? ? TextElement.new(nil, nil, nil, nil, nil, nil, '', nil) : te
|
277
|
-
}
|
278
|
-
}.sort_by { |l| l.map { |te| te.top or 0 }.max }
|
279
|
-
|
14
|
+
def Tabula.make_table(page, area, options={})
|
15
|
+
warn 'Tabula.make_table is DEPRECATED. Use Tabula::Page#make_table instead.'
|
16
|
+
page.get_area(area).make_table(options)
|
280
17
|
end
|
281
18
|
|
19
|
+
# extract a table from file +pdf_path+, +pages+ and +area+
|
20
|
+
#
|
21
|
+
# +pages+ can be a single integer (1-based) or an array of integers
|
22
|
+
#
|
23
|
+
# ==== Options
|
24
|
+
# +:password+ - Password if encrypted PDF (default: empty)
|
25
|
+
# +:detect_ruling_lines+ - Try to detect vertical (default: true)
|
26
|
+
# +:vertical_rulings+ - List of positions for vertical rulings. Overrides +:detect_ruling_lines+. (default: [])
|
27
|
+
def Tabula.extract_table(pdf_path, page, area, options={})
|
28
|
+
options = {
|
29
|
+
:password => '',
|
30
|
+
:detect_ruling_lines => true,
|
31
|
+
:vertical_rulings => []
|
32
|
+
}.merge(options)
|
282
33
|
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
lines = []
|
288
|
-
line_boundaries = extractor.get_line_boundaries
|
289
|
-
|
290
|
-
# find all the text elements
|
291
|
-
# contained within each detected line (table row) boundary
|
292
|
-
line_boundaries.each do |lb|
|
293
|
-
line = Line.new
|
294
|
-
|
295
|
-
line_members = text_elements.find_all do |te|
|
296
|
-
te.vertically_overlaps?(lb)
|
297
|
-
end
|
298
|
-
|
299
|
-
text_elements -= line_members
|
300
|
-
|
301
|
-
line_members.sort_by(&:left).each do |te|
|
302
|
-
# skip text_elements that only contain spaces
|
303
|
-
next if te.text =~ ONLY_SPACES_RE
|
304
|
-
line << te
|
305
|
-
end
|
306
|
-
|
307
|
-
lines << line if line.text_elements.size > 0
|
34
|
+
if area.instance_of?(Array)
|
35
|
+
top, left, bottom, right = area
|
36
|
+
area = Tabula::ZoneEntity.new(top, left,
|
37
|
+
right - left, bottom - top)
|
308
38
|
end
|
309
39
|
|
310
|
-
|
311
|
-
|
312
|
-
vertical_rulings = options[:vertical_rulings]
|
313
|
-
columns = TableExtractor.new(lines.map(&:text_elements).flatten.compact.uniq, {:merge_words => options[:merge_words], :vertical_rulings => vertical_rulings}).group_by_columns.sort_by(&:left)
|
314
|
-
|
315
|
-
# insert an empty cell in a given column if there's no text elements within that column's boundaries
|
316
|
-
lines.each_with_index do |l, line_index|
|
317
|
-
next if l.text_elements.nil?
|
318
|
-
l.text_elements.compact! # TODO WHY do I have to do this?
|
319
|
-
l.text_elements.uniq! # TODO WHY do I have to do this?
|
320
|
-
l.text_elements.sort_by!(&:left)
|
321
|
-
|
322
|
-
columns.each_with_index do |c, i|
|
323
|
-
if (l.text_elements.select{|te| te && te.left >= c.left && te.right <= (c.left + c.width)}.empty?)
|
324
|
-
l.text_elements.insert(i, TextElement.new(l.top, c.left, c.width, l.height, nil, 0, '', 0))
|
325
|
-
end
|
326
|
-
end
|
40
|
+
if page.is_a?(Integer)
|
41
|
+
page = [page]
|
327
42
|
end
|
328
43
|
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
44
|
+
page_obj = Extraction::ObjectExtractor.new(pdf_path,
|
45
|
+
page,
|
46
|
+
options[:password]) \
|
47
|
+
.extract.next
|
333
48
|
|
334
|
-
|
335
|
-
|
49
|
+
use_detected_lines = false
|
50
|
+
if options[:detect_ruling_lines] && options[:vertical_rulings].empty?
|
51
|
+
detected_vertical_rulings = Ruling.crop_rulings_to_area(page_obj.vertical_ruling_lines,
|
52
|
+
area)
|
336
53
|
|
337
|
-
|
338
|
-
|
339
|
-
== columns.detect { |c| c.text_elements.include? l.text_elements[t2] }
|
340
|
-
if l.text_elements[t1].bottom <= l.text_elements[t2].bottom
|
341
|
-
l.text_elements[t1].merge!(l.text_elements[t2])
|
342
|
-
l.text_elements[t2] = nil
|
343
|
-
else
|
344
|
-
l.text_elements[t2].merge!(l.text_elements[t1])
|
345
|
-
l.text_elements[t1] = nil
|
346
|
-
end
|
347
|
-
end
|
348
|
-
end
|
54
|
+
# only use lines if at least 80% of them cover at least 90%
|
55
|
+
# of the height of area of interest
|
349
56
|
|
350
|
-
|
351
|
-
|
352
|
-
|
57
|
+
# TODO this heuristic SUCKS
|
58
|
+
# what if only a couple columns is delimited with vertical rulings?
|
59
|
+
# ie: https://www.dropbox.com/s/lpydler5c3pn408/S2MNCEbirdisland.pdf (see 7th column)
|
60
|
+
# idea: detect columns without considering rulings, detect vertical rulings
|
61
|
+
# calculate ratio and try to come up with a threshold
|
62
|
+
use_detected_lines = detected_vertical_rulings.size > 2 \
|
63
|
+
&& (detected_vertical_rulings.count { |vl|
|
64
|
+
vl.height / area.height > 0.9
|
65
|
+
} / detected_vertical_rulings.size.to_f) >= 0.8
|
353
66
|
|
354
|
-
# remove duplicate lines
|
355
|
-
# TODO this shouldn't have happened here, check why we have to do
|
356
|
-
# this (maybe duplication is happening in the column merging phase?)
|
357
|
-
(0..lines.size - 2).each do |i|
|
358
|
-
next if lines[i].nil?
|
359
|
-
# if any of the elements on the next line is duplicated, kill
|
360
|
-
# the next line
|
361
|
-
if (0..lines[i].text_elements.size-1).any? { |j| lines[i].text_elements[j] == lines[i+1].text_elements[j] }
|
362
|
-
lines[i+1] = nil
|
363
|
-
end
|
364
67
|
end
|
365
68
|
|
366
|
-
|
367
|
-
|
368
|
-
end
|
69
|
+
page_obj.get_area(area).make_table(:vertical_rulings => use_detected_lines ? detected_vertical_rulings : options[:vertical_rulings])
|
70
|
+
|
369
71
|
end
|
370
72
|
end
|
data/lib/tabula/table_guesser.rb
CHANGED
@@ -1,11 +1,6 @@
|
|
1
|
-
require 'java'
|
2
1
|
require 'json'
|
3
|
-
require_relative '../geom/point'
|
4
|
-
require_relative '../geom/segment'
|
5
|
-
require_relative '../geom/rectangle'
|
6
|
-
require_relative './pdf_render'
|
7
|
-
#CLASSPATH=:./target/javacpp.jar:./target/javacv.jar:./target/javacv-macosx-x86_64.jar:./target/PDFRenderer-0.9.1.jar
|
8
2
|
|
3
|
+
warn 'Tabula::TableGuesser is DEPRECATED and will be removed'
|
9
4
|
|
10
5
|
module Tabula
|
11
6
|
module TableGuesser
|
@@ -13,7 +8,7 @@ module Tabula
|
|
13
8
|
def TableGuesser.find_and_write_rects(filename, output_dir)
|
14
9
|
#writes to JSON the rectangles on each page in the specified PDF.
|
15
10
|
open(File.join(output_dir, "tables.json"), 'w') do |f|
|
16
|
-
f.write( JSON.dump(find_rects(filename).map{|a| a.map{|r| r.dims.map
|
11
|
+
f.write( JSON.dump(find_rects(filename).map{|a| a.map{|r| r.dims.map(&:to_i) }} ))
|
17
12
|
end
|
18
13
|
end
|
19
14
|
|
@@ -50,8 +45,8 @@ module Tabula
|
|
50
45
|
lines
|
51
46
|
end
|
52
47
|
|
53
|
-
def TableGuesser.find_lines_on_page(pdf,
|
54
|
-
Tabula::
|
48
|
+
def TableGuesser.find_lines_on_page(pdf, page_number_zero_indexed)
|
49
|
+
Tabula::Extraction::LineExtractor.lines_in_pdf_page(pdf, page_number_zero_indexed, {:render_pdf => false})
|
55
50
|
end
|
56
51
|
|
57
52
|
def TableGuesser.find_rects_on_page(pdf, page_index)
|
@@ -59,9 +54,11 @@ module Tabula
|
|
59
54
|
end
|
60
55
|
|
61
56
|
def TableGuesser.find_rects_from_lines(lines)
|
62
|
-
horizontal_lines = lines.select
|
63
|
-
vertical_lines = lines.select
|
64
|
-
find_tables(vertical_lines, horizontal_lines).inject([])
|
57
|
+
horizontal_lines = lines.select(&:horizontal?)
|
58
|
+
vertical_lines = lines.select(&:vertical?)
|
59
|
+
find_tables(vertical_lines, horizontal_lines).inject([]) do |memo, next_rect|
|
60
|
+
java.awt.geom.Rectangle2D::Float.unionize( memo, next_rect )
|
61
|
+
end.compact.reject{|r| r.area == 0 }.sort_by(&:area).reverse
|
65
62
|
end
|
66
63
|
|
67
64
|
|
@@ -75,14 +72,14 @@ module Tabula
|
|
75
72
|
end
|
76
73
|
|
77
74
|
def TableGuesser.find_tables(verticals, horizontals)
|
78
|
-
#
|
79
|
-
#
|
80
|
-
#
|
81
|
-
#
|
82
|
-
#
|
83
|
-
#
|
84
|
-
#
|
85
|
-
corner_proximity_threshold = 0.
|
75
|
+
#
|
76
|
+
# Find all the rectangles in the vertical and horizontal lines given.
|
77
|
+
#
|
78
|
+
# Rectangles are deduped with hashRectangle, which considers two rectangles identical if each point rounds to the same tens place as the other.
|
79
|
+
#
|
80
|
+
# TODO: generalize this.
|
81
|
+
#
|
82
|
+
corner_proximity_threshold = 0.005;
|
86
83
|
|
87
84
|
rectangles = []
|
88
85
|
#find rectangles with one horizontal line and two vertical lines that end within $threshold to the ends of the horizontal line.
|
@@ -137,9 +134,10 @@ module Tabula
|
|
137
134
|
#in case we eventually tolerate not-quite-vertical lines, this computers the distance in Y directly, rather than depending on the vertical lines' lengths.
|
138
135
|
height = [left_vertical_line.bottom - left_vertical_line.top, right_vertical_line.bottom - right_vertical_line.top].max
|
139
136
|
|
140
|
-
|
137
|
+
top = [left_vertical_line.top, right_vertical_line.top].min
|
141
138
|
width = horizontal_line.right - horizontal_line.left
|
142
|
-
|
139
|
+
left = horizontal_line.left
|
140
|
+
r = java.awt.geom.Rectangle2D::Float.new( left, top, width, height ) #x, y, w, h
|
143
141
|
#rectangles.put(hashRectangle(r), r); #TODO: I dont' think I need this now that I'm in Rubyland
|
144
142
|
rectangles << r
|
145
143
|
end
|
@@ -187,7 +185,7 @@ module Tabula
|
|
187
185
|
y = vertical_line.top
|
188
186
|
width = [top_horizontal_line.right - top_horizontal_line.left, bottom_horizontal_line.right - bottom_horizontal_line.right].max
|
189
187
|
height = vertical_line.bottom - vertical_line.top
|
190
|
-
r =
|
188
|
+
r = java.awt.geom.Rectangle2D::Float.new( x, y, width, height ) #x, y, w, h
|
191
189
|
#rectangles.put(hashRectangle(r), r);
|
192
190
|
rectangles << r
|
193
191
|
end
|