tabula-extractor 0.6.5-java → 0.6.6-java
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/tabula/entities.rb +4 -0
- data/lib/tabula/table_extractor.rb +79 -21
- data/lib/tabula/version.rb +1 -1
- data/vertical_rulings_bug.pdf +0 -0
- data/vertical_rulings_bug.rb +29 -0
- metadata +4 -2
data/lib/tabula/entities.rb
CHANGED
@@ -232,6 +232,10 @@ module Tabula
|
|
232
232
|
self.height = t.height
|
233
233
|
else
|
234
234
|
if in_same_column = self.text_elements.find { |te| te.horizontally_overlaps?(t) }
|
235
|
+
#sometimes a space needs to be added here
|
236
|
+
unless in_same_column.vertically_overlaps?(t)
|
237
|
+
t.text = " " + t.text
|
238
|
+
end
|
235
239
|
in_same_column.merge!(t)
|
236
240
|
else
|
237
241
|
self.text_elements << t
|
@@ -14,7 +14,15 @@ module Tabula
|
|
14
14
|
def initialize(text_elements, options = {})
|
15
15
|
self.text_elements = text_elements
|
16
16
|
self.options = DEFAULT_OPTIONS.merge(options)
|
17
|
-
|
17
|
+
|
18
|
+
if self.options[:merge_words]
|
19
|
+
if self.options[:vertical_rulings]
|
20
|
+
merge_words_in_a_vertical_rulings_aware_manner!(self.options[:vertical_rulings])
|
21
|
+
else
|
22
|
+
merge_words!
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
18
26
|
end
|
19
27
|
|
20
28
|
def get_rows
|
@@ -49,7 +57,7 @@ module Tabula
|
|
49
57
|
if column = columns.detect { |c| te.horizontally_overlaps?(c) }
|
50
58
|
column << te
|
51
59
|
else
|
52
|
-
puts "couldn't find a place for #{te.inspect}"
|
60
|
+
#puts "couldn't find a place for #{te.inspect}"
|
53
61
|
#columns << Column.new(te.left, te.width, [te])
|
54
62
|
end
|
55
63
|
end
|
@@ -134,21 +142,69 @@ module Tabula
|
|
134
142
|
self.text_elements.compact!
|
135
143
|
return self.text_elements
|
136
144
|
end
|
145
|
+
|
146
|
+
#this is where spaces come from!
|
147
|
+
def merge_words_in_a_vertical_rulings_aware_manner!(vertical_rulings)
|
148
|
+
#don't merge words across a ruling.
|
149
|
+
|
150
|
+
return self.text_elements if @merged # only merge once. awful hack.
|
151
|
+
@merged = true
|
152
|
+
current_word_index = i = 0
|
153
|
+
char1 = self.text_elements[i]
|
154
|
+
vertical_ruling_locations = vertical_rulings.map &:left
|
155
|
+
|
156
|
+
while i < self.text_elements.size-1 do
|
157
|
+
|
158
|
+
char2 = self.text_elements[i+1]
|
159
|
+
|
160
|
+
next if char2.nil? or char1.nil?
|
161
|
+
|
162
|
+
if self.text_elements[current_word_index].should_merge?(char2)
|
163
|
+
unless vertical_ruling_locations.map{|loc| self.text_elements[current_word_index].left < loc && char2.left > loc}.include?(true)
|
164
|
+
self.text_elements[current_word_index].merge!(char2)
|
165
|
+
end
|
166
|
+
|
167
|
+
char1 = char2
|
168
|
+
self.text_elements[i+1] = nil
|
169
|
+
else
|
170
|
+
# is there a space? is this within `CHARACTER_DISTANCE_THRESHOLD` points of previous char?
|
171
|
+
if (char1.text != " ") and (char2.text != " ") and self.text_elements[current_word_index].should_add_space?(char2)
|
172
|
+
self.text_elements[current_word_index].text += " "
|
173
|
+
#self.text_elements[current_word_index].width += self.text_elements[current_word_index].width_of_space
|
174
|
+
end
|
175
|
+
current_word_index = i+1
|
176
|
+
end
|
177
|
+
i += 1
|
178
|
+
end
|
179
|
+
self.text_elements.compact!
|
180
|
+
return self.text_elements
|
181
|
+
end
|
137
182
|
end
|
138
183
|
|
139
|
-
|
184
|
+
##
|
185
|
+
# Deprecated.
|
186
|
+
##
|
140
187
|
def Tabula.group_by_columns(text_elements, merge_words=false)
|
141
188
|
TableExtractor.new(text_elements, :merge_words => merge_words).group_by_columns
|
142
189
|
end
|
143
190
|
|
191
|
+
##
|
192
|
+
# Deprecated.
|
193
|
+
##
|
144
194
|
def Tabula.get_line_boundaries(text_elements)
|
145
195
|
TableExtractor.new(text_elements).get_line_boundaries
|
146
196
|
end
|
147
197
|
|
198
|
+
##
|
199
|
+
# Deprecated.
|
200
|
+
##
|
148
201
|
def Tabula.get_columns(text_elements, merge_words=true)
|
149
202
|
TableExtractor.new(text_elements, :merge_words => merge_words).get_columns
|
150
203
|
end
|
151
204
|
|
205
|
+
##
|
206
|
+
# Deprecated.
|
207
|
+
##
|
152
208
|
def Tabula.get_rows(text_elements, merge_words=true)
|
153
209
|
TableExtractor.new(text_elements, :merge_words => merge_words).get_rows
|
154
210
|
end
|
@@ -256,7 +312,7 @@ module Tabula
|
|
256
312
|
vertical_rulings = options[:vertical_rulings]
|
257
313
|
columns = TableExtractor.new(lines.map(&:text_elements).flatten.compact.uniq, {:merge_words => options[:merge_words], :vertical_rulings => vertical_rulings}).group_by_columns.sort_by(&:left)
|
258
314
|
|
259
|
-
# insert empty
|
315
|
+
# insert an empty cell in a given column if there's no text elements within that column's boundaries
|
260
316
|
lines.each_with_index do |l, line_index|
|
261
317
|
next if l.text_elements.nil?
|
262
318
|
l.text_elements.compact! # TODO WHY do I have to do this?
|
@@ -271,26 +327,28 @@ module Tabula
|
|
271
327
|
end
|
272
328
|
|
273
329
|
# merge elements that are in the same column
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
l.text_elements[t1].
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
330
|
+
unless options[:dontmerge]
|
331
|
+
lines.each_with_index do |l, line_index|
|
332
|
+
next if l.text_elements.nil?
|
333
|
+
|
334
|
+
(0..l.text_elements.size-1).to_a.combination(2).each do |t1, t2| #don't remove a string of empty cells
|
335
|
+
next if l.text_elements[t1].nil? or l.text_elements[t2].nil? or l.text_elements[t1].text.empty? or l.text_elements[t2].text.empty?
|
336
|
+
|
337
|
+
# if same column...
|
338
|
+
if columns.detect { |c| c.text_elements.include? l.text_elements[t1] } \
|
339
|
+
== columns.detect { |c| c.text_elements.include? l.text_elements[t2] }
|
340
|
+
if l.text_elements[t1].bottom <= l.text_elements[t2].bottom
|
341
|
+
l.text_elements[t1].merge!(l.text_elements[t2])
|
342
|
+
l.text_elements[t2] = nil
|
343
|
+
else
|
344
|
+
l.text_elements[t2].merge!(l.text_elements[t1])
|
345
|
+
l.text_elements[t1] = nil
|
346
|
+
end
|
289
347
|
end
|
290
348
|
end
|
291
|
-
end
|
292
349
|
|
293
|
-
|
350
|
+
l.text_elements.compact!
|
351
|
+
end
|
294
352
|
end
|
295
353
|
|
296
354
|
# remove duplicate lines
|
data/lib/tabula/version.rb
CHANGED
Binary file
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require './lib/tabula'
|
2
|
+
|
3
|
+
input_filename = "vertical_rulings_bug.pdf"
|
4
|
+
out = File.new("output.xls", 'w')
|
5
|
+
|
6
|
+
extractor = Tabula::Extraction::CharacterExtractor.new(input_filename, :all) #:all ) # 1..2643
|
7
|
+
extractor.extract.each_with_index do |pdf_page, page_index|
|
8
|
+
|
9
|
+
lines = Tabula::Ruling::clean_rulings(Tabula::LSD::detect_lines_in_pdf_page(input_filename, page_index))
|
10
|
+
page_areas = [[0, 0, 1000, 1700]]
|
11
|
+
|
12
|
+
scale_factor = pdf_page.width / 1700
|
13
|
+
puts scale_factor
|
14
|
+
|
15
|
+
vertical_rulings = [0, 360, 506, 617, 906, 1034, 1160, 1290, 1418, 1548].map{|n| Geometry::Segment.new_by_arrays([n * scale_factor, 0], [n * scale_factor, 1000])}
|
16
|
+
|
17
|
+
page_areas.each do |page_area|
|
18
|
+
text = pdf_page.get_text( page_area ) #all the characters within the given area.
|
19
|
+
|
20
|
+
Tabula::Writers.send(:TSV,
|
21
|
+
Tabula.make_table_with_vertical_rulings(text, {:vertical_rulings => vertical_rulings, :merge_words => true, :dontmerge => true}),
|
22
|
+
out)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
out.close
|
26
|
+
|
27
|
+
|
28
|
+
#with dontmerge false (i.e. if we merge) we get crap. STCITY and no spaces in any cities.
|
29
|
+
#with dontmerge true (or commented out), MORGANTOWNWV, and some spaces (e.g. BRYN MAWR, but not FRESHMEADOWS)
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: tabula-extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.6.
|
5
|
+
version: 0.6.6
|
6
6
|
platform: java
|
7
7
|
authors:
|
8
8
|
- Manuel Aristarán
|
@@ -11,7 +11,7 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date: 2013-
|
14
|
+
date: 2013-08-23 00:00:00.000000000 Z
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
17
17
|
name: minitest
|
@@ -132,6 +132,8 @@ files:
|
|
132
132
|
- test/data/gre.pdf
|
133
133
|
- test/data/tabla_subsidios.pdf
|
134
134
|
- test/tests.rb
|
135
|
+
- vertical_rulings_bug.pdf
|
136
|
+
- vertical_rulings_bug.rb
|
135
137
|
homepage: https://github.com/jazzido/tabula-extractor
|
136
138
|
licenses:
|
137
139
|
- MIT
|