tabula-extractor 0.6.5-java → 0.6.6-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/tabula/entities.rb +4 -0
- data/lib/tabula/table_extractor.rb +79 -21
- data/lib/tabula/version.rb +1 -1
- data/vertical_rulings_bug.pdf +0 -0
- data/vertical_rulings_bug.rb +29 -0
- metadata +4 -2
data/lib/tabula/entities.rb
CHANGED
@@ -232,6 +232,10 @@ module Tabula
|
|
232
232
|
self.height = t.height
|
233
233
|
else
|
234
234
|
if in_same_column = self.text_elements.find { |te| te.horizontally_overlaps?(t) }
|
235
|
+
#sometimes a space needs to be added here
|
236
|
+
unless in_same_column.vertically_overlaps?(t)
|
237
|
+
t.text = " " + t.text
|
238
|
+
end
|
235
239
|
in_same_column.merge!(t)
|
236
240
|
else
|
237
241
|
self.text_elements << t
|
@@ -14,7 +14,15 @@ module Tabula
|
|
14
14
|
def initialize(text_elements, options = {})
|
15
15
|
self.text_elements = text_elements
|
16
16
|
self.options = DEFAULT_OPTIONS.merge(options)
|
17
|
-
|
17
|
+
|
18
|
+
if self.options[:merge_words]
|
19
|
+
if self.options[:vertical_rulings]
|
20
|
+
merge_words_in_a_vertical_rulings_aware_manner!(self.options[:vertical_rulings])
|
21
|
+
else
|
22
|
+
merge_words!
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
18
26
|
end
|
19
27
|
|
20
28
|
def get_rows
|
@@ -49,7 +57,7 @@ module Tabula
|
|
49
57
|
if column = columns.detect { |c| te.horizontally_overlaps?(c) }
|
50
58
|
column << te
|
51
59
|
else
|
52
|
-
puts "couldn't find a place for #{te.inspect}"
|
60
|
+
#puts "couldn't find a place for #{te.inspect}"
|
53
61
|
#columns << Column.new(te.left, te.width, [te])
|
54
62
|
end
|
55
63
|
end
|
@@ -134,21 +142,69 @@ module Tabula
|
|
134
142
|
self.text_elements.compact!
|
135
143
|
return self.text_elements
|
136
144
|
end
|
145
|
+
|
146
|
+
#this is where spaces come from!
|
147
|
+
def merge_words_in_a_vertical_rulings_aware_manner!(vertical_rulings)
|
148
|
+
#don't merge words across a ruling.
|
149
|
+
|
150
|
+
return self.text_elements if @merged # only merge once. awful hack.
|
151
|
+
@merged = true
|
152
|
+
current_word_index = i = 0
|
153
|
+
char1 = self.text_elements[i]
|
154
|
+
vertical_ruling_locations = vertical_rulings.map &:left
|
155
|
+
|
156
|
+
while i < self.text_elements.size-1 do
|
157
|
+
|
158
|
+
char2 = self.text_elements[i+1]
|
159
|
+
|
160
|
+
next if char2.nil? or char1.nil?
|
161
|
+
|
162
|
+
if self.text_elements[current_word_index].should_merge?(char2)
|
163
|
+
unless vertical_ruling_locations.map{|loc| self.text_elements[current_word_index].left < loc && char2.left > loc}.include?(true)
|
164
|
+
self.text_elements[current_word_index].merge!(char2)
|
165
|
+
end
|
166
|
+
|
167
|
+
char1 = char2
|
168
|
+
self.text_elements[i+1] = nil
|
169
|
+
else
|
170
|
+
# is there a space? is this within `CHARACTER_DISTANCE_THRESHOLD` points of previous char?
|
171
|
+
if (char1.text != " ") and (char2.text != " ") and self.text_elements[current_word_index].should_add_space?(char2)
|
172
|
+
self.text_elements[current_word_index].text += " "
|
173
|
+
#self.text_elements[current_word_index].width += self.text_elements[current_word_index].width_of_space
|
174
|
+
end
|
175
|
+
current_word_index = i+1
|
176
|
+
end
|
177
|
+
i += 1
|
178
|
+
end
|
179
|
+
self.text_elements.compact!
|
180
|
+
return self.text_elements
|
181
|
+
end
|
137
182
|
end
|
138
183
|
|
139
|
-
|
184
|
+
##
|
185
|
+
# Deprecated.
|
186
|
+
##
|
140
187
|
def Tabula.group_by_columns(text_elements, merge_words=false)
|
141
188
|
TableExtractor.new(text_elements, :merge_words => merge_words).group_by_columns
|
142
189
|
end
|
143
190
|
|
191
|
+
##
|
192
|
+
# Deprecated.
|
193
|
+
##
|
144
194
|
def Tabula.get_line_boundaries(text_elements)
|
145
195
|
TableExtractor.new(text_elements).get_line_boundaries
|
146
196
|
end
|
147
197
|
|
198
|
+
##
|
199
|
+
# Deprecated.
|
200
|
+
##
|
148
201
|
def Tabula.get_columns(text_elements, merge_words=true)
|
149
202
|
TableExtractor.new(text_elements, :merge_words => merge_words).get_columns
|
150
203
|
end
|
151
204
|
|
205
|
+
##
|
206
|
+
# Deprecated.
|
207
|
+
##
|
152
208
|
def Tabula.get_rows(text_elements, merge_words=true)
|
153
209
|
TableExtractor.new(text_elements, :merge_words => merge_words).get_rows
|
154
210
|
end
|
@@ -256,7 +312,7 @@ module Tabula
|
|
256
312
|
vertical_rulings = options[:vertical_rulings]
|
257
313
|
columns = TableExtractor.new(lines.map(&:text_elements).flatten.compact.uniq, {:merge_words => options[:merge_words], :vertical_rulings => vertical_rulings}).group_by_columns.sort_by(&:left)
|
258
314
|
|
259
|
-
# insert empty
|
315
|
+
# insert an empty cell in a given column if there's no text elements within that column's boundaries
|
260
316
|
lines.each_with_index do |l, line_index|
|
261
317
|
next if l.text_elements.nil?
|
262
318
|
l.text_elements.compact! # TODO WHY do I have to do this?
|
@@ -271,26 +327,28 @@ module Tabula
|
|
271
327
|
end
|
272
328
|
|
273
329
|
# merge elements that are in the same column
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
l.text_elements[t1].
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
330
|
+
unless options[:dontmerge]
|
331
|
+
lines.each_with_index do |l, line_index|
|
332
|
+
next if l.text_elements.nil?
|
333
|
+
|
334
|
+
(0..l.text_elements.size-1).to_a.combination(2).each do |t1, t2| #don't remove a string of empty cells
|
335
|
+
next if l.text_elements[t1].nil? or l.text_elements[t2].nil? or l.text_elements[t1].text.empty? or l.text_elements[t2].text.empty?
|
336
|
+
|
337
|
+
# if same column...
|
338
|
+
if columns.detect { |c| c.text_elements.include? l.text_elements[t1] } \
|
339
|
+
== columns.detect { |c| c.text_elements.include? l.text_elements[t2] }
|
340
|
+
if l.text_elements[t1].bottom <= l.text_elements[t2].bottom
|
341
|
+
l.text_elements[t1].merge!(l.text_elements[t2])
|
342
|
+
l.text_elements[t2] = nil
|
343
|
+
else
|
344
|
+
l.text_elements[t2].merge!(l.text_elements[t1])
|
345
|
+
l.text_elements[t1] = nil
|
346
|
+
end
|
289
347
|
end
|
290
348
|
end
|
291
|
-
end
|
292
349
|
|
293
|
-
|
350
|
+
l.text_elements.compact!
|
351
|
+
end
|
294
352
|
end
|
295
353
|
|
296
354
|
# remove duplicate lines
|
data/lib/tabula/version.rb
CHANGED
Binary file
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require './lib/tabula'
|
2
|
+
|
3
|
+
input_filename = "vertical_rulings_bug.pdf"
|
4
|
+
out = File.new("output.xls", 'w')
|
5
|
+
|
6
|
+
extractor = Tabula::Extraction::CharacterExtractor.new(input_filename, :all) #:all ) # 1..2643
|
7
|
+
extractor.extract.each_with_index do |pdf_page, page_index|
|
8
|
+
|
9
|
+
lines = Tabula::Ruling::clean_rulings(Tabula::LSD::detect_lines_in_pdf_page(input_filename, page_index))
|
10
|
+
page_areas = [[0, 0, 1000, 1700]]
|
11
|
+
|
12
|
+
scale_factor = pdf_page.width / 1700
|
13
|
+
puts scale_factor
|
14
|
+
|
15
|
+
vertical_rulings = [0, 360, 506, 617, 906, 1034, 1160, 1290, 1418, 1548].map{|n| Geometry::Segment.new_by_arrays([n * scale_factor, 0], [n * scale_factor, 1000])}
|
16
|
+
|
17
|
+
page_areas.each do |page_area|
|
18
|
+
text = pdf_page.get_text( page_area ) #all the characters within the given area.
|
19
|
+
|
20
|
+
Tabula::Writers.send(:TSV,
|
21
|
+
Tabula.make_table_with_vertical_rulings(text, {:vertical_rulings => vertical_rulings, :merge_words => true, :dontmerge => true}),
|
22
|
+
out)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
out.close
|
26
|
+
|
27
|
+
|
28
|
+
#with dontmerge false (i.e. if we merge) we get crap. STCITY and no spaces in any cities.
|
29
|
+
#with dontmerge true (or commented out), MORGANTOWNWV, and some spaces (e.g. BRYN MAWR, but not FRESHMEADOWS)
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: tabula-extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.6.
|
5
|
+
version: 0.6.6
|
6
6
|
platform: java
|
7
7
|
authors:
|
8
8
|
- Manuel Aristarán
|
@@ -11,7 +11,7 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date: 2013-
|
14
|
+
date: 2013-08-23 00:00:00.000000000 Z
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
17
17
|
name: minitest
|
@@ -132,6 +132,8 @@ files:
|
|
132
132
|
- test/data/gre.pdf
|
133
133
|
- test/data/tabla_subsidios.pdf
|
134
134
|
- test/tests.rb
|
135
|
+
- vertical_rulings_bug.pdf
|
136
|
+
- vertical_rulings_bug.rb
|
135
137
|
homepage: https://github.com/jazzido/tabula-extractor
|
136
138
|
licenses:
|
137
139
|
- MIT
|