tabula-extractor 0.6.3-java → 0.6.4-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +1 -1
- data/bin/tabula +15 -9
- data/ext/Makefile.OSX +4 -1
- data/ext/liblsd.dylib +0 -0
- data/lib/tabula.rb +1 -0
- data/lib/tabula/core_ext.rb +6 -2
- data/lib/tabula/entities.rb +39 -3
- data/lib/tabula/line_segment_detector.rb +8 -2
- data/lib/tabula/pdf_dump.rb +21 -8
- data/lib/tabula/table_extractor.rb +40 -80
- data/lib/tabula/version.rb +1 -1
- data/lib/tabula/writers.rb +1 -1
- data/tabula-extractor.gemspec +2 -0
- data/test/tests.rb +0 -1
- metadata +24 -2
data/Rakefile
CHANGED
data/bin/tabula
CHANGED
@@ -34,11 +34,13 @@ EOS
|
|
34
34
|
|
35
35
|
opt :pages, 'Comma separated list of ranges. Examples: --pages 1-3,5-7 or --pages 3. Default is --pages 1', :default => '1', :type => String
|
36
36
|
opt :area, 'Portion of the page to analyze (top,left,bottom,right). Example: --area 269.875,12.75,790.5,561. Default is entire page', :type => String, :default => nil
|
37
|
+
opt :password, 'Password to decrypt document. Default is empty', :default => ''
|
37
38
|
opt :guess, 'Guess the portion of the page to analyze per page. Slow.'
|
39
|
+
opt :debug, 'Print detected table areas instead of processing.'
|
38
40
|
opt :format, "Output format (#{FORMATS.join(",")})", :default => 'CSV'
|
39
41
|
opt :outfile, 'Write output to <file> instead of STDOUT', :default => '-'
|
40
42
|
end
|
41
|
-
|
43
|
+
|
42
44
|
if !opts[:area].nil?
|
43
45
|
unless opts[:area].split(',').size == 4 \
|
44
46
|
&& opts[:area].split(',').all? { |x| x.strip =~ /(\d+\.?\d*)/ }
|
@@ -59,25 +61,29 @@ def main
|
|
59
61
|
|
60
62
|
area = opts[:area].nil? ? nil : opts[:area].split(',').map(&:to_f)
|
61
63
|
out = opts[:outfile] == '-' ? $stdout : File.new(opts[:outfile], 'w')
|
62
|
-
extractor = Tabula::Extraction::CharacterExtractor.new(filename, parse_pages_arg(opts[:pages]))
|
64
|
+
extractor = Tabula::Extraction::CharacterExtractor.new(filename, parse_pages_arg(opts[:pages]), opts[:password])
|
63
65
|
extractor.extract.each_with_index do |page, page_index|
|
64
|
-
if opts[:guess]
|
66
|
+
if opts[:guess]
|
65
67
|
lines = Tabula::Ruling::clean_rulings(Tabula::LSD::detect_lines_in_pdf_page(filename, page_index))
|
66
|
-
page_areas = Tabula::TableGuesser::find_rects_from_lines(lines)
|
68
|
+
page_areas = Tabula::TableGuesser::find_rects_from_lines(lines)
|
67
69
|
page_areas.map!{|rect| rect.dims(:top, :left, :bottom, :right)}
|
68
70
|
else
|
69
71
|
page_areas = [area]
|
70
72
|
end
|
71
73
|
|
72
74
|
page_areas.each do |page_area|
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
75
|
+
if opts[:guess] && opts[:debug]
|
76
|
+
puts (page_index + 1).to_s + ', ' + page_area.to_s
|
77
|
+
else
|
78
|
+
text = page.get_text( page_area )
|
79
|
+
Tabula::Writers.send(opts[:format].to_sym,
|
80
|
+
Tabula.make_table(text),
|
81
|
+
out)
|
82
|
+
end
|
83
|
+
|
77
84
|
end
|
78
85
|
end
|
79
86
|
out.close
|
80
87
|
end
|
81
88
|
|
82
89
|
main
|
83
|
-
|
data/ext/Makefile.OSX
CHANGED
@@ -1,9 +1,12 @@
|
|
1
1
|
include Makefile.defaults
|
2
2
|
|
3
|
+
|
4
|
+
CFLAGS := -arch i386 -arch x86_64 -fPIC -O3 -g -Wall -Werror
|
5
|
+
|
3
6
|
lib: lib$(NAME).$(VERSION).dylib
|
4
7
|
|
5
8
|
lib$(NAME).$(VERSION).dylib: $(NAME).o
|
6
|
-
$(CC) -dynamiclib -lm -o lib$(NAME).dylib $^
|
9
|
+
$(CC) -arch i386 -arch x86_64 -dynamiclib -lm -o lib$(NAME).dylib $^
|
7
10
|
|
8
11
|
clean:
|
9
12
|
$(RM) *.o
|
data/ext/liblsd.dylib
CHANGED
Binary file
|
data/lib/tabula.rb
CHANGED
data/lib/tabula/core_ext.rb
CHANGED
@@ -10,7 +10,7 @@ module Enumerable
|
|
10
10
|
|
11
11
|
def sample_variance
|
12
12
|
m = self.mean
|
13
|
-
sum = self.inject(0){|accum, i| accum +(i-m)**2 }
|
13
|
+
sum = self.inject(0) {|accum, i| accum + (i-m)**2 }
|
14
14
|
sum/(self.length - 1).to_f
|
15
15
|
end
|
16
16
|
|
@@ -18,4 +18,8 @@ module Enumerable
|
|
18
18
|
return Math.sqrt(self.sample_variance)
|
19
19
|
end
|
20
20
|
|
21
|
-
|
21
|
+
def sorted?
|
22
|
+
each_cons(2).all? { |a, b| (a <=> b) <= 0 }
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
data/lib/tabula/entities.rb
CHANGED
@@ -70,6 +70,22 @@ module Tabula
|
|
70
70
|
intersection_area / union_area
|
71
71
|
end
|
72
72
|
|
73
|
+
# as defined by PDF-TREX paper
|
74
|
+
def horizontal_overlap_ratio(other)
|
75
|
+
delta = [self.bottom - self.top, other.bottom - other.top].min
|
76
|
+
if [other.top, self.top, other.bottom, self.bottom].sorted?
|
77
|
+
(other.bottom - self.top) / delta
|
78
|
+
elsif [self.top, other.top, self.bottom, other.bottom].sorted?
|
79
|
+
(self.bottom - other.top) / delta
|
80
|
+
elsif [self.top, other.top, other.bottom, self.bottom].sorted?
|
81
|
+
(other.bottom - other.top) / delta
|
82
|
+
elsif [other.top, self.top, self.bottom, other.bottom].sorted?
|
83
|
+
(self.bottom - self.top) / delta
|
84
|
+
else
|
85
|
+
0
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
73
89
|
def to_h
|
74
90
|
hash = {}
|
75
91
|
[:top, :left, :width, :height].each do |m|
|
@@ -99,8 +115,8 @@ module Tabula
|
|
99
115
|
|
100
116
|
# spaces are not detected, b/c they have height == 0
|
101
117
|
# ze = ZoneEntity.new(area[0], area[1], area[3] - area[1], area[2] - area[0])
|
102
|
-
# self.texts.select { |t| t.overlaps? ze }
|
103
|
-
self.texts.select do |t|
|
118
|
+
# self.texts.select { |t| t.overlaps? ze }
|
119
|
+
self.texts.select do |t|
|
104
120
|
t.top > area[0] && t.top + t.height < area[2] && t.left > area[1] && t.left + t.width < area[3]
|
105
121
|
end
|
106
122
|
end
|
@@ -179,12 +195,32 @@ module Tabula
|
|
179
195
|
end
|
180
196
|
end
|
181
197
|
|
198
|
+
class Table
|
199
|
+
attr_reader :lines
|
200
|
+
def initialize(line_count, separators)
|
201
|
+
@separators = separators
|
202
|
+
@lines = (0...line_count).inject([]) { |m| m << Line.new }
|
203
|
+
end
|
204
|
+
|
205
|
+
def add_text_element(text_element, i, j)
|
206
|
+
if @lines.size <= i
|
207
|
+
@lines[i] = Line.new
|
208
|
+
end
|
209
|
+
if @lines[i].text_elements[j]
|
210
|
+
@lines[i].text_elements[j].merge!(text_element)
|
211
|
+
else
|
212
|
+
@lines[i].text_elements[j] = text_element
|
213
|
+
end
|
214
|
+
end
|
215
|
+
end
|
182
216
|
|
183
217
|
class Line < ZoneEntity
|
184
218
|
attr_accessor :text_elements
|
219
|
+
attr_reader :index
|
185
220
|
|
186
|
-
def initialize
|
221
|
+
def initialize(index=nil)
|
187
222
|
self.text_elements = []
|
223
|
+
@index = index
|
188
224
|
end
|
189
225
|
|
190
226
|
def <<(t)
|
@@ -5,6 +5,7 @@ require 'ffi'
|
|
5
5
|
|
6
6
|
require_relative './entities'
|
7
7
|
require_relative './pdf_render'
|
8
|
+
require_relative './pdf_dump'
|
8
9
|
require File.join(File.dirname(__FILE__), '../../target/', Tabula::PDFBOX)
|
9
10
|
|
10
11
|
java_import javax.imageio.ImageIO
|
@@ -45,7 +46,7 @@ module Tabula
|
|
45
46
|
def LSD.detect_lines_in_pdf_page(pdf_path, page_number, options={})
|
46
47
|
options = DETECT_LINES_DEFAULTS.merge(options)
|
47
48
|
|
48
|
-
pdf_file =
|
49
|
+
pdf_file = Extraction.openPDF(pdf_path)
|
49
50
|
page = pdf_file.getDocumentCatalog.getAllPages[page_number]
|
50
51
|
bi = Tabula::Render.pageToBufferedImage(page,
|
51
52
|
options[:image_size])
|
@@ -62,9 +63,14 @@ module Tabula
|
|
62
63
|
image
|
63
64
|
elsif image.class == String
|
64
65
|
ImageIO.read(java.io.File.new(image))
|
65
|
-
|
66
|
+
else
|
66
67
|
raise ArgumentError, 'image must be a string or a BufferedImage'
|
67
68
|
end
|
69
|
+
|
70
|
+
ImageIO.write(bimage,
|
71
|
+
'png',
|
72
|
+
java.io.File.new("/tmp/white.png"))
|
73
|
+
|
68
74
|
image = LSD.image_to_image_double(bimage)
|
69
75
|
|
70
76
|
lines_found_ptr = FFI::MemoryPointer.new(:int, 1)
|
data/lib/tabula/pdf_dump.rb
CHANGED
@@ -7,9 +7,22 @@ require File.join(File.dirname(__FILE__), '../../target/', Tabula::PDFBOX)
|
|
7
7
|
java_import org.apache.pdfbox.pdfparser.PDFParser
|
8
8
|
java_import org.apache.pdfbox.pdmodel.PDDocument
|
9
9
|
java_import org.apache.pdfbox.util.PDFTextStripper
|
10
|
+
java_import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial
|
10
11
|
|
11
12
|
module Tabula
|
13
|
+
|
12
14
|
module Extraction
|
15
|
+
|
16
|
+
def Extraction.openPDF(pdf_filename, password='')
|
17
|
+
raise Errno::ENOENT unless File.exists?(pdf_filename)
|
18
|
+
document = PDDocument.load(pdf_filename)
|
19
|
+
if document.isEncrypted
|
20
|
+
sdm = StandardDecryptionMaterial.new(password)
|
21
|
+
document.openProtection(sdm)
|
22
|
+
end
|
23
|
+
document
|
24
|
+
end
|
25
|
+
|
13
26
|
class TextExtractor < org.apache.pdfbox.util.PDFTextStripper
|
14
27
|
|
15
28
|
attr_accessor :characters, :fonts
|
@@ -28,8 +41,9 @@ module Tabula
|
|
28
41
|
end
|
29
42
|
|
30
43
|
|
44
|
+
|
31
45
|
def processTextPosition(text)
|
32
|
-
#
|
46
|
+
# return if text.getCharacter == ' '
|
33
47
|
|
34
48
|
# text_font = text.getFont
|
35
49
|
# text_size = text.getFontSize
|
@@ -49,9 +63,8 @@ module Tabula
|
|
49
63
|
end
|
50
64
|
|
51
65
|
class PagesInfoExtractor
|
52
|
-
def initialize(pdf_filename)
|
53
|
-
|
54
|
-
@pdf_file = PDDocument.load(java.io.File.new(pdf_filename))
|
66
|
+
def initialize(pdf_filename, password='')
|
67
|
+
@pdf_file = Extraction.openPDF(pdf_filename, password)
|
55
68
|
@all_pages = @pdf_file.getDocumentCatalog.getAllPages
|
56
69
|
end
|
57
70
|
|
@@ -60,7 +73,7 @@ module Tabula
|
|
60
73
|
begin
|
61
74
|
@all_pages.each_with_index do |page, i|
|
62
75
|
contents = page.getContents
|
63
|
-
next if contents.nil?
|
76
|
+
# next if contents.nil?
|
64
77
|
y.yield Tabula::Page.new(page.findCropBox.width,
|
65
78
|
page.findCropBox.height,
|
66
79
|
page.getRotation.to_i,
|
@@ -78,9 +91,9 @@ module Tabula
|
|
78
91
|
include Observable
|
79
92
|
|
80
93
|
#N.B. pages can be :all, a list of pages or a range.
|
81
|
-
def initialize(pdf_filename, pages=[1])
|
94
|
+
def initialize(pdf_filename, pages=[1], password='')
|
82
95
|
raise Errno::ENOENT unless File.exists?(pdf_filename)
|
83
|
-
@pdf_file =
|
96
|
+
@pdf_file = Extraction.openPDF(pdf_filename, password)
|
84
97
|
@all_pages = @pdf_file.getDocumentCatalog.getAllPages
|
85
98
|
@pages = pages == :all ? (1..@all_pages.size) : pages
|
86
99
|
@extractor = TextExtractor.new
|
@@ -105,7 +118,7 @@ module Tabula
|
|
105
118
|
char.getXDirAdj.round(2),
|
106
119
|
char.getWidthDirAdj.round(2),
|
107
120
|
char.getHeightDir.round(2),
|
108
|
-
|
121
|
+
char.getFont,
|
109
122
|
char.getFontSize.round(2),
|
110
123
|
char.getCharacter,
|
111
124
|
char.getWidthOfSpace)
|
@@ -115,12 +115,9 @@ module Tabula
|
|
115
115
|
|
116
116
|
char2 = self.text_elements[i+1]
|
117
117
|
|
118
|
-
|
119
|
-
|
120
118
|
next if char2.nil? or char1.nil?
|
121
119
|
|
122
120
|
if self.text_elements[current_word_index].should_merge?(char2)
|
123
|
-
#puts "merging: #{self.text_elements[current_word_index].text}/#{self.text_elements[current_word_index].width}"
|
124
121
|
self.text_elements[current_word_index].merge!(char2)
|
125
122
|
char1 = char2
|
126
123
|
self.text_elements[i+1] = nil
|
@@ -166,92 +163,60 @@ module Tabula
|
|
166
163
|
|
167
164
|
ONLY_SPACES_RE = Regexp.new('^\s+$')
|
168
165
|
|
169
|
-
|
170
|
-
def Tabula.make_table(text_elements, options={})
|
171
|
-
extractor = TableExtractor.new(text_elements, options)
|
172
|
-
|
173
|
-
# group by lines
|
166
|
+
def Tabula.group_by_lines(text_elements)
|
174
167
|
lines = []
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
line_members = text_elements.find_all do |te|
|
183
|
-
te.vertically_overlaps?(lb)
|
184
|
-
end
|
185
|
-
|
186
|
-
text_elements -= line_members
|
187
|
-
|
188
|
-
line_members.sort_by(&:left).each do |te|
|
189
|
-
# skip text_elements that only contain spaces
|
190
|
-
next if te.text =~ ONLY_SPACES_RE
|
191
|
-
line << te
|
168
|
+
text_elements.each do |te|
|
169
|
+
next if te.text =~ ONLY_SPACES_RE
|
170
|
+
l = lines.find { |line| line.horizontal_overlap_ratio(te) >= 0.01 }
|
171
|
+
if l.nil?
|
172
|
+
l = Line.new
|
173
|
+
lines << l
|
192
174
|
end
|
193
|
-
|
194
|
-
lines << line if line.text_elements.size > 0
|
175
|
+
l << te
|
195
176
|
end
|
177
|
+
lines
|
178
|
+
end
|
196
179
|
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
180
|
+
# Returns an array of Tabula::Line
|
181
|
+
def Tabula.make_table(text_elements, options={})
|
182
|
+
default_options = {:separators => []}
|
183
|
+
options = default_options.merge(options)
|
184
|
+
|
185
|
+
extractor = TableExtractor.new(text_elements, options).text_elements
|
186
|
+
lines = group_by_lines(text_elements)
|
187
|
+
top = lines[0].text_elements.map(&:top).min
|
188
|
+
right = 0
|
189
|
+
columns = []
|
190
|
+
|
191
|
+
text_elements.sort_by(&:left).each do |te|
|
192
|
+
next if te.text =~ ONLY_SPACES_RE
|
193
|
+
if te.top >= top
|
194
|
+
left = te.left
|
195
|
+
if (left > right)
|
196
|
+
columns << right
|
197
|
+
right = te.right
|
198
|
+
elsif te.right > right
|
199
|
+
right = te.right
|
213
200
|
end
|
214
201
|
end
|
215
202
|
end
|
216
203
|
|
217
|
-
|
218
|
-
lines.each_with_index do |l, line_index|
|
219
|
-
next if l.text_elements.nil?
|
220
|
-
|
221
|
-
(0..l.text_elements.size-1).to_a.combination(2).each do |t1, t2|
|
222
|
-
next if l.text_elements[t1].nil? or l.text_elements[t2].nil? or l.text_elements[t1].text.empty? or l.text_elements[t2].text.empty?
|
204
|
+
separators = columns[1..-1].sort.reverse
|
223
205
|
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
l.text_elements[t2] = nil
|
230
|
-
else
|
231
|
-
l.text_elements[t2].merge!(l.text_elements[t1])
|
232
|
-
l.text_elements[t1] = nil
|
233
|
-
end
|
234
|
-
end
|
206
|
+
table = Table.new(lines.count, separators)
|
207
|
+
lines.each_with_index do |line, i|
|
208
|
+
line.text_elements.each do |te|
|
209
|
+
j = separators.find_index { |s| te.left > s } || separators.count
|
210
|
+
table.add_text_element(te, i, separators.count - j)
|
235
211
|
end
|
236
|
-
|
237
|
-
l.text_elements.compact!
|
238
212
|
end
|
239
213
|
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
next if lines[i].nil?
|
245
|
-
# if any of the elements on the next line is duplicated, kill
|
246
|
-
# the next line
|
247
|
-
if (0..lines[i].text_elements.size-1).any? { |j| lines[i].text_elements[j] == lines[i+1].text_elements[j] }
|
248
|
-
lines[i+1] = nil
|
249
|
-
end
|
214
|
+
table.lines.map do |l|
|
215
|
+
l.text_elements.map! { |te|
|
216
|
+
te.nil? ? TextElement.new(nil, nil, nil, nil, nil, nil, '', nil) : te
|
217
|
+
}
|
250
218
|
end
|
251
219
|
|
252
|
-
lines.compact.map do |line|
|
253
|
-
line.text_elements.sort_by(&:left)
|
254
|
-
end
|
255
220
|
end
|
256
221
|
|
257
222
|
|
@@ -340,9 +305,4 @@ module Tabula
|
|
340
305
|
line.text_elements.sort_by(&:left)
|
341
306
|
end
|
342
307
|
end
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
308
|
end
|
data/lib/tabula/version.rb
CHANGED
data/lib/tabula/writers.rb
CHANGED
data/tabula-extractor.gemspec
CHANGED
@@ -22,6 +22,8 @@ Gem::Specification.new do |s|
|
|
22
22
|
|
23
23
|
s.add_development_dependency 'minitest'
|
24
24
|
s.add_development_dependency 'bundler', '>= 1.3.4'
|
25
|
+
s.add_development_dependency 'ruby-debug'
|
25
26
|
|
26
27
|
s.add_runtime_dependency "trollop", ["~> 2.0"]
|
28
|
+
# s.add_runtime_dependency "algorithms", ["~> 0.6.1"]
|
27
29
|
end
|
data/test/tests.rb
CHANGED
@@ -114,7 +114,6 @@ class TestExtractor < Minitest::Test
|
|
114
114
|
lines = Tabula::TableGuesser.find_lines_on_page(pdf_file_path, 0)
|
115
115
|
vertical_rulings = lines.select(&:vertical?).uniq{|line| (line.left / 10).round }
|
116
116
|
|
117
|
-
|
118
117
|
characters = character_extractor.extract.next.get_text([110, 28, 218, 833])
|
119
118
|
#top left bottom right
|
120
119
|
expected = [['AANONSEN, DEBORAH, A', '', 'STATEN ISLAND, NY', 'MEALS', '$85.00'],
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: tabula-extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.6.
|
5
|
+
version: 0.6.4
|
6
6
|
platform: java
|
7
7
|
authors:
|
8
8
|
- Manuel Aristarán
|
@@ -11,7 +11,7 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date: 2013-
|
14
|
+
date: 2013-07-09 00:00:00.000000000 Z
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
17
17
|
name: minitest
|
@@ -45,6 +45,22 @@ dependencies:
|
|
45
45
|
none: false
|
46
46
|
prerelease: false
|
47
47
|
type: :development
|
48
|
+
- !ruby/object:Gem::Dependency
|
49
|
+
name: ruby-debug
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
none: false
|
56
|
+
requirement: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - '>='
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: '0'
|
61
|
+
none: false
|
62
|
+
prerelease: false
|
63
|
+
type: :development
|
48
64
|
- !ruby/object:Gem::Dependency
|
49
65
|
name: trollop
|
50
66
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -126,12 +142,18 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
126
142
|
requirements:
|
127
143
|
- - '>='
|
128
144
|
- !ruby/object:Gem::Version
|
145
|
+
segments:
|
146
|
+
- 0
|
147
|
+
hash: 2
|
129
148
|
version: '0'
|
130
149
|
none: false
|
131
150
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
132
151
|
requirements:
|
133
152
|
- - '>='
|
134
153
|
- !ruby/object:Gem::Version
|
154
|
+
segments:
|
155
|
+
- 0
|
156
|
+
hash: 2
|
135
157
|
version: '0'
|
136
158
|
none: false
|
137
159
|
requirements: []
|