tabula-extractor 0.6.3-java → 0.6.4-java
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +1 -1
- data/bin/tabula +15 -9
- data/ext/Makefile.OSX +4 -1
- data/ext/liblsd.dylib +0 -0
- data/lib/tabula.rb +1 -0
- data/lib/tabula/core_ext.rb +6 -2
- data/lib/tabula/entities.rb +39 -3
- data/lib/tabula/line_segment_detector.rb +8 -2
- data/lib/tabula/pdf_dump.rb +21 -8
- data/lib/tabula/table_extractor.rb +40 -80
- data/lib/tabula/version.rb +1 -1
- data/lib/tabula/writers.rb +1 -1
- data/tabula-extractor.gemspec +2 -0
- data/test/tests.rb +0 -1
- metadata +24 -2
data/Rakefile
CHANGED
data/bin/tabula
CHANGED
@@ -34,11 +34,13 @@ EOS
|
|
34
34
|
|
35
35
|
opt :pages, 'Comma separated list of ranges. Examples: --pages 1-3,5-7 or --pages 3. Default is --pages 1', :default => '1', :type => String
|
36
36
|
opt :area, 'Portion of the page to analyze (top,left,bottom,right). Example: --area 269.875,12.75,790.5,561. Default is entire page', :type => String, :default => nil
|
37
|
+
opt :password, 'Password to decrypt document. Default is empty', :default => ''
|
37
38
|
opt :guess, 'Guess the portion of the page to analyze per page. Slow.'
|
39
|
+
opt :debug, 'Print detected table areas instead of processing.'
|
38
40
|
opt :format, "Output format (#{FORMATS.join(",")})", :default => 'CSV'
|
39
41
|
opt :outfile, 'Write output to <file> instead of STDOUT', :default => '-'
|
40
42
|
end
|
41
|
-
|
43
|
+
|
42
44
|
if !opts[:area].nil?
|
43
45
|
unless opts[:area].split(',').size == 4 \
|
44
46
|
&& opts[:area].split(',').all? { |x| x.strip =~ /(\d+\.?\d*)/ }
|
@@ -59,25 +61,29 @@ def main
|
|
59
61
|
|
60
62
|
area = opts[:area].nil? ? nil : opts[:area].split(',').map(&:to_f)
|
61
63
|
out = opts[:outfile] == '-' ? $stdout : File.new(opts[:outfile], 'w')
|
62
|
-
extractor = Tabula::Extraction::CharacterExtractor.new(filename, parse_pages_arg(opts[:pages]))
|
64
|
+
extractor = Tabula::Extraction::CharacterExtractor.new(filename, parse_pages_arg(opts[:pages]), opts[:password])
|
63
65
|
extractor.extract.each_with_index do |page, page_index|
|
64
|
-
if opts[:guess]
|
66
|
+
if opts[:guess]
|
65
67
|
lines = Tabula::Ruling::clean_rulings(Tabula::LSD::detect_lines_in_pdf_page(filename, page_index))
|
66
|
-
page_areas = Tabula::TableGuesser::find_rects_from_lines(lines)
|
68
|
+
page_areas = Tabula::TableGuesser::find_rects_from_lines(lines)
|
67
69
|
page_areas.map!{|rect| rect.dims(:top, :left, :bottom, :right)}
|
68
70
|
else
|
69
71
|
page_areas = [area]
|
70
72
|
end
|
71
73
|
|
72
74
|
page_areas.each do |page_area|
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
75
|
+
if opts[:guess] && opts[:debug]
|
76
|
+
puts (page_index + 1).to_s + ', ' + page_area.to_s
|
77
|
+
else
|
78
|
+
text = page.get_text( page_area )
|
79
|
+
Tabula::Writers.send(opts[:format].to_sym,
|
80
|
+
Tabula.make_table(text),
|
81
|
+
out)
|
82
|
+
end
|
83
|
+
|
77
84
|
end
|
78
85
|
end
|
79
86
|
out.close
|
80
87
|
end
|
81
88
|
|
82
89
|
main
|
83
|
-
|
data/ext/Makefile.OSX
CHANGED
@@ -1,9 +1,12 @@
|
|
1
1
|
include Makefile.defaults
|
2
2
|
|
3
|
+
|
4
|
+
CFLAGS := -arch i386 -arch x86_64 -fPIC -O3 -g -Wall -Werror
|
5
|
+
|
3
6
|
lib: lib$(NAME).$(VERSION).dylib
|
4
7
|
|
5
8
|
lib$(NAME).$(VERSION).dylib: $(NAME).o
|
6
|
-
$(CC) -dynamiclib -lm -o lib$(NAME).dylib $^
|
9
|
+
$(CC) -arch i386 -arch x86_64 -dynamiclib -lm -o lib$(NAME).dylib $^
|
7
10
|
|
8
11
|
clean:
|
9
12
|
$(RM) *.o
|
data/ext/liblsd.dylib
CHANGED
Binary file
|
data/lib/tabula.rb
CHANGED
data/lib/tabula/core_ext.rb
CHANGED
@@ -10,7 +10,7 @@ module Enumerable
|
|
10
10
|
|
11
11
|
def sample_variance
|
12
12
|
m = self.mean
|
13
|
-
sum = self.inject(0){|accum, i| accum +(i-m)**2 }
|
13
|
+
sum = self.inject(0) {|accum, i| accum + (i-m)**2 }
|
14
14
|
sum/(self.length - 1).to_f
|
15
15
|
end
|
16
16
|
|
@@ -18,4 +18,8 @@ module Enumerable
|
|
18
18
|
return Math.sqrt(self.sample_variance)
|
19
19
|
end
|
20
20
|
|
21
|
-
|
21
|
+
def sorted?
|
22
|
+
each_cons(2).all? { |a, b| (a <=> b) <= 0 }
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
data/lib/tabula/entities.rb
CHANGED
@@ -70,6 +70,22 @@ module Tabula
|
|
70
70
|
intersection_area / union_area
|
71
71
|
end
|
72
72
|
|
73
|
+
# as defined by PDF-TREX paper
|
74
|
+
def horizontal_overlap_ratio(other)
|
75
|
+
delta = [self.bottom - self.top, other.bottom - other.top].min
|
76
|
+
if [other.top, self.top, other.bottom, self.bottom].sorted?
|
77
|
+
(other.bottom - self.top) / delta
|
78
|
+
elsif [self.top, other.top, self.bottom, other.bottom].sorted?
|
79
|
+
(self.bottom - other.top) / delta
|
80
|
+
elsif [self.top, other.top, other.bottom, self.bottom].sorted?
|
81
|
+
(other.bottom - other.top) / delta
|
82
|
+
elsif [other.top, self.top, self.bottom, other.bottom].sorted?
|
83
|
+
(self.bottom - self.top) / delta
|
84
|
+
else
|
85
|
+
0
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
73
89
|
def to_h
|
74
90
|
hash = {}
|
75
91
|
[:top, :left, :width, :height].each do |m|
|
@@ -99,8 +115,8 @@ module Tabula
|
|
99
115
|
|
100
116
|
# spaces are not detected, b/c they have height == 0
|
101
117
|
# ze = ZoneEntity.new(area[0], area[1], area[3] - area[1], area[2] - area[0])
|
102
|
-
# self.texts.select { |t| t.overlaps? ze }
|
103
|
-
self.texts.select do |t|
|
118
|
+
# self.texts.select { |t| t.overlaps? ze }
|
119
|
+
self.texts.select do |t|
|
104
120
|
t.top > area[0] && t.top + t.height < area[2] && t.left > area[1] && t.left + t.width < area[3]
|
105
121
|
end
|
106
122
|
end
|
@@ -179,12 +195,32 @@ module Tabula
|
|
179
195
|
end
|
180
196
|
end
|
181
197
|
|
198
|
+
class Table
|
199
|
+
attr_reader :lines
|
200
|
+
def initialize(line_count, separators)
|
201
|
+
@separators = separators
|
202
|
+
@lines = (0...line_count).inject([]) { |m| m << Line.new }
|
203
|
+
end
|
204
|
+
|
205
|
+
def add_text_element(text_element, i, j)
|
206
|
+
if @lines.size <= i
|
207
|
+
@lines[i] = Line.new
|
208
|
+
end
|
209
|
+
if @lines[i].text_elements[j]
|
210
|
+
@lines[i].text_elements[j].merge!(text_element)
|
211
|
+
else
|
212
|
+
@lines[i].text_elements[j] = text_element
|
213
|
+
end
|
214
|
+
end
|
215
|
+
end
|
182
216
|
|
183
217
|
class Line < ZoneEntity
|
184
218
|
attr_accessor :text_elements
|
219
|
+
attr_reader :index
|
185
220
|
|
186
|
-
def initialize
|
221
|
+
def initialize(index=nil)
|
187
222
|
self.text_elements = []
|
223
|
+
@index = index
|
188
224
|
end
|
189
225
|
|
190
226
|
def <<(t)
|
@@ -5,6 +5,7 @@ require 'ffi'
|
|
5
5
|
|
6
6
|
require_relative './entities'
|
7
7
|
require_relative './pdf_render'
|
8
|
+
require_relative './pdf_dump'
|
8
9
|
require File.join(File.dirname(__FILE__), '../../target/', Tabula::PDFBOX)
|
9
10
|
|
10
11
|
java_import javax.imageio.ImageIO
|
@@ -45,7 +46,7 @@ module Tabula
|
|
45
46
|
def LSD.detect_lines_in_pdf_page(pdf_path, page_number, options={})
|
46
47
|
options = DETECT_LINES_DEFAULTS.merge(options)
|
47
48
|
|
48
|
-
pdf_file =
|
49
|
+
pdf_file = Extraction.openPDF(pdf_path)
|
49
50
|
page = pdf_file.getDocumentCatalog.getAllPages[page_number]
|
50
51
|
bi = Tabula::Render.pageToBufferedImage(page,
|
51
52
|
options[:image_size])
|
@@ -62,9 +63,14 @@ module Tabula
|
|
62
63
|
image
|
63
64
|
elsif image.class == String
|
64
65
|
ImageIO.read(java.io.File.new(image))
|
65
|
-
|
66
|
+
else
|
66
67
|
raise ArgumentError, 'image must be a string or a BufferedImage'
|
67
68
|
end
|
69
|
+
|
70
|
+
ImageIO.write(bimage,
|
71
|
+
'png',
|
72
|
+
java.io.File.new("/tmp/white.png"))
|
73
|
+
|
68
74
|
image = LSD.image_to_image_double(bimage)
|
69
75
|
|
70
76
|
lines_found_ptr = FFI::MemoryPointer.new(:int, 1)
|
data/lib/tabula/pdf_dump.rb
CHANGED
@@ -7,9 +7,22 @@ require File.join(File.dirname(__FILE__), '../../target/', Tabula::PDFBOX)
|
|
7
7
|
java_import org.apache.pdfbox.pdfparser.PDFParser
|
8
8
|
java_import org.apache.pdfbox.pdmodel.PDDocument
|
9
9
|
java_import org.apache.pdfbox.util.PDFTextStripper
|
10
|
+
java_import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial
|
10
11
|
|
11
12
|
module Tabula
|
13
|
+
|
12
14
|
module Extraction
|
15
|
+
|
16
|
+
def Extraction.openPDF(pdf_filename, password='')
|
17
|
+
raise Errno::ENOENT unless File.exists?(pdf_filename)
|
18
|
+
document = PDDocument.load(pdf_filename)
|
19
|
+
if document.isEncrypted
|
20
|
+
sdm = StandardDecryptionMaterial.new(password)
|
21
|
+
document.openProtection(sdm)
|
22
|
+
end
|
23
|
+
document
|
24
|
+
end
|
25
|
+
|
13
26
|
class TextExtractor < org.apache.pdfbox.util.PDFTextStripper
|
14
27
|
|
15
28
|
attr_accessor :characters, :fonts
|
@@ -28,8 +41,9 @@ module Tabula
|
|
28
41
|
end
|
29
42
|
|
30
43
|
|
44
|
+
|
31
45
|
def processTextPosition(text)
|
32
|
-
#
|
46
|
+
# return if text.getCharacter == ' '
|
33
47
|
|
34
48
|
# text_font = text.getFont
|
35
49
|
# text_size = text.getFontSize
|
@@ -49,9 +63,8 @@ module Tabula
|
|
49
63
|
end
|
50
64
|
|
51
65
|
class PagesInfoExtractor
|
52
|
-
def initialize(pdf_filename)
|
53
|
-
|
54
|
-
@pdf_file = PDDocument.load(java.io.File.new(pdf_filename))
|
66
|
+
def initialize(pdf_filename, password='')
|
67
|
+
@pdf_file = Extraction.openPDF(pdf_filename, password)
|
55
68
|
@all_pages = @pdf_file.getDocumentCatalog.getAllPages
|
56
69
|
end
|
57
70
|
|
@@ -60,7 +73,7 @@ module Tabula
|
|
60
73
|
begin
|
61
74
|
@all_pages.each_with_index do |page, i|
|
62
75
|
contents = page.getContents
|
63
|
-
next if contents.nil?
|
76
|
+
# next if contents.nil?
|
64
77
|
y.yield Tabula::Page.new(page.findCropBox.width,
|
65
78
|
page.findCropBox.height,
|
66
79
|
page.getRotation.to_i,
|
@@ -78,9 +91,9 @@ module Tabula
|
|
78
91
|
include Observable
|
79
92
|
|
80
93
|
#N.B. pages can be :all, a list of pages or a range.
|
81
|
-
def initialize(pdf_filename, pages=[1])
|
94
|
+
def initialize(pdf_filename, pages=[1], password='')
|
82
95
|
raise Errno::ENOENT unless File.exists?(pdf_filename)
|
83
|
-
@pdf_file =
|
96
|
+
@pdf_file = Extraction.openPDF(pdf_filename, password)
|
84
97
|
@all_pages = @pdf_file.getDocumentCatalog.getAllPages
|
85
98
|
@pages = pages == :all ? (1..@all_pages.size) : pages
|
86
99
|
@extractor = TextExtractor.new
|
@@ -105,7 +118,7 @@ module Tabula
|
|
105
118
|
char.getXDirAdj.round(2),
|
106
119
|
char.getWidthDirAdj.round(2),
|
107
120
|
char.getHeightDir.round(2),
|
108
|
-
|
121
|
+
char.getFont,
|
109
122
|
char.getFontSize.round(2),
|
110
123
|
char.getCharacter,
|
111
124
|
char.getWidthOfSpace)
|
@@ -115,12 +115,9 @@ module Tabula
|
|
115
115
|
|
116
116
|
char2 = self.text_elements[i+1]
|
117
117
|
|
118
|
-
|
119
|
-
|
120
118
|
next if char2.nil? or char1.nil?
|
121
119
|
|
122
120
|
if self.text_elements[current_word_index].should_merge?(char2)
|
123
|
-
#puts "merging: #{self.text_elements[current_word_index].text}/#{self.text_elements[current_word_index].width}"
|
124
121
|
self.text_elements[current_word_index].merge!(char2)
|
125
122
|
char1 = char2
|
126
123
|
self.text_elements[i+1] = nil
|
@@ -166,92 +163,60 @@ module Tabula
|
|
166
163
|
|
167
164
|
ONLY_SPACES_RE = Regexp.new('^\s+$')
|
168
165
|
|
169
|
-
|
170
|
-
def Tabula.make_table(text_elements, options={})
|
171
|
-
extractor = TableExtractor.new(text_elements, options)
|
172
|
-
|
173
|
-
# group by lines
|
166
|
+
def Tabula.group_by_lines(text_elements)
|
174
167
|
lines = []
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
line_members = text_elements.find_all do |te|
|
183
|
-
te.vertically_overlaps?(lb)
|
184
|
-
end
|
185
|
-
|
186
|
-
text_elements -= line_members
|
187
|
-
|
188
|
-
line_members.sort_by(&:left).each do |te|
|
189
|
-
# skip text_elements that only contain spaces
|
190
|
-
next if te.text =~ ONLY_SPACES_RE
|
191
|
-
line << te
|
168
|
+
text_elements.each do |te|
|
169
|
+
next if te.text =~ ONLY_SPACES_RE
|
170
|
+
l = lines.find { |line| line.horizontal_overlap_ratio(te) >= 0.01 }
|
171
|
+
if l.nil?
|
172
|
+
l = Line.new
|
173
|
+
lines << l
|
192
174
|
end
|
193
|
-
|
194
|
-
lines << line if line.text_elements.size > 0
|
175
|
+
l << te
|
195
176
|
end
|
177
|
+
lines
|
178
|
+
end
|
196
179
|
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
180
|
+
# Returns an array of Tabula::Line
|
181
|
+
def Tabula.make_table(text_elements, options={})
|
182
|
+
default_options = {:separators => []}
|
183
|
+
options = default_options.merge(options)
|
184
|
+
|
185
|
+
extractor = TableExtractor.new(text_elements, options).text_elements
|
186
|
+
lines = group_by_lines(text_elements)
|
187
|
+
top = lines[0].text_elements.map(&:top).min
|
188
|
+
right = 0
|
189
|
+
columns = []
|
190
|
+
|
191
|
+
text_elements.sort_by(&:left).each do |te|
|
192
|
+
next if te.text =~ ONLY_SPACES_RE
|
193
|
+
if te.top >= top
|
194
|
+
left = te.left
|
195
|
+
if (left > right)
|
196
|
+
columns << right
|
197
|
+
right = te.right
|
198
|
+
elsif te.right > right
|
199
|
+
right = te.right
|
213
200
|
end
|
214
201
|
end
|
215
202
|
end
|
216
203
|
|
217
|
-
|
218
|
-
lines.each_with_index do |l, line_index|
|
219
|
-
next if l.text_elements.nil?
|
220
|
-
|
221
|
-
(0..l.text_elements.size-1).to_a.combination(2).each do |t1, t2|
|
222
|
-
next if l.text_elements[t1].nil? or l.text_elements[t2].nil? or l.text_elements[t1].text.empty? or l.text_elements[t2].text.empty?
|
204
|
+
separators = columns[1..-1].sort.reverse
|
223
205
|
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
l.text_elements[t2] = nil
|
230
|
-
else
|
231
|
-
l.text_elements[t2].merge!(l.text_elements[t1])
|
232
|
-
l.text_elements[t1] = nil
|
233
|
-
end
|
234
|
-
end
|
206
|
+
table = Table.new(lines.count, separators)
|
207
|
+
lines.each_with_index do |line, i|
|
208
|
+
line.text_elements.each do |te|
|
209
|
+
j = separators.find_index { |s| te.left > s } || separators.count
|
210
|
+
table.add_text_element(te, i, separators.count - j)
|
235
211
|
end
|
236
|
-
|
237
|
-
l.text_elements.compact!
|
238
212
|
end
|
239
213
|
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
next if lines[i].nil?
|
245
|
-
# if any of the elements on the next line is duplicated, kill
|
246
|
-
# the next line
|
247
|
-
if (0..lines[i].text_elements.size-1).any? { |j| lines[i].text_elements[j] == lines[i+1].text_elements[j] }
|
248
|
-
lines[i+1] = nil
|
249
|
-
end
|
214
|
+
table.lines.map do |l|
|
215
|
+
l.text_elements.map! { |te|
|
216
|
+
te.nil? ? TextElement.new(nil, nil, nil, nil, nil, nil, '', nil) : te
|
217
|
+
}
|
250
218
|
end
|
251
219
|
|
252
|
-
lines.compact.map do |line|
|
253
|
-
line.text_elements.sort_by(&:left)
|
254
|
-
end
|
255
220
|
end
|
256
221
|
|
257
222
|
|
@@ -340,9 +305,4 @@ module Tabula
|
|
340
305
|
line.text_elements.sort_by(&:left)
|
341
306
|
end
|
342
307
|
end
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
308
|
end
|
data/lib/tabula/version.rb
CHANGED
data/lib/tabula/writers.rb
CHANGED
data/tabula-extractor.gemspec
CHANGED
@@ -22,6 +22,8 @@ Gem::Specification.new do |s|
|
|
22
22
|
|
23
23
|
s.add_development_dependency 'minitest'
|
24
24
|
s.add_development_dependency 'bundler', '>= 1.3.4'
|
25
|
+
s.add_development_dependency 'ruby-debug'
|
25
26
|
|
26
27
|
s.add_runtime_dependency "trollop", ["~> 2.0"]
|
28
|
+
# s.add_runtime_dependency "algorithms", ["~> 0.6.1"]
|
27
29
|
end
|
data/test/tests.rb
CHANGED
@@ -114,7 +114,6 @@ class TestExtractor < Minitest::Test
|
|
114
114
|
lines = Tabula::TableGuesser.find_lines_on_page(pdf_file_path, 0)
|
115
115
|
vertical_rulings = lines.select(&:vertical?).uniq{|line| (line.left / 10).round }
|
116
116
|
|
117
|
-
|
118
117
|
characters = character_extractor.extract.next.get_text([110, 28, 218, 833])
|
119
118
|
#top left bottom right
|
120
119
|
expected = [['AANONSEN, DEBORAH, A', '', 'STATEN ISLAND, NY', 'MEALS', '$85.00'],
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: tabula-extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.6.
|
5
|
+
version: 0.6.4
|
6
6
|
platform: java
|
7
7
|
authors:
|
8
8
|
- Manuel Aristarán
|
@@ -11,7 +11,7 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date: 2013-
|
14
|
+
date: 2013-07-09 00:00:00.000000000 Z
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
17
17
|
name: minitest
|
@@ -45,6 +45,22 @@ dependencies:
|
|
45
45
|
none: false
|
46
46
|
prerelease: false
|
47
47
|
type: :development
|
48
|
+
- !ruby/object:Gem::Dependency
|
49
|
+
name: ruby-debug
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
none: false
|
56
|
+
requirement: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - '>='
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: '0'
|
61
|
+
none: false
|
62
|
+
prerelease: false
|
63
|
+
type: :development
|
48
64
|
- !ruby/object:Gem::Dependency
|
49
65
|
name: trollop
|
50
66
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -126,12 +142,18 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
126
142
|
requirements:
|
127
143
|
- - '>='
|
128
144
|
- !ruby/object:Gem::Version
|
145
|
+
segments:
|
146
|
+
- 0
|
147
|
+
hash: 2
|
129
148
|
version: '0'
|
130
149
|
none: false
|
131
150
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
132
151
|
requirements:
|
133
152
|
- - '>='
|
134
153
|
- !ruby/object:Gem::Version
|
154
|
+
segments:
|
155
|
+
- 0
|
156
|
+
hash: 2
|
135
157
|
version: '0'
|
136
158
|
none: false
|
137
159
|
requirements: []
|