tabula-extractor 0.0.1-java → 0.5.0-java
Sign up to get free protection for your applications and to get access to all the features.
- data/.travis.yml +6 -0
- data/Gemfile +0 -3
- data/README.md +19 -2
- data/Rakefile +4 -5
- data/bin/tabula +27 -7
- data/ext/COPYING +661 -0
- data/ext/Makefile.OSX +15 -0
- data/ext/Makefile.defaults +9 -0
- data/ext/Makefile.linux32 +11 -0
- data/ext/Makefile.linux64 +12 -0
- data/ext/Makefile.mingw +10 -0
- data/ext/liblsd-linux32.so +0 -0
- data/ext/liblsd-linux64.so +0 -0
- data/ext/liblsd.def +3 -0
- data/ext/liblsd.dll +0 -0
- data/ext/liblsd.dylib +0 -0
- data/ext/lsd.c +2270 -0
- data/ext/lsd.h +283 -0
- data/lib/tabula.rb +6 -0
- data/lib/tabula/core_ext.rb +21 -0
- data/lib/tabula/entities.rb +141 -20
- data/lib/tabula/line_segment_detector.rb +99 -0
- data/lib/tabula/pdf_dump.rb +10 -8
- data/lib/tabula/pdf_render.rb +64 -0
- data/lib/tabula/table_extractor.rb +19 -20
- data/lib/tabula/version.rb +1 -1
- data/lib/tabula/writers.rb +1 -1
- data/tabula-extractor.gemspec +3 -2
- data/target/{pdfbox-app-1.8.0.jar → pdfbox-app-2.0.0-SNAPSHOT.jar} +0 -0
- data/test/tests.rb +7 -6
- metadata +22 -5
@@ -0,0 +1,99 @@
|
|
1
|
+
require 'java'
|
2
|
+
require 'rbconfig'
|
3
|
+
|
4
|
+
require 'ffi'
|
5
|
+
|
6
|
+
require_relative './entities'
|
7
|
+
require_relative './pdf_render'
|
8
|
+
require File.join(File.dirname(__FILE__), '../../target/', Tabula::PDFBOX)
|
9
|
+
|
10
|
+
java_import javax.imageio.ImageIO
|
11
|
+
java_import java.awt.image.BufferedImage
|
12
|
+
java_import org.apache.pdfbox.pdmodel.PDDocument
|
13
|
+
|
14
|
+
module Tabula
|
15
|
+
module LSD
|
16
|
+
extend FFI::Library
|
17
|
+
ffi_lib File.expand_path('../../ext/' + case RbConfig::CONFIG['host_os']
|
18
|
+
when /mswin|msys|mingw|cygwin|bccwin|wince|emc/
|
19
|
+
'liblsd.dll'
|
20
|
+
when /darwin|mac os/
|
21
|
+
'liblsd.dylib'
|
22
|
+
when /linux/
|
23
|
+
if RbConfig::CONFIG['target_cpu'] == 'x86_64'
|
24
|
+
'liblsd-linux64.so'
|
25
|
+
else
|
26
|
+
'liblsd-linux32.so'
|
27
|
+
end
|
28
|
+
else
|
29
|
+
raise "unknown os: #{RbConfig::CONFIG['host_os']}"
|
30
|
+
end,
|
31
|
+
File.dirname(__FILE__))
|
32
|
+
|
33
|
+
attach_function :lsd, [ :pointer, :buffer_in, :int, :int ], :pointer
|
34
|
+
attach_function :free_values, [ :pointer ], :void
|
35
|
+
|
36
|
+
def LSD.detect_lines_in_pdf_page(pdf_path, page_number, scale_factor=1)
|
37
|
+
pdf_file = PDDocument.loadNonSeq(java.io.File.new(pdf_path), nil)
|
38
|
+
bi = Tabula::Render.pageToBufferedImage(pdf_file.getDocumentCatalog.getAllPages[page_number - 1])
|
39
|
+
pdf_file.close
|
40
|
+
detect_lines(bi,scale_factor)
|
41
|
+
end
|
42
|
+
|
43
|
+
# image can be either a string (path to image) or a Java::JavaAwtImage::BufferedImage
|
44
|
+
# image to pixels: http://stackoverflow.com/questions/6524196/java-get-pixel-array-from-image
|
45
|
+
def LSD.detect_lines(image, scale_factor=1)
|
46
|
+
bimage = if image.class == Java::JavaAwtImage::BufferedImage
|
47
|
+
image
|
48
|
+
elsif image.class == String
|
49
|
+
ImageIO.read(java.io.File.new(image))
|
50
|
+
else
|
51
|
+
raise ArgumentError, 'image must be a string or a BufferedImage'
|
52
|
+
end
|
53
|
+
image = LSD.image_to_image_double(bimage)
|
54
|
+
|
55
|
+
lines_found_ptr = FFI::MemoryPointer.new(:int, 1)
|
56
|
+
|
57
|
+
out = lsd(lines_found_ptr, image, bimage.getWidth, bimage.getHeight)
|
58
|
+
|
59
|
+
lines_found = lines_found_ptr.get_int
|
60
|
+
|
61
|
+
rv = []
|
62
|
+
lines_found.times do |i|
|
63
|
+
a = out[7*8*i].read_array_of_type(:double, 7)
|
64
|
+
|
65
|
+
a_round = a[0..3].map(&:round)
|
66
|
+
p1, p2 = [[a_round[0], a_round[1]], [a_round[2], a_round[3]]]
|
67
|
+
|
68
|
+
rv << Tabula::Ruling.new(p1[1] * scale_factor,
|
69
|
+
p1[0] * scale_factor,
|
70
|
+
(p2[0] - p1[0]) * scale_factor,
|
71
|
+
(p2[1] - p1[1]) * scale_factor)
|
72
|
+
end
|
73
|
+
|
74
|
+
free_values(out)
|
75
|
+
bimage.flush
|
76
|
+
bimage.getGraphics.dispose
|
77
|
+
image = nil
|
78
|
+
|
79
|
+
return rv
|
80
|
+
end
|
81
|
+
|
82
|
+
private
|
83
|
+
def LSD.image_to_image_double(buffered_image)
|
84
|
+
width = buffered_image.getWidth; height = buffered_image.getHeight
|
85
|
+
raster_size = width * height
|
86
|
+
|
87
|
+
image_double = FFI::MemoryPointer.new(:double, raster_size)
|
88
|
+
pixels = Java::int[width * height].new
|
89
|
+
buffered_image.getRGB(0, 0, width, height, pixels, 0, width)
|
90
|
+
|
91
|
+
image_double.put_array_of_double 0, pixels.to_a
|
92
|
+
end
|
93
|
+
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
if __FILE__ == $0
|
98
|
+
puts Tabula::LSD.detect_lines_in_pdf_page ARGV[0], ARGV[1].to_i
|
99
|
+
end
|
data/lib/tabula/pdf_dump.rb
CHANGED
@@ -3,7 +3,7 @@ require 'observer'
|
|
3
3
|
require_relative './entities.rb'
|
4
4
|
|
5
5
|
require 'java'
|
6
|
-
require File.join(File.dirname(__FILE__), '../../target/
|
6
|
+
require File.join(File.dirname(__FILE__), '../../target/', Tabula::PDFBOX)
|
7
7
|
java_import org.apache.pdfbox.pdfparser.PDFParser
|
8
8
|
java_import org.apache.pdfbox.pdmodel.PDDocument
|
9
9
|
java_import org.apache.pdfbox.util.PDFTextStripper
|
@@ -44,13 +44,14 @@ module Tabula
|
|
44
44
|
c = text.getCharacter
|
45
45
|
# probably not the fastest way of detecting printable chars
|
46
46
|
self.characters << text if c =~ PRINTABLE_RE
|
47
|
+
|
47
48
|
end
|
48
49
|
end
|
49
50
|
|
50
51
|
class PagesInfoExtractor
|
51
52
|
def initialize(pdf_filename)
|
52
53
|
raise Errno::ENOENT unless File.exists?(pdf_filename)
|
53
|
-
@pdf_file = PDDocument.
|
54
|
+
@pdf_file = PDDocument.load(java.io.File.new(pdf_filename))
|
54
55
|
@all_pages = @pdf_file.getDocumentCatalog.getAllPages
|
55
56
|
end
|
56
57
|
|
@@ -99,13 +100,14 @@ module Tabula
|
|
99
100
|
page.getRotation.to_i,
|
100
101
|
i+1,
|
101
102
|
@extractor.characters.map { |char|
|
102
|
-
Tabula::TextElement.new(char.getYDirAdj,
|
103
|
-
char.getXDirAdj,
|
104
|
-
char.getWidthDirAdj,
|
105
|
-
char.getHeightDir,
|
103
|
+
Tabula::TextElement.new(char.getYDirAdj.round(2),
|
104
|
+
char.getXDirAdj.round(2),
|
105
|
+
char.getWidthDirAdj.round(2),
|
106
|
+
char.getHeightDir.round(2),
|
106
107
|
nil,
|
107
|
-
char.getFontSize,
|
108
|
-
char.getCharacter
|
108
|
+
char.getFontSize.round(2),
|
109
|
+
char.getCharacter,
|
110
|
+
char.getWidthOfSpace)
|
109
111
|
})
|
110
112
|
end
|
111
113
|
ensure
|
@@ -0,0 +1,64 @@
|
|
1
|
+
require 'java'
|
2
|
+
|
3
|
+
require File.join(File.dirname(__FILE__), '../../target/', Tabula::PDFBOX)
|
4
|
+
|
5
|
+
java_import org.apache.pdfbox.pdmodel.PDDocument
|
6
|
+
java_import org.apache.pdfbox.pdfviewer.PageDrawer
|
7
|
+
java_import java.awt.image.BufferedImage
|
8
|
+
java_import javax.imageio.ImageIO
|
9
|
+
java_import java.awt.Dimension
|
10
|
+
java_import java.awt.Color
|
11
|
+
|
12
|
+
module Tabula
|
13
|
+
module Render
|
14
|
+
|
15
|
+
# render a PDF page to a graphics context, but skip rendering the text
|
16
|
+
# This is done to reduce 'noise' introduced by the text, we only
|
17
|
+
# care about lines.
|
18
|
+
class PageDrawerNoText < PageDrawer
|
19
|
+
def processTextPosition(text)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
TRANSPARENT_WHITE = Color.new(255, 255, 255, 0)
|
24
|
+
|
25
|
+
def self.pageToBufferedImage(page, width=2048, pageDrawerClass=PageDrawerNoText)
|
26
|
+
cropbox = page.findCropBox
|
27
|
+
widthPt, heightPt = cropbox.getWidth, cropbox.getHeight
|
28
|
+
pageDimension = Dimension.new(widthPt, heightPt)
|
29
|
+
rotation = java.lang.Math.toRadians(page.findRotation)
|
30
|
+
|
31
|
+
scaling = width / (rotation == 0 ? widthPt : heightPt)
|
32
|
+
widthPx, heightPx = java.lang.Math.round(widthPt * scaling), java.lang.Math.round(heightPt * scaling)
|
33
|
+
|
34
|
+
retval = if rotation != 0
|
35
|
+
BufferedImage.new(heightPx, widthPx, BufferedImage::TYPE_BYTE_GRAY)
|
36
|
+
else
|
37
|
+
BufferedImage.new(widthPx, heightPx, BufferedImage::TYPE_BYTE_GRAY)
|
38
|
+
end
|
39
|
+
graphics = retval.getGraphics()
|
40
|
+
graphics.setBackground(TRANSPARENT_WHITE)
|
41
|
+
graphics.clearRect(0, 0, retval.getWidth, retval.getHeight)
|
42
|
+
if rotation != 0
|
43
|
+
graphics.translate(retval.getWidth, 0.0)
|
44
|
+
graphics.rotate(rotation)
|
45
|
+
end
|
46
|
+
graphics.scale(scaling, scaling)
|
47
|
+
drawer = pageDrawerClass.new()
|
48
|
+
drawer.drawPage(graphics, page, pageDimension)
|
49
|
+
graphics.dispose
|
50
|
+
|
51
|
+
return retval
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
# testing
|
57
|
+
if __FILE__ == $0
|
58
|
+
pdf_file = PDDocument.loadNonSeq(java.io.File.new(ARGV[0]), nil)
|
59
|
+
bi = Tabula::Render.pageToBufferedImage(pdf_file.getDocumentCatalog.getAllPages[ARGV[1].to_i - 1])
|
60
|
+
puts bi.class
|
61
|
+
ImageIO.write(bi, 'png',
|
62
|
+
java.io.File.new('notext.png'))
|
63
|
+
end
|
64
|
+
|
@@ -14,7 +14,6 @@ module Tabula
|
|
14
14
|
def initialize(text_elements, options = {})
|
15
15
|
self.text_elements = text_elements
|
16
16
|
self.options = DEFAULT_OPTIONS.merge(options)
|
17
|
-
@merged = false
|
18
17
|
merge_words! if self.options[:merge_words]
|
19
18
|
end
|
20
19
|
|
@@ -42,9 +41,9 @@ module Tabula
|
|
42
41
|
end
|
43
42
|
|
44
43
|
def get_columns
|
45
|
-
Tabula.group_by_columns(text_elements).map
|
44
|
+
Tabula.group_by_columns(text_elements).map do |c|
|
46
45
|
{'left' => c.left, 'right' => c.right, 'width' => c.width}
|
47
|
-
|
46
|
+
end
|
48
47
|
end
|
49
48
|
|
50
49
|
def get_line_boundaries
|
@@ -108,6 +107,7 @@ module Tabula
|
|
108
107
|
# is there a space? is this within `CHARACTER_DISTANCE_THRESHOLD` points of previous char?
|
109
108
|
if (char1.text != " ") and (char2.text != " ") and self.text_elements[current_word_index].should_add_space?(char2)
|
110
109
|
self.text_elements[current_word_index].text += " "
|
110
|
+
self.text_elements[current_word_index].width += self.text_elements[current_word_index].width_of_space
|
111
111
|
end
|
112
112
|
current_word_index = i+1
|
113
113
|
end
|
@@ -135,11 +135,11 @@ module Tabula
|
|
135
135
|
end
|
136
136
|
|
137
137
|
def Tabula.lines_to_csv(lines)
|
138
|
-
CSV.generate
|
139
|
-
lines.each
|
138
|
+
CSV.generate do |csv|
|
139
|
+
lines.each do |l|
|
140
140
|
csv << l.map { |c| c.text.strip }
|
141
|
-
|
142
|
-
|
141
|
+
end
|
142
|
+
end
|
143
143
|
end
|
144
144
|
|
145
145
|
ONLY_SPACES_RE = Regexp.new('^\s+$')
|
@@ -154,45 +154,43 @@ module Tabula
|
|
154
154
|
|
155
155
|
# find all the text elements
|
156
156
|
# contained within each detected line (table row) boundary
|
157
|
-
line_boundaries.each
|
157
|
+
line_boundaries.each do |lb|
|
158
158
|
line = Line.new
|
159
159
|
|
160
|
-
line_members = text_elements.find_all
|
160
|
+
line_members = text_elements.find_all do |te|
|
161
161
|
te.vertically_overlaps?(lb)
|
162
|
-
|
162
|
+
end
|
163
163
|
|
164
164
|
text_elements -= line_members
|
165
165
|
|
166
|
-
line_members.sort_by(&:left).each
|
166
|
+
line_members.sort_by(&:left).each do |te|
|
167
167
|
# skip text_elements that only contain spaces
|
168
168
|
next if te.text =~ ONLY_SPACES_RE
|
169
169
|
line << te
|
170
|
-
|
170
|
+
end
|
171
171
|
|
172
172
|
lines << line if line.text_elements.size > 0
|
173
|
-
|
173
|
+
end
|
174
174
|
|
175
175
|
lines.sort_by!(&:top)
|
176
176
|
|
177
177
|
columns = Tabula.group_by_columns(lines.map(&:text_elements).flatten.compact.uniq).sort_by(&:left)
|
178
178
|
|
179
179
|
# # insert empty cells if needed
|
180
|
-
lines.each_with_index
|
180
|
+
lines.each_with_index do |l, line_index|
|
181
181
|
next if l.text_elements.nil?
|
182
182
|
l.text_elements.compact! # TODO WHY do I have to do this?
|
183
183
|
l.text_elements.uniq! # TODO WHY do I have to do this?
|
184
184
|
l.text_elements.sort_by!(&:left)
|
185
185
|
|
186
|
-
# l.text_elements = Tabula.merge_words(l.text_elements)
|
187
|
-
|
188
186
|
next unless l.text_elements.size < columns.size
|
189
187
|
|
190
188
|
columns.each_with_index do |c, i|
|
191
189
|
if (i > l.text_elements.size - 1) or !l.text_elements(&:left)[i].nil? and !c.text_elements.include?(l.text_elements[i])
|
192
|
-
l.text_elements.insert(i, TextElement.new(l.top, c.left, c.width, l.height, nil, 0, ''))
|
190
|
+
l.text_elements.insert(i, TextElement.new(l.top, c.left, c.width, l.height, nil, 0, '', 0))
|
193
191
|
end
|
194
192
|
end
|
195
|
-
|
193
|
+
end
|
196
194
|
|
197
195
|
# # merge elements that are in the same column
|
198
196
|
columns = Tabula.group_by_columns(lines.map(&:text_elements).flatten.compact.uniq)
|
@@ -230,8 +228,9 @@ module Tabula
|
|
230
228
|
lines[i+1] = nil
|
231
229
|
end
|
232
230
|
end
|
233
|
-
|
231
|
+
|
232
|
+
lines.compact.map do |line|
|
234
233
|
line.text_elements.sort_by(&:left)
|
235
|
-
|
234
|
+
end
|
236
235
|
end
|
237
236
|
end
|
data/lib/tabula/version.rb
CHANGED
data/lib/tabula/writers.rb
CHANGED
data/tabula-extractor.gemspec
CHANGED
@@ -14,13 +14,14 @@ Gem::Specification.new do |s|
|
|
14
14
|
|
15
15
|
s.platform = 'java'
|
16
16
|
|
17
|
-
|
17
|
+
shared_libs = ['liblsd.dylib', 'liblsd-linux64.so', 'liblsd-linux32.so', 'liblsd.dll'].map { |f| 'ext/' + f }
|
18
|
+
s.files = `git ls-files`.split("\n") + shared_libs.map.reject { |f| !File.exists?(f) }
|
18
19
|
s.test_files = `git ls-files -- {test,features}/*`.split("\n")
|
19
20
|
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
20
21
|
s.require_paths = ["lib"]
|
21
22
|
|
22
23
|
s.add_development_dependency 'minitest'
|
23
|
-
s.add_development_dependency 'bundler', '>= 1.3.
|
24
|
+
s.add_development_dependency 'bundler', '>= 1.3.4'
|
24
25
|
|
25
26
|
s.add_runtime_dependency "trollop", ["~> 2.0"]
|
26
27
|
end
|
Binary file
|
data/test/tests.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# -*- coding: utf-8 -*-
|
2
|
+
require 'minitest'
|
2
3
|
require 'minitest/autorun'
|
3
4
|
|
4
5
|
require_relative '../lib/tabula'
|
@@ -9,7 +10,7 @@ def lines_to_array(lines)
|
|
9
10
|
}
|
10
11
|
end
|
11
12
|
|
12
|
-
class TestPagesInfoExtractor <
|
13
|
+
class TestPagesInfoExtractor < Minitest::Test
|
13
14
|
def test_pages_info_extractor
|
14
15
|
extractor = Tabula::Extraction::PagesInfoExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
|
15
16
|
|
@@ -23,7 +24,7 @@ class TestPagesInfoExtractor < MiniTest::Unit::TestCase
|
|
23
24
|
end
|
24
25
|
|
25
26
|
|
26
|
-
class TestDumper <
|
27
|
+
class TestDumper < Minitest::Test
|
27
28
|
|
28
29
|
def test_extractor
|
29
30
|
extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
|
@@ -40,7 +41,7 @@ class TestDumper < MiniTest::Unit::TestCase
|
|
40
41
|
end
|
41
42
|
end
|
42
43
|
|
43
|
-
class TestExtractor <
|
44
|
+
class TestExtractor < Minitest::Test
|
44
45
|
|
45
46
|
def test_table_extraction_1
|
46
47
|
character_extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
|
@@ -54,7 +55,7 @@ class TestExtractor < MiniTest::Unit::TestCase
|
|
54
55
|
character_extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/argentina_diputados_voting_record.pdf', File.dirname(__FILE__)))
|
55
56
|
characters = character_extractor.extract.next.get_text([269.875, 12.75, 790.5, 561])
|
56
57
|
|
57
|
-
expected = [["
|
58
|
+
expected = [["ABDALA de MATARAZZO, Norma Amanda", "Frente Cívico por Santiago", "Santiago del Estero", "AFIRMATIVO"], ["ALBRIEU, Oscar Edmundo Nicolas", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["ALONSO, María Luz", "Frente para la Victoria - PJ", "La Pampa", "AFIRMATIVO"], ["ARENA, Celia Isabel", "Frente para la Victoria - PJ", "Santa Fe", "AFIRMATIVO"], ["ARREGUI, Andrés Roberto", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["AVOSCAN, Herman Horacio", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["BALCEDO, María Ester", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["BARRANDEGUY, Raúl Enrique", "Frente para la Victoria - PJ", "Entre Ríos", "AFIRMATIVO"], ["BASTERRA, Luis Eugenio", "Frente para la Victoria - PJ", "Formosa", "AFIRMATIVO"], ["BEDANO, Nora Esther", "Frente para la Victoria - PJ", "Córdoba", "AFIRMATIVO"], ["BERNAL, María Eugenia", "Frente para la Victoria - PJ", "Jujuy", "AFIRMATIVO"], ["BERTONE, Rosana Andrea", "Frente para la Victoria - PJ", "Tierra del Fuego", "AFIRMATIVO"], ["BIANCHI, María del Carmen", "Frente para la Victoria - PJ", "Cdad. Aut. Bs. As.", "AFIRMATIVO"], ["BIDEGAIN, Gloria Mercedes", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["BRAWER, Mara", "Frente para la Victoria - PJ", "Cdad. Aut. Bs. As.", "AFIRMATIVO"], ["BRILLO, José Ricardo", "Movimiento Popular Neuquino", "Neuquén", "AFIRMATIVO"], ["BROMBERG, Isaac Benjamín", "Frente para la Victoria - PJ", "Tucumán", "AFIRMATIVO"], ["BRUE, Daniel Agustín", "Frente Cívico por Santiago", "Santiago del Estero", "AFIRMATIVO"], ["CALCAGNO, Eric", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CARLOTTO, Remo Gerardo", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CARMONA, Guillermo Ramón", "Frente para la Victoria - PJ", "Mendoza", "AFIRMATIVO"], ["CATALAN MAGNI, Julio César", "Frente para la Victoria - PJ", "Tierra del Fuego", "AFIRMATIVO"], ["CEJAS, Jorge Alberto", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["CHIENO, María Elena", "Frente para la Victoria - PJ", "Corrientes", "AFIRMATIVO"], ["CIAMPINI, José Alberto", "Frente para la Victoria - PJ", "Neuquén", "AFIRMATIVO"], ["CIGOGNA, Luis Francisco Jorge", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CLERI, Marcos", "Frente para la Victoria - PJ", "Santa Fe", "AFIRMATIVO"], ["COMELLI, Alicia Marcela", "Movimiento Popular Neuquino", "Neuquén", "AFIRMATIVO"], ["CONTI, Diana Beatriz", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CORDOBA, Stella Maris", "Frente para la Victoria - PJ", "Tucumán", "AFIRMATIVO"], ["CURRILEN, Oscar Rubén", "Frente para la Victoria - PJ", "Chubut", "AFIRMATIVO"]]
|
58
59
|
|
59
60
|
assert_equal expected, lines_to_array(Tabula.make_table(characters))
|
60
61
|
end
|
@@ -62,9 +63,9 @@ class TestExtractor < MiniTest::Unit::TestCase
|
|
62
63
|
# TODO Spaces inserted in words - fails
|
63
64
|
def test_bo_page24
|
64
65
|
character_extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/bo_page24.pdf', File.dirname(__FILE__)))
|
65
|
-
characters = character_extractor.extract.next.get_text([435.625, 53.125,
|
66
|
+
characters = character_extractor.extract.next.get_text([435.625, 53.125, 585.7142857142857, 810.5357142857142])
|
66
67
|
|
67
|
-
expected = [["1", "UNICA", "CECILIA KANDUS", "16/12/2008", "PEDRO ALBERTO GALINDEZ", "60279/09"], ["1", "UNICA", "CECILIA KANDUS", "10/06/2009", "PASTORA FILOMENA NAVARRO", "60280/09"], ["13", "UNICA", "MIRTA S. BOTTALLO DE VILLA", "02/07/2009", "MARIO LUIS ANGELERI, DNI 4.313.138", "60198/09"], ["16", "UNICA", "LUIS PEDRO FASANELLI", "22/05/2009", "PETTER o PEDRO KAHRS", "60244/09"], ["18", "UNICA", "ALEJANDRA SALLES", "26/06/2009", "RAUL FERNANDO FORTINI", "60236/09"], ["31", "UNICA", "MARÍA CRISTINA GARCÍA", "17/06/2009", "DOMINGO TRIPODI Y PAULA LUPPINO", "60302/09"], ["34", "UNICA", "SUSANA B.MARZIONI", "11/06/2009", "JESUSA CARMEN VAZQUEZ", "60177/09"], ["51", "UNICA", "MARIA LUCRECIA SERRAT", "19/05/2009", "DANIEL DECUADRO", "60227/09"], ["51", "UNICA", "MARIA LUCRECIA SERRAT", "12/02/2009", "ELIZABETH LILIANA MANSILLA ROMERO", "60150/09"], ["75", "UNICA", "IGNACIO M. REBAUDI BASAVILBASO", "01/07/2009", "ABALSAMO ALFREDO DANIEL", "60277/09"], ["94", "UNICA", "GABRIELA PALÓPOLI", "02/07/2009", "ALVAREZ ALICIA ESTHER", "60360/09"], ["96", "UNICA", "DANIEL PAZ EYNARD", "16/06/2009", "NELIDA ALBORADA ALCARAZ SERRANO", "60176/09"]]
|
68
|
+
expected = [["1", "UNICA", "CECILIA KANDUS", "16/12/2008", "PEDRO ALBERTO GALINDEZ", "60279/09"], ["1", "UNICA", "CECILIA KANDUS", "10/06/2009", "PASTORA FILOMENA NAVARRO", "60280/09"], ["13", "UNICA", "MIRTA S. BOTTALLO DE VILLA", "02/07/2009", "MARIO LUIS ANGELERI, DNI 4.313.138", "60198/09"], ["16", "UNICA", "LUIS PEDRO FASANELLI", "22/05/2009", "PETTER o PEDRO KAHRS", "60244/09"], ["18", "UNICA", "ALEJANDRA SALLES", "26/06/2009", "RAUL FERNANDO FORTINI", "60236/09"], ["31", "UNICA", "MARÍA CRISTINA GARCÍA", "17/06/2009", "DOMINGO TRIPODI Y PAULA LUPPINO", "60302/09"], ["34", "UNICA", "SUSANA B. MARZIONI", "11/06/2009", "JESUSA CARMEN VAZQUEZ", "60177/09"], ["51", "UNICA", "MARIA LUCRECIA SERRAT", "19/05/2009", "DANIEL DECUADRO", "60227/09"], ["51", "UNICA", "MARIA LUCRECIA SERRAT", "12/02/2009", "ELIZABETH LILIANA MANSILLA ROMERO", "60150/09"], ["75", "UNICA", "IGNACIO M. REBAUDI BASAVILBASO", "01/07/2009", "ABALSAMO ALFREDO DANIEL", "60277/09"], ["94", "UNICA", "GABRIELA PALÓPOLI", "02/07/2009", "ALVAREZ ALICIA ESTHER", "60360/09"], ["96", "UNICA", "DANIEL PAZ EYNARD", "16/06/2009", "NELIDA ALBORADA ALCARAZ SERRANO", "60176/09"]]
|
68
69
|
assert_equal expected, lines_to_array(Tabula.make_table(characters))
|
69
70
|
end
|
70
71
|
|
metadata
CHANGED
@@ -2,14 +2,14 @@
|
|
2
2
|
name: tabula-extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.0
|
5
|
+
version: 0.5.0
|
6
6
|
platform: java
|
7
7
|
authors:
|
8
8
|
- Manuel Aristarán
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-
|
12
|
+
date: 2013-06-12 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: minitest
|
@@ -35,13 +35,13 @@ dependencies:
|
|
35
35
|
requirements:
|
36
36
|
- - ">="
|
37
37
|
- !ruby/object:Gem::Version
|
38
|
-
version: 1.3.
|
38
|
+
version: 1.3.4
|
39
39
|
none: false
|
40
40
|
requirement: !ruby/object:Gem::Requirement
|
41
41
|
requirements:
|
42
42
|
- - ">="
|
43
43
|
- !ruby/object:Gem::Version
|
44
|
-
version: 1.3.
|
44
|
+
version: 1.3.4
|
45
45
|
none: false
|
46
46
|
prerelease: false
|
47
47
|
type: :development
|
@@ -70,6 +70,7 @@ extensions: []
|
|
70
70
|
extra_rdoc_files: []
|
71
71
|
files:
|
72
72
|
- ".gitignore"
|
73
|
+
- ".travis.yml"
|
73
74
|
- AUTHORS.md
|
74
75
|
- Gemfile
|
75
76
|
- LICENSE.md
|
@@ -77,15 +78,31 @@ files:
|
|
77
78
|
- README.md
|
78
79
|
- Rakefile
|
79
80
|
- bin/tabula
|
81
|
+
- ext/COPYING
|
82
|
+
- ext/Makefile.OSX
|
83
|
+
- ext/Makefile.defaults
|
84
|
+
- ext/Makefile.linux32
|
85
|
+
- ext/Makefile.linux64
|
86
|
+
- ext/Makefile.mingw
|
87
|
+
- ext/liblsd-linux32.so
|
88
|
+
- ext/liblsd-linux64.so
|
89
|
+
- ext/liblsd.def
|
90
|
+
- ext/liblsd.dll
|
91
|
+
- ext/liblsd.dylib
|
92
|
+
- ext/lsd.c
|
93
|
+
- ext/lsd.h
|
80
94
|
- lib/tabula.rb
|
95
|
+
- lib/tabula/core_ext.rb
|
81
96
|
- lib/tabula/entities.rb
|
97
|
+
- lib/tabula/line_segment_detector.rb
|
82
98
|
- lib/tabula/pdf_dump.rb
|
99
|
+
- lib/tabula/pdf_render.rb
|
83
100
|
- lib/tabula/table_extractor.rb
|
84
101
|
- lib/tabula/version.rb
|
85
102
|
- lib/tabula/whitespace.rb
|
86
103
|
- lib/tabula/writers.rb
|
87
104
|
- tabula-extractor.gemspec
|
88
|
-
- target/pdfbox-app-
|
105
|
+
- target/pdfbox-app-2.0.0-SNAPSHOT.jar
|
89
106
|
- test/data/ClinicalResearchDisclosureReport2012Q2.pdf
|
90
107
|
- test/data/argentina_diputados_voting_record.pdf
|
91
108
|
- test/data/bo_page24.pdf
|