tabula-extractor 0.0.1-java → 0.5.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.travis.yml +6 -0
- data/Gemfile +0 -3
- data/README.md +19 -2
- data/Rakefile +4 -5
- data/bin/tabula +27 -7
- data/ext/COPYING +661 -0
- data/ext/Makefile.OSX +15 -0
- data/ext/Makefile.defaults +9 -0
- data/ext/Makefile.linux32 +11 -0
- data/ext/Makefile.linux64 +12 -0
- data/ext/Makefile.mingw +10 -0
- data/ext/liblsd-linux32.so +0 -0
- data/ext/liblsd-linux64.so +0 -0
- data/ext/liblsd.def +3 -0
- data/ext/liblsd.dll +0 -0
- data/ext/liblsd.dylib +0 -0
- data/ext/lsd.c +2270 -0
- data/ext/lsd.h +283 -0
- data/lib/tabula.rb +6 -0
- data/lib/tabula/core_ext.rb +21 -0
- data/lib/tabula/entities.rb +141 -20
- data/lib/tabula/line_segment_detector.rb +99 -0
- data/lib/tabula/pdf_dump.rb +10 -8
- data/lib/tabula/pdf_render.rb +64 -0
- data/lib/tabula/table_extractor.rb +19 -20
- data/lib/tabula/version.rb +1 -1
- data/lib/tabula/writers.rb +1 -1
- data/tabula-extractor.gemspec +3 -2
- data/target/{pdfbox-app-1.8.0.jar → pdfbox-app-2.0.0-SNAPSHOT.jar} +0 -0
- data/test/tests.rb +7 -6
- metadata +22 -5
@@ -0,0 +1,99 @@
|
|
1
|
+
require 'java'
|
2
|
+
require 'rbconfig'
|
3
|
+
|
4
|
+
require 'ffi'
|
5
|
+
|
6
|
+
require_relative './entities'
|
7
|
+
require_relative './pdf_render'
|
8
|
+
require File.join(File.dirname(__FILE__), '../../target/', Tabula::PDFBOX)
|
9
|
+
|
10
|
+
java_import javax.imageio.ImageIO
|
11
|
+
java_import java.awt.image.BufferedImage
|
12
|
+
java_import org.apache.pdfbox.pdmodel.PDDocument
|
13
|
+
|
14
|
+
module Tabula
|
15
|
+
module LSD
|
16
|
+
extend FFI::Library
|
17
|
+
ffi_lib File.expand_path('../../ext/' + case RbConfig::CONFIG['host_os']
|
18
|
+
when /mswin|msys|mingw|cygwin|bccwin|wince|emc/
|
19
|
+
'liblsd.dll'
|
20
|
+
when /darwin|mac os/
|
21
|
+
'liblsd.dylib'
|
22
|
+
when /linux/
|
23
|
+
if RbConfig::CONFIG['target_cpu'] == 'x86_64'
|
24
|
+
'liblsd-linux64.so'
|
25
|
+
else
|
26
|
+
'liblsd-linux32.so'
|
27
|
+
end
|
28
|
+
else
|
29
|
+
raise "unknown os: #{RbConfig::CONFIG['host_os']}"
|
30
|
+
end,
|
31
|
+
File.dirname(__FILE__))
|
32
|
+
|
33
|
+
attach_function :lsd, [ :pointer, :buffer_in, :int, :int ], :pointer
|
34
|
+
attach_function :free_values, [ :pointer ], :void
|
35
|
+
|
36
|
+
def LSD.detect_lines_in_pdf_page(pdf_path, page_number, scale_factor=1)
|
37
|
+
pdf_file = PDDocument.loadNonSeq(java.io.File.new(pdf_path), nil)
|
38
|
+
bi = Tabula::Render.pageToBufferedImage(pdf_file.getDocumentCatalog.getAllPages[page_number - 1])
|
39
|
+
pdf_file.close
|
40
|
+
detect_lines(bi,scale_factor)
|
41
|
+
end
|
42
|
+
|
43
|
+
# image can be either a string (path to image) or a Java::JavaAwtImage::BufferedImage
|
44
|
+
# image to pixels: http://stackoverflow.com/questions/6524196/java-get-pixel-array-from-image
|
45
|
+
def LSD.detect_lines(image, scale_factor=1)
|
46
|
+
bimage = if image.class == Java::JavaAwtImage::BufferedImage
|
47
|
+
image
|
48
|
+
elsif image.class == String
|
49
|
+
ImageIO.read(java.io.File.new(image))
|
50
|
+
else
|
51
|
+
raise ArgumentError, 'image must be a string or a BufferedImage'
|
52
|
+
end
|
53
|
+
image = LSD.image_to_image_double(bimage)
|
54
|
+
|
55
|
+
lines_found_ptr = FFI::MemoryPointer.new(:int, 1)
|
56
|
+
|
57
|
+
out = lsd(lines_found_ptr, image, bimage.getWidth, bimage.getHeight)
|
58
|
+
|
59
|
+
lines_found = lines_found_ptr.get_int
|
60
|
+
|
61
|
+
rv = []
|
62
|
+
lines_found.times do |i|
|
63
|
+
a = out[7*8*i].read_array_of_type(:double, 7)
|
64
|
+
|
65
|
+
a_round = a[0..3].map(&:round)
|
66
|
+
p1, p2 = [[a_round[0], a_round[1]], [a_round[2], a_round[3]]]
|
67
|
+
|
68
|
+
rv << Tabula::Ruling.new(p1[1] * scale_factor,
|
69
|
+
p1[0] * scale_factor,
|
70
|
+
(p2[0] - p1[0]) * scale_factor,
|
71
|
+
(p2[1] - p1[1]) * scale_factor)
|
72
|
+
end
|
73
|
+
|
74
|
+
free_values(out)
|
75
|
+
bimage.flush
|
76
|
+
bimage.getGraphics.dispose
|
77
|
+
image = nil
|
78
|
+
|
79
|
+
return rv
|
80
|
+
end
|
81
|
+
|
82
|
+
private
|
83
|
+
def LSD.image_to_image_double(buffered_image)
|
84
|
+
width = buffered_image.getWidth; height = buffered_image.getHeight
|
85
|
+
raster_size = width * height
|
86
|
+
|
87
|
+
image_double = FFI::MemoryPointer.new(:double, raster_size)
|
88
|
+
pixels = Java::int[width * height].new
|
89
|
+
buffered_image.getRGB(0, 0, width, height, pixels, 0, width)
|
90
|
+
|
91
|
+
image_double.put_array_of_double 0, pixels.to_a
|
92
|
+
end
|
93
|
+
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
if __FILE__ == $0
|
98
|
+
puts Tabula::LSD.detect_lines_in_pdf_page ARGV[0], ARGV[1].to_i
|
99
|
+
end
|
data/lib/tabula/pdf_dump.rb
CHANGED
@@ -3,7 +3,7 @@ require 'observer'
|
|
3
3
|
require_relative './entities.rb'
|
4
4
|
|
5
5
|
require 'java'
|
6
|
-
require File.join(File.dirname(__FILE__), '../../target/
|
6
|
+
require File.join(File.dirname(__FILE__), '../../target/', Tabula::PDFBOX)
|
7
7
|
java_import org.apache.pdfbox.pdfparser.PDFParser
|
8
8
|
java_import org.apache.pdfbox.pdmodel.PDDocument
|
9
9
|
java_import org.apache.pdfbox.util.PDFTextStripper
|
@@ -44,13 +44,14 @@ module Tabula
|
|
44
44
|
c = text.getCharacter
|
45
45
|
# probably not the fastest way of detecting printable chars
|
46
46
|
self.characters << text if c =~ PRINTABLE_RE
|
47
|
+
|
47
48
|
end
|
48
49
|
end
|
49
50
|
|
50
51
|
class PagesInfoExtractor
|
51
52
|
def initialize(pdf_filename)
|
52
53
|
raise Errno::ENOENT unless File.exists?(pdf_filename)
|
53
|
-
@pdf_file = PDDocument.
|
54
|
+
@pdf_file = PDDocument.load(java.io.File.new(pdf_filename))
|
54
55
|
@all_pages = @pdf_file.getDocumentCatalog.getAllPages
|
55
56
|
end
|
56
57
|
|
@@ -99,13 +100,14 @@ module Tabula
|
|
99
100
|
page.getRotation.to_i,
|
100
101
|
i+1,
|
101
102
|
@extractor.characters.map { |char|
|
102
|
-
Tabula::TextElement.new(char.getYDirAdj,
|
103
|
-
char.getXDirAdj,
|
104
|
-
char.getWidthDirAdj,
|
105
|
-
char.getHeightDir,
|
103
|
+
Tabula::TextElement.new(char.getYDirAdj.round(2),
|
104
|
+
char.getXDirAdj.round(2),
|
105
|
+
char.getWidthDirAdj.round(2),
|
106
|
+
char.getHeightDir.round(2),
|
106
107
|
nil,
|
107
|
-
char.getFontSize,
|
108
|
-
char.getCharacter
|
108
|
+
char.getFontSize.round(2),
|
109
|
+
char.getCharacter,
|
110
|
+
char.getWidthOfSpace)
|
109
111
|
})
|
110
112
|
end
|
111
113
|
ensure
|
@@ -0,0 +1,64 @@
|
|
1
|
+
require 'java'
|
2
|
+
|
3
|
+
require File.join(File.dirname(__FILE__), '../../target/', Tabula::PDFBOX)
|
4
|
+
|
5
|
+
java_import org.apache.pdfbox.pdmodel.PDDocument
|
6
|
+
java_import org.apache.pdfbox.pdfviewer.PageDrawer
|
7
|
+
java_import java.awt.image.BufferedImage
|
8
|
+
java_import javax.imageio.ImageIO
|
9
|
+
java_import java.awt.Dimension
|
10
|
+
java_import java.awt.Color
|
11
|
+
|
12
|
+
module Tabula
|
13
|
+
module Render
|
14
|
+
|
15
|
+
# render a PDF page to a graphics context, but skip rendering the text
|
16
|
+
# This is done to reduce 'noise' introduced by the text, we only
|
17
|
+
# care about lines.
|
18
|
+
class PageDrawerNoText < PageDrawer
|
19
|
+
def processTextPosition(text)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
TRANSPARENT_WHITE = Color.new(255, 255, 255, 0)
|
24
|
+
|
25
|
+
def self.pageToBufferedImage(page, width=2048, pageDrawerClass=PageDrawerNoText)
|
26
|
+
cropbox = page.findCropBox
|
27
|
+
widthPt, heightPt = cropbox.getWidth, cropbox.getHeight
|
28
|
+
pageDimension = Dimension.new(widthPt, heightPt)
|
29
|
+
rotation = java.lang.Math.toRadians(page.findRotation)
|
30
|
+
|
31
|
+
scaling = width / (rotation == 0 ? widthPt : heightPt)
|
32
|
+
widthPx, heightPx = java.lang.Math.round(widthPt * scaling), java.lang.Math.round(heightPt * scaling)
|
33
|
+
|
34
|
+
retval = if rotation != 0
|
35
|
+
BufferedImage.new(heightPx, widthPx, BufferedImage::TYPE_BYTE_GRAY)
|
36
|
+
else
|
37
|
+
BufferedImage.new(widthPx, heightPx, BufferedImage::TYPE_BYTE_GRAY)
|
38
|
+
end
|
39
|
+
graphics = retval.getGraphics()
|
40
|
+
graphics.setBackground(TRANSPARENT_WHITE)
|
41
|
+
graphics.clearRect(0, 0, retval.getWidth, retval.getHeight)
|
42
|
+
if rotation != 0
|
43
|
+
graphics.translate(retval.getWidth, 0.0)
|
44
|
+
graphics.rotate(rotation)
|
45
|
+
end
|
46
|
+
graphics.scale(scaling, scaling)
|
47
|
+
drawer = pageDrawerClass.new()
|
48
|
+
drawer.drawPage(graphics, page, pageDimension)
|
49
|
+
graphics.dispose
|
50
|
+
|
51
|
+
return retval
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
# testing
|
57
|
+
if __FILE__ == $0
|
58
|
+
pdf_file = PDDocument.loadNonSeq(java.io.File.new(ARGV[0]), nil)
|
59
|
+
bi = Tabula::Render.pageToBufferedImage(pdf_file.getDocumentCatalog.getAllPages[ARGV[1].to_i - 1])
|
60
|
+
puts bi.class
|
61
|
+
ImageIO.write(bi, 'png',
|
62
|
+
java.io.File.new('notext.png'))
|
63
|
+
end
|
64
|
+
|
@@ -14,7 +14,6 @@ module Tabula
|
|
14
14
|
def initialize(text_elements, options = {})
|
15
15
|
self.text_elements = text_elements
|
16
16
|
self.options = DEFAULT_OPTIONS.merge(options)
|
17
|
-
@merged = false
|
18
17
|
merge_words! if self.options[:merge_words]
|
19
18
|
end
|
20
19
|
|
@@ -42,9 +41,9 @@ module Tabula
|
|
42
41
|
end
|
43
42
|
|
44
43
|
def get_columns
|
45
|
-
Tabula.group_by_columns(text_elements).map
|
44
|
+
Tabula.group_by_columns(text_elements).map do |c|
|
46
45
|
{'left' => c.left, 'right' => c.right, 'width' => c.width}
|
47
|
-
|
46
|
+
end
|
48
47
|
end
|
49
48
|
|
50
49
|
def get_line_boundaries
|
@@ -108,6 +107,7 @@ module Tabula
|
|
108
107
|
# is there a space? is this within `CHARACTER_DISTANCE_THRESHOLD` points of previous char?
|
109
108
|
if (char1.text != " ") and (char2.text != " ") and self.text_elements[current_word_index].should_add_space?(char2)
|
110
109
|
self.text_elements[current_word_index].text += " "
|
110
|
+
self.text_elements[current_word_index].width += self.text_elements[current_word_index].width_of_space
|
111
111
|
end
|
112
112
|
current_word_index = i+1
|
113
113
|
end
|
@@ -135,11 +135,11 @@ module Tabula
|
|
135
135
|
end
|
136
136
|
|
137
137
|
def Tabula.lines_to_csv(lines)
|
138
|
-
CSV.generate
|
139
|
-
lines.each
|
138
|
+
CSV.generate do |csv|
|
139
|
+
lines.each do |l|
|
140
140
|
csv << l.map { |c| c.text.strip }
|
141
|
-
|
142
|
-
|
141
|
+
end
|
142
|
+
end
|
143
143
|
end
|
144
144
|
|
145
145
|
ONLY_SPACES_RE = Regexp.new('^\s+$')
|
@@ -154,45 +154,43 @@ module Tabula
|
|
154
154
|
|
155
155
|
# find all the text elements
|
156
156
|
# contained within each detected line (table row) boundary
|
157
|
-
line_boundaries.each
|
157
|
+
line_boundaries.each do |lb|
|
158
158
|
line = Line.new
|
159
159
|
|
160
|
-
line_members = text_elements.find_all
|
160
|
+
line_members = text_elements.find_all do |te|
|
161
161
|
te.vertically_overlaps?(lb)
|
162
|
-
|
162
|
+
end
|
163
163
|
|
164
164
|
text_elements -= line_members
|
165
165
|
|
166
|
-
line_members.sort_by(&:left).each
|
166
|
+
line_members.sort_by(&:left).each do |te|
|
167
167
|
# skip text_elements that only contain spaces
|
168
168
|
next if te.text =~ ONLY_SPACES_RE
|
169
169
|
line << te
|
170
|
-
|
170
|
+
end
|
171
171
|
|
172
172
|
lines << line if line.text_elements.size > 0
|
173
|
-
|
173
|
+
end
|
174
174
|
|
175
175
|
lines.sort_by!(&:top)
|
176
176
|
|
177
177
|
columns = Tabula.group_by_columns(lines.map(&:text_elements).flatten.compact.uniq).sort_by(&:left)
|
178
178
|
|
179
179
|
# # insert empty cells if needed
|
180
|
-
lines.each_with_index
|
180
|
+
lines.each_with_index do |l, line_index|
|
181
181
|
next if l.text_elements.nil?
|
182
182
|
l.text_elements.compact! # TODO WHY do I have to do this?
|
183
183
|
l.text_elements.uniq! # TODO WHY do I have to do this?
|
184
184
|
l.text_elements.sort_by!(&:left)
|
185
185
|
|
186
|
-
# l.text_elements = Tabula.merge_words(l.text_elements)
|
187
|
-
|
188
186
|
next unless l.text_elements.size < columns.size
|
189
187
|
|
190
188
|
columns.each_with_index do |c, i|
|
191
189
|
if (i > l.text_elements.size - 1) or !l.text_elements(&:left)[i].nil? and !c.text_elements.include?(l.text_elements[i])
|
192
|
-
l.text_elements.insert(i, TextElement.new(l.top, c.left, c.width, l.height, nil, 0, ''))
|
190
|
+
l.text_elements.insert(i, TextElement.new(l.top, c.left, c.width, l.height, nil, 0, '', 0))
|
193
191
|
end
|
194
192
|
end
|
195
|
-
|
193
|
+
end
|
196
194
|
|
197
195
|
# # merge elements that are in the same column
|
198
196
|
columns = Tabula.group_by_columns(lines.map(&:text_elements).flatten.compact.uniq)
|
@@ -230,8 +228,9 @@ module Tabula
|
|
230
228
|
lines[i+1] = nil
|
231
229
|
end
|
232
230
|
end
|
233
|
-
|
231
|
+
|
232
|
+
lines.compact.map do |line|
|
234
233
|
line.text_elements.sort_by(&:left)
|
235
|
-
|
234
|
+
end
|
236
235
|
end
|
237
236
|
end
|
data/lib/tabula/version.rb
CHANGED
data/lib/tabula/writers.rb
CHANGED
data/tabula-extractor.gemspec
CHANGED
@@ -14,13 +14,14 @@ Gem::Specification.new do |s|
|
|
14
14
|
|
15
15
|
s.platform = 'java'
|
16
16
|
|
17
|
-
|
17
|
+
shared_libs = ['liblsd.dylib', 'liblsd-linux64.so', 'liblsd-linux32.so', 'liblsd.dll'].map { |f| 'ext/' + f }
|
18
|
+
s.files = `git ls-files`.split("\n") + shared_libs.map.reject { |f| !File.exists?(f) }
|
18
19
|
s.test_files = `git ls-files -- {test,features}/*`.split("\n")
|
19
20
|
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
20
21
|
s.require_paths = ["lib"]
|
21
22
|
|
22
23
|
s.add_development_dependency 'minitest'
|
23
|
-
s.add_development_dependency 'bundler', '>= 1.3.
|
24
|
+
s.add_development_dependency 'bundler', '>= 1.3.4'
|
24
25
|
|
25
26
|
s.add_runtime_dependency "trollop", ["~> 2.0"]
|
26
27
|
end
|
Binary file
|
data/test/tests.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# -*- coding: utf-8 -*-
|
2
|
+
require 'minitest'
|
2
3
|
require 'minitest/autorun'
|
3
4
|
|
4
5
|
require_relative '../lib/tabula'
|
@@ -9,7 +10,7 @@ def lines_to_array(lines)
|
|
9
10
|
}
|
10
11
|
end
|
11
12
|
|
12
|
-
class TestPagesInfoExtractor <
|
13
|
+
class TestPagesInfoExtractor < Minitest::Test
|
13
14
|
def test_pages_info_extractor
|
14
15
|
extractor = Tabula::Extraction::PagesInfoExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
|
15
16
|
|
@@ -23,7 +24,7 @@ class TestPagesInfoExtractor < MiniTest::Unit::TestCase
|
|
23
24
|
end
|
24
25
|
|
25
26
|
|
26
|
-
class TestDumper <
|
27
|
+
class TestDumper < Minitest::Test
|
27
28
|
|
28
29
|
def test_extractor
|
29
30
|
extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
|
@@ -40,7 +41,7 @@ class TestDumper < MiniTest::Unit::TestCase
|
|
40
41
|
end
|
41
42
|
end
|
42
43
|
|
43
|
-
class TestExtractor <
|
44
|
+
class TestExtractor < Minitest::Test
|
44
45
|
|
45
46
|
def test_table_extraction_1
|
46
47
|
character_extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
|
@@ -54,7 +55,7 @@ class TestExtractor < MiniTest::Unit::TestCase
|
|
54
55
|
character_extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/argentina_diputados_voting_record.pdf', File.dirname(__FILE__)))
|
55
56
|
characters = character_extractor.extract.next.get_text([269.875, 12.75, 790.5, 561])
|
56
57
|
|
57
|
-
expected = [["
|
58
|
+
expected = [["ABDALA de MATARAZZO, Norma Amanda", "Frente Cívico por Santiago", "Santiago del Estero", "AFIRMATIVO"], ["ALBRIEU, Oscar Edmundo Nicolas", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["ALONSO, María Luz", "Frente para la Victoria - PJ", "La Pampa", "AFIRMATIVO"], ["ARENA, Celia Isabel", "Frente para la Victoria - PJ", "Santa Fe", "AFIRMATIVO"], ["ARREGUI, Andrés Roberto", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["AVOSCAN, Herman Horacio", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["BALCEDO, María Ester", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["BARRANDEGUY, Raúl Enrique", "Frente para la Victoria - PJ", "Entre Ríos", "AFIRMATIVO"], ["BASTERRA, Luis Eugenio", "Frente para la Victoria - PJ", "Formosa", "AFIRMATIVO"], ["BEDANO, Nora Esther", "Frente para la Victoria - PJ", "Córdoba", "AFIRMATIVO"], ["BERNAL, María Eugenia", "Frente para la Victoria - PJ", "Jujuy", "AFIRMATIVO"], ["BERTONE, Rosana Andrea", "Frente para la Victoria - PJ", "Tierra del Fuego", "AFIRMATIVO"], ["BIANCHI, María del Carmen", "Frente para la Victoria - PJ", "Cdad. Aut. Bs. As.", "AFIRMATIVO"], ["BIDEGAIN, Gloria Mercedes", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["BRAWER, Mara", "Frente para la Victoria - PJ", "Cdad. Aut. Bs. As.", "AFIRMATIVO"], ["BRILLO, José Ricardo", "Movimiento Popular Neuquino", "Neuquén", "AFIRMATIVO"], ["BROMBERG, Isaac Benjamín", "Frente para la Victoria - PJ", "Tucumán", "AFIRMATIVO"], ["BRUE, Daniel Agustín", "Frente Cívico por Santiago", "Santiago del Estero", "AFIRMATIVO"], ["CALCAGNO, Eric", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CARLOTTO, Remo Gerardo", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CARMONA, Guillermo Ramón", "Frente para la Victoria - PJ", "Mendoza", "AFIRMATIVO"], ["CATALAN MAGNI, Julio César", "Frente para la Victoria - PJ", "Tierra del Fuego", "AFIRMATIVO"], ["CEJAS, Jorge Alberto", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["CHIENO, María Elena", "Frente para la Victoria - PJ", "Corrientes", "AFIRMATIVO"], ["CIAMPINI, José Alberto", "Frente para la Victoria - PJ", "Neuquén", "AFIRMATIVO"], ["CIGOGNA, Luis Francisco Jorge", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CLERI, Marcos", "Frente para la Victoria - PJ", "Santa Fe", "AFIRMATIVO"], ["COMELLI, Alicia Marcela", "Movimiento Popular Neuquino", "Neuquén", "AFIRMATIVO"], ["CONTI, Diana Beatriz", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CORDOBA, Stella Maris", "Frente para la Victoria - PJ", "Tucumán", "AFIRMATIVO"], ["CURRILEN, Oscar Rubén", "Frente para la Victoria - PJ", "Chubut", "AFIRMATIVO"]]
|
58
59
|
|
59
60
|
assert_equal expected, lines_to_array(Tabula.make_table(characters))
|
60
61
|
end
|
@@ -62,9 +63,9 @@ class TestExtractor < MiniTest::Unit::TestCase
|
|
62
63
|
# TODO Spaces inserted in words - fails
|
63
64
|
def test_bo_page24
|
64
65
|
character_extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/bo_page24.pdf', File.dirname(__FILE__)))
|
65
|
-
characters = character_extractor.extract.next.get_text([435.625, 53.125,
|
66
|
+
characters = character_extractor.extract.next.get_text([435.625, 53.125, 585.7142857142857, 810.5357142857142])
|
66
67
|
|
67
|
-
expected = [["1", "UNICA", "CECILIA KANDUS", "16/12/2008", "PEDRO ALBERTO GALINDEZ", "60279/09"], ["1", "UNICA", "CECILIA KANDUS", "10/06/2009", "PASTORA FILOMENA NAVARRO", "60280/09"], ["13", "UNICA", "MIRTA S. BOTTALLO DE VILLA", "02/07/2009", "MARIO LUIS ANGELERI, DNI 4.313.138", "60198/09"], ["16", "UNICA", "LUIS PEDRO FASANELLI", "22/05/2009", "PETTER o PEDRO KAHRS", "60244/09"], ["18", "UNICA", "ALEJANDRA SALLES", "26/06/2009", "RAUL FERNANDO FORTINI", "60236/09"], ["31", "UNICA", "MARÍA CRISTINA GARCÍA", "17/06/2009", "DOMINGO TRIPODI Y PAULA LUPPINO", "60302/09"], ["34", "UNICA", "SUSANA B.MARZIONI", "11/06/2009", "JESUSA CARMEN VAZQUEZ", "60177/09"], ["51", "UNICA", "MARIA LUCRECIA SERRAT", "19/05/2009", "DANIEL DECUADRO", "60227/09"], ["51", "UNICA", "MARIA LUCRECIA SERRAT", "12/02/2009", "ELIZABETH LILIANA MANSILLA ROMERO", "60150/09"], ["75", "UNICA", "IGNACIO M. REBAUDI BASAVILBASO", "01/07/2009", "ABALSAMO ALFREDO DANIEL", "60277/09"], ["94", "UNICA", "GABRIELA PALÓPOLI", "02/07/2009", "ALVAREZ ALICIA ESTHER", "60360/09"], ["96", "UNICA", "DANIEL PAZ EYNARD", "16/06/2009", "NELIDA ALBORADA ALCARAZ SERRANO", "60176/09"]]
|
68
|
+
expected = [["1", "UNICA", "CECILIA KANDUS", "16/12/2008", "PEDRO ALBERTO GALINDEZ", "60279/09"], ["1", "UNICA", "CECILIA KANDUS", "10/06/2009", "PASTORA FILOMENA NAVARRO", "60280/09"], ["13", "UNICA", "MIRTA S. BOTTALLO DE VILLA", "02/07/2009", "MARIO LUIS ANGELERI, DNI 4.313.138", "60198/09"], ["16", "UNICA", "LUIS PEDRO FASANELLI", "22/05/2009", "PETTER o PEDRO KAHRS", "60244/09"], ["18", "UNICA", "ALEJANDRA SALLES", "26/06/2009", "RAUL FERNANDO FORTINI", "60236/09"], ["31", "UNICA", "MARÍA CRISTINA GARCÍA", "17/06/2009", "DOMINGO TRIPODI Y PAULA LUPPINO", "60302/09"], ["34", "UNICA", "SUSANA B. MARZIONI", "11/06/2009", "JESUSA CARMEN VAZQUEZ", "60177/09"], ["51", "UNICA", "MARIA LUCRECIA SERRAT", "19/05/2009", "DANIEL DECUADRO", "60227/09"], ["51", "UNICA", "MARIA LUCRECIA SERRAT", "12/02/2009", "ELIZABETH LILIANA MANSILLA ROMERO", "60150/09"], ["75", "UNICA", "IGNACIO M. REBAUDI BASAVILBASO", "01/07/2009", "ABALSAMO ALFREDO DANIEL", "60277/09"], ["94", "UNICA", "GABRIELA PALÓPOLI", "02/07/2009", "ALVAREZ ALICIA ESTHER", "60360/09"], ["96", "UNICA", "DANIEL PAZ EYNARD", "16/06/2009", "NELIDA ALBORADA ALCARAZ SERRANO", "60176/09"]]
|
68
69
|
assert_equal expected, lines_to_array(Tabula.make_table(characters))
|
69
70
|
end
|
70
71
|
|
metadata
CHANGED
@@ -2,14 +2,14 @@
|
|
2
2
|
name: tabula-extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.0
|
5
|
+
version: 0.5.0
|
6
6
|
platform: java
|
7
7
|
authors:
|
8
8
|
- Manuel Aristarán
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-
|
12
|
+
date: 2013-06-12 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: minitest
|
@@ -35,13 +35,13 @@ dependencies:
|
|
35
35
|
requirements:
|
36
36
|
- - ">="
|
37
37
|
- !ruby/object:Gem::Version
|
38
|
-
version: 1.3.
|
38
|
+
version: 1.3.4
|
39
39
|
none: false
|
40
40
|
requirement: !ruby/object:Gem::Requirement
|
41
41
|
requirements:
|
42
42
|
- - ">="
|
43
43
|
- !ruby/object:Gem::Version
|
44
|
-
version: 1.3.
|
44
|
+
version: 1.3.4
|
45
45
|
none: false
|
46
46
|
prerelease: false
|
47
47
|
type: :development
|
@@ -70,6 +70,7 @@ extensions: []
|
|
70
70
|
extra_rdoc_files: []
|
71
71
|
files:
|
72
72
|
- ".gitignore"
|
73
|
+
- ".travis.yml"
|
73
74
|
- AUTHORS.md
|
74
75
|
- Gemfile
|
75
76
|
- LICENSE.md
|
@@ -77,15 +78,31 @@ files:
|
|
77
78
|
- README.md
|
78
79
|
- Rakefile
|
79
80
|
- bin/tabula
|
81
|
+
- ext/COPYING
|
82
|
+
- ext/Makefile.OSX
|
83
|
+
- ext/Makefile.defaults
|
84
|
+
- ext/Makefile.linux32
|
85
|
+
- ext/Makefile.linux64
|
86
|
+
- ext/Makefile.mingw
|
87
|
+
- ext/liblsd-linux32.so
|
88
|
+
- ext/liblsd-linux64.so
|
89
|
+
- ext/liblsd.def
|
90
|
+
- ext/liblsd.dll
|
91
|
+
- ext/liblsd.dylib
|
92
|
+
- ext/lsd.c
|
93
|
+
- ext/lsd.h
|
80
94
|
- lib/tabula.rb
|
95
|
+
- lib/tabula/core_ext.rb
|
81
96
|
- lib/tabula/entities.rb
|
97
|
+
- lib/tabula/line_segment_detector.rb
|
82
98
|
- lib/tabula/pdf_dump.rb
|
99
|
+
- lib/tabula/pdf_render.rb
|
83
100
|
- lib/tabula/table_extractor.rb
|
84
101
|
- lib/tabula/version.rb
|
85
102
|
- lib/tabula/whitespace.rb
|
86
103
|
- lib/tabula/writers.rb
|
87
104
|
- tabula-extractor.gemspec
|
88
|
-
- target/pdfbox-app-
|
105
|
+
- target/pdfbox-app-2.0.0-SNAPSHOT.jar
|
89
106
|
- test/data/ClinicalResearchDisclosureReport2012Q2.pdf
|
90
107
|
- test/data/argentina_diputados_voting_record.pdf
|
91
108
|
- test/data/bo_page24.pdf
|