tabula-extractor 0.0.1-java → 0.5.0-java

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,99 @@
1
+ require 'java'
2
+ require 'rbconfig'
3
+
4
+ require 'ffi'
5
+
6
+ require_relative './entities'
7
+ require_relative './pdf_render'
8
+ require File.join(File.dirname(__FILE__), '../../target/', Tabula::PDFBOX)
9
+
10
+ java_import javax.imageio.ImageIO
11
+ java_import java.awt.image.BufferedImage
12
+ java_import org.apache.pdfbox.pdmodel.PDDocument
13
+
14
+ module Tabula
15
+ module LSD
16
+ extend FFI::Library
17
+ ffi_lib File.expand_path('../../ext/' + case RbConfig::CONFIG['host_os']
18
+ when /mswin|msys|mingw|cygwin|bccwin|wince|emc/
19
+ 'liblsd.dll'
20
+ when /darwin|mac os/
21
+ 'liblsd.dylib'
22
+ when /linux/
23
+ if RbConfig::CONFIG['target_cpu'] == 'x86_64'
24
+ 'liblsd-linux64.so'
25
+ else
26
+ 'liblsd-linux32.so'
27
+ end
28
+ else
29
+ raise "unknown os: #{RbConfig::CONFIG['host_os']}"
30
+ end,
31
+ File.dirname(__FILE__))
32
+
33
+ attach_function :lsd, [ :pointer, :buffer_in, :int, :int ], :pointer
34
+ attach_function :free_values, [ :pointer ], :void
35
+
36
+ def LSD.detect_lines_in_pdf_page(pdf_path, page_number, scale_factor=1)
37
+ pdf_file = PDDocument.loadNonSeq(java.io.File.new(pdf_path), nil)
38
+ bi = Tabula::Render.pageToBufferedImage(pdf_file.getDocumentCatalog.getAllPages[page_number - 1])
39
+ pdf_file.close
40
+ detect_lines(bi,scale_factor)
41
+ end
42
+
43
+ # image can be either a string (path to image) or a Java::JavaAwtImage::BufferedImage
44
+ # image to pixels: http://stackoverflow.com/questions/6524196/java-get-pixel-array-from-image
45
+ def LSD.detect_lines(image, scale_factor=1)
46
+ bimage = if image.class == Java::JavaAwtImage::BufferedImage
47
+ image
48
+ elsif image.class == String
49
+ ImageIO.read(java.io.File.new(image))
50
+ else
51
+ raise ArgumentError, 'image must be a string or a BufferedImage'
52
+ end
53
+ image = LSD.image_to_image_double(bimage)
54
+
55
+ lines_found_ptr = FFI::MemoryPointer.new(:int, 1)
56
+
57
+ out = lsd(lines_found_ptr, image, bimage.getWidth, bimage.getHeight)
58
+
59
+ lines_found = lines_found_ptr.get_int
60
+
61
+ rv = []
62
+ lines_found.times do |i|
63
+ a = out[7*8*i].read_array_of_type(:double, 7)
64
+
65
+ a_round = a[0..3].map(&:round)
66
+ p1, p2 = [[a_round[0], a_round[1]], [a_round[2], a_round[3]]]
67
+
68
+ rv << Tabula::Ruling.new(p1[1] * scale_factor,
69
+ p1[0] * scale_factor,
70
+ (p2[0] - p1[0]) * scale_factor,
71
+ (p2[1] - p1[1]) * scale_factor)
72
+ end
73
+
74
+ free_values(out)
75
+ bimage.flush
76
+ bimage.getGraphics.dispose
77
+ image = nil
78
+
79
+ return rv
80
+ end
81
+
82
+ private
83
+ def LSD.image_to_image_double(buffered_image)
84
+ width = buffered_image.getWidth; height = buffered_image.getHeight
85
+ raster_size = width * height
86
+
87
+ image_double = FFI::MemoryPointer.new(:double, raster_size)
88
+ pixels = Java::int[width * height].new
89
+ buffered_image.getRGB(0, 0, width, height, pixels, 0, width)
90
+
91
+ image_double.put_array_of_double 0, pixels.to_a
92
+ end
93
+
94
+ end
95
+ end
96
+
97
+ if __FILE__ == $0
98
+ puts Tabula::LSD.detect_lines_in_pdf_page ARGV[0], ARGV[1].to_i
99
+ end
@@ -3,7 +3,7 @@ require 'observer'
3
3
  require_relative './entities.rb'
4
4
 
5
5
  require 'java'
6
- require File.join(File.dirname(__FILE__), '../../target/pdfbox-app-1.8.0.jar')
6
+ require File.join(File.dirname(__FILE__), '../../target/', Tabula::PDFBOX)
7
7
  java_import org.apache.pdfbox.pdfparser.PDFParser
8
8
  java_import org.apache.pdfbox.pdmodel.PDDocument
9
9
  java_import org.apache.pdfbox.util.PDFTextStripper
@@ -44,13 +44,14 @@ module Tabula
44
44
  c = text.getCharacter
45
45
  # probably not the fastest way of detecting printable chars
46
46
  self.characters << text if c =~ PRINTABLE_RE
47
+
47
48
  end
48
49
  end
49
50
 
50
51
  class PagesInfoExtractor
51
52
  def initialize(pdf_filename)
52
53
  raise Errno::ENOENT unless File.exists?(pdf_filename)
53
- @pdf_file = PDDocument.loadNonSeq(java.io.File.new(pdf_filename), nil)
54
+ @pdf_file = PDDocument.load(java.io.File.new(pdf_filename))
54
55
  @all_pages = @pdf_file.getDocumentCatalog.getAllPages
55
56
  end
56
57
 
@@ -99,13 +100,14 @@ module Tabula
99
100
  page.getRotation.to_i,
100
101
  i+1,
101
102
  @extractor.characters.map { |char|
102
- Tabula::TextElement.new(char.getYDirAdj,
103
- char.getXDirAdj,
104
- char.getWidthDirAdj,
105
- char.getHeightDir,
103
+ Tabula::TextElement.new(char.getYDirAdj.round(2),
104
+ char.getXDirAdj.round(2),
105
+ char.getWidthDirAdj.round(2),
106
+ char.getHeightDir.round(2),
106
107
  nil,
107
- char.getFontSize,
108
- char.getCharacter)
108
+ char.getFontSize.round(2),
109
+ char.getCharacter,
110
+ char.getWidthOfSpace)
109
111
  })
110
112
  end
111
113
  ensure
@@ -0,0 +1,64 @@
1
+ require 'java'
2
+
3
+ require File.join(File.dirname(__FILE__), '../../target/', Tabula::PDFBOX)
4
+
5
+ java_import org.apache.pdfbox.pdmodel.PDDocument
6
+ java_import org.apache.pdfbox.pdfviewer.PageDrawer
7
+ java_import java.awt.image.BufferedImage
8
+ java_import javax.imageio.ImageIO
9
+ java_import java.awt.Dimension
10
+ java_import java.awt.Color
11
+
12
+ module Tabula
13
+ module Render
14
+
15
+ # render a PDF page to a graphics context, but skip rendering the text
16
+ # This is done to reduce 'noise' introduced by the text, we only
17
+ # care about lines.
18
+ class PageDrawerNoText < PageDrawer
19
+ def processTextPosition(text)
20
+ end
21
+ end
22
+
23
+ TRANSPARENT_WHITE = Color.new(255, 255, 255, 0)
24
+
25
+ def self.pageToBufferedImage(page, width=2048, pageDrawerClass=PageDrawerNoText)
26
+ cropbox = page.findCropBox
27
+ widthPt, heightPt = cropbox.getWidth, cropbox.getHeight
28
+ pageDimension = Dimension.new(widthPt, heightPt)
29
+ rotation = java.lang.Math.toRadians(page.findRotation)
30
+
31
+ scaling = width / (rotation == 0 ? widthPt : heightPt)
32
+ widthPx, heightPx = java.lang.Math.round(widthPt * scaling), java.lang.Math.round(heightPt * scaling)
33
+
34
+ retval = if rotation != 0
35
+ BufferedImage.new(heightPx, widthPx, BufferedImage::TYPE_BYTE_GRAY)
36
+ else
37
+ BufferedImage.new(widthPx, heightPx, BufferedImage::TYPE_BYTE_GRAY)
38
+ end
39
+ graphics = retval.getGraphics()
40
+ graphics.setBackground(TRANSPARENT_WHITE)
41
+ graphics.clearRect(0, 0, retval.getWidth, retval.getHeight)
42
+ if rotation != 0
43
+ graphics.translate(retval.getWidth, 0.0)
44
+ graphics.rotate(rotation)
45
+ end
46
+ graphics.scale(scaling, scaling)
47
+ drawer = pageDrawerClass.new()
48
+ drawer.drawPage(graphics, page, pageDimension)
49
+ graphics.dispose
50
+
51
+ return retval
52
+ end
53
+ end
54
+ end
55
+
56
+ # testing
57
+ if __FILE__ == $0
58
+ pdf_file = PDDocument.loadNonSeq(java.io.File.new(ARGV[0]), nil)
59
+ bi = Tabula::Render.pageToBufferedImage(pdf_file.getDocumentCatalog.getAllPages[ARGV[1].to_i - 1])
60
+ puts bi.class
61
+ ImageIO.write(bi, 'png',
62
+ java.io.File.new('notext.png'))
63
+ end
64
+
@@ -14,7 +14,6 @@ module Tabula
14
14
  def initialize(text_elements, options = {})
15
15
  self.text_elements = text_elements
16
16
  self.options = DEFAULT_OPTIONS.merge(options)
17
- @merged = false
18
17
  merge_words! if self.options[:merge_words]
19
18
  end
20
19
 
@@ -42,9 +41,9 @@ module Tabula
42
41
  end
43
42
 
44
43
  def get_columns
45
- Tabula.group_by_columns(text_elements).map { |c|
44
+ Tabula.group_by_columns(text_elements).map do |c|
46
45
  {'left' => c.left, 'right' => c.right, 'width' => c.width}
47
- }
46
+ end
48
47
  end
49
48
 
50
49
  def get_line_boundaries
@@ -108,6 +107,7 @@ module Tabula
108
107
  # is there a space? is this within `CHARACTER_DISTANCE_THRESHOLD` points of previous char?
109
108
  if (char1.text != " ") and (char2.text != " ") and self.text_elements[current_word_index].should_add_space?(char2)
110
109
  self.text_elements[current_word_index].text += " "
110
+ self.text_elements[current_word_index].width += self.text_elements[current_word_index].width_of_space
111
111
  end
112
112
  current_word_index = i+1
113
113
  end
@@ -135,11 +135,11 @@ module Tabula
135
135
  end
136
136
 
137
137
  def Tabula.lines_to_csv(lines)
138
- CSV.generate { |csv|
139
- lines.each { |l|
138
+ CSV.generate do |csv|
139
+ lines.each do |l|
140
140
  csv << l.map { |c| c.text.strip }
141
- }
142
- }
141
+ end
142
+ end
143
143
  end
144
144
 
145
145
  ONLY_SPACES_RE = Regexp.new('^\s+$')
@@ -154,45 +154,43 @@ module Tabula
154
154
 
155
155
  # find all the text elements
156
156
  # contained within each detected line (table row) boundary
157
- line_boundaries.each { |lb|
157
+ line_boundaries.each do |lb|
158
158
  line = Line.new
159
159
 
160
- line_members = text_elements.find_all { |te|
160
+ line_members = text_elements.find_all do |te|
161
161
  te.vertically_overlaps?(lb)
162
- }
162
+ end
163
163
 
164
164
  text_elements -= line_members
165
165
 
166
- line_members.sort_by(&:left).each { |te|
166
+ line_members.sort_by(&:left).each do |te|
167
167
  # skip text_elements that only contain spaces
168
168
  next if te.text =~ ONLY_SPACES_RE
169
169
  line << te
170
- }
170
+ end
171
171
 
172
172
  lines << line if line.text_elements.size > 0
173
- }
173
+ end
174
174
 
175
175
  lines.sort_by!(&:top)
176
176
 
177
177
  columns = Tabula.group_by_columns(lines.map(&:text_elements).flatten.compact.uniq).sort_by(&:left)
178
178
 
179
179
  # # insert empty cells if needed
180
- lines.each_with_index { |l, line_index|
180
+ lines.each_with_index do |l, line_index|
181
181
  next if l.text_elements.nil?
182
182
  l.text_elements.compact! # TODO WHY do I have to do this?
183
183
  l.text_elements.uniq! # TODO WHY do I have to do this?
184
184
  l.text_elements.sort_by!(&:left)
185
185
 
186
- # l.text_elements = Tabula.merge_words(l.text_elements)
187
-
188
186
  next unless l.text_elements.size < columns.size
189
187
 
190
188
  columns.each_with_index do |c, i|
191
189
  if (i > l.text_elements.size - 1) or !l.text_elements(&:left)[i].nil? and !c.text_elements.include?(l.text_elements[i])
192
- l.text_elements.insert(i, TextElement.new(l.top, c.left, c.width, l.height, nil, 0, ''))
190
+ l.text_elements.insert(i, TextElement.new(l.top, c.left, c.width, l.height, nil, 0, '', 0))
193
191
  end
194
192
  end
195
- }
193
+ end
196
194
 
197
195
  # # merge elements that are in the same column
198
196
  columns = Tabula.group_by_columns(lines.map(&:text_elements).flatten.compact.uniq)
@@ -230,8 +228,9 @@ module Tabula
230
228
  lines[i+1] = nil
231
229
  end
232
230
  end
233
- lines.compact.map { |line|
231
+
232
+ lines.compact.map do |line|
234
233
  line.text_elements.sort_by(&:left)
235
- }
234
+ end
236
235
  end
237
236
  end
@@ -1,3 +1,3 @@
1
1
  module Tabula
2
- VERSION = '0.0.1'
2
+ VERSION = '0.5.0'
3
3
  end
@@ -16,7 +16,7 @@ module Tabula
16
16
 
17
17
  def Writers.TSV(lines, output=$stdout)
18
18
  tsv_string = lines.each { |l|
19
- output.write(l.map(&:text).join("\t") + '\n')
19
+ output.write(l.map(&:text).join("\t") + "\n")
20
20
  }
21
21
  end
22
22
 
@@ -14,13 +14,14 @@ Gem::Specification.new do |s|
14
14
 
15
15
  s.platform = 'java'
16
16
 
17
- s.files = `git ls-files`.split("\n")
17
+ shared_libs = ['liblsd.dylib', 'liblsd-linux64.so', 'liblsd-linux32.so', 'liblsd.dll'].map { |f| 'ext/' + f }
18
+ s.files = `git ls-files`.split("\n") + shared_libs.map.reject { |f| !File.exists?(f) }
18
19
  s.test_files = `git ls-files -- {test,features}/*`.split("\n")
19
20
  s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
20
21
  s.require_paths = ["lib"]
21
22
 
22
23
  s.add_development_dependency 'minitest'
23
- s.add_development_dependency 'bundler', '>= 1.3.5'
24
+ s.add_development_dependency 'bundler', '>= 1.3.4'
24
25
 
25
26
  s.add_runtime_dependency "trollop", ["~> 2.0"]
26
27
  end
data/test/tests.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  # -*- coding: utf-8 -*-
2
+ require 'minitest'
2
3
  require 'minitest/autorun'
3
4
 
4
5
  require_relative '../lib/tabula'
@@ -9,7 +10,7 @@ def lines_to_array(lines)
9
10
  }
10
11
  end
11
12
 
12
- class TestPagesInfoExtractor < MiniTest::Unit::TestCase
13
+ class TestPagesInfoExtractor < Minitest::Test
13
14
  def test_pages_info_extractor
14
15
  extractor = Tabula::Extraction::PagesInfoExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
15
16
 
@@ -23,7 +24,7 @@ class TestPagesInfoExtractor < MiniTest::Unit::TestCase
23
24
  end
24
25
 
25
26
 
26
- class TestDumper < MiniTest::Unit::TestCase
27
+ class TestDumper < Minitest::Test
27
28
 
28
29
  def test_extractor
29
30
  extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
@@ -40,7 +41,7 @@ class TestDumper < MiniTest::Unit::TestCase
40
41
  end
41
42
  end
42
43
 
43
- class TestExtractor < MiniTest::Unit::TestCase
44
+ class TestExtractor < Minitest::Test
44
45
 
45
46
  def test_table_extraction_1
46
47
  character_extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
@@ -54,7 +55,7 @@ class TestExtractor < MiniTest::Unit::TestCase
54
55
  character_extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/argentina_diputados_voting_record.pdf', File.dirname(__FILE__)))
55
56
  characters = character_extractor.extract.next.get_text([269.875, 12.75, 790.5, 561])
56
57
 
57
- expected = [["Apellido y Nombre", "Bloque político", "Provincia", ""], ["ABDALA de MATARAZZO, Norma Amanda", "Frente Cívico por Santiago", "Santiago del Estero", "AFIRMATIVO"], ["ALBRIEU, Oscar Edmundo Nicolas", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["ALONSO, María Luz", "Frente para la Victoria - PJ", "La Pampa", "AFIRMATIVO"], ["ARENA, Celia Isabel", "Frente para la Victoria - PJ", "Santa Fe", "AFIRMATIVO"], ["ARREGUI, Andrés Roberto", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["AVOSCAN, Herman Horacio", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["BALCEDO, María Ester", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["BARRANDEGUY, Raúl Enrique", "Frente para la Victoria - PJ", "Entre Ríos", "AFIRMATIVO"], ["BASTERRA, Luis Eugenio", "Frente para la Victoria - PJ", "Formosa", "AFIRMATIVO"], ["BEDANO, Nora Esther", "Frente para la Victoria - PJ", "Córdoba", "AFIRMATIVO"], ["BERNAL, María Eugenia", "Frente para la Victoria - PJ", "Jujuy", "AFIRMATIVO"], ["BERTONE, Rosana Andrea", "Frente para la Victoria - PJ", "Tierra del Fuego", "AFIRMATIVO"], ["BIANCHI, María del Carmen", "Frente para la Victoria - PJ", "Cdad. Aut. Bs. As.", "AFIRMATIVO"], ["BIDEGAIN, Gloria Mercedes", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["BRAWER, Mara", "Frente para la Victoria - PJ", "Cdad. Aut. Bs. As.", "AFIRMATIVO"], ["BRILLO, José Ricardo", "Movimiento Popular Neuquino", "Neuquén", "AFIRMATIVO"], ["BROMBERG, Isaac Benjamín", "Frente para la Victoria - PJ", "Tucumán", "AFIRMATIVO"], ["BRUE, Daniel Agustín", "Frente Cívico por Santiago", "Santiago del Estero", "AFIRMATIVO"], ["CALCAGNO, Eric", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CARLOTTO, Remo Gerardo", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CARMONA, Guillermo Ramón", "Frente para la Victoria - PJ", "Mendoza", "AFIRMATIVO"], ["CATALAN MAGNI, Julio César", "Frente para la Victoria - PJ", "Tierra del Fuego", "AFIRMATIVO"], ["CEJAS, Jorge Alberto", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["CHIENO, María Elena", "Frente para la Victoria - PJ", "Corrientes", "AFIRMATIVO"], ["CIAMPINI, José Alberto", "Frente para la Victoria - PJ", "Neuquén", "AFIRMATIVO"], ["CIGOGNA, Luis Francisco Jorge", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CLERI, Marcos", "Frente para la Victoria - PJ", "Santa Fe", "AFIRMATIVO"], ["COMELLI, Alicia Marcela", "Movimiento Popular Neuquino", "Neuquén", "AFIRMATIVO"], ["CONTI, Diana Beatriz", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CORDOBA, Stella Maris", "Frente para la Victoria - PJ", "Tucumán", "AFIRMATIVO"], ["CURRILEN, Oscar Rubén", "Frente para la Victoria - PJ", "Chubut", "AFIRMATIVO"]]
58
+ expected = [["ABDALA de MATARAZZO, Norma Amanda", "Frente Cívico por Santiago", "Santiago del Estero", "AFIRMATIVO"], ["ALBRIEU, Oscar Edmundo Nicolas", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["ALONSO, María Luz", "Frente para la Victoria - PJ", "La Pampa", "AFIRMATIVO"], ["ARENA, Celia Isabel", "Frente para la Victoria - PJ", "Santa Fe", "AFIRMATIVO"], ["ARREGUI, Andrés Roberto", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["AVOSCAN, Herman Horacio", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["BALCEDO, María Ester", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["BARRANDEGUY, Raúl Enrique", "Frente para la Victoria - PJ", "Entre Ríos", "AFIRMATIVO"], ["BASTERRA, Luis Eugenio", "Frente para la Victoria - PJ", "Formosa", "AFIRMATIVO"], ["BEDANO, Nora Esther", "Frente para la Victoria - PJ", "Córdoba", "AFIRMATIVO"], ["BERNAL, María Eugenia", "Frente para la Victoria - PJ", "Jujuy", "AFIRMATIVO"], ["BERTONE, Rosana Andrea", "Frente para la Victoria - PJ", "Tierra del Fuego", "AFIRMATIVO"], ["BIANCHI, María del Carmen", "Frente para la Victoria - PJ", "Cdad. Aut. Bs. As.", "AFIRMATIVO"], ["BIDEGAIN, Gloria Mercedes", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["BRAWER, Mara", "Frente para la Victoria - PJ", "Cdad. Aut. Bs. As.", "AFIRMATIVO"], ["BRILLO, José Ricardo", "Movimiento Popular Neuquino", "Neuquén", "AFIRMATIVO"], ["BROMBERG, Isaac Benjamín", "Frente para la Victoria - PJ", "Tucumán", "AFIRMATIVO"], ["BRUE, Daniel Agustín", "Frente Cívico por Santiago", "Santiago del Estero", "AFIRMATIVO"], ["CALCAGNO, Eric", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CARLOTTO, Remo Gerardo", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CARMONA, Guillermo Ramón", "Frente para la Victoria - PJ", "Mendoza", "AFIRMATIVO"], ["CATALAN MAGNI, Julio César", "Frente para la Victoria - PJ", "Tierra del Fuego", "AFIRMATIVO"], ["CEJAS, Jorge Alberto", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["CHIENO, María Elena", "Frente para la Victoria - PJ", "Corrientes", "AFIRMATIVO"], ["CIAMPINI, José Alberto", "Frente para la Victoria - PJ", "Neuquén", "AFIRMATIVO"], ["CIGOGNA, Luis Francisco Jorge", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CLERI, Marcos", "Frente para la Victoria - PJ", "Santa Fe", "AFIRMATIVO"], ["COMELLI, Alicia Marcela", "Movimiento Popular Neuquino", "Neuquén", "AFIRMATIVO"], ["CONTI, Diana Beatriz", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CORDOBA, Stella Maris", "Frente para la Victoria - PJ", "Tucumán", "AFIRMATIVO"], ["CURRILEN, Oscar Rubén", "Frente para la Victoria - PJ", "Chubut", "AFIRMATIVO"]]
58
59
 
59
60
  assert_equal expected, lines_to_array(Tabula.make_table(characters))
60
61
  end
@@ -62,9 +63,9 @@ class TestExtractor < MiniTest::Unit::TestCase
62
63
  # TODO Spaces inserted in words - fails
63
64
  def test_bo_page24
64
65
  character_extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/bo_page24.pdf', File.dirname(__FILE__)))
65
- characters = character_extractor.extract.next.get_text([435.625, 53.125, 570.7142857142857, 810.5357142857142])
66
+ characters = character_extractor.extract.next.get_text([435.625, 53.125, 585.7142857142857, 810.5357142857142])
66
67
 
67
- expected = [["1", "UNICA", "CECILIA KANDUS", "16/12/2008", "PEDRO ALBERTO GALINDEZ", "60279/09"], ["1", "UNICA", "CECILIA KANDUS", "10/06/2009", "PASTORA FILOMENA NAVARRO", "60280/09"], ["13", "UNICA", "MIRTA S. BOTTALLO DE VILLA", "02/07/2009", "MARIO LUIS ANGELERI, DNI 4.313.138", "60198/09"], ["16", "UNICA", "LUIS PEDRO FASANELLI", "22/05/2009", "PETTER o PEDRO KAHRS", "60244/09"], ["18", "UNICA", "ALEJANDRA SALLES", "26/06/2009", "RAUL FERNANDO FORTINI", "60236/09"], ["31", "UNICA", "MARÍA CRISTINA GARCÍA", "17/06/2009", "DOMINGO TRIPODI Y PAULA LUPPINO", "60302/09"], ["34", "UNICA", "SUSANA B.MARZIONI", "11/06/2009", "JESUSA CARMEN VAZQUEZ", "60177/09"], ["51", "UNICA", "MARIA LUCRECIA SERRAT", "19/05/2009", "DANIEL DECUADRO", "60227/09"], ["51", "UNICA", "MARIA LUCRECIA SERRAT", "12/02/2009", "ELIZABETH LILIANA MANSILLA ROMERO", "60150/09"], ["75", "UNICA", "IGNACIO M. REBAUDI BASAVILBASO", "01/07/2009", "ABALSAMO ALFREDO DANIEL", "60277/09"], ["94", "UNICA", "GABRIELA PALÓPOLI", "02/07/2009", "ALVAREZ ALICIA ESTHER", "60360/09"], ["96", "UNICA", "DANIEL PAZ EYNARD", "16/06/2009", "NELIDA ALBORADA ALCARAZ SERRANO", "60176/09"]]
68
+ expected = [["1", "UNICA", "CECILIA KANDUS", "16/12/2008", "PEDRO ALBERTO GALINDEZ", "60279/09"], ["1", "UNICA", "CECILIA KANDUS", "10/06/2009", "PASTORA FILOMENA NAVARRO", "60280/09"], ["13", "UNICA", "MIRTA S. BOTTALLO DE VILLA", "02/07/2009", "MARIO LUIS ANGELERI, DNI 4.313.138", "60198/09"], ["16", "UNICA", "LUIS PEDRO FASANELLI", "22/05/2009", "PETTER o PEDRO KAHRS", "60244/09"], ["18", "UNICA", "ALEJANDRA SALLES", "26/06/2009", "RAUL FERNANDO FORTINI", "60236/09"], ["31", "UNICA", "MARÍA CRISTINA GARCÍA", "17/06/2009", "DOMINGO TRIPODI Y PAULA LUPPINO", "60302/09"], ["34", "UNICA", "SUSANA B. MARZIONI", "11/06/2009", "JESUSA CARMEN VAZQUEZ", "60177/09"], ["51", "UNICA", "MARIA LUCRECIA SERRAT", "19/05/2009", "DANIEL DECUADRO", "60227/09"], ["51", "UNICA", "MARIA LUCRECIA SERRAT", "12/02/2009", "ELIZABETH LILIANA MANSILLA ROMERO", "60150/09"], ["75", "UNICA", "IGNACIO M. REBAUDI BASAVILBASO", "01/07/2009", "ABALSAMO ALFREDO DANIEL", "60277/09"], ["94", "UNICA", "GABRIELA PALÓPOLI", "02/07/2009", "ALVAREZ ALICIA ESTHER", "60360/09"], ["96", "UNICA", "DANIEL PAZ EYNARD", "16/06/2009", "NELIDA ALBORADA ALCARAZ SERRANO", "60176/09"]]
68
69
  assert_equal expected, lines_to_array(Tabula.make_table(characters))
69
70
  end
70
71
 
metadata CHANGED
@@ -2,14 +2,14 @@
2
2
  name: tabula-extractor
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.0.1
5
+ version: 0.5.0
6
6
  platform: java
7
7
  authors:
8
8
  - Manuel Aristarán
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-05-09 00:00:00.000000000 Z
12
+ date: 2013-06-12 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: minitest
@@ -35,13 +35,13 @@ dependencies:
35
35
  requirements:
36
36
  - - ">="
37
37
  - !ruby/object:Gem::Version
38
- version: 1.3.5
38
+ version: 1.3.4
39
39
  none: false
40
40
  requirement: !ruby/object:Gem::Requirement
41
41
  requirements:
42
42
  - - ">="
43
43
  - !ruby/object:Gem::Version
44
- version: 1.3.5
44
+ version: 1.3.4
45
45
  none: false
46
46
  prerelease: false
47
47
  type: :development
@@ -70,6 +70,7 @@ extensions: []
70
70
  extra_rdoc_files: []
71
71
  files:
72
72
  - ".gitignore"
73
+ - ".travis.yml"
73
74
  - AUTHORS.md
74
75
  - Gemfile
75
76
  - LICENSE.md
@@ -77,15 +78,31 @@ files:
77
78
  - README.md
78
79
  - Rakefile
79
80
  - bin/tabula
81
+ - ext/COPYING
82
+ - ext/Makefile.OSX
83
+ - ext/Makefile.defaults
84
+ - ext/Makefile.linux32
85
+ - ext/Makefile.linux64
86
+ - ext/Makefile.mingw
87
+ - ext/liblsd-linux32.so
88
+ - ext/liblsd-linux64.so
89
+ - ext/liblsd.def
90
+ - ext/liblsd.dll
91
+ - ext/liblsd.dylib
92
+ - ext/lsd.c
93
+ - ext/lsd.h
80
94
  - lib/tabula.rb
95
+ - lib/tabula/core_ext.rb
81
96
  - lib/tabula/entities.rb
97
+ - lib/tabula/line_segment_detector.rb
82
98
  - lib/tabula/pdf_dump.rb
99
+ - lib/tabula/pdf_render.rb
83
100
  - lib/tabula/table_extractor.rb
84
101
  - lib/tabula/version.rb
85
102
  - lib/tabula/whitespace.rb
86
103
  - lib/tabula/writers.rb
87
104
  - tabula-extractor.gemspec
88
- - target/pdfbox-app-1.8.0.jar
105
+ - target/pdfbox-app-2.0.0-SNAPSHOT.jar
89
106
  - test/data/ClinicalResearchDisclosureReport2012Q2.pdf
90
107
  - test/data/argentina_diputados_voting_record.pdf
91
108
  - test/data/bo_page24.pdf