tabula-extractor 0.0.1-java → 0.5.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,99 @@
1
+ require 'java'
2
+ require 'rbconfig'
3
+
4
+ require 'ffi'
5
+
6
+ require_relative './entities'
7
+ require_relative './pdf_render'
8
+ require File.join(File.dirname(__FILE__), '../../target/', Tabula::PDFBOX)
9
+
10
+ java_import javax.imageio.ImageIO
11
+ java_import java.awt.image.BufferedImage
12
+ java_import org.apache.pdfbox.pdmodel.PDDocument
13
+
14
+ module Tabula
15
+ module LSD
16
+ extend FFI::Library
17
+ ffi_lib File.expand_path('../../ext/' + case RbConfig::CONFIG['host_os']
18
+ when /mswin|msys|mingw|cygwin|bccwin|wince|emc/
19
+ 'liblsd.dll'
20
+ when /darwin|mac os/
21
+ 'liblsd.dylib'
22
+ when /linux/
23
+ if RbConfig::CONFIG['target_cpu'] == 'x86_64'
24
+ 'liblsd-linux64.so'
25
+ else
26
+ 'liblsd-linux32.so'
27
+ end
28
+ else
29
+ raise "unknown os: #{RbConfig::CONFIG['host_os']}"
30
+ end,
31
+ File.dirname(__FILE__))
32
+
33
+ attach_function :lsd, [ :pointer, :buffer_in, :int, :int ], :pointer
34
+ attach_function :free_values, [ :pointer ], :void
35
+
36
+ def LSD.detect_lines_in_pdf_page(pdf_path, page_number, scale_factor=1)
37
+ pdf_file = PDDocument.loadNonSeq(java.io.File.new(pdf_path), nil)
38
+ bi = Tabula::Render.pageToBufferedImage(pdf_file.getDocumentCatalog.getAllPages[page_number - 1])
39
+ pdf_file.close
40
+ detect_lines(bi,scale_factor)
41
+ end
42
+
43
+ # image can be either a string (path to image) or a Java::JavaAwtImage::BufferedImage
44
+ # image to pixels: http://stackoverflow.com/questions/6524196/java-get-pixel-array-from-image
45
+ def LSD.detect_lines(image, scale_factor=1)
46
+ bimage = if image.class == Java::JavaAwtImage::BufferedImage
47
+ image
48
+ elsif image.class == String
49
+ ImageIO.read(java.io.File.new(image))
50
+ else
51
+ raise ArgumentError, 'image must be a string or a BufferedImage'
52
+ end
53
+ image = LSD.image_to_image_double(bimage)
54
+
55
+ lines_found_ptr = FFI::MemoryPointer.new(:int, 1)
56
+
57
+ out = lsd(lines_found_ptr, image, bimage.getWidth, bimage.getHeight)
58
+
59
+ lines_found = lines_found_ptr.get_int
60
+
61
+ rv = []
62
+ lines_found.times do |i|
63
+ a = out[7*8*i].read_array_of_type(:double, 7)
64
+
65
+ a_round = a[0..3].map(&:round)
66
+ p1, p2 = [[a_round[0], a_round[1]], [a_round[2], a_round[3]]]
67
+
68
+ rv << Tabula::Ruling.new(p1[1] * scale_factor,
69
+ p1[0] * scale_factor,
70
+ (p2[0] - p1[0]) * scale_factor,
71
+ (p2[1] - p1[1]) * scale_factor)
72
+ end
73
+
74
+ free_values(out)
75
+ bimage.flush
76
+ bimage.getGraphics.dispose
77
+ image = nil
78
+
79
+ return rv
80
+ end
81
+
82
+ private
83
+ def LSD.image_to_image_double(buffered_image)
84
+ width = buffered_image.getWidth; height = buffered_image.getHeight
85
+ raster_size = width * height
86
+
87
+ image_double = FFI::MemoryPointer.new(:double, raster_size)
88
+ pixels = Java::int[width * height].new
89
+ buffered_image.getRGB(0, 0, width, height, pixels, 0, width)
90
+
91
+ image_double.put_array_of_double 0, pixels.to_a
92
+ end
93
+
94
+ end
95
+ end
96
+
97
+ if __FILE__ == $0
98
+ puts Tabula::LSD.detect_lines_in_pdf_page ARGV[0], ARGV[1].to_i
99
+ end
@@ -3,7 +3,7 @@ require 'observer'
3
3
  require_relative './entities.rb'
4
4
 
5
5
  require 'java'
6
- require File.join(File.dirname(__FILE__), '../../target/pdfbox-app-1.8.0.jar')
6
+ require File.join(File.dirname(__FILE__), '../../target/', Tabula::PDFBOX)
7
7
  java_import org.apache.pdfbox.pdfparser.PDFParser
8
8
  java_import org.apache.pdfbox.pdmodel.PDDocument
9
9
  java_import org.apache.pdfbox.util.PDFTextStripper
@@ -44,13 +44,14 @@ module Tabula
44
44
  c = text.getCharacter
45
45
  # probably not the fastest way of detecting printable chars
46
46
  self.characters << text if c =~ PRINTABLE_RE
47
+
47
48
  end
48
49
  end
49
50
 
50
51
  class PagesInfoExtractor
51
52
  def initialize(pdf_filename)
52
53
  raise Errno::ENOENT unless File.exists?(pdf_filename)
53
- @pdf_file = PDDocument.loadNonSeq(java.io.File.new(pdf_filename), nil)
54
+ @pdf_file = PDDocument.load(java.io.File.new(pdf_filename))
54
55
  @all_pages = @pdf_file.getDocumentCatalog.getAllPages
55
56
  end
56
57
 
@@ -99,13 +100,14 @@ module Tabula
99
100
  page.getRotation.to_i,
100
101
  i+1,
101
102
  @extractor.characters.map { |char|
102
- Tabula::TextElement.new(char.getYDirAdj,
103
- char.getXDirAdj,
104
- char.getWidthDirAdj,
105
- char.getHeightDir,
103
+ Tabula::TextElement.new(char.getYDirAdj.round(2),
104
+ char.getXDirAdj.round(2),
105
+ char.getWidthDirAdj.round(2),
106
+ char.getHeightDir.round(2),
106
107
  nil,
107
- char.getFontSize,
108
- char.getCharacter)
108
+ char.getFontSize.round(2),
109
+ char.getCharacter,
110
+ char.getWidthOfSpace)
109
111
  })
110
112
  end
111
113
  ensure
@@ -0,0 +1,64 @@
1
+ require 'java'
2
+
3
+ require File.join(File.dirname(__FILE__), '../../target/', Tabula::PDFBOX)
4
+
5
+ java_import org.apache.pdfbox.pdmodel.PDDocument
6
+ java_import org.apache.pdfbox.pdfviewer.PageDrawer
7
+ java_import java.awt.image.BufferedImage
8
+ java_import javax.imageio.ImageIO
9
+ java_import java.awt.Dimension
10
+ java_import java.awt.Color
11
+
12
+ module Tabula
13
+ module Render
14
+
15
+ # render a PDF page to a graphics context, but skip rendering the text
16
+ # This is done to reduce 'noise' introduced by the text, we only
17
+ # care about lines.
18
+ class PageDrawerNoText < PageDrawer
19
+ def processTextPosition(text)
20
+ end
21
+ end
22
+
23
+ TRANSPARENT_WHITE = Color.new(255, 255, 255, 0)
24
+
25
+ def self.pageToBufferedImage(page, width=2048, pageDrawerClass=PageDrawerNoText)
26
+ cropbox = page.findCropBox
27
+ widthPt, heightPt = cropbox.getWidth, cropbox.getHeight
28
+ pageDimension = Dimension.new(widthPt, heightPt)
29
+ rotation = java.lang.Math.toRadians(page.findRotation)
30
+
31
+ scaling = width / (rotation == 0 ? widthPt : heightPt)
32
+ widthPx, heightPx = java.lang.Math.round(widthPt * scaling), java.lang.Math.round(heightPt * scaling)
33
+
34
+ retval = if rotation != 0
35
+ BufferedImage.new(heightPx, widthPx, BufferedImage::TYPE_BYTE_GRAY)
36
+ else
37
+ BufferedImage.new(widthPx, heightPx, BufferedImage::TYPE_BYTE_GRAY)
38
+ end
39
+ graphics = retval.getGraphics()
40
+ graphics.setBackground(TRANSPARENT_WHITE)
41
+ graphics.clearRect(0, 0, retval.getWidth, retval.getHeight)
42
+ if rotation != 0
43
+ graphics.translate(retval.getWidth, 0.0)
44
+ graphics.rotate(rotation)
45
+ end
46
+ graphics.scale(scaling, scaling)
47
+ drawer = pageDrawerClass.new()
48
+ drawer.drawPage(graphics, page, pageDimension)
49
+ graphics.dispose
50
+
51
+ return retval
52
+ end
53
+ end
54
+ end
55
+
56
+ # testing
57
+ if __FILE__ == $0
58
+ pdf_file = PDDocument.loadNonSeq(java.io.File.new(ARGV[0]), nil)
59
+ bi = Tabula::Render.pageToBufferedImage(pdf_file.getDocumentCatalog.getAllPages[ARGV[1].to_i - 1])
60
+ puts bi.class
61
+ ImageIO.write(bi, 'png',
62
+ java.io.File.new('notext.png'))
63
+ end
64
+
@@ -14,7 +14,6 @@ module Tabula
14
14
  def initialize(text_elements, options = {})
15
15
  self.text_elements = text_elements
16
16
  self.options = DEFAULT_OPTIONS.merge(options)
17
- @merged = false
18
17
  merge_words! if self.options[:merge_words]
19
18
  end
20
19
 
@@ -42,9 +41,9 @@ module Tabula
42
41
  end
43
42
 
44
43
  def get_columns
45
- Tabula.group_by_columns(text_elements).map { |c|
44
+ Tabula.group_by_columns(text_elements).map do |c|
46
45
  {'left' => c.left, 'right' => c.right, 'width' => c.width}
47
- }
46
+ end
48
47
  end
49
48
 
50
49
  def get_line_boundaries
@@ -108,6 +107,7 @@ module Tabula
108
107
  # is there a space? is this within `CHARACTER_DISTANCE_THRESHOLD` points of previous char?
109
108
  if (char1.text != " ") and (char2.text != " ") and self.text_elements[current_word_index].should_add_space?(char2)
110
109
  self.text_elements[current_word_index].text += " "
110
+ self.text_elements[current_word_index].width += self.text_elements[current_word_index].width_of_space
111
111
  end
112
112
  current_word_index = i+1
113
113
  end
@@ -135,11 +135,11 @@ module Tabula
135
135
  end
136
136
 
137
137
  def Tabula.lines_to_csv(lines)
138
- CSV.generate { |csv|
139
- lines.each { |l|
138
+ CSV.generate do |csv|
139
+ lines.each do |l|
140
140
  csv << l.map { |c| c.text.strip }
141
- }
142
- }
141
+ end
142
+ end
143
143
  end
144
144
 
145
145
  ONLY_SPACES_RE = Regexp.new('^\s+$')
@@ -154,45 +154,43 @@ module Tabula
154
154
 
155
155
  # find all the text elements
156
156
  # contained within each detected line (table row) boundary
157
- line_boundaries.each { |lb|
157
+ line_boundaries.each do |lb|
158
158
  line = Line.new
159
159
 
160
- line_members = text_elements.find_all { |te|
160
+ line_members = text_elements.find_all do |te|
161
161
  te.vertically_overlaps?(lb)
162
- }
162
+ end
163
163
 
164
164
  text_elements -= line_members
165
165
 
166
- line_members.sort_by(&:left).each { |te|
166
+ line_members.sort_by(&:left).each do |te|
167
167
  # skip text_elements that only contain spaces
168
168
  next if te.text =~ ONLY_SPACES_RE
169
169
  line << te
170
- }
170
+ end
171
171
 
172
172
  lines << line if line.text_elements.size > 0
173
- }
173
+ end
174
174
 
175
175
  lines.sort_by!(&:top)
176
176
 
177
177
  columns = Tabula.group_by_columns(lines.map(&:text_elements).flatten.compact.uniq).sort_by(&:left)
178
178
 
179
179
  # # insert empty cells if needed
180
- lines.each_with_index { |l, line_index|
180
+ lines.each_with_index do |l, line_index|
181
181
  next if l.text_elements.nil?
182
182
  l.text_elements.compact! # TODO WHY do I have to do this?
183
183
  l.text_elements.uniq! # TODO WHY do I have to do this?
184
184
  l.text_elements.sort_by!(&:left)
185
185
 
186
- # l.text_elements = Tabula.merge_words(l.text_elements)
187
-
188
186
  next unless l.text_elements.size < columns.size
189
187
 
190
188
  columns.each_with_index do |c, i|
191
189
  if (i > l.text_elements.size - 1) or !l.text_elements(&:left)[i].nil? and !c.text_elements.include?(l.text_elements[i])
192
- l.text_elements.insert(i, TextElement.new(l.top, c.left, c.width, l.height, nil, 0, ''))
190
+ l.text_elements.insert(i, TextElement.new(l.top, c.left, c.width, l.height, nil, 0, '', 0))
193
191
  end
194
192
  end
195
- }
193
+ end
196
194
 
197
195
  # # merge elements that are in the same column
198
196
  columns = Tabula.group_by_columns(lines.map(&:text_elements).flatten.compact.uniq)
@@ -230,8 +228,9 @@ module Tabula
230
228
  lines[i+1] = nil
231
229
  end
232
230
  end
233
- lines.compact.map { |line|
231
+
232
+ lines.compact.map do |line|
234
233
  line.text_elements.sort_by(&:left)
235
- }
234
+ end
236
235
  end
237
236
  end
@@ -1,3 +1,3 @@
1
1
  module Tabula
2
- VERSION = '0.0.1'
2
+ VERSION = '0.5.0'
3
3
  end
@@ -16,7 +16,7 @@ module Tabula
16
16
 
17
17
  def Writers.TSV(lines, output=$stdout)
18
18
  tsv_string = lines.each { |l|
19
- output.write(l.map(&:text).join("\t") + '\n')
19
+ output.write(l.map(&:text).join("\t") + "\n")
20
20
  }
21
21
  end
22
22
 
@@ -14,13 +14,14 @@ Gem::Specification.new do |s|
14
14
 
15
15
  s.platform = 'java'
16
16
 
17
- s.files = `git ls-files`.split("\n")
17
+ shared_libs = ['liblsd.dylib', 'liblsd-linux64.so', 'liblsd-linux32.so', 'liblsd.dll'].map { |f| 'ext/' + f }
18
+ s.files = `git ls-files`.split("\n") + shared_libs.map.reject { |f| !File.exists?(f) }
18
19
  s.test_files = `git ls-files -- {test,features}/*`.split("\n")
19
20
  s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
20
21
  s.require_paths = ["lib"]
21
22
 
22
23
  s.add_development_dependency 'minitest'
23
- s.add_development_dependency 'bundler', '>= 1.3.5'
24
+ s.add_development_dependency 'bundler', '>= 1.3.4'
24
25
 
25
26
  s.add_runtime_dependency "trollop", ["~> 2.0"]
26
27
  end
data/test/tests.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  # -*- coding: utf-8 -*-
2
+ require 'minitest'
2
3
  require 'minitest/autorun'
3
4
 
4
5
  require_relative '../lib/tabula'
@@ -9,7 +10,7 @@ def lines_to_array(lines)
9
10
  }
10
11
  end
11
12
 
12
- class TestPagesInfoExtractor < MiniTest::Unit::TestCase
13
+ class TestPagesInfoExtractor < Minitest::Test
13
14
  def test_pages_info_extractor
14
15
  extractor = Tabula::Extraction::PagesInfoExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
15
16
 
@@ -23,7 +24,7 @@ class TestPagesInfoExtractor < MiniTest::Unit::TestCase
23
24
  end
24
25
 
25
26
 
26
- class TestDumper < MiniTest::Unit::TestCase
27
+ class TestDumper < Minitest::Test
27
28
 
28
29
  def test_extractor
29
30
  extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
@@ -40,7 +41,7 @@ class TestDumper < MiniTest::Unit::TestCase
40
41
  end
41
42
  end
42
43
 
43
- class TestExtractor < MiniTest::Unit::TestCase
44
+ class TestExtractor < Minitest::Test
44
45
 
45
46
  def test_table_extraction_1
46
47
  character_extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
@@ -54,7 +55,7 @@ class TestExtractor < MiniTest::Unit::TestCase
54
55
  character_extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/argentina_diputados_voting_record.pdf', File.dirname(__FILE__)))
55
56
  characters = character_extractor.extract.next.get_text([269.875, 12.75, 790.5, 561])
56
57
 
57
- expected = [["Apellido y Nombre", "Bloque político", "Provincia", ""], ["ABDALA de MATARAZZO, Norma Amanda", "Frente Cívico por Santiago", "Santiago del Estero", "AFIRMATIVO"], ["ALBRIEU, Oscar Edmundo Nicolas", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["ALONSO, María Luz", "Frente para la Victoria - PJ", "La Pampa", "AFIRMATIVO"], ["ARENA, Celia Isabel", "Frente para la Victoria - PJ", "Santa Fe", "AFIRMATIVO"], ["ARREGUI, Andrés Roberto", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["AVOSCAN, Herman Horacio", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["BALCEDO, María Ester", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["BARRANDEGUY, Raúl Enrique", "Frente para la Victoria - PJ", "Entre Ríos", "AFIRMATIVO"], ["BASTERRA, Luis Eugenio", "Frente para la Victoria - PJ", "Formosa", "AFIRMATIVO"], ["BEDANO, Nora Esther", "Frente para la Victoria - PJ", "Córdoba", "AFIRMATIVO"], ["BERNAL, María Eugenia", "Frente para la Victoria - PJ", "Jujuy", "AFIRMATIVO"], ["BERTONE, Rosana Andrea", "Frente para la Victoria - PJ", "Tierra del Fuego", "AFIRMATIVO"], ["BIANCHI, María del Carmen", "Frente para la Victoria - PJ", "Cdad. Aut. Bs. As.", "AFIRMATIVO"], ["BIDEGAIN, Gloria Mercedes", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["BRAWER, Mara", "Frente para la Victoria - PJ", "Cdad. Aut. Bs. As.", "AFIRMATIVO"], ["BRILLO, José Ricardo", "Movimiento Popular Neuquino", "Neuquén", "AFIRMATIVO"], ["BROMBERG, Isaac Benjamín", "Frente para la Victoria - PJ", "Tucumán", "AFIRMATIVO"], ["BRUE, Daniel Agustín", "Frente Cívico por Santiago", "Santiago del Estero", "AFIRMATIVO"], ["CALCAGNO, Eric", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CARLOTTO, Remo Gerardo", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CARMONA, Guillermo Ramón", "Frente para la Victoria - PJ", "Mendoza", "AFIRMATIVO"], ["CATALAN MAGNI, Julio César", "Frente para la Victoria - PJ", "Tierra del Fuego", "AFIRMATIVO"], ["CEJAS, Jorge Alberto", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["CHIENO, María Elena", "Frente para la Victoria - PJ", "Corrientes", "AFIRMATIVO"], ["CIAMPINI, José Alberto", "Frente para la Victoria - PJ", "Neuquén", "AFIRMATIVO"], ["CIGOGNA, Luis Francisco Jorge", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CLERI, Marcos", "Frente para la Victoria - PJ", "Santa Fe", "AFIRMATIVO"], ["COMELLI, Alicia Marcela", "Movimiento Popular Neuquino", "Neuquén", "AFIRMATIVO"], ["CONTI, Diana Beatriz", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CORDOBA, Stella Maris", "Frente para la Victoria - PJ", "Tucumán", "AFIRMATIVO"], ["CURRILEN, Oscar Rubén", "Frente para la Victoria - PJ", "Chubut", "AFIRMATIVO"]]
58
+ expected = [["ABDALA de MATARAZZO, Norma Amanda", "Frente Cívico por Santiago", "Santiago del Estero", "AFIRMATIVO"], ["ALBRIEU, Oscar Edmundo Nicolas", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["ALONSO, María Luz", "Frente para la Victoria - PJ", "La Pampa", "AFIRMATIVO"], ["ARENA, Celia Isabel", "Frente para la Victoria - PJ", "Santa Fe", "AFIRMATIVO"], ["ARREGUI, Andrés Roberto", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["AVOSCAN, Herman Horacio", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["BALCEDO, María Ester", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["BARRANDEGUY, Raúl Enrique", "Frente para la Victoria - PJ", "Entre Ríos", "AFIRMATIVO"], ["BASTERRA, Luis Eugenio", "Frente para la Victoria - PJ", "Formosa", "AFIRMATIVO"], ["BEDANO, Nora Esther", "Frente para la Victoria - PJ", "Córdoba", "AFIRMATIVO"], ["BERNAL, María Eugenia", "Frente para la Victoria - PJ", "Jujuy", "AFIRMATIVO"], ["BERTONE, Rosana Andrea", "Frente para la Victoria - PJ", "Tierra del Fuego", "AFIRMATIVO"], ["BIANCHI, María del Carmen", "Frente para la Victoria - PJ", "Cdad. Aut. Bs. As.", "AFIRMATIVO"], ["BIDEGAIN, Gloria Mercedes", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["BRAWER, Mara", "Frente para la Victoria - PJ", "Cdad. Aut. Bs. As.", "AFIRMATIVO"], ["BRILLO, José Ricardo", "Movimiento Popular Neuquino", "Neuquén", "AFIRMATIVO"], ["BROMBERG, Isaac Benjamín", "Frente para la Victoria - PJ", "Tucumán", "AFIRMATIVO"], ["BRUE, Daniel Agustín", "Frente Cívico por Santiago", "Santiago del Estero", "AFIRMATIVO"], ["CALCAGNO, Eric", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CARLOTTO, Remo Gerardo", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CARMONA, Guillermo Ramón", "Frente para la Victoria - PJ", "Mendoza", "AFIRMATIVO"], ["CATALAN MAGNI, Julio César", "Frente para la Victoria - PJ", "Tierra del Fuego", "AFIRMATIVO"], ["CEJAS, Jorge Alberto", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["CHIENO, María Elena", "Frente para la Victoria - PJ", "Corrientes", "AFIRMATIVO"], ["CIAMPINI, José Alberto", "Frente para la Victoria - PJ", "Neuquén", "AFIRMATIVO"], ["CIGOGNA, Luis Francisco Jorge", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CLERI, Marcos", "Frente para la Victoria - PJ", "Santa Fe", "AFIRMATIVO"], ["COMELLI, Alicia Marcela", "Movimiento Popular Neuquino", "Neuquén", "AFIRMATIVO"], ["CONTI, Diana Beatriz", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CORDOBA, Stella Maris", "Frente para la Victoria - PJ", "Tucumán", "AFIRMATIVO"], ["CURRILEN, Oscar Rubén", "Frente para la Victoria - PJ", "Chubut", "AFIRMATIVO"]]
58
59
 
59
60
  assert_equal expected, lines_to_array(Tabula.make_table(characters))
60
61
  end
@@ -62,9 +63,9 @@ class TestExtractor < MiniTest::Unit::TestCase
62
63
  # TODO Spaces inserted in words - fails
63
64
  def test_bo_page24
64
65
  character_extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/bo_page24.pdf', File.dirname(__FILE__)))
65
- characters = character_extractor.extract.next.get_text([435.625, 53.125, 570.7142857142857, 810.5357142857142])
66
+ characters = character_extractor.extract.next.get_text([435.625, 53.125, 585.7142857142857, 810.5357142857142])
66
67
 
67
- expected = [["1", "UNICA", "CECILIA KANDUS", "16/12/2008", "PEDRO ALBERTO GALINDEZ", "60279/09"], ["1", "UNICA", "CECILIA KANDUS", "10/06/2009", "PASTORA FILOMENA NAVARRO", "60280/09"], ["13", "UNICA", "MIRTA S. BOTTALLO DE VILLA", "02/07/2009", "MARIO LUIS ANGELERI, DNI 4.313.138", "60198/09"], ["16", "UNICA", "LUIS PEDRO FASANELLI", "22/05/2009", "PETTER o PEDRO KAHRS", "60244/09"], ["18", "UNICA", "ALEJANDRA SALLES", "26/06/2009", "RAUL FERNANDO FORTINI", "60236/09"], ["31", "UNICA", "MARÍA CRISTINA GARCÍA", "17/06/2009", "DOMINGO TRIPODI Y PAULA LUPPINO", "60302/09"], ["34", "UNICA", "SUSANA B.MARZIONI", "11/06/2009", "JESUSA CARMEN VAZQUEZ", "60177/09"], ["51", "UNICA", "MARIA LUCRECIA SERRAT", "19/05/2009", "DANIEL DECUADRO", "60227/09"], ["51", "UNICA", "MARIA LUCRECIA SERRAT", "12/02/2009", "ELIZABETH LILIANA MANSILLA ROMERO", "60150/09"], ["75", "UNICA", "IGNACIO M. REBAUDI BASAVILBASO", "01/07/2009", "ABALSAMO ALFREDO DANIEL", "60277/09"], ["94", "UNICA", "GABRIELA PALÓPOLI", "02/07/2009", "ALVAREZ ALICIA ESTHER", "60360/09"], ["96", "UNICA", "DANIEL PAZ EYNARD", "16/06/2009", "NELIDA ALBORADA ALCARAZ SERRANO", "60176/09"]]
68
+ expected = [["1", "UNICA", "CECILIA KANDUS", "16/12/2008", "PEDRO ALBERTO GALINDEZ", "60279/09"], ["1", "UNICA", "CECILIA KANDUS", "10/06/2009", "PASTORA FILOMENA NAVARRO", "60280/09"], ["13", "UNICA", "MIRTA S. BOTTALLO DE VILLA", "02/07/2009", "MARIO LUIS ANGELERI, DNI 4.313.138", "60198/09"], ["16", "UNICA", "LUIS PEDRO FASANELLI", "22/05/2009", "PETTER o PEDRO KAHRS", "60244/09"], ["18", "UNICA", "ALEJANDRA SALLES", "26/06/2009", "RAUL FERNANDO FORTINI", "60236/09"], ["31", "UNICA", "MARÍA CRISTINA GARCÍA", "17/06/2009", "DOMINGO TRIPODI Y PAULA LUPPINO", "60302/09"], ["34", "UNICA", "SUSANA B. MARZIONI", "11/06/2009", "JESUSA CARMEN VAZQUEZ", "60177/09"], ["51", "UNICA", "MARIA LUCRECIA SERRAT", "19/05/2009", "DANIEL DECUADRO", "60227/09"], ["51", "UNICA", "MARIA LUCRECIA SERRAT", "12/02/2009", "ELIZABETH LILIANA MANSILLA ROMERO", "60150/09"], ["75", "UNICA", "IGNACIO M. REBAUDI BASAVILBASO", "01/07/2009", "ABALSAMO ALFREDO DANIEL", "60277/09"], ["94", "UNICA", "GABRIELA PALÓPOLI", "02/07/2009", "ALVAREZ ALICIA ESTHER", "60360/09"], ["96", "UNICA", "DANIEL PAZ EYNARD", "16/06/2009", "NELIDA ALBORADA ALCARAZ SERRANO", "60176/09"]]
68
69
  assert_equal expected, lines_to_array(Tabula.make_table(characters))
69
70
  end
70
71
 
metadata CHANGED
@@ -2,14 +2,14 @@
2
2
  name: tabula-extractor
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.0.1
5
+ version: 0.5.0
6
6
  platform: java
7
7
  authors:
8
8
  - Manuel Aristarán
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-05-09 00:00:00.000000000 Z
12
+ date: 2013-06-12 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: minitest
@@ -35,13 +35,13 @@ dependencies:
35
35
  requirements:
36
36
  - - ">="
37
37
  - !ruby/object:Gem::Version
38
- version: 1.3.5
38
+ version: 1.3.4
39
39
  none: false
40
40
  requirement: !ruby/object:Gem::Requirement
41
41
  requirements:
42
42
  - - ">="
43
43
  - !ruby/object:Gem::Version
44
- version: 1.3.5
44
+ version: 1.3.4
45
45
  none: false
46
46
  prerelease: false
47
47
  type: :development
@@ -70,6 +70,7 @@ extensions: []
70
70
  extra_rdoc_files: []
71
71
  files:
72
72
  - ".gitignore"
73
+ - ".travis.yml"
73
74
  - AUTHORS.md
74
75
  - Gemfile
75
76
  - LICENSE.md
@@ -77,15 +78,31 @@ files:
77
78
  - README.md
78
79
  - Rakefile
79
80
  - bin/tabula
81
+ - ext/COPYING
82
+ - ext/Makefile.OSX
83
+ - ext/Makefile.defaults
84
+ - ext/Makefile.linux32
85
+ - ext/Makefile.linux64
86
+ - ext/Makefile.mingw
87
+ - ext/liblsd-linux32.so
88
+ - ext/liblsd-linux64.so
89
+ - ext/liblsd.def
90
+ - ext/liblsd.dll
91
+ - ext/liblsd.dylib
92
+ - ext/lsd.c
93
+ - ext/lsd.h
80
94
  - lib/tabula.rb
95
+ - lib/tabula/core_ext.rb
81
96
  - lib/tabula/entities.rb
97
+ - lib/tabula/line_segment_detector.rb
82
98
  - lib/tabula/pdf_dump.rb
99
+ - lib/tabula/pdf_render.rb
83
100
  - lib/tabula/table_extractor.rb
84
101
  - lib/tabula/version.rb
85
102
  - lib/tabula/whitespace.rb
86
103
  - lib/tabula/writers.rb
87
104
  - tabula-extractor.gemspec
88
- - target/pdfbox-app-1.8.0.jar
105
+ - target/pdfbox-app-2.0.0-SNAPSHOT.jar
89
106
  - test/data/ClinicalResearchDisclosureReport2012Q2.pdf
90
107
  - test/data/argentina_diputados_voting_record.pdf
91
108
  - test/data/bo_page24.pdf