RubyGems - tabula-extractor - Versions diffs - 0.0.1-java → 0.5.0-java - Mend

tabula-extractor 0.0.1-java → 0.5.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

data/.travis.yml +6 -0
data/Gemfile +0 -3
data/README.md +19 -2
data/Rakefile +4 -5
data/bin/tabula +27 -7
data/ext/COPYING +661 -0
data/ext/Makefile.OSX +15 -0
data/ext/Makefile.defaults +9 -0
data/ext/Makefile.linux32 +11 -0
data/ext/Makefile.linux64 +12 -0
data/ext/Makefile.mingw +10 -0
data/ext/liblsd-linux32.so +0 -0
data/ext/liblsd-linux64.so +0 -0
data/ext/liblsd.def +3 -0
data/ext/liblsd.dll +0 -0
data/ext/liblsd.dylib +0 -0
data/ext/lsd.c +2270 -0
data/ext/lsd.h +283 -0
data/lib/tabula.rb +6 -0
data/lib/tabula/core_ext.rb +21 -0
data/lib/tabula/entities.rb +141 -20
data/lib/tabula/line_segment_detector.rb +99 -0
data/lib/tabula/pdf_dump.rb +10 -8
data/lib/tabula/pdf_render.rb +64 -0
data/lib/tabula/table_extractor.rb +19 -20
data/lib/tabula/version.rb +1 -1
data/lib/tabula/writers.rb +1 -1
data/tabula-extractor.gemspec +3 -2
data/target/{pdfbox-app-1.8.0.jar → pdfbox-app-2.0.0-SNAPSHOT.jar} +0 -0
data/test/tests.rb +7 -6
metadata +22 -5

data/lib/tabula/line_segment_detector.rb ADDED Viewed

@@ -0,0 +1,99 @@
+require 'java'
+require 'rbconfig'
+require 'ffi'
+require_relative './entities'
+require_relative './pdf_render'
+require File.join(File.dirname(__FILE__), '../../target/', Tabula::PDFBOX)
+java_import javax.imageio.ImageIO
+java_import java.awt.image.BufferedImage
+java_import org.apache.pdfbox.pdmodel.PDDocument
+module Tabula
+  module LSD
+    extend FFI::Library
+    ffi_lib File.expand_path('../../ext/' + case RbConfig::CONFIG['host_os']
+                                            when /mswin|msys|mingw|cygwin|bccwin|wince|emc/
+                                              'liblsd.dll'
+                                            when /darwin|mac os/
+                                              'liblsd.dylib'
+                                            when /linux/
+                                              if RbConfig::CONFIG['target_cpu'] == 'x86_64'
+                                                'liblsd-linux64.so'
+                                              else
+                                                'liblsd-linux32.so'
+                                              end
+                                            else
+                                              raise "unknown os: #{RbConfig::CONFIG['host_os']}"
+                                            end,
+                             File.dirname(__FILE__))
+    attach_function :lsd, [ :pointer, :buffer_in, :int, :int ], :pointer
+    attach_function :free_values, [ :pointer ], :void
+    def LSD.detect_lines_in_pdf_page(pdf_path, page_number, scale_factor=1)
+      pdf_file = PDDocument.loadNonSeq(java.io.File.new(pdf_path), nil)
+      bi = Tabula::Render.pageToBufferedImage(pdf_file.getDocumentCatalog.getAllPages[page_number - 1])
+      pdf_file.close
+      detect_lines(bi,scale_factor)
+    end
+    # image can be either a string (path to image) or a Java::JavaAwtImage::BufferedImage
+    # image to pixels: http://stackoverflow.com/questions/6524196/java-get-pixel-array-from-image
+    def LSD.detect_lines(image, scale_factor=1)
+      bimage = if image.class == Java::JavaAwtImage::BufferedImage
+                 image
+               elsif image.class == String
+                 ImageIO.read(java.io.File.new(image))
+                 else
+                 raise ArgumentError, 'image must be a string or a BufferedImage'
+               end
+      image = LSD.image_to_image_double(bimage)
+      lines_found_ptr = FFI::MemoryPointer.new(:int, 1)
+      out = lsd(lines_found_ptr, image, bimage.getWidth, bimage.getHeight)
+      lines_found = lines_found_ptr.get_int
+      rv = []
+      lines_found.times do |i|
+        a = out[7*8*i].read_array_of_type(:double, 7)
+        a_round = a[0..3].map(&:round)
+        p1, p2 = [[a_round[0], a_round[1]], [a_round[2], a_round[3]]]
+        rv << Tabula::Ruling.new(p1[1] * scale_factor,
+                                 p1[0] * scale_factor,
+                                 (p2[0] - p1[0]) * scale_factor,
+                                 (p2[1] - p1[1]) * scale_factor)
+      end
+      free_values(out)
+      bimage.flush
+      bimage.getGraphics.dispose
+      image = nil
+      return rv
+    end
+    private
+    def LSD.image_to_image_double(buffered_image)
+      width = buffered_image.getWidth; height = buffered_image.getHeight
+      raster_size = width * height
+      image_double = FFI::MemoryPointer.new(:double, raster_size)
+      pixels = Java::int[width * height].new
+      buffered_image.getRGB(0, 0, width, height, pixels, 0, width)
+      image_double.put_array_of_double 0, pixels.to_a
+    end
+  end
+end
+if __FILE__ == $0
+  puts Tabula::LSD.detect_lines_in_pdf_page ARGV[0], ARGV[1].to_i
+end

data/lib/tabula/pdf_dump.rb CHANGED Viewed

@@ -3,7 +3,7 @@ require 'observer'
 require_relative './entities.rb'
 require 'java'
-require File.join(File.dirname(__FILE__), '../../target/pdfbox-app-1.8.0.jar')
+require File.join(File.dirname(__FILE__), '../../target/', Tabula::PDFBOX)
 java_import org.apache.pdfbox.pdfparser.PDFParser
 java_import org.apache.pdfbox.pdmodel.PDDocument
 java_import org.apache.pdfbox.util.PDFTextStripper
@@ -44,13 +44,14 @@ module Tabula
         c = text.getCharacter
         # probably not the fastest way of detecting printable chars
         self.characters << text  if c =~ PRINTABLE_RE
       end
     end
     class PagesInfoExtractor
       def initialize(pdf_filename)
         raise Errno::ENOENT unless File.exists?(pdf_filename)
-        @pdf_file = PDDocument.loadNonSeq(java.io.File.new(pdf_filename), nil)
+        @pdf_file = PDDocument.load(java.io.File.new(pdf_filename))
         @all_pages = @pdf_file.getDocumentCatalog.getAllPages
       end
@@ -99,13 +100,14 @@ module Tabula
                                        page.getRotation.to_i,
                                        i+1,
                                        @extractor.characters.map { |char|
-                                         Tabula::TextElement.new(char.getYDirAdj,
-                                                                 char.getXDirAdj,
-                                                                 char.getWidthDirAdj,
-                                                                 char.getHeightDir,
+                                         Tabula::TextElement.new(char.getYDirAdj.round(2),
+                                                                 char.getXDirAdj.round(2),
+                                                                 char.getWidthDirAdj.round(2),
+                                                                 char.getHeightDir.round(2),
                                                                  nil,
-                                                                 char.getFontSize,
-                                                                 char.getCharacter)
+                                                                 char.getFontSize.round(2),
+                                                                 char.getCharacter,
+                                                                 char.getWidthOfSpace)
                                        })
             end
           ensure

data/lib/tabula/pdf_render.rb ADDED Viewed

@@ -0,0 +1,64 @@
+require 'java'
+require File.join(File.dirname(__FILE__), '../../target/', Tabula::PDFBOX)
+java_import org.apache.pdfbox.pdmodel.PDDocument
+java_import org.apache.pdfbox.pdfviewer.PageDrawer
+java_import java.awt.image.BufferedImage
+java_import javax.imageio.ImageIO
+java_import java.awt.Dimension
+java_import java.awt.Color
+module Tabula
+  module Render
+    # render a PDF page to a graphics context, but skip rendering the text
+    # This is done to reduce 'noise' introduced by the text, we only
+    # care about lines.
+    class PageDrawerNoText < PageDrawer
+      def processTextPosition(text)
+      end
+    end
+    TRANSPARENT_WHITE = Color.new(255, 255, 255, 0)
+    def self.pageToBufferedImage(page, width=2048, pageDrawerClass=PageDrawerNoText)
+      cropbox = page.findCropBox
+      widthPt, heightPt = cropbox.getWidth, cropbox.getHeight
+      pageDimension = Dimension.new(widthPt, heightPt)
+      rotation = java.lang.Math.toRadians(page.findRotation)
+      scaling = width / (rotation == 0 ? widthPt : heightPt)
+      widthPx, heightPx = java.lang.Math.round(widthPt * scaling), java.lang.Math.round(heightPt * scaling)
+      retval = if rotation != 0
+                 BufferedImage.new(heightPx, widthPx, BufferedImage::TYPE_BYTE_GRAY)
+               else
+                 BufferedImage.new(widthPx, heightPx, BufferedImage::TYPE_BYTE_GRAY)
+               end
+      graphics = retval.getGraphics()
+      graphics.setBackground(TRANSPARENT_WHITE)
+      graphics.clearRect(0, 0, retval.getWidth, retval.getHeight)
+      if rotation != 0
+        graphics.translate(retval.getWidth, 0.0)
+        graphics.rotate(rotation)
+      end
+      graphics.scale(scaling, scaling)
+      drawer = pageDrawerClass.new()
+      drawer.drawPage(graphics,  page, pageDimension)
+      graphics.dispose
+      return retval
+    end
+  end
+end
+# testing
+if __FILE__ == $0
+  pdf_file = PDDocument.loadNonSeq(java.io.File.new(ARGV[0]), nil)
+  bi = Tabula::Render.pageToBufferedImage(pdf_file.getDocumentCatalog.getAllPages[ARGV[1].to_i - 1])
+  puts bi.class
+  ImageIO.write(bi, 'png',
+                java.io.File.new('notext.png'))
+end

data/lib/tabula/table_extractor.rb CHANGED Viewed

@@ -14,7 +14,6 @@ module Tabula
     def initialize(text_elements, options = {})
       self.text_elements = text_elements
       self.options = DEFAULT_OPTIONS.merge(options)
-      @merged = false
       merge_words! if self.options[:merge_words]
     end
@@ -42,9 +41,9 @@ module Tabula
     end
     def get_columns
-      Tabula.group_by_columns(text_elements).map { |c|
+      Tabula.group_by_columns(text_elements).map do |c|
         {'left' => c.left, 'right' => c.right, 'width' => c.width}
-      }
+      end
     end
     def get_line_boundaries
@@ -108,6 +107,7 @@ module Tabula
           # is there a space? is this within `CHARACTER_DISTANCE_THRESHOLD` points of previous char?
           if (char1.text != " ") and (char2.text != " ") and self.text_elements[current_word_index].should_add_space?(char2)
             self.text_elements[current_word_index].text += " "
+            self.text_elements[current_word_index].width += self.text_elements[current_word_index].width_of_space
           end
           current_word_index = i+1
         end
@@ -135,11 +135,11 @@ module Tabula
   end
   def Tabula.lines_to_csv(lines)
-    CSV.generate { |csv|
-      lines.each { |l|
+    CSV.generate do |csv|
+      lines.each do |l|
         csv << l.map { |c| c.text.strip }
-      }
-    }
+      end
+    end
   end
   ONLY_SPACES_RE = Regexp.new('^\s+$')
@@ -154,45 +154,43 @@ module Tabula
     # find all the text elements
     # contained within each detected line (table row) boundary
-    line_boundaries.each { |lb|
+    line_boundaries.each do |lb|
       line = Line.new
-      line_members = text_elements.find_all { |te|
+      line_members = text_elements.find_all do |te|
         te.vertically_overlaps?(lb)
-      }
+      end
       text_elements -= line_members
-      line_members.sort_by(&:left).each { |te|
+      line_members.sort_by(&:left).each do |te|
         # skip text_elements that only contain spaces
         next if te.text =~ ONLY_SPACES_RE
         line << te
-      }
+      end
       lines << line if line.text_elements.size > 0
-    }
+    end
     lines.sort_by!(&:top)
     columns = Tabula.group_by_columns(lines.map(&:text_elements).flatten.compact.uniq).sort_by(&:left)
     # # insert empty cells if needed
-    lines.each_with_index { |l, line_index|
+    lines.each_with_index do |l, line_index|
       next if l.text_elements.nil?
       l.text_elements.compact! # TODO WHY do I have to do this?
       l.text_elements.uniq!  # TODO WHY do I have to do this?
       l.text_elements.sort_by!(&:left)
-      # l.text_elements = Tabula.merge_words(l.text_elements)
       next unless l.text_elements.size < columns.size
       columns.each_with_index do |c, i|
         if (i > l.text_elements.size - 1) or !l.text_elements(&:left)[i].nil? and !c.text_elements.include?(l.text_elements[i])
-          l.text_elements.insert(i, TextElement.new(l.top, c.left, c.width, l.height, nil, 0, ''))
+          l.text_elements.insert(i, TextElement.new(l.top, c.left, c.width, l.height, nil, 0, '', 0))
         end
       end
-    }
+    end
     # # merge elements that are in the same column
     columns = Tabula.group_by_columns(lines.map(&:text_elements).flatten.compact.uniq)
@@ -230,8 +228,9 @@ module Tabula
         lines[i+1] = nil
       end
     end
-    lines.compact.map { |line|
+    lines.compact.map do |line|
       line.text_elements.sort_by(&:left)
-    }
+    end
   end
 end

data/lib/tabula/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Tabula
-  VERSION = '0.0.1'
+  VERSION = '0.5.0'
 end

data/lib/tabula/writers.rb CHANGED Viewed

@@ -16,7 +16,7 @@ module Tabula
     def Writers.TSV(lines, output=$stdout)
       tsv_string = lines.each { |l|
-        output.write(l.map(&:text).join("\t") + '\n')
+        output.write(l.map(&:text).join("\t") + "\n")
       }
     end

data/tabula-extractor.gemspec CHANGED Viewed

@@ -14,13 +14,14 @@ Gem::Specification.new do |s|
   s.platform = 'java'
-  s.files         = `git ls-files`.split("\n")
+  shared_libs = ['liblsd.dylib', 'liblsd-linux64.so', 'liblsd-linux32.so', 'liblsd.dll'].map { |f| 'ext/' + f }
+  s.files         = `git ls-files`.split("\n") + shared_libs.map.reject { |f| !File.exists?(f) }
   s.test_files    = `git ls-files -- {test,features}/*`.split("\n")
   s.executables   = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
   s.require_paths = ["lib"]
   s.add_development_dependency 'minitest'
-  s.add_development_dependency 'bundler', '>= 1.3.5'
+  s.add_development_dependency 'bundler', '>= 1.3.4'
   s.add_runtime_dependency "trollop", ["~> 2.0"]
 end

data/target/{pdfbox-app-1.8.0.jar → pdfbox-app-2.0.0-SNAPSHOT.jar} RENAMED Viewed

Binary file

data/test/tests.rb CHANGED Viewed

@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+require 'minitest'
 require 'minitest/autorun'
 require_relative '../lib/tabula'
@@ -9,7 +10,7 @@ def lines_to_array(lines)
   }
 end
-class TestPagesInfoExtractor < MiniTest::Unit::TestCase
+class TestPagesInfoExtractor < Minitest::Test
   def test_pages_info_extractor
     extractor = Tabula::Extraction::PagesInfoExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
@@ -23,7 +24,7 @@ class TestPagesInfoExtractor < MiniTest::Unit::TestCase
 end
-class TestDumper < MiniTest::Unit::TestCase
+class TestDumper < Minitest::Test
   def test_extractor
     extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
@@ -40,7 +41,7 @@ class TestDumper < MiniTest::Unit::TestCase
   end
 end
-class TestExtractor < MiniTest::Unit::TestCase
+class TestExtractor < Minitest::Test
   def test_table_extraction_1
     character_extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
@@ -54,7 +55,7 @@ class TestExtractor < MiniTest::Unit::TestCase
     character_extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/argentina_diputados_voting_record.pdf', File.dirname(__FILE__)))
     characters = character_extractor.extract.next.get_text([269.875, 12.75, 790.5, 561])
-    expected = [["Apellido y Nombre", "Bloque político", "Provincia", ""], ["ABDALA de MATARAZZO, Norma Amanda", "Frente Cívico por Santiago", "Santiago del Estero", "AFIRMATIVO"], ["ALBRIEU, Oscar Edmundo Nicolas", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["ALONSO, María Luz", "Frente para la Victoria - PJ", "La Pampa", "AFIRMATIVO"], ["ARENA, Celia Isabel", "Frente para la Victoria - PJ", "Santa Fe", "AFIRMATIVO"], ["ARREGUI, Andrés Roberto", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["AVOSCAN, Herman Horacio", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["BALCEDO, María Ester", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["BARRANDEGUY, Raúl Enrique", "Frente para la Victoria - PJ", "Entre Ríos", "AFIRMATIVO"], ["BASTERRA, Luis Eugenio", "Frente para la Victoria - PJ", "Formosa", "AFIRMATIVO"], ["BEDANO, Nora Esther", "Frente para la Victoria - PJ", "Córdoba", "AFIRMATIVO"], ["BERNAL, María Eugenia", "Frente para la Victoria - PJ", "Jujuy", "AFIRMATIVO"], ["BERTONE, Rosana Andrea", "Frente para la Victoria - PJ", "Tierra del Fuego", "AFIRMATIVO"], ["BIANCHI, María del Carmen", "Frente para la Victoria - PJ", "Cdad. Aut. Bs. As.", "AFIRMATIVO"], ["BIDEGAIN, Gloria Mercedes", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["BRAWER, Mara", "Frente para la Victoria - PJ", "Cdad. Aut. Bs. As.", "AFIRMATIVO"], ["BRILLO, José Ricardo", "Movimiento Popular Neuquino", "Neuquén", "AFIRMATIVO"], ["BROMBERG, Isaac Benjamín", "Frente para la Victoria - PJ", "Tucumán", "AFIRMATIVO"], ["BRUE, Daniel Agustín", "Frente Cívico por Santiago", "Santiago del Estero", "AFIRMATIVO"], ["CALCAGNO, Eric", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CARLOTTO, Remo Gerardo", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CARMONA, Guillermo Ramón", "Frente para la Victoria - PJ", "Mendoza", "AFIRMATIVO"], ["CATALAN MAGNI, Julio César", "Frente para la Victoria - PJ", "Tierra del Fuego", "AFIRMATIVO"], ["CEJAS, Jorge Alberto", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["CHIENO, María Elena", "Frente para la Victoria - PJ", "Corrientes", "AFIRMATIVO"], ["CIAMPINI, José Alberto", "Frente para la Victoria - PJ", "Neuquén", "AFIRMATIVO"], ["CIGOGNA, Luis Francisco Jorge", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CLERI, Marcos", "Frente para la Victoria - PJ", "Santa Fe", "AFIRMATIVO"], ["COMELLI, Alicia Marcela", "Movimiento Popular Neuquino", "Neuquén", "AFIRMATIVO"], ["CONTI, Diana Beatriz", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CORDOBA, Stella Maris", "Frente para la Victoria - PJ", "Tucumán", "AFIRMATIVO"], ["CURRILEN, Oscar Rubén", "Frente para la Victoria - PJ", "Chubut", "AFIRMATIVO"]]
+    expected = [["ABDALA de MATARAZZO, Norma Amanda", "Frente Cívico por Santiago", "Santiago del Estero", "AFIRMATIVO"], ["ALBRIEU, Oscar Edmundo Nicolas", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["ALONSO, María Luz", "Frente para la Victoria - PJ", "La Pampa", "AFIRMATIVO"], ["ARENA, Celia Isabel", "Frente para la Victoria - PJ", "Santa Fe", "AFIRMATIVO"], ["ARREGUI, Andrés Roberto", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["AVOSCAN, Herman Horacio", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["BALCEDO, María Ester", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["BARRANDEGUY, Raúl Enrique", "Frente para la Victoria - PJ", "Entre Ríos", "AFIRMATIVO"], ["BASTERRA, Luis Eugenio", "Frente para la Victoria - PJ", "Formosa", "AFIRMATIVO"], ["BEDANO, Nora Esther", "Frente para la Victoria - PJ", "Córdoba", "AFIRMATIVO"], ["BERNAL, María Eugenia", "Frente para la Victoria - PJ", "Jujuy", "AFIRMATIVO"], ["BERTONE, Rosana Andrea", "Frente para la Victoria - PJ", "Tierra del Fuego", "AFIRMATIVO"], ["BIANCHI, María del Carmen", "Frente para la Victoria - PJ", "Cdad. Aut. Bs. As.", "AFIRMATIVO"], ["BIDEGAIN, Gloria Mercedes", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["BRAWER, Mara", "Frente para la Victoria - PJ", "Cdad. Aut. Bs. As.", "AFIRMATIVO"], ["BRILLO, José Ricardo", "Movimiento Popular Neuquino", "Neuquén", "AFIRMATIVO"], ["BROMBERG, Isaac Benjamín", "Frente para la Victoria - PJ", "Tucumán", "AFIRMATIVO"], ["BRUE, Daniel Agustín", "Frente Cívico por Santiago", "Santiago del Estero", "AFIRMATIVO"], ["CALCAGNO, Eric", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CARLOTTO, Remo Gerardo", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CARMONA, Guillermo Ramón", "Frente para la Victoria - PJ", "Mendoza", "AFIRMATIVO"], ["CATALAN MAGNI, Julio César", "Frente para la Victoria - PJ", "Tierra del Fuego", "AFIRMATIVO"], ["CEJAS, Jorge Alberto", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["CHIENO, María Elena", "Frente para la Victoria - PJ", "Corrientes", "AFIRMATIVO"], ["CIAMPINI, José Alberto", "Frente para la Victoria - PJ", "Neuquén", "AFIRMATIVO"], ["CIGOGNA, Luis Francisco Jorge", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CLERI, Marcos", "Frente para la Victoria - PJ", "Santa Fe", "AFIRMATIVO"], ["COMELLI, Alicia Marcela", "Movimiento Popular Neuquino", "Neuquén", "AFIRMATIVO"], ["CONTI, Diana Beatriz", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CORDOBA, Stella Maris", "Frente para la Victoria - PJ", "Tucumán", "AFIRMATIVO"], ["CURRILEN, Oscar Rubén", "Frente para la Victoria - PJ", "Chubut", "AFIRMATIVO"]]
     assert_equal expected, lines_to_array(Tabula.make_table(characters))
   end
@@ -62,9 +63,9 @@ class TestExtractor < MiniTest::Unit::TestCase
   # TODO Spaces inserted in words - fails
   def test_bo_page24
     character_extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/bo_page24.pdf', File.dirname(__FILE__)))
-    characters = character_extractor.extract.next.get_text([435.625, 53.125, 570.7142857142857, 810.5357142857142])
+    characters = character_extractor.extract.next.get_text([435.625, 53.125, 585.7142857142857, 810.5357142857142])
-    expected = [["1", "UNICA", "CECILIA KANDUS", "16/12/2008", "PEDRO ALBERTO GALINDEZ", "60279/09"], ["1", "UNICA", "CECILIA KANDUS", "10/06/2009", "PASTORA FILOMENA NAVARRO", "60280/09"], ["13", "UNICA", "MIRTA S. BOTTALLO DE VILLA", "02/07/2009", "MARIO LUIS ANGELERI, DNI 4.313.138", "60198/09"], ["16", "UNICA", "LUIS PEDRO FASANELLI", "22/05/2009", "PETTER o PEDRO KAHRS", "60244/09"], ["18", "UNICA", "ALEJANDRA SALLES", "26/06/2009", "RAUL FERNANDO FORTINI", "60236/09"], ["31", "UNICA", "MARÍA CRISTINA GARCÍA", "17/06/2009", "DOMINGO TRIPODI Y PAULA LUPPINO", "60302/09"], ["34", "UNICA", "SUSANA B.MARZIONI", "11/06/2009", "JESUSA CARMEN VAZQUEZ", "60177/09"], ["51", "UNICA", "MARIA LUCRECIA SERRAT", "19/05/2009", "DANIEL DECUADRO", "60227/09"], ["51", "UNICA", "MARIA LUCRECIA SERRAT", "12/02/2009", "ELIZABETH LILIANA MANSILLA ROMERO", "60150/09"], ["75", "UNICA", "IGNACIO M. REBAUDI BASAVILBASO", "01/07/2009", "ABALSAMO ALFREDO DANIEL", "60277/09"], ["94", "UNICA", "GABRIELA PALÓPOLI", "02/07/2009", "ALVAREZ ALICIA ESTHER", "60360/09"], ["96", "UNICA", "DANIEL PAZ EYNARD", "16/06/2009", "NELIDA ALBORADA ALCARAZ SERRANO", "60176/09"]]
+    expected = [["1", "UNICA", "CECILIA KANDUS", "16/12/2008", "PEDRO ALBERTO GALINDEZ", "60279/09"], ["1", "UNICA", "CECILIA KANDUS", "10/06/2009", "PASTORA FILOMENA NAVARRO", "60280/09"], ["13", "UNICA", "MIRTA S. BOTTALLO DE VILLA", "02/07/2009", "MARIO LUIS ANGELERI, DNI 4.313.138", "60198/09"], ["16", "UNICA", "LUIS PEDRO FASANELLI", "22/05/2009", "PETTER o PEDRO KAHRS", "60244/09"], ["18", "UNICA", "ALEJANDRA SALLES", "26/06/2009", "RAUL FERNANDO FORTINI", "60236/09"], ["31", "UNICA", "MARÍA CRISTINA GARCÍA", "17/06/2009", "DOMINGO TRIPODI Y PAULA LUPPINO", "60302/09"], ["34", "UNICA", "SUSANA B. MARZIONI", "11/06/2009", "JESUSA CARMEN VAZQUEZ", "60177/09"], ["51", "UNICA", "MARIA LUCRECIA SERRAT", "19/05/2009", "DANIEL DECUADRO", "60227/09"], ["51", "UNICA", "MARIA LUCRECIA SERRAT", "12/02/2009", "ELIZABETH LILIANA MANSILLA ROMERO", "60150/09"], ["75", "UNICA", "IGNACIO M. REBAUDI BASAVILBASO", "01/07/2009", "ABALSAMO ALFREDO DANIEL", "60277/09"], ["94", "UNICA", "GABRIELA PALÓPOLI", "02/07/2009", "ALVAREZ ALICIA ESTHER", "60360/09"], ["96", "UNICA", "DANIEL PAZ EYNARD", "16/06/2009", "NELIDA ALBORADA ALCARAZ SERRANO", "60176/09"]]
     assert_equal expected, lines_to_array(Tabula.make_table(characters))
   end

metadata CHANGED Viewed

@@ -2,14 +2,14 @@
 name: tabula-extractor
 version: !ruby/object:Gem::Version
   prerelease:
-  version: 0.0.1
+  version: 0.5.0
 platform: java
 authors:
 - Manuel Aristarán
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-05-09 00:00:00.000000000 Z
+date: 2013-06-12 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: minitest
@@ -35,13 +35,13 @@ dependencies:
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        version: 1.3.5
+        version: 1.3.4
     none: false
   requirement: !ruby/object:Gem::Requirement
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        version: 1.3.5
+        version: 1.3.4
     none: false
   prerelease: false
   type: :development
@@ -70,6 +70,7 @@ extensions: []
 extra_rdoc_files: []
 files:
 - ".gitignore"
+- ".travis.yml"
 - AUTHORS.md
 - Gemfile
 - LICENSE.md
@@ -77,15 +78,31 @@ files:
 - README.md
 - Rakefile
 - bin/tabula
+- ext/COPYING
+- ext/Makefile.OSX
+- ext/Makefile.defaults
+- ext/Makefile.linux32
+- ext/Makefile.linux64
+- ext/Makefile.mingw
+- ext/liblsd-linux32.so
+- ext/liblsd-linux64.so
+- ext/liblsd.def
+- ext/liblsd.dll
+- ext/liblsd.dylib
+- ext/lsd.c
+- ext/lsd.h
 - lib/tabula.rb
+- lib/tabula/core_ext.rb
 - lib/tabula/entities.rb
+- lib/tabula/line_segment_detector.rb
 - lib/tabula/pdf_dump.rb
+- lib/tabula/pdf_render.rb
 - lib/tabula/table_extractor.rb
 - lib/tabula/version.rb
 - lib/tabula/whitespace.rb
 - lib/tabula/writers.rb
 - tabula-extractor.gemspec
-- target/pdfbox-app-1.8.0.jar
+- target/pdfbox-app-2.0.0-SNAPSHOT.jar
 - test/data/ClinicalResearchDisclosureReport2012Q2.pdf
 - test/data/argentina_diputados_voting_record.pdf
 - test/data/bo_page24.pdf