RubyGems - tabula-extractor - Versions diffs - 0.6.4-java → 0.6.5-java - Mend

tabula-extractor 0.6.4-java → 0.6.5-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

data/lib/tabula/line_segment_detector.rb +12 -4
data/lib/tabula/pdf_dump.rb +0 -2
data/lib/tabula/table_extractor.rb +6 -2
data/lib/tabula/version.rb +1 -1
data/tabula-extractor.gemspec +1 -0
metadata +4 -3

data/lib/tabula/line_segment_detector.rb CHANGED Viewed

@@ -43,6 +43,18 @@ module Tabula
       :image_size => 2048
     }
+    def LSD.detect_lines_in_pdf(pdf_path, options={})
+      options = DETECT_LINES_DEFAULTS.merge(options)
+      pdf_file = PDDocument.loadNonSeq(java.io.File.new(pdf_path), nil)
+      lines = pdf_file.getDocumentCatalog.getAllPages.to_a.map do |page|
+        bi = Tabula::Render.pageToBufferedImage(page, options[:image_size])
+        detect_lines(bi, options[:scale_factor] || (page.findCropBox.width / options[:image_size]))
+      end
+      pdf_file.close
+      lines
+    end
     def LSD.detect_lines_in_pdf_page(pdf_path, page_number, options={})
       options = DETECT_LINES_DEFAULTS.merge(options)
@@ -67,10 +79,6 @@ module Tabula
                  raise ArgumentError, 'image must be a string or a BufferedImage'
                end
-      ImageIO.write(bimage,
-                    'png',
-                    java.io.File.new("/tmp/white.png"))
       image = LSD.image_to_image_double(bimage)
       lines_found_ptr = FFI::MemoryPointer.new(:int, 1)

data/lib/tabula/pdf_dump.rb CHANGED Viewed

@@ -40,8 +40,6 @@ module Tabula
         self.characters = []; self.fonts = {}
       end
       def processTextPosition(text)
         # return if text.getCharacter == ' '

data/lib/tabula/table_extractor.rb CHANGED Viewed

@@ -182,6 +182,10 @@ module Tabula
     default_options = {:separators => []}
     options = default_options.merge(options)
+    if text_elements.empty?
+      return []
+    end
     extractor = TableExtractor.new(text_elements, options).text_elements
     lines = group_by_lines(text_elements)
     top = lines[0].text_elements.map(&:top).min
@@ -211,11 +215,11 @@ module Tabula
       end
     end
-    table.lines.map do |l|
+    table.lines.map { |l|
       l.text_elements.map! { |te|
         te.nil? ? TextElement.new(nil, nil, nil, nil, nil, nil, '', nil) : te
       }
-    end
+    }.sort_by { |l| l.map { |te| te.top or 0 }.max }
   end

data/lib/tabula/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Tabula
-  VERSION = '0.6.4'
+  VERSION = '0.6.5'
 end

data/tabula-extractor.gemspec CHANGED Viewed

@@ -11,6 +11,7 @@ Gem::Specification.new do |s|
   s.homepage    = "https://github.com/jazzido/tabula-extractor"
   s.summary     = %q{extract tables from PDF files}
   s.description = %q{extract tables from PDF files}
+  s.license     = 'MIT'
   s.platform = 'java'

metadata CHANGED Viewed

@@ -2,7 +2,7 @@
 name: tabula-extractor
 version: !ruby/object:Gem::Version
   prerelease:
-  version: 0.6.4
+  version: 0.6.5
 platform: java
 authors:
 - Manuel Aristarán
@@ -11,7 +11,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-07-09 00:00:00.000000000 Z
+date: 2013-07-23 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: minitest
@@ -133,7 +133,8 @@ files:
 - test/data/tabla_subsidios.pdf
 - test/tests.rb
 homepage: https://github.com/jazzido/tabula-extractor
-licenses: []
+licenses:
+- MIT
 post_install_message:
 rdoc_options: []
 require_paths: