tabula-extractor 0.7.2-java → 0.7.4-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/README.md +4 -8
- data/bin/tabula +3 -3
- data/lib/tabula.rb +9 -5
- data/lib/tabula/entities.rb +1 -0
- data/lib/tabula/entities/cell.rb +6 -4
- data/lib/tabula/entities/has_cells.rb +22 -78
- data/lib/tabula/entities/line.rb +52 -6
- data/lib/tabula/entities/page.rb +43 -50
- data/lib/tabula/entities/ruling.rb +83 -105
- data/lib/tabula/entities/spreadsheet.rb +74 -11
- data/lib/tabula/entities/table.rb +55 -37
- data/lib/tabula/entities/tabular.rb +42 -0
- data/lib/tabula/entities/text_chunk.rb +55 -52
- data/lib/tabula/entities/text_element.rb +129 -62
- data/lib/tabula/entities/zone_entity.rb +15 -6
- data/lib/tabula/extraction.rb +114 -49
- data/lib/tabula/line_segment_detector.rb +0 -5
- data/lib/tabula/table_extractor.rb +32 -37
- data/lib/tabula/version.rb +1 -1
- data/tabula-extractor.gemspec +2 -5
- metadata +13 -95
- data/ext/COPYING +0 -661
- data/ext/Makefile.OSX +0 -18
- data/ext/Makefile.defaults +0 -9
- data/ext/Makefile.linux32 +0 -11
- data/ext/Makefile.linux64 +0 -12
- data/ext/Makefile.mingw +0 -10
- data/ext/Makefile.mingw64 +0 -10
- data/ext/liblsd-linux32.so +0 -0
- data/ext/liblsd-linux64.so +0 -0
- data/ext/liblsd.def +0 -3
- data/ext/liblsd.dll +0 -0
- data/ext/liblsd.dylib +0 -0
- data/ext/liblsd64.dll +0 -0
- data/ext/lsd.c +0 -2270
- data/ext/lsd.h +0 -283
- data/test/data/47008204D_USA.page4.pdf +0 -0
- data/test/data/560015757GV_China.page1.pdf +0 -0
- data/test/data/ClinicalResearchDisclosureReport2012Q2.pdf +0 -0
- data/test/data/GSK_2012_Q4.page437.pdf +0 -0
- data/test/data/S2MNCEbirdisland.pdf +0 -0
- data/test/data/argentina_diputados_voting_record.pdf +0 -0
- data/test/data/bo_page24.pdf +0 -0
- data/test/data/campaign_donors.pdf +0 -0
- data/test/data/frx_2012_disclosure.pdf +0 -0
- data/test/data/frx_2012_disclosure.tsv +0 -88
- data/test/data/gre.pdf +0 -0
- data/test/data/no_tables.pdf +0 -0
- data/test/data/nyc_2013fiscalreporttables.pdf +0 -0
- data/test/data/puertos1.pdf +0 -0
- data/test/data/spanning_cells.csv +0 -21
- data/test/data/spanning_cells.pdf +0 -0
- data/test/data/strongschools.pdf +0 -0
- data/test/data/sydney_disclosure_contract.pdf +0 -0
- data/test/data/tabla_subsidios.pdf +0 -0
- data/test/data/vertical_rulings_bug.pdf +0 -0
- data/test/data/vietnam3.pdf +0 -0
- data/test/data/wc2012.pdf +0 -0
- data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
- data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
- data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
- data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
- data/test/heuristic-test-set/original/cs076pct.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
- data/test/heuristic.rb +0 -50
- data/test/test_bin_tabula.sh +0 -7
- data/test/tests.rb +0 -603
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA1:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: 935de0f0dc43fa388a86cc091dc540b74b6ce31f
         | 
| 4 | 
            +
              data.tar.gz: 67fa5fda6450c3b1659af3c61c8027843be5c082
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: 191054f79148535bf359c81c72d35b717f71f97ee3c3bedd4c2af66e4332afb98f3071afe4c9ed9e894586e3a20722769742f17fc02b9a5d5d954a4fae50803d
         | 
| 7 | 
            +
              data.tar.gz: 711f993194c402d1bca016f0fe13ccaeb8e4eafc6b67c2de0fa8b3cef1e7e3ae5b4cdefc2b251b64467747e7af26f80bb54bf57d4424ea50bb2dd26db7e27570
         | 
    
        data/.gitignore
    CHANGED
    
    
    
        data/README.md
    CHANGED
    
    | @@ -7,7 +7,7 @@ Extract tables from PDF files. `tabula-extractor` is the table extraction engine | |
| 7 7 |  | 
| 8 8 | 
             
            ## Installation
         | 
| 9 9 |  | 
| 10 | 
            -
             | 
| 10 | 
            +
            `tabula-extractor` only works with JRuby 1.7 or newer. [Install JRuby](http://jruby.org/getting-started) and run
         | 
| 11 11 |  | 
| 12 12 | 
             
            ``
         | 
| 13 13 | 
             
            jruby -S gem install tabula-extractor
         | 
| @@ -57,12 +57,12 @@ Here's a very basic example: | |
| 57 57 |  | 
| 58 58 | 
             
            ````ruby
         | 
| 59 59 | 
             
            require 'tabula'
         | 
| 60 | 
            -
             | 
| 60 | 
            +
             | 
| 61 61 | 
             
            pdf_file_path = "whatever.pdf"
         | 
| 62 62 | 
             
            outfilename = "whatever.csv"
         | 
| 63 | 
            -
             | 
| 63 | 
            +
             | 
| 64 64 | 
             
            out = open(outfilename, 'w')
         | 
| 65 | 
            -
             | 
| 65 | 
            +
             | 
| 66 66 | 
             
            extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, :all )
         | 
| 67 67 | 
             
            extractor.extract.each do |pdf_page|
         | 
| 68 68 | 
             
              pdf_page.spreadsheets.each do |spreadsheet|
         | 
| @@ -73,7 +73,3 @@ end | |
| 73 73 | 
             
            out.close
         | 
| 74 74 |  | 
| 75 75 | 
             
            ````
         | 
| 76 | 
            -
             | 
| 77 | 
            -
            ## Notes
         | 
| 78 | 
            -
             | 
| 79 | 
            -
            `tabula-extractor` uses [LSD: a Line Segment Detector](http://www.ipol.im/pub/art/2012/gjmr-lsd/) by Rafael Grompone von Gioi, Jérémie Jakubowicz, Jean-Michel Morel and Gregory Randall.
         | 
    
        data/bin/tabula
    CHANGED
    
    | @@ -1,4 +1,4 @@ | |
| 1 | 
            -
            #!/usr/bin/env jruby
         | 
| 1 | 
            +
            #!/usr/bin/env jruby -J-Djava.awt.headless=true
         | 
| 2 2 | 
             
            # encoding: utf-8
         | 
| 3 3 | 
             
            require 'trollop'
         | 
| 4 4 | 
             
            require_relative '../lib/tabula'
         | 
| @@ -9,7 +9,7 @@ def parse_pages_arg(pages_arg) | |
| 9 9 | 
             
              if(pages_arg == 'all')
         | 
| 10 10 | 
             
                return :all
         | 
| 11 11 | 
             
              end
         | 
| 12 | 
            -
             | 
| 12 | 
            +
             | 
| 13 13 | 
             
              ranges = pages_arg.split(',').map(&:strip)
         | 
| 14 14 | 
             
              pages = []
         | 
| 15 15 | 
             
              ranges.each do |range|
         | 
| @@ -100,7 +100,7 @@ def main | |
| 100 100 | 
             
            				else
         | 
| 101 101 | 
             
            				  false
         | 
| 102 102 | 
             
            				end
         | 
| 103 | 
            -
             | 
| 103 | 
            +
             | 
| 104 104 | 
             
              extractor = Tabula::Extraction::ObjectExtractor.new(filename, parse_pages_arg(opts[:pages]), opts[:password])
         | 
| 105 105 | 
             
              extractor.extract.each_with_index do |pdf_page, page_index|
         | 
| 106 106 |  | 
    
        data/lib/tabula.rb
    CHANGED
    
    | @@ -1,6 +1,7 @@ | |
| 1 1 | 
             
            module Tabula
         | 
| 2 2 | 
             
              PDFBOX = 'pdfbox-app-2.0.0-SNAPSHOT.jar'
         | 
| 3 3 | 
             
              ONLY_SPACES_RE = Regexp.new('^\s+$')
         | 
| 4 | 
            +
              SAME_CHAR_RE = Regexp.new('^(.)\1+$')
         | 
| 4 5 | 
             
            end
         | 
| 5 6 |  | 
| 6 7 | 
             
            require File.join(File.dirname(__FILE__), '../target/', Tabula::PDFBOX)
         | 
| @@ -8,7 +9,6 @@ require File.join(File.dirname(__FILE__), '../target/', 'slf4j-api-1.6.3.jar') | |
| 8 9 | 
             
            require File.join(File.dirname(__FILE__), '../target/', 'trove4j-3.0.3.jar')
         | 
| 9 10 | 
             
            require File.join(File.dirname(__FILE__), '../target/', 'jsi-1.1.0-SNAPSHOT.jar')
         | 
| 10 11 |  | 
| 11 | 
            -
             | 
| 12 12 | 
             
            import 'java.util.logging.LogManager'
         | 
| 13 13 | 
             
            import 'java.util.logging.Level'
         | 
| 14 14 |  | 
| @@ -22,13 +22,17 @@ lm.logger_names.each do |name| | |
| 22 22 | 
             
                end
         | 
| 23 23 | 
             
              end
         | 
| 24 24 | 
             
            end
         | 
| 25 | 
            -
             | 
| 26 | 
            -
             | 
| 27 25 | 
             
            require_relative './tabula/version'
         | 
| 28 26 | 
             
            require_relative './tabula/core_ext'
         | 
| 27 | 
            +
             | 
| 29 28 | 
             
            require_relative './tabula/entities'
         | 
| 30 29 | 
             
            require_relative './tabula/extraction'
         | 
| 31 30 | 
             
            require_relative './tabula/table_extractor'
         | 
| 32 31 | 
             
            require_relative './tabula/writers'
         | 
| 33 | 
            -
             | 
| 34 | 
            -
             | 
| 32 | 
            +
             | 
| 33 | 
            +
            module Tabula
         | 
| 34 | 
            +
              autoload :LSD               , File.expand_path('tabula/line_segment_detector.rb', File.dirname(__FILE__))
         | 
| 35 | 
            +
              autoload :Render            , File.expand_path('tabula/pdf_render.rb', File.dirname(__FILE__))
         | 
| 36 | 
            +
            end
         | 
| 37 | 
            +
             | 
| 38 | 
            +
            require_relative './tabula/table_extractor'
         | 
    
        data/lib/tabula/entities.rb
    CHANGED
    
    
    
        data/lib/tabula/entities/cell.rb
    CHANGED
    
    | @@ -15,7 +15,7 @@ module Tabula | |
| 15 15 | 
             
                  @placeholder = false
         | 
| 16 16 | 
             
                  @spanning = false
         | 
| 17 17 | 
             
                  @text_elements = []
         | 
| 18 | 
            -
                  @options = ({:use_line_returns =>  | 
| 18 | 
            +
                  @options = ({:use_line_returns => true, :cell_debug => NORMAL}).merge options
         | 
| 19 19 | 
             
                end
         | 
| 20 20 |  | 
| 21 21 | 
             
                def self.new_from_points(topleft, bottomright, options={})
         | 
| @@ -29,11 +29,13 @@ module Tabula | |
| 29 29 | 
             
                  output = ""
         | 
| 30 30 | 
             
                  text_elements.sort #use the default sort for ZoneEntity
         | 
| 31 31 | 
             
                  text_elements.group_by(&:top).values.each do |row|
         | 
| 32 | 
            -
                    output << row.map{|el| el.text}.join('') + (@options[:use_line_returns] ? "\ | 
| 33 | 
            -
             | 
| 32 | 
            +
                    output << row.map{|el| el.text}.join('') + (@options[:use_line_returns] ? "\r" : '')
         | 
| 33 | 
            +
                    # per @bchartoff, https://github.com/jazzido/tabula-extractor/pull/65#issuecomment-32899336
         | 
| 34 | 
            +
                    # line returns as \r behave better in Excel.
         | 
| 35 | 
            +
                  end
         | 
| 34 36 | 
             
                  if (output.empty? && @options[:cell_debug] >= DEBUG) || @options[:cell_debug] >= SUPERDEBUG
         | 
| 35 37 | 
             
                    text_output = output.dup
         | 
| 36 | 
            -
                    output = "top: #{top} left: #{left} \n w: #{width} h: #{height}" | 
| 38 | 
            +
                    output = "top: #{top} left: #{left} \n w: #{width} h: #{height}"
         | 
| 37 39 | 
             
                    output += " \n #{text_output}"
         | 
| 38 40 | 
             
                  end
         | 
| 39 41 | 
             
                  output.strip
         | 
| @@ -6,27 +6,30 @@ module Tabula | |
| 6 6 | 
             
              # subclasses must define cells, vertical_ruling_lines, horizontal_ruling_lines accessors; ruling_lines reader
         | 
| 7 7 | 
             
              module HasCells
         | 
| 8 8 |  | 
| 9 | 
            -
                 | 
| 9 | 
            +
                ARBITRARY_MAGIC_HEURISTIC_NUMBER = 0.65
         | 
| 10 10 |  | 
| 11 11 | 
             
                def is_tabular?
         | 
| 12 | 
            +
                  ratio = heuristic_ratio
         | 
| 13 | 
            +
                  return ratio > ARBITRARY_MAGIC_HEURISTIC_NUMBER && ratio < (1 / ARBITRARY_MAGIC_HEURISTIC_NUMBER)
         | 
| 14 | 
            +
                end
         | 
| 15 | 
            +
             | 
| 16 | 
            +
                def heuristic_ratio
         | 
| 12 17 | 
             
                  #spreadsheet extraction
         | 
| 13 18 | 
             
                  spreadsheet = spreadsheets.first
         | 
| 14 | 
            -
                  return  | 
| 19 | 
            +
                  return Float::NAN if spreadsheet.nil?
         | 
| 15 20 | 
             
                  rows_defined_by_lines = spreadsheet.rows.size #rows filled in automatically
         | 
| 16 21 | 
             
                  columns_defined_by_lines = spreadsheet.cols.size
         | 
| 17 22 |  | 
| 18 23 | 
             
                  table = self.get_table
         | 
| 19 24 | 
             
                  columns_defined_without_lines = table.cols.size
         | 
| 20 25 | 
             
                  rows_defined_without_lines = table.rows.size
         | 
| 21 | 
            -
                   | 
| 22 | 
            -
             | 
| 23 | 
            -
                  return ratio > ANOTHER_MAGIC_NUMBER && ratio < (1 / ANOTHER_MAGIC_NUMBER)
         | 
| 26 | 
            +
                  ((columns_defined_by_lines.to_f / columns_defined_without_lines) + (rows_defined_by_lines.to_f / rows_defined_without_lines)) / 2
         | 
| 24 27 | 
             
                end
         | 
| 25 28 |  | 
| 26 29 | 
             
                # finds cells from the ruling lines on the page.
         | 
| 27 30 | 
             
                # implements Nurminen thesis algorithm cf. https://github.com/jazzido/tabula-extractor/issues/16
         | 
| 28 31 | 
             
                # subclasses must define cells, vertical_ruling_lines, horizontal_ruling_lines accessors
         | 
| 29 | 
            -
                def find_cells!(options={})
         | 
| 32 | 
            +
                def find_cells!(horizontal_ruling_lines, vertical_ruling_lines, options={})
         | 
| 30 33 | 
             
                  # All lines need to been sorted from up to down,
         | 
| 31 34 | 
             
                  # and left to right in ascending order
         | 
| 32 35 |  | 
| @@ -39,9 +42,10 @@ module Tabula | |
| 39 42 | 
             
                  # depending on the Point2D default sort here.
         | 
| 40 43 | 
             
                  intersection_points_array = intersection_points.keys.sort
         | 
| 41 44 |  | 
| 42 | 
            -
                   | 
| 45 | 
            +
                  intersection_points_array.each_with_index do |topLeft, i|
         | 
| 43 46 | 
             
                    # Fetch all points on the same vertical and horizontal
         | 
| 44 47 | 
             
                    # line with current crossing point
         | 
| 48 | 
            +
                    horizontal, vertical = intersection_points[topLeft]
         | 
| 45 49 |  | 
| 46 50 | 
             
                    # this lets us go to the next intersection_point in intersection_points_array
         | 
| 47 51 | 
             
                    # it is bad and I feel bad.
         | 
| @@ -64,19 +68,19 @@ module Tabula | |
| 64 68 | 
             
                          #                                                    point;
         | 
| 65 69 | 
             
                          next unless horizontal.colinear?(y_point)
         | 
| 66 70 | 
             
                          #Hypothetical bottom right point of rectangle
         | 
| 67 | 
            -
                          btmRight = Point2D::Float.new( | 
| 71 | 
            +
                          btmRight = Point2D::Float.new(y_point.x, x_point.y)
         | 
| 68 72 | 
             
                          if intersection_points.include?(btmRight)
         | 
| 69 | 
            -
                            intersection_points[btmRight] | 
| 70 | 
            -
             | 
| 73 | 
            +
                            btmRightHorizontal, btmRightVertical = intersection_points[btmRight]
         | 
| 74 | 
            +
             | 
| 75 | 
            +
                            if btmRightHorizontal.colinear?( x_point ) &&
         | 
| 71 76 | 
             
                                btmRightVertical.colinear?( y_point )
         | 
| 72 | 
            -
             | 
| 73 | 
            -
             | 
| 74 | 
            -
             | 
| 75 | 
            -
             | 
| 76 | 
            -
             | 
| 77 | 
            -
             | 
| 78 | 
            -
             | 
| 79 | 
            -
                              end
         | 
| 77 | 
            +
                              # Rectangle is confirmed to have 4 sides
         | 
| 78 | 
            +
                              cellsFound << Cell.new_from_points( topLeft, btmRight, options)
         | 
| 79 | 
            +
                              # Each crossing point can be the top left corner
         | 
| 80 | 
            +
                              # of only a single rectangle
         | 
| 81 | 
            +
                              #next crossing-point; we need to "next" out of the outer loop here
         | 
| 82 | 
            +
                              # to avoid creating non-minimal cells, I htink.
         | 
| 83 | 
            +
                              throw :cellCreated
         | 
| 80 84 | 
             
                            end
         | 
| 81 85 | 
             
                          end
         | 
| 82 86 | 
             
                        end
         | 
| @@ -87,66 +91,6 @@ module Tabula | |
| 87 91 | 
             
                  cellsFound
         | 
| 88 92 | 
             
                end
         | 
| 89 93 |  | 
| 90 | 
            -
                #############################
         | 
| 91 | 
            -
                # Chapter 2, Spanning Cells #
         | 
| 92 | 
            -
                #############################
         | 
| 93 | 
            -
                #if c is a "spanning cell", that is
         | 
| 94 | 
            -
                #              if there are N>0 vertical lines strictly between this cell's left and right
         | 
| 95 | 
            -
                #insert N placeholder cells after it with zero size (but same top)
         | 
| 96 | 
            -
             | 
| 97 | 
            -
                # subclasses must define cells, vertical_ruling_lines, horizontal_ruling_lines accessors
         | 
| 98 | 
            -
                def add_spanning_cells!
         | 
| 99 | 
            -
                  #rounding: because Cell.new_from_points, using in #find_cells above, has
         | 
| 100 | 
            -
                  # a float precision error where, for instance, a cell whose x2 coord is
         | 
| 101 | 
            -
                  # supposed to be 160.137451171875 comes out as 160.13745498657227 because
         | 
| 102 | 
            -
                  # of minus. :(
         | 
| 103 | 
            -
                  vertical_uniq_locs = vertical_ruling_lines.map{|l| l.left.round(5)}.uniq    #already sorted
         | 
| 104 | 
            -
                  horizontal_uniq_locs = horizontal_ruling_lines.map{|l| l.top.round(5)}.uniq #already sorted
         | 
| 105 | 
            -
             | 
| 106 | 
            -
                  cells.each do |c|
         | 
| 107 | 
            -
                    vertical_rulings_spanned_over = vertical_uniq_locs.select{|l| l > c.left.round(5) && l < c.right.round(5) }
         | 
| 108 | 
            -
                    horizontal_rulings_spanned_over = horizontal_uniq_locs.select{|t| t > c.top.round(5) && t < c.bottom.round(5) }
         | 
| 109 | 
            -
             | 
| 110 | 
            -
                    unless vertical_rulings_spanned_over.empty?
         | 
| 111 | 
            -
                      c.spanning = true
         | 
| 112 | 
            -
                      vertical_rulings_spanned_over.each do |spanned_over_line_loc|
         | 
| 113 | 
            -
                        placeholder = Cell.new(c.top, spanned_over_line_loc, 0, c.height)
         | 
| 114 | 
            -
                        placeholder.placeholder = true
         | 
| 115 | 
            -
                        cells << placeholder
         | 
| 116 | 
            -
                      end
         | 
| 117 | 
            -
                    end
         | 
| 118 | 
            -
                    unless horizontal_rulings_spanned_over.empty?
         | 
| 119 | 
            -
                      c.spanning = true
         | 
| 120 | 
            -
                      horizontal_rulings_spanned_over.each do |spanned_over_line_loc|
         | 
| 121 | 
            -
                        placeholder = Cell.new(spanned_over_line_loc, c.left, c.width, 0)
         | 
| 122 | 
            -
                        placeholder.placeholder = true
         | 
| 123 | 
            -
                        cells << placeholder
         | 
| 124 | 
            -
                      end
         | 
| 125 | 
            -
                    end
         | 
| 126 | 
            -
             | 
| 127 | 
            -
                    #if there's a spanning cell that's spans over both rows and columns, then it has "double placeholder" cells
         | 
| 128 | 
            -
                    # e.g. -------------------
         | 
| 129 | 
            -
                    #      | C |  C |  C | C |         (this is some pretty sweet ASCII art, eh?)
         | 
| 130 | 
            -
                    #      |-----------------|
         | 
| 131 | 
            -
                    #      | C |  C |  C | C |
         | 
| 132 | 
            -
                    #      |-----------------|
         | 
| 133 | 
            -
                    #      | C | SC    P | C |   where MC is the "spanning cell" that holds all the text within its bounds
         | 
| 134 | 
            -
                    #      |----    +    ----|         P is a "placeholder" cell with either zero width or zero height
         | 
| 135 | 
            -
                    #      | C | P    DP | C |         DP is a "double placeholder" cell with zero width and zero height
         | 
| 136 | 
            -
                    #      |----    +    ----|         C is an ordinary cell.
         | 
| 137 | 
            -
                    #      | C | P    DP | C |
         | 
| 138 | 
            -
                    #      |-----------------|
         | 
| 139 | 
            -
             | 
| 140 | 
            -
                    unless (double_placeholders = vertical_rulings_spanned_over.product(horizontal_rulings_spanned_over)).empty?
         | 
| 141 | 
            -
                      double_placeholders.each do |vert_spanned_over, horiz_spanned_over|
         | 
| 142 | 
            -
                        placeholder = Cell.new(horiz_spanned_over, vert_spanned_over, 0, 0)
         | 
| 143 | 
            -
                        placeholder.placeholder = true
         | 
| 144 | 
            -
                        cells << placeholder
         | 
| 145 | 
            -
                      end
         | 
| 146 | 
            -
                    end
         | 
| 147 | 
            -
                  end
         | 
| 148 | 
            -
                end
         | 
| 149 | 
            -
             | 
| 150 94 | 
             
                #TODO:
         | 
| 151 95 | 
             
                #returns array of Spreadsheet objects constructed (or spreadsheet_areas => cells)
         | 
| 152 96 | 
             
                #maybe placeholders should be added after cells is split into spreadsheets
         | 
    
        data/lib/tabula/entities/line.rb
    CHANGED
    
    | @@ -3,6 +3,8 @@ module Tabula | |
| 3 3 | 
             
                attr_accessor :text_elements
         | 
| 4 4 | 
             
                attr_reader :index
         | 
| 5 5 |  | 
| 6 | 
            +
                SPACE_RUN_MAX_LENGTH = 3
         | 
| 7 | 
            +
             | 
| 6 8 | 
             
                def initialize(index=nil)
         | 
| 7 9 | 
             
                  @text_elements = []
         | 
| 8 10 | 
             
                  @index = index
         | 
| @@ -16,15 +18,59 @@ module Tabula | |
| 16 18 | 
             
                    self.width = t.width
         | 
| 17 19 | 
             
                    self.height = t.height
         | 
| 18 20 | 
             
                  else
         | 
| 19 | 
            -
                     | 
| 20 | 
            -
             | 
| 21 | 
            -
                    else
         | 
| 22 | 
            -
                      self.text_elements << t
         | 
| 23 | 
            -
                      self.merge!(t)
         | 
| 24 | 
            -
                    end
         | 
| 21 | 
            +
                    self.text_elements << t
         | 
| 22 | 
            +
                    self.merge!(t)
         | 
| 25 23 | 
             
                  end
         | 
| 26 24 | 
             
                end
         | 
| 27 25 |  | 
| 26 | 
            +
                ##
         | 
| 27 | 
            +
                # remove runs of the space char longer than SPACE_RUN_MAX_LENGTH
         | 
| 28 | 
            +
                # should not change dimensions of the container +Line+
         | 
| 29 | 
            +
                def remove_sequential_spaces!(seq_spaces_count=SPACE_RUN_MAX_LENGTH)
         | 
| 30 | 
            +
                  self.text_elements = self.text_elements.reduce([]) do |memo, text_chunk|
         | 
| 31 | 
            +
                    long_space_runs = text_chunk
         | 
| 32 | 
            +
                      .text_elements
         | 
| 33 | 
            +
                      .chunk { |te| te.text == ' '}  # detect runs of spaces...
         | 
| 34 | 
            +
                      .select { |is_space, text_elements| # ...longer than SPACE_RUN_MAX_LENGTH
         | 
| 35 | 
            +
                      is_space && !text_elements.nil? && text_elements.size >= SPACE_RUN_MAX_LENGTH
         | 
| 36 | 
            +
                    }
         | 
| 37 | 
            +
                      .map { |_, text_elements| text_elements }
         | 
| 38 | 
            +
             | 
| 39 | 
            +
                    # no long runs of spaces
         | 
| 40 | 
            +
                    # keep as it was and end iteration
         | 
| 41 | 
            +
                    if long_space_runs.empty?
         | 
| 42 | 
            +
                      memo << text_chunk
         | 
| 43 | 
            +
                      next memo
         | 
| 44 | 
            +
                    end
         | 
| 45 | 
            +
             | 
| 46 | 
            +
                    ranges = long_space_runs.map { |lsr|
         | 
| 47 | 
            +
                      idx = text_chunk
         | 
| 48 | 
            +
                        .text_elements
         | 
| 49 | 
            +
                        .index { |te| te.equal?(lsr.first) } # we need pointer comparison here
         | 
| 50 | 
            +
                      (idx)..(idx+lsr.size-1)
         | 
| 51 | 
            +
                    }
         | 
| 52 | 
            +
             | 
| 53 | 
            +
                    in_run = false
         | 
| 54 | 
            +
                    new_chunk = true
         | 
| 55 | 
            +
                    text_chunk
         | 
| 56 | 
            +
                      .text_elements
         | 
| 57 | 
            +
                      .each_with_index do |te, i|
         | 
| 58 | 
            +
                      if ranges.any? { |r| r.include?(i) } # te belongs to a run of spaces, skip
         | 
| 59 | 
            +
                        in_run = true
         | 
| 60 | 
            +
                      else
         | 
| 61 | 
            +
                        if in_run || new_chunk
         | 
| 62 | 
            +
                          memo << TextChunk.create_from_text_element(te)
         | 
| 63 | 
            +
                        else
         | 
| 64 | 
            +
                          memo.last << te
         | 
| 65 | 
            +
                        end
         | 
| 66 | 
            +
                        in_run = new_chunk = false
         | 
| 67 | 
            +
                      end
         | 
| 68 | 
            +
                    end
         | 
| 69 | 
            +
                    memo
         | 
| 70 | 
            +
                  end # reduce
         | 
| 71 | 
            +
                  self
         | 
| 72 | 
            +
                end
         | 
| 73 | 
            +
             | 
| 28 74 | 
             
                #used for testing, ignores text element stuff besides stripped text.
         | 
| 29 75 | 
             
                def ==(other)
         | 
| 30 76 | 
             
                  return false if other.nil?
         | 
    
        data/lib/tabula/entities/page.rb
    CHANGED
    
    | @@ -6,7 +6,7 @@ module Tabula | |
| 6 6 | 
             
                attr_writer :min_char_width, :min_char_height
         | 
| 7 7 | 
             
                attr_accessor :cells
         | 
| 8 8 |  | 
| 9 | 
            -
                def initialize(file_path, width, height, rotation, number, texts=[], ruling_lines=[], min_char_width=nil, min_char_height=nil)
         | 
| 9 | 
            +
                def initialize(file_path, width, height, rotation, number, texts=[], ruling_lines=[], min_char_width=nil, min_char_height=nil, spatial_index=nil)
         | 
| 10 10 | 
             
                  super(0, 0, width, height)
         | 
| 11 11 | 
             
                  @rotation = rotation
         | 
| 12 12 | 
             
                  if number < 1
         | 
| @@ -19,10 +19,16 @@ module Tabula | |
| 19 19 | 
             
                  @spreadsheets = nil
         | 
| 20 20 | 
             
                  @min_char_width = min_char_width
         | 
| 21 21 | 
             
                  @min_char_height = min_char_height
         | 
| 22 | 
            -
                  @spatial_index = TextElementIndex.new
         | 
| 23 22 |  | 
| 24 23 | 
             
                  self.texts = texts
         | 
| 25 | 
            -
             | 
| 24 | 
            +
             | 
| 25 | 
            +
                  if spatial_index.nil?
         | 
| 26 | 
            +
                    @spatial_index = TextElementIndex.new
         | 
| 27 | 
            +
                    self.texts.each { |te| @spatial_index << te }
         | 
| 28 | 
            +
                  else
         | 
| 29 | 
            +
                    @spatial_index = spatial_index
         | 
| 30 | 
            +
                  end
         | 
| 31 | 
            +
             | 
| 26 32 | 
             
                end
         | 
| 27 33 |  | 
| 28 34 | 
             
                def min_char_width
         | 
| @@ -49,7 +55,8 @@ module Tabula | |
| 49 55 | 
             
                                           texts,
         | 
| 50 56 | 
             
                                           Ruling.crop_rulings_to_area(@ruling_lines, area),
         | 
| 51 57 | 
             
                                           texts.map(&:width).min,
         | 
| 52 | 
            -
                                           texts.map(&:height).min | 
| 58 | 
            +
                                           texts.map(&:height).min,
         | 
| 59 | 
            +
                                           @spatial_index)
         | 
| 53 60 | 
             
                  return page_area
         | 
| 54 61 | 
             
                end
         | 
| 55 62 |  | 
| @@ -60,28 +67,33 @@ module Tabula | |
| 60 67 | 
             
                    return Tabula::Table.new(0, [])
         | 
| 61 68 | 
             
                  end
         | 
| 62 69 |  | 
| 63 | 
            -
                   | 
| 70 | 
            +
                  texts = self.texts.sort
         | 
| 71 | 
            +
                  text_chunks = TextElement.merge_words(texts, options)
         | 
| 64 72 |  | 
| 65 | 
            -
                  lines = TextChunk.group_by_lines(text_chunks)
         | 
| 73 | 
            +
                  lines = TextChunk.group_by_lines(text_chunks.sort).sort_by(&:top)
         | 
| 66 74 |  | 
| 67 | 
            -
                  unless options[:vertical_rulings].empty?
         | 
| 68 | 
            -
             | 
| 69 | 
            -
             | 
| 70 | 
            -
             | 
| 71 | 
            -
             | 
| 72 | 
            -
                                                         text_chunks)
         | 
| 73 | 
            -
                    separators = columns[1..-1].sort.reverse
         | 
| 74 | 
            -
                  end
         | 
| 75 | 
            +
                  columns = unless options[:vertical_rulings].empty?
         | 
| 76 | 
            +
                              options[:vertical_rulings].map(&:left).sort #pixel locations, not entities
         | 
| 77 | 
            +
                            else
         | 
| 78 | 
            +
                              TextChunk.column_positions(lines).sort
         | 
| 79 | 
            +
                            end
         | 
| 75 80 |  | 
| 76 | 
            -
                  table = Table.new(lines.count,  | 
| 81 | 
            +
                  table = Table.new(lines.count, columns)
         | 
| 77 82 | 
             
                  lines.each_with_index do |line, i|
         | 
| 78 | 
            -
                    line.text_elements.each do |te|
         | 
| 79 | 
            -
                      j =  | 
| 80 | 
            -
                      table.add_text_element(te, i,  | 
| 83 | 
            +
                    line.text_elements.select { |te| te.text !~ ONLY_SPACES_RE }.each do |te|
         | 
| 84 | 
            +
                      j = columns.find_index { |s| te.left <= s } || columns.count
         | 
| 85 | 
            +
                      table.add_text_element(te, i, j)
         | 
| 81 86 | 
             
                    end
         | 
| 82 87 | 
             
                  end
         | 
| 83 88 |  | 
| 84 | 
            -
                  table | 
| 89 | 
            +
                  # fixes up the table a little bit, replacing nils with empty TextElements
         | 
| 90 | 
            +
                  # and sorting the lines.
         | 
| 91 | 
            +
                  # table.rows.each do |l|
         | 
| 92 | 
            +
                  #   l.text_elements = l.text_elements.map do |te|
         | 
| 93 | 
            +
                  #     te || TextElement.new(nil, nil, nil, nil, nil, nil, '', nil)
         | 
| 94 | 
            +
                  #   end
         | 
| 95 | 
            +
                  # end
         | 
| 96 | 
            +
                  # table.rows.sort_by!(&:top)
         | 
| 85 97 | 
             
                  table
         | 
| 86 98 | 
             
                end
         | 
| 87 99 |  | 
| @@ -96,7 +108,7 @@ module Tabula | |
| 96 108 | 
             
                    return @spreadsheets
         | 
| 97 109 | 
             
                  end
         | 
| 98 110 | 
             
                  get_ruling_lines!(options)
         | 
| 99 | 
            -
                  self.find_cells!(options)
         | 
| 111 | 
            +
                  self.find_cells!(self.horizontal_ruling_lines, self.vertical_ruling_lines, options)
         | 
| 100 112 |  | 
| 101 113 | 
             
                  spreadsheet_areas = find_spreadsheets_from_cells #literally, java.awt.geom.Area objects. lol sorry. polygons.
         | 
| 102 114 |  | 
| @@ -157,14 +169,18 @@ module Tabula | |
| 157 169 |  | 
| 158 170 | 
             
                #returns ruling lines, memoizes them in
         | 
| 159 171 | 
             
                def get_ruling_lines!(options={})
         | 
| 160 | 
            -
                  if  | 
| 161 | 
            -
                     | 
| 162 | 
            -
                    @vertical_ruling_lines ||= self.collapse_oriented_rulings(@ruling_lines.select(&:vertical?))
         | 
| 163 | 
            -
                    @horizontal_ruling_lines ||= self.collapse_oriented_rulings(@ruling_lines.select(&:horizontal?))
         | 
| 164 | 
            -
                    @vertical_ruling_lines + @horizontal_ruling_lines
         | 
| 165 | 
            -
                  else
         | 
| 166 | 
            -
                    []
         | 
| 172 | 
            +
                  if @ruling_lines.nil? || @ruling_lines.empty?
         | 
| 173 | 
            +
                    return []
         | 
| 167 174 | 
             
                  end
         | 
| 175 | 
            +
                  self.snap_points!
         | 
| 176 | 
            +
             | 
| 177 | 
            +
                  @ruling_lines.select! { |l| !(l.width == 0 && l.height == 0) }
         | 
| 178 | 
            +
             | 
| 179 | 
            +
                  @vertical_ruling_lines ||= Ruling.collapse_oriented_rulings(@ruling_lines.select(&:vertical?))
         | 
| 180 | 
            +
                  @horizontal_ruling_lines ||= Ruling.collapse_oriented_rulings(@ruling_lines.select(&:horizontal?))
         | 
| 181 | 
            +
             | 
| 182 | 
            +
                  @vertical_ruling_lines + @horizontal_ruling_lines
         | 
| 183 | 
            +
             | 
| 168 184 | 
             
                end
         | 
| 169 185 |  | 
| 170 186 | 
             
                ##
         | 
| @@ -252,29 +268,6 @@ module Tabula | |
| 252 268 | 
             
                    l.java_send :setLine, [java.awt.geom.Point2D, java.awt.geom.Point2D], p1_p2[0], p1_p2[1]
         | 
| 253 269 | 
             
                  end
         | 
| 254 270 | 
             
                end
         | 
| 255 | 
            -
             | 
| 256 | 
            -
                def collapse_oriented_rulings(lines)
         | 
| 257 | 
            -
                  # lines must all be of one orientation (i.e. horizontal, vertical)
         | 
| 258 | 
            -
             | 
| 259 | 
            -
                  if lines.empty?
         | 
| 260 | 
            -
                    return []
         | 
| 261 | 
            -
                  end
         | 
| 262 | 
            -
             | 
| 263 | 
            -
                  lines.sort! {|a, b| a.position != b.position ? a.position <=> b.position : a.start <=> b.start }
         | 
| 264 | 
            -
             | 
| 265 | 
            -
                  lines = lines.inject([lines.shift]) do |memo, next_line|
         | 
| 266 | 
            -
                    last = memo.last
         | 
| 267 | 
            -
                    if next_line.position == last.position && last.nearlyIntersects?(next_line)
         | 
| 268 | 
            -
                      memo.last.start = next_line.start < last.start ? next_line.start : last.start
         | 
| 269 | 
            -
                      memo.last.end = next_line.end < last.end ? last.end : next_line.end
         | 
| 270 | 
            -
                      memo
         | 
| 271 | 
            -
                    elsif next_line.length == 0
         | 
| 272 | 
            -
                      memo
         | 
| 273 | 
            -
                    else
         | 
| 274 | 
            -
                      memo << next_line
         | 
| 275 | 
            -
                    end
         | 
| 276 | 
            -
                  end
         | 
| 277 | 
            -
                end
         | 
| 278 271 | 
             
              end
         | 
| 279 272 |  | 
| 280 273 | 
             
            end
         |