tabula-extractor 0.7.1-java → 0.7.2-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/README.md +22 -0
- data/Rakefile +1 -1
- data/bin/tabula +8 -1
- data/lib/tabula.rb +4 -0
- data/lib/tabula/core_ext.rb +1 -1
- data/lib/tabula/entities.rb +1 -0
- data/lib/tabula/entities/page.rb +20 -8
- data/lib/tabula/entities/spreadsheet.rb +1 -1
- data/lib/tabula/entities/table.rb +12 -0
- data/lib/tabula/entities/text_chunk.rb +1 -3
- data/lib/tabula/entities/text_element.rb +52 -34
- data/lib/tabula/entities/text_element_index.rb +55 -0
- data/lib/tabula/extraction.rb +6 -5
- data/lib/tabula/version.rb +1 -1
- data/target/jsi-1.1.0-SNAPSHOT.jar +0 -0
- data/target/slf4j-api-1.6.3.jar +0 -0
- data/target/trove4j-3.0.3.jar +0 -0
- data/test/data/nyc_2013fiscalreporttables.pdf +0 -0
- data/test/data/sydney_disclosure_contract.pdf +0 -0
- data/test/data/wc2012.pdf +0 -0
- data/test/tests.rb +45 -1
- metadata +13 -3
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA1:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: 089f1213abcf17bb66c982d40b2145a0f452297c
         | 
| 4 | 
            +
              data.tar.gz: 80151791aae887fe11108e3f39c03e06eb29cdea
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: 28a058d979fc405094a416fbcd05c65f3cbd25e8269ce8e22f023a27fe0e5f3d4ef39f8b1e60c00ffc964bd858eb379ae71826f9eb4e70447b4181be96d97efe
         | 
| 7 | 
            +
              data.tar.gz: c601699c5639a72a0ce4149b50f523cebe914a6767f3dd449979dde6c9a6c7ae269a270f7da0bfd33d92a52199d3a74866aef69121e6033f593c48341f3dc9f4
         | 
    
        data/.gitignore
    CHANGED
    
    
    
        data/README.md
    CHANGED
    
    | @@ -44,6 +44,7 @@ Tabula helps you extract tables from PDFs | |
| 44 44 | 
             
                                      extraction (if there are ruling lines separating each
         | 
| 45 45 | 
             
                                      cell, as in a PDF of an Excel spreadsheet)
         | 
| 46 46 | 
             
                      --silent, -i:   Suppress all stderr output.
         | 
| 47 | 
            +
            --use-line-returns, -u:   Use embedded line returns in cells.
         | 
| 47 48 | 
             
                     --version, -v:   Print version and exit
         | 
| 48 49 | 
             
                        --help, -h:   Show this message
         | 
| 49 50 | 
             
            ```
         | 
| @@ -52,6 +53,27 @@ Tabula helps you extract tables from PDFs | |
| 52 53 |  | 
| 53 54 | 
             
            `tabula-extractor` is a RubyGem that you can use to programmatically extract tabular data, using the Tabula engine, in your scripts or applications. We don't have docs yet, but [the tests](test/tests.rb) are a good source of information.
         | 
| 54 55 |  | 
| 56 | 
            +
            Here's a very basic example:
         | 
| 57 | 
            +
             | 
| 58 | 
            +
            ````ruby
         | 
| 59 | 
            +
            require 'tabula'
         | 
| 60 | 
            +
             
         | 
| 61 | 
            +
            pdf_file_path = "whatever.pdf"
         | 
| 62 | 
            +
            outfilename = "whatever.csv"
         | 
| 63 | 
            +
             
         | 
| 64 | 
            +
            out = open(outfilename, 'w')
         | 
| 65 | 
            +
             
         | 
| 66 | 
            +
            extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, :all )
         | 
| 67 | 
            +
            extractor.extract.each do |pdf_page|
         | 
| 68 | 
            +
              pdf_page.spreadsheets.each do |spreadsheet|
         | 
| 69 | 
            +
                out << spreadsheet.to_csv
         | 
| 70 | 
            +
                out << "\n\n"
         | 
| 71 | 
            +
              end
         | 
| 72 | 
            +
            end
         | 
| 73 | 
            +
            out.close
         | 
| 74 | 
            +
             | 
| 75 | 
            +
            ````
         | 
| 76 | 
            +
             | 
| 55 77 | 
             
            ## Notes
         | 
| 56 78 |  | 
| 57 79 | 
             
            `tabula-extractor` uses [LSD: a Line Segment Detector](http://www.ipol.im/pub/art/2012/gjmr-lsd/) by Rafael Grompone von Gioi, Jérémie Jakubowicz, Jean-Michel Morel and Gregory Randall.
         | 
    
        data/Rakefile
    CHANGED
    
    
    
        data/bin/tabula
    CHANGED
    
    | @@ -47,6 +47,7 @@ EOS | |
| 47 47 | 
             
                opt :spreadsheet, "Force PDF to be extracted using spreadsheet-style extraction (if there are ruling lines separating each cell, as in a PDF of an Excel spreadsheet)"
         | 
| 48 48 | 
             
                opt :no_spreadsheet, "Force PDF not to be extracted using spreadsheet-style extraction (if there are ruling lines separating each cell, as in a PDF of an Excel spreadsheet)"
         | 
| 49 49 | 
             
                opt :silent, 'Suppress all stderr output.'
         | 
| 50 | 
            +
                opt :use_line_returns, 'Use embedded line returns in cells. (Only in spreadsheet mode.)'
         | 
| 50 51 | 
             
              end
         | 
| 51 52 |  | 
| 52 53 | 
             
              if !opts[:columns].nil?
         | 
| @@ -94,6 +95,12 @@ def main | |
| 94 95 | 
             
                                            else
         | 
| 95 96 | 
             
                                              nil
         | 
| 96 97 | 
             
                                            end
         | 
| 98 | 
            +
              use_line_returns = if opts[:use_line_returns]
         | 
| 99 | 
            +
            				  true
         | 
| 100 | 
            +
            				else
         | 
| 101 | 
            +
            				  false
         | 
| 102 | 
            +
            				end
         | 
| 103 | 
            +
              
         | 
| 97 104 | 
             
              extractor = Tabula::Extraction::ObjectExtractor.new(filename, parse_pages_arg(opts[:pages]), opts[:password])
         | 
| 98 105 | 
             
              extractor.extract.each_with_index do |pdf_page, page_index|
         | 
| 99 106 |  | 
| @@ -111,7 +118,7 @@ def main | |
| 111 118 | 
             
                      STDERR.puts "Page #{pdf_page.number(:one_indexed)}: #{spreadsheet.dims(:top, :left, :bottom, :right)}"
         | 
| 112 119 | 
             
                    end
         | 
| 113 120 | 
             
                  end
         | 
| 114 | 
            -
                  tables = pdf_page.spreadsheets.map(&:rows)
         | 
| 121 | 
            +
                  tables = pdf_page.spreadsheets(:use_line_returns=> use_line_returns).map(&:rows)
         | 
| 115 122 | 
             
                else
         | 
| 116 123 | 
             
                  STDERR.puts "Page #{pdf_page.number(:one_indexed)}: #{page_area.to_s}" if opts[:debug]
         | 
| 117 124 | 
             
                  if opts[:guess]
         | 
    
        data/lib/tabula.rb
    CHANGED
    
    | @@ -4,6 +4,10 @@ module Tabula | |
| 4 4 | 
             
            end
         | 
| 5 5 |  | 
| 6 6 | 
             
            require File.join(File.dirname(__FILE__), '../target/', Tabula::PDFBOX)
         | 
| 7 | 
            +
            require File.join(File.dirname(__FILE__), '../target/', 'slf4j-api-1.6.3.jar')
         | 
| 8 | 
            +
            require File.join(File.dirname(__FILE__), '../target/', 'trove4j-3.0.3.jar')
         | 
| 9 | 
            +
            require File.join(File.dirname(__FILE__), '../target/', 'jsi-1.1.0-SNAPSHOT.jar')
         | 
| 10 | 
            +
             | 
| 7 11 |  | 
| 8 12 | 
             
            import 'java.util.logging.LogManager'
         | 
| 9 13 | 
             
            import 'java.util.logging.Level'
         | 
    
        data/lib/tabula/core_ext.rb
    CHANGED
    
    
    
        data/lib/tabula/entities.rb
    CHANGED
    
    | @@ -2,6 +2,7 @@ require_relative './entities/zone_entity' | |
| 2 2 | 
             
            require_relative './entities/cell'
         | 
| 3 3 | 
             
            require_relative './entities/has_cells'
         | 
| 4 4 | 
             
            require_relative './entities/line'
         | 
| 5 | 
            +
            require_relative './entities/text_element_index'
         | 
| 5 6 | 
             
            require_relative './entities/page'
         | 
| 6 7 | 
             
            require_relative './entities/page_area'
         | 
| 7 8 | 
             
            require_relative './entities/ruling'
         | 
    
        data/lib/tabula/entities/page.rb
    CHANGED
    
    | @@ -15,11 +15,14 @@ module Tabula | |
| 15 15 | 
             
                  @ruling_lines = ruling_lines
         | 
| 16 16 | 
             
                  @file_path = file_path
         | 
| 17 17 | 
             
                  @number_one_indexed = number
         | 
| 18 | 
            -
                  self.texts = texts
         | 
| 19 18 | 
             
                  @cells = []
         | 
| 20 19 | 
             
                  @spreadsheets = nil
         | 
| 21 20 | 
             
                  @min_char_width = min_char_width
         | 
| 22 21 | 
             
                  @min_char_height = min_char_height
         | 
| 22 | 
            +
                  @spatial_index = TextElementIndex.new
         | 
| 23 | 
            +
             | 
| 24 | 
            +
                  self.texts = texts
         | 
| 25 | 
            +
                  self.texts.each { |te| @spatial_index << te }
         | 
| 23 26 | 
             
                end
         | 
| 24 27 |  | 
| 25 28 | 
             
                def min_char_width
         | 
| @@ -54,10 +57,10 @@ module Tabula | |
| 54 57 | 
             
                def get_table(options={})
         | 
| 55 58 | 
             
                  options = {:vertical_rulings => []}.merge(options)
         | 
| 56 59 | 
             
                  if texts.empty?
         | 
| 57 | 
            -
                    return []
         | 
| 60 | 
            +
                    return Tabula::Table.new(0, [])
         | 
| 58 61 | 
             
                  end
         | 
| 59 62 |  | 
| 60 | 
            -
                  text_chunks = TextElement.merge_words(self.texts, options).sort
         | 
| 63 | 
            +
                  text_chunks = TextElement.merge_words(self.texts.sort, options).sort
         | 
| 61 64 |  | 
| 62 65 | 
             
                  lines = TextChunk.group_by_lines(text_chunks)
         | 
| 63 66 |  | 
| @@ -65,7 +68,8 @@ module Tabula | |
| 65 68 | 
             
                    columns = options[:vertical_rulings].map(&:left) #pixel locations, not entities
         | 
| 66 69 | 
             
                    separators = columns.sort.reverse
         | 
| 67 70 | 
             
                  else
         | 
| 68 | 
            -
                    columns = TextChunk.column_positions( | 
| 71 | 
            +
                    columns = TextChunk.column_positions(lines.first.text_elements.min_by(&:top).top,
         | 
| 72 | 
            +
                                                         text_chunks)
         | 
| 69 73 | 
             
                    separators = columns[1..-1].sort.reverse
         | 
| 70 74 | 
             
                  end
         | 
| 71 75 |  | 
| @@ -123,8 +127,8 @@ module Tabula | |
| 123 127 | 
             
                  spreadsheets(options).each do |spreadsheet|
         | 
| 124 128 | 
             
                    spreadsheet.cells.each do |cell|
         | 
| 125 129 | 
             
                      cell.text_elements = page.get_cell_text(cell)
         | 
| 126 | 
            -
                      spreadsheet.cells_resolved = true
         | 
| 127 130 | 
             
                    end
         | 
| 131 | 
            +
                    spreadsheet.cells_resolved = true
         | 
| 128 132 | 
             
                  end
         | 
| 129 133 | 
             
                end
         | 
| 130 134 |  | 
| @@ -176,9 +180,17 @@ module Tabula | |
| 176 180 | 
             
                  if area.nil?
         | 
| 177 181 | 
             
                    texts
         | 
| 178 182 | 
             
                  else
         | 
| 179 | 
            -
                     | 
| 180 | 
            -
             | 
| 181 | 
            -
             | 
| 183 | 
            +
                    @spatial_index.contains(area)
         | 
| 184 | 
            +
                  end
         | 
| 185 | 
            +
                end
         | 
| 186 | 
            +
             | 
| 187 | 
            +
                def fill_in_cell_texts!(areas)
         | 
| 188 | 
            +
                  texts.each do |t|
         | 
| 189 | 
            +
                    area = areas.find{|a| a.contains(t) }
         | 
| 190 | 
            +
                    area.text_elements << t unless area.nil?
         | 
| 191 | 
            +
                  end
         | 
| 192 | 
            +
                  areas.each do |area|
         | 
| 193 | 
            +
                    area.text_elements = TextElement.merge_words(area.text_elements)
         | 
| 182 194 | 
             
                  end
         | 
| 183 195 | 
             
                end
         | 
| 184 196 |  | 
| @@ -53,7 +53,7 @@ module Tabula | |
| 53 53 | 
             
                  if array_of_rows.size > 2
         | 
| 54 54 | 
             
                    if array_of_rows[0].map(&:left).uniq.size < array_of_rows[1].map(&:left).uniq.size
         | 
| 55 55 | 
             
                      missing_spots = array_of_rows[1].map(&:left) - array_of_rows[0].map(&:left)
         | 
| 56 | 
            -
             | 
| 56 | 
            +
             | 
| 57 57 | 
             
                      missing_spots.each do |missing_spot|
         | 
| 58 58 | 
             
                        missing_spot_placeholder = Cell.new(array_of_rows[0][0].top, missing_spot, 0, 0)
         | 
| 59 59 | 
             
                        missing_spot_placeholder.placeholder = true
         | 
| @@ -92,5 +92,17 @@ module Tabula | |
| 92 92 | 
             
                    'data' => rows,
         | 
| 93 93 | 
             
                  }.to_json(*a)
         | 
| 94 94 | 
             
                end
         | 
| 95 | 
            +
             | 
| 96 | 
            +
                def to_csv
         | 
| 97 | 
            +
                  out = StringIO.new
         | 
| 98 | 
            +
                  Tabula::Writers.CSV(rows, out)
         | 
| 99 | 
            +
                  out.string
         | 
| 100 | 
            +
                end
         | 
| 101 | 
            +
             | 
| 102 | 
            +
                def to_tsv
         | 
| 103 | 
            +
                  out = StringIO.new
         | 
| 104 | 
            +
                  Tabula::Writers.TSV(rows, out)
         | 
| 105 | 
            +
                  out.string
         | 
| 106 | 
            +
                end
         | 
| 95 107 | 
             
              end
         | 
| 96 108 | 
             
            end
         | 
| @@ -31,11 +31,9 @@ module Tabula | |
| 31 31 |  | 
| 32 32 | 
             
                ##
         | 
| 33 33 | 
             
                # calculate estimated columns from an iterable of TextChunk
         | 
| 34 | 
            -
                def self.column_positions(text_chunks)
         | 
| 34 | 
            +
                def self.column_positions(top, text_chunks)
         | 
| 35 35 | 
             
                  right = 0
         | 
| 36 36 | 
             
                  columns = []
         | 
| 37 | 
            -
                  lines = TextChunk.group_by_lines(text_chunks)
         | 
| 38 | 
            -
                  top = lines.first.text_elements.map(&:top).min
         | 
| 39 37 |  | 
| 40 38 | 
             
                  text_chunks.each do |te|
         | 
| 41 39 | 
             
                    next if te.text =~ ONLY_SPACES_RE
         | 
| @@ -2,16 +2,17 @@ module Tabula | |
| 2 2 | 
             
              ##
         | 
| 3 3 | 
             
              # a Glyph
         | 
| 4 4 | 
             
              class TextElement < ZoneEntity
         | 
| 5 | 
            -
                attr_accessor :font, :font_size, :text, :width_of_space
         | 
| 5 | 
            +
                attr_accessor :font, :font_size, :text, :width_of_space, :direction
         | 
| 6 6 |  | 
| 7 7 | 
             
                TOLERANCE_FACTOR = 0.25
         | 
| 8 8 |  | 
| 9 | 
            -
                def initialize(top, left, width, height, font, font_size, text, width_of_space)
         | 
| 9 | 
            +
                def initialize(top, left, width, height, font, font_size, text, width_of_space, direction=0)
         | 
| 10 10 | 
             
                  super(top, left, width, height)
         | 
| 11 11 | 
             
                  self.font = font
         | 
| 12 12 | 
             
                  self.font_size = font_size
         | 
| 13 13 | 
             
                  self.text = text
         | 
| 14 14 | 
             
                  self.width_of_space = width_of_space
         | 
| 15 | 
            +
                  self.direction = direction
         | 
| 15 16 | 
             
                end
         | 
| 16 17 |  | 
| 17 18 | 
             
                EMPTY = TextElement.new(0, 0, 0, 0, nil, 0, '', 0)
         | 
| @@ -31,40 +32,45 @@ module Tabula | |
| 31 32 | 
             
                    current_chunk = chunks.last
         | 
| 32 33 | 
             
                    prev_char = current_chunk.text_elements.last
         | 
| 33 34 |  | 
| 34 | 
            -
                    #  | 
| 35 | 
            -
                     | 
| 36 | 
            -
                       | 
| 37 | 
            -
                    }
         | 
| 38 | 
            -
             | 
| 39 | 
            -
                    # should we add a space?
         | 
| 40 | 
            -
                    if (prev_char.text != " ") && (char.text != " ") \
         | 
| 41 | 
            -
                      && !across_vertical_ruling \
         | 
| 42 | 
            -
                      && prev_char.should_add_space?(char)
         | 
| 43 | 
            -
             | 
| 44 | 
            -
                      sp = self.new(prev_char.top,
         | 
| 45 | 
            -
                                    prev_char.right,
         | 
| 46 | 
            -
                                    prev_char.width_of_space,
         | 
| 47 | 
            -
                                    prev_char.width_of_space, # width == height for spaces
         | 
| 48 | 
            -
                                    prev_char.font,
         | 
| 49 | 
            -
                                    prev_char.font_size,
         | 
| 50 | 
            -
                                    ' ',
         | 
| 51 | 
            -
                                    prev_char.width_of_space)
         | 
| 52 | 
            -
                      chunks.last << sp
         | 
| 53 | 
            -
                      prev_char = sp
         | 
| 54 | 
            -
                    end
         | 
| 55 | 
            -
             | 
| 56 | 
            -
                    # should_merge? isn't aware of vertical rulings, so even if two text elements are close enough
         | 
| 57 | 
            -
                    # that they ought to be merged by that account.
         | 
| 58 | 
            -
                    # we still shouldn't merge them if the two elements are on opposite sides of a vertical ruling.
         | 
| 59 | 
            -
                    # Why are both of those `.left`?, you might ask. The intuition is that a letter
         | 
| 60 | 
            -
                    # that starts on the left of a vertical ruling ought to remain on the left of it.
         | 
| 61 | 
            -
                    if !across_vertical_ruling && prev_char.should_merge?(char)
         | 
| 62 | 
            -
                      chunks.last << char
         | 
| 35 | 
            +
                    # if same char AND overlapped, skip
         | 
| 36 | 
            +
                    if prev_char.text == char.text && prev_char.overlaps_with_ratio?(char, 0.85)
         | 
| 37 | 
            +
                      chunks
         | 
| 63 38 | 
             
                    else
         | 
| 64 | 
            -
                      #  | 
| 65 | 
            -
                       | 
| 39 | 
            +
                      # any vertical ruling goes across prev_char and char?
         | 
| 40 | 
            +
                      across_vertical_ruling = vertical_ruling_locations.any? { |loc|
         | 
| 41 | 
            +
                        prev_char.left < loc && char.left > loc
         | 
| 42 | 
            +
                      }
         | 
| 43 | 
            +
             | 
| 44 | 
            +
                      # should we add a space?
         | 
| 45 | 
            +
                      if (prev_char.text != " ") && (char.text != " ") \
         | 
| 46 | 
            +
                        && !across_vertical_ruling \
         | 
| 47 | 
            +
                        && prev_char.should_add_space?(char)
         | 
| 48 | 
            +
             | 
| 49 | 
            +
                        sp = self.new(prev_char.top,
         | 
| 50 | 
            +
                                      prev_char.right,
         | 
| 51 | 
            +
                                      prev_char.width_of_space,
         | 
| 52 | 
            +
                                      prev_char.width_of_space, # width == height for spaces
         | 
| 53 | 
            +
                                      prev_char.font,
         | 
| 54 | 
            +
                                      prev_char.font_size,
         | 
| 55 | 
            +
                                      ' ',
         | 
| 56 | 
            +
                                      prev_char.width_of_space)
         | 
| 57 | 
            +
                        chunks.last << sp
         | 
| 58 | 
            +
                        prev_char = sp
         | 
| 59 | 
            +
                      end
         | 
| 60 | 
            +
             | 
| 61 | 
            +
                      # should_merge? isn't aware of vertical rulings, so even if two text elements are close enough
         | 
| 62 | 
            +
                      # that they ought to be merged by that account.
         | 
| 63 | 
            +
                      # we still shouldn't merge them if the two elements are on opposite sides of a vertical ruling.
         | 
| 64 | 
            +
                      # Why are both of those `.left`?, you might ask. The intuition is that a letter
         | 
| 65 | 
            +
                      # that starts on the left of a vertical ruling ought to remain on the left of it.
         | 
| 66 | 
            +
                      if !across_vertical_ruling && prev_char.should_merge?(char)
         | 
| 67 | 
            +
                        chunks.last << char
         | 
| 68 | 
            +
                      else
         | 
| 69 | 
            +
                        # create a new chunk
         | 
| 70 | 
            +
                        chunks << TextChunk.create_from_text_element(char)
         | 
| 71 | 
            +
                      end
         | 
| 72 | 
            +
                      chunks
         | 
| 66 73 | 
             
                    end
         | 
| 67 | 
            -
                    chunks
         | 
| 68 74 | 
             
                  end
         | 
| 69 75 | 
             
                end
         | 
| 70 76 |  | 
| @@ -108,5 +114,17 @@ module Tabula | |
| 108 114 | 
             
                def ==(other)
         | 
| 109 115 | 
             
                  self.text.strip == other.text.strip
         | 
| 110 116 | 
             
                end
         | 
| 117 | 
            +
             | 
| 118 | 
            +
                # sort in lexicographic (reading) order
         | 
| 119 | 
            +
                def <=>(other)
         | 
| 120 | 
            +
                  if self.vertically_overlaps?(other)
         | 
| 121 | 
            +
                    self.left <=> other.left
         | 
| 122 | 
            +
                  elsif self.top < other.top
         | 
| 123 | 
            +
                    -1
         | 
| 124 | 
            +
                  else
         | 
| 125 | 
            +
                    1
         | 
| 126 | 
            +
                  end
         | 
| 127 | 
            +
                end
         | 
| 128 | 
            +
             | 
| 111 129 | 
             
              end
         | 
| 112 130 | 
             
            end
         | 
| @@ -0,0 +1,55 @@ | |
| 1 | 
            +
            module Tabula
         | 
| 2 | 
            +
              class TextElementIndex < Java::ComInfomatiqJsiRtree::RTree
         | 
| 3 | 
            +
             | 
| 4 | 
            +
                attr_reader :te_dict
         | 
| 5 | 
            +
             | 
| 6 | 
            +
                class SaveToListProcedure
         | 
| 7 | 
            +
                  include Java::GnuTroveProcedure::TIntProcedure
         | 
| 8 | 
            +
             | 
| 9 | 
            +
                  attr_reader :list
         | 
| 10 | 
            +
             | 
| 11 | 
            +
                  def initialize(parent)
         | 
| 12 | 
            +
                    @parent = parent
         | 
| 13 | 
            +
                    @list = []
         | 
| 14 | 
            +
                  end
         | 
| 15 | 
            +
             | 
| 16 | 
            +
                  def execute(id)
         | 
| 17 | 
            +
                    @list << @parent.te_dict[id]
         | 
| 18 | 
            +
                    return true
         | 
| 19 | 
            +
                  end
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                  def reset!
         | 
| 22 | 
            +
                    @list = []
         | 
| 23 | 
            +
                  end
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                end
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                def initialize
         | 
| 28 | 
            +
                  super
         | 
| 29 | 
            +
                  self.init(nil)
         | 
| 30 | 
            +
                  @te_dict = {}
         | 
| 31 | 
            +
                  @save_to_list = SaveToListProcedure.new(self)
         | 
| 32 | 
            +
                end
         | 
| 33 | 
            +
             | 
| 34 | 
            +
                def <<(text_element)
         | 
| 35 | 
            +
                  r = Java::ComInfomatiqJsi::Rectangle.new(text_element.left,
         | 
| 36 | 
            +
                                                           text_element.top,
         | 
| 37 | 
            +
                                                           text_element.right,
         | 
| 38 | 
            +
                                                           text_element.bottom)
         | 
| 39 | 
            +
                  @te_dict[text_element.object_id] = text_element
         | 
| 40 | 
            +
                  self.add(r, text_element.object_id)
         | 
| 41 | 
            +
                end
         | 
| 42 | 
            +
             | 
| 43 | 
            +
                def contains(zone_entity)
         | 
| 44 | 
            +
                  r = Java::ComInfomatiqJsi::Rectangle.new(zone_entity.left,
         | 
| 45 | 
            +
                                                           zone_entity.top,
         | 
| 46 | 
            +
                                                           zone_entity.right,
         | 
| 47 | 
            +
                                                           zone_entity.bottom)
         | 
| 48 | 
            +
                  @save_to_list.reset!
         | 
| 49 | 
            +
                  super(r, @save_to_list)
         | 
| 50 | 
            +
             | 
| 51 | 
            +
                  # sort in lexicographic (reading) order
         | 
| 52 | 
            +
                  @save_to_list.list.sort
         | 
| 53 | 
            +
                end
         | 
| 54 | 
            +
              end
         | 
| 55 | 
            +
            end
         | 
    
        data/lib/tabula/extraction.rb
    CHANGED
    
    | @@ -203,12 +203,12 @@ module Tabula | |
| 203 203 |  | 
| 204 204 | 
             
                    if c == ' ' || c == ' ' # replace non-breaking space for space
         | 
| 205 205 | 
             
                      c = ' '
         | 
| 206 | 
            -
                      h = text. | 
| 206 | 
            +
                      h = text.getWidth.round(2)
         | 
| 207 207 | 
             
                    end
         | 
| 208 208 |  | 
| 209 | 
            -
                    te = Tabula::TextElement.new(text. | 
| 210 | 
            -
                                                 text. | 
| 211 | 
            -
                                                 text. | 
| 209 | 
            +
                    te = Tabula::TextElement.new(text.getY.round(2) - h,
         | 
| 210 | 
            +
                                                 text.getX.round(2),
         | 
| 211 | 
            +
                                                 text.getWidth.round(2),
         | 
| 212 212 | 
             
                                                 # ugly hack follows: we need spaces to have a height, so we can
         | 
| 213 213 | 
             
                                                 # test for vertical overlap. height == width seems a safe bet.
         | 
| 214 214 | 
             
                                                 h,
         | 
| @@ -216,7 +216,8 @@ module Tabula | |
| 216 216 | 
             
                                                 text.getFontSize.round(2),
         | 
| 217 217 | 
             
                                                 c,
         | 
| 218 218 | 
             
                                                 # workaround a possible bug in PDFBox: https://issues.apache.org/jira/browse/PDFBOX-1755
         | 
| 219 | 
            -
                                                 text.getWidthOfSpace == 0 ? self.currentSpaceWidth : text.getWidthOfSpace | 
| 219 | 
            +
                                                 text.getWidthOfSpace == 0 ? self.currentSpaceWidth : text.getWidthOfSpace,
         | 
| 220 | 
            +
                                                 text.getDir)
         | 
| 220 221 |  | 
| 221 222 | 
             
                    ccp_bounds = self.currentClippingPath
         | 
| 222 223 |  | 
    
        data/lib/tabula/version.rb
    CHANGED
    
    
| Binary file | 
| Binary file | 
| Binary file | 
| Binary file | 
| Binary file | 
| Binary file | 
    
        data/test/tests.rb
    CHANGED
    
    | @@ -206,7 +206,8 @@ class TestExtractor < Minitest::Test | |
| 206 206 | 
             
                table = Tabula.extract_table(pdf_file_path,
         | 
| 207 207 | 
             
                                                            1,
         | 
| 208 208 | 
             
                                                            [106.01, 48.09, 227.31, 551.89],
         | 
| 209 | 
            -
                                                            :detect_ruling_lines => true | 
| 209 | 
            +
                                                            :detect_ruling_lines => true,
         | 
| 210 | 
            +
                                                            :extraction_method => "original")
         | 
| 210 211 |  | 
| 211 212 | 
             
                expected = Tabula::Table.new_from_array([["AANONSEN, DEBORAH, A", "", "STATEN ISLAND, NY", "MEALS", "$85.00"], ["TOTAL", "", "", "", "$85.00"], ["AARON, CAREN, T", "", "RICHMOND, VA", "EDUCATIONAL ITEMS", "$78.80"], ["AARON, CAREN, T", "", "RICHMOND, VA", "MEALS", "$392.45"], ["TOTAL", "", "", "", "$471.25"], ["AARON, JOHN", "", "CLARKSVILLE, TN", "MEALS", "$20.39"], ["TOTAL", "", "", "", "$20.39"], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "MEALS", "$310.33"], ["", "REGIONAL PULMONARY & SLEEP"], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE"], ["TOTAL", "", "", "", "$5,010.33"], ["AARON, MAUREEN, M", "", "MARTINSVILLE, VA", "MEALS", "$193.67"], ["TOTAL", "", "", "", "$193.67"], ["AARON, MICHAEL, L", "", "WEST ISLIP, NY", "MEALS", "$19.50"], ["TOTAL", "", "", "", "$19.50"], ["AARON, MICHAEL, R", "", "BROOKLYN, NY", "MEALS", "$65.92"]])
         | 
| 212 213 |  | 
| @@ -527,6 +528,49 @@ class TestExtractor < Minitest::Test | |
| 527 528 | 
             
                  assert_equal ["TOTAL", "453,515", "895,111", "456,431", "718,382", "487,183", "886,211", "494,220", "816,623", "495,580", "810,565", "627,469", "1,248,804", "540,367"], table.last
         | 
| 528 529 | 
             
                end
         | 
| 529 530 | 
             
              end
         | 
| 531 | 
            +
             | 
| 532 | 
            +
              def test_remove_repeated_text
         | 
| 533 | 
            +
                top, left, bottom, right = 106.07142857142858, 50.91428571428572, 141.42857142857144, 755.2285714285715
         | 
| 534 | 
            +
             | 
| 535 | 
            +
                table = Tabula.extract_table(File.expand_path('data/nyc_2013fiscalreporttables.pdf', File.dirname(__FILE__)),
         | 
| 536 | 
            +
                                             1,
         | 
| 537 | 
            +
                                             [top,left,bottom,right],
         | 
| 538 | 
            +
                                             :detect_ruling_lines => false,
         | 
| 539 | 
            +
                                             :extraction_method => 'original')
         | 
| 540 | 
            +
             | 
| 541 | 
            +
                ary = table_to_array(table)
         | 
| 542 | 
            +
                assert_equal ary[1][1], "$ 18,969,610"
         | 
| 543 | 
            +
                assert_equal ary[1][2], "$ 18,157,722"
         | 
| 544 | 
            +
              end
         | 
| 545 | 
            +
             | 
| 546 | 
            +
              def test_remove_overlapping_text
         | 
| 547 | 
            +
                # one of those PDFs that put characters on top of another to make text "bold"
         | 
| 548 | 
            +
                top,left,bottom,right = 399.98571428571427, 36.06428571428571, 425.1214285714285, 544.2428571428571
         | 
| 549 | 
            +
                table = Tabula.extract_table(File.expand_path('data/wc2012.pdf', File.dirname(__FILE__)),
         | 
| 550 | 
            +
                                             1,
         | 
| 551 | 
            +
                                             [top,left,bottom,right],
         | 
| 552 | 
            +
                                             :detect_ruling_lines => false,
         | 
| 553 | 
            +
                                             :extraction_method => 'original')
         | 
| 554 | 
            +
             | 
| 555 | 
            +
                ary = table_to_array(table)
         | 
| 556 | 
            +
                assert_equal ary.first.first, "Community development"
         | 
| 557 | 
            +
              end
         | 
| 558 | 
            +
             | 
| 559 | 
            +
              def test_cells_including_line_returns
         | 
| 560 | 
            +
                data = []
         | 
| 561 | 
            +
                pdf_file_path = "./test/data/sydney_disclosure_contract.pdf"
         | 
| 562 | 
            +
                Tabula::Extraction::ObjectExtractor.new(pdf_file_path, [1]).extract.each do |pdf_page|
         | 
| 563 | 
            +
                  pdf_page.spreadsheets.each do |spreadsheet|
         | 
| 564 | 
            +
                    spreadsheet.cells.each do |cell|
         | 
| 565 | 
            +
                      cell.text_elements = pdf_page.get_cell_text(cell)
         | 
| 566 | 
            +
                      cell.options = ({:use_line_returns => true, :cell_debug => 0})
         | 
| 567 | 
            +
                      data << cell.text
         | 
| 568 | 
            +
                    end
         | 
| 569 | 
            +
                  end
         | 
| 570 | 
            +
                end
         | 
| 571 | 
            +
                assert_equal ["1295", "Name: Reino International Pty Ltd trading as Duncan Solutions \nAddress: 15/39 Herbet Street, St Leonards NSW 2065", "N/A", "Effective Date: 13 May 2013 \nDuration: 15 Weeks", "Supply, Installation and Maintenance of Parking Ticket Machines", "$3,148,800.00exgst", "N/A", "N/A", "Open Tender  \nTender evaluation criteria included: \n- The schedule of prices \n- Compliance with technical specifications/Technical assessment \n- Operational Plan including maintenance procedures"], data
         | 
| 572 | 
            +
              end
         | 
| 573 | 
            +
             | 
| 530 574 | 
             
            end
         | 
| 531 575 |  | 
| 532 576 | 
             
            class TestIsTabularHeuristic < Minitest::Test
         | 
    
        metadata
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: tabula-extractor
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 0.7. | 
| 4 | 
            +
              version: 0.7.2
         | 
| 5 5 | 
             
            platform: java
         | 
| 6 6 | 
             
            authors:
         | 
| 7 7 | 
             
            - Manuel Aristarán
         | 
| @@ -10,7 +10,7 @@ authors: | |
| 10 10 | 
             
            autorequire:
         | 
| 11 11 | 
             
            bindir: bin
         | 
| 12 12 | 
             
            cert_chain: []
         | 
| 13 | 
            -
            date: 2014-01- | 
| 13 | 
            +
            date: 2014-01-20 00:00:00.000000000 Z
         | 
| 14 14 | 
             
            dependencies:
         | 
| 15 15 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 16 16 | 
             
              name: minitest
         | 
| @@ -127,6 +127,7 @@ files: | |
| 127 127 | 
             
            - lib/tabula/entities/table.rb
         | 
| 128 128 | 
             
            - lib/tabula/entities/text_chunk.rb
         | 
| 129 129 | 
             
            - lib/tabula/entities/text_element.rb
         | 
| 130 | 
            +
            - lib/tabula/entities/text_element_index.rb
         | 
| 130 131 | 
             
            - lib/tabula/entities/zone_entity.rb
         | 
| 131 132 | 
             
            - lib/tabula/extraction.rb
         | 
| 132 133 | 
             
            - lib/tabula/line_segment_detector.rb
         | 
| @@ -138,7 +139,10 @@ files: | |
| 138 139 | 
             
            - lib/tabula/version.rb
         | 
| 139 140 | 
             
            - lib/tabula/writers.rb
         | 
| 140 141 | 
             
            - tabula-extractor.gemspec
         | 
| 142 | 
            +
            - target/jsi-1.1.0-SNAPSHOT.jar
         | 
| 141 143 | 
             
            - target/pdfbox-app-2.0.0-SNAPSHOT.jar
         | 
| 144 | 
            +
            - target/slf4j-api-1.6.3.jar
         | 
| 145 | 
            +
            - target/trove4j-3.0.3.jar
         | 
| 142 146 | 
             
            - test/data/47008204D_USA.page4.pdf
         | 
| 143 147 | 
             
            - test/data/560015757GV_China.page1.pdf
         | 
| 144 148 | 
             
            - test/data/ClinicalResearchDisclosureReport2012Q2.pdf
         | 
| @@ -151,13 +155,16 @@ files: | |
| 151 155 | 
             
            - test/data/frx_2012_disclosure.tsv
         | 
| 152 156 | 
             
            - test/data/gre.pdf
         | 
| 153 157 | 
             
            - test/data/no_tables.pdf
         | 
| 158 | 
            +
            - test/data/nyc_2013fiscalreporttables.pdf
         | 
| 154 159 | 
             
            - test/data/puertos1.pdf
         | 
| 155 160 | 
             
            - test/data/spanning_cells.csv
         | 
| 156 161 | 
             
            - test/data/spanning_cells.pdf
         | 
| 157 162 | 
             
            - test/data/strongschools.pdf
         | 
| 163 | 
            +
            - test/data/sydney_disclosure_contract.pdf
         | 
| 158 164 | 
             
            - test/data/tabla_subsidios.pdf
         | 
| 159 165 | 
             
            - test/data/vertical_rulings_bug.pdf
         | 
| 160 166 | 
             
            - test/data/vietnam3.pdf
         | 
| 167 | 
            +
            - test/data/wc2012.pdf
         | 
| 161 168 | 
             
            - test/heuristic-test-set/original/560015757GV_China.page1.pdf
         | 
| 162 169 | 
             
            - test/heuristic-test-set/original/S2MNCEbirdisland.pdf
         | 
| 163 170 | 
             
            - test/heuristic-test-set/original/bo_page24.pdf
         | 
| @@ -190,7 +197,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement | |
| 190 197 | 
             
                  version: '0'
         | 
| 191 198 | 
             
            requirements: []
         | 
| 192 199 | 
             
            rubyforge_project:
         | 
| 193 | 
            -
            rubygems_version: 2. | 
| 200 | 
            +
            rubygems_version: 2.1.9
         | 
| 194 201 | 
             
            signing_key:
         | 
| 195 202 | 
             
            specification_version: 4
         | 
| 196 203 | 
             
            summary: extract tables from PDF files
         | 
| @@ -207,13 +214,16 @@ test_files: | |
| 207 214 | 
             
            - test/data/frx_2012_disclosure.tsv
         | 
| 208 215 | 
             
            - test/data/gre.pdf
         | 
| 209 216 | 
             
            - test/data/no_tables.pdf
         | 
| 217 | 
            +
            - test/data/nyc_2013fiscalreporttables.pdf
         | 
| 210 218 | 
             
            - test/data/puertos1.pdf
         | 
| 211 219 | 
             
            - test/data/spanning_cells.csv
         | 
| 212 220 | 
             
            - test/data/spanning_cells.pdf
         | 
| 213 221 | 
             
            - test/data/strongschools.pdf
         | 
| 222 | 
            +
            - test/data/sydney_disclosure_contract.pdf
         | 
| 214 223 | 
             
            - test/data/tabla_subsidios.pdf
         | 
| 215 224 | 
             
            - test/data/vertical_rulings_bug.pdf
         | 
| 216 225 | 
             
            - test/data/vietnam3.pdf
         | 
| 226 | 
            +
            - test/data/wc2012.pdf
         | 
| 217 227 | 
             
            - test/heuristic-test-set/original/560015757GV_China.page1.pdf
         | 
| 218 228 | 
             
            - test/heuristic-test-set/original/S2MNCEbirdisland.pdf
         | 
| 219 229 | 
             
            - test/heuristic-test-set/original/bo_page24.pdf
         |