tabula-extractor 0.6.6-java → 0.7.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/AUTHORS.md +1 -0
- data/README.md +27 -11
- data/bin/tabula +61 -19
- data/ext/liblsd-linux32.so +0 -0
- data/ext/liblsd-linux64.so +0 -0
- data/ext/liblsd.dll +0 -0
- data/ext/liblsd.dylib +0 -0
- data/ext/liblsd64.dll +0 -0
- data/ext/lsd.c +137 -137
- data/ext/lsd.h +9 -9
- data/lib/tabula.rb +20 -3
- data/lib/tabula/core_ext.rb +261 -0
- data/lib/tabula/entities.rb +11 -456
- data/lib/tabula/entities/cell.rb +42 -0
- data/lib/tabula/entities/has_cells.rb +244 -0
- data/lib/tabula/entities/line.rb +39 -0
- data/lib/tabula/entities/page.rb +269 -0
- data/lib/tabula/entities/page_area.rb +7 -0
- data/lib/tabula/entities/ruling.rb +300 -0
- data/lib/tabula/entities/spreadsheet.rb +92 -0
- data/lib/tabula/entities/table.rb +81 -0
- data/lib/tabula/entities/text_chunk.rb +114 -0
- data/lib/tabula/entities/text_element.rb +112 -0
- data/lib/tabula/entities/zone_entity.rb +57 -0
- data/lib/tabula/extraction.rb +327 -0
- data/lib/tabula/line_segment_detector.rb +9 -7
- data/lib/tabula/pdf_line_extractor.rb +319 -0
- data/lib/tabula/pdf_render.rb +1 -5
- data/lib/tabula/spreadsheet_extractor.rb +52 -0
- data/lib/tabula/table_extractor.rb +50 -348
- data/lib/tabula/table_guesser.rb +21 -23
- data/lib/tabula/version.rb +1 -1
- data/lib/tabula/writers.rb +5 -6
- data/tabula-extractor.gemspec +1 -0
- data/target/pdfbox-app-2.0.0-SNAPSHOT.jar +0 -0
- data/test/data/47008204D_USA.page4.pdf +0 -0
- data/test/data/560015757GV_China.page1.pdf +0 -0
- data/test/data/GSK_2012_Q4.page437.pdf +0 -0
- data/test/data/S2MNCEbirdisland.pdf +0 -0
- data/test/data/campaign_donors.pdf +0 -0
- data/test/data/frx_2012_disclosure.tsv +88 -0
- data/test/data/no_tables.pdf +0 -0
- data/test/data/puertos1.pdf +0 -0
- data/test/data/spanning_cells.csv +21 -0
- data/test/data/spanning_cells.pdf +0 -0
- data/test/data/strongschools.pdf +0 -0
- data/{vertical_rulings_bug.pdf → test/data/vertical_rulings_bug.pdf} +0 -0
- data/test/data/vietnam3.pdf +0 -0
- data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
- data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
- data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
- data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
- data/test/heuristic.rb +50 -0
- data/test/test_bin_tabula.sh +7 -0
- data/test/tests.rb +476 -63
- metadata +79 -28
- data/lib/geom/point.rb +0 -21
- data/lib/geom/rectangle.rb +0 -101
- data/lib/geom/segment.rb +0 -82
- data/lib/tabula/pdf_dump.rb +0 -132
- data/lib/tabula/whitespace.rb +0 -50
- data/vertical_rulings_bug.rb +0 -29
| @@ -0,0 +1,42 @@ | |
| 1 | 
            +
            module Tabula
         | 
| 2 | 
            +
             | 
| 3 | 
            +
              #cells are components of spreadsheets
         | 
| 4 | 
            +
             | 
| 5 | 
            +
              class Cell < ZoneEntity
         | 
| 6 | 
            +
             | 
| 7 | 
            +
                NORMAL = 0
         | 
| 8 | 
            +
                DEBUG = 1
         | 
| 9 | 
            +
                SUPERDEBUG = 2
         | 
| 10 | 
            +
             | 
| 11 | 
            +
                attr_accessor :text_elements, :placeholder, :spanning, :options
         | 
| 12 | 
            +
             | 
| 13 | 
            +
                def initialize(top, left, width, height, options={})
         | 
| 14 | 
            +
                  super(top, left, width, height)
         | 
| 15 | 
            +
                  @placeholder = false
         | 
| 16 | 
            +
                  @spanning = false
         | 
| 17 | 
            +
                  @text_elements = []
         | 
| 18 | 
            +
                  @options = ({:use_line_returns => false, :cell_debug => NORMAL}).merge options
         | 
| 19 | 
            +
                end
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                def self.new_from_points(topleft, bottomright, options={})
         | 
| 22 | 
            +
                  width = bottomright.x - topleft.x
         | 
| 23 | 
            +
                  height = bottomright.y - topleft.y
         | 
| 24 | 
            +
                  Cell.new(topleft.y, topleft.x, width, height, options)
         | 
| 25 | 
            +
                end
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                def text
         | 
| 28 | 
            +
                  return "placeholder" if @placeholder && @options[:cell_debug] >= DEBUG
         | 
| 29 | 
            +
                  output = ""
         | 
| 30 | 
            +
                  text_elements.sort #use the default sort for ZoneEntity
         | 
| 31 | 
            +
                  text_elements.group_by(&:top).values.each do |row|
         | 
| 32 | 
            +
                    output << row.map{|el| el.text}.join('') + (@options[:use_line_returns] ? "\n" : '')
         | 
| 33 | 
            +
                  end 
         | 
| 34 | 
            +
                  if (output.empty? && @options[:cell_debug] >= DEBUG) || @options[:cell_debug] >= SUPERDEBUG
         | 
| 35 | 
            +
                    text_output = output.dup
         | 
| 36 | 
            +
                    output = "top: #{top} left: #{left} \n w: #{width} h: #{height}" 
         | 
| 37 | 
            +
                    output += " \n #{text_output}"
         | 
| 38 | 
            +
                  end
         | 
| 39 | 
            +
                  output.strip
         | 
| 40 | 
            +
                end
         | 
| 41 | 
            +
              end
         | 
| 42 | 
            +
            end
         | 
| @@ -0,0 +1,244 @@ | |
| 1 | 
            +
            require 'set'
         | 
| 2 | 
            +
            java_import java.awt.Polygon
         | 
| 3 | 
            +
            java_import java.awt.geom.Area
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            module Tabula
         | 
| 6 | 
            +
              # subclasses must define cells, vertical_ruling_lines, horizontal_ruling_lines accessors; ruling_lines reader
         | 
| 7 | 
            +
              module HasCells
         | 
| 8 | 
            +
             | 
| 9 | 
            +
                IS_TABULAR_HEURISTIC_RATIO = 0.8
         | 
| 10 | 
            +
                ANOTHER_MAGIC_NUMBER = 0.75
         | 
| 11 | 
            +
             | 
| 12 | 
            +
                def is_tabular?
         | 
| 13 | 
            +
                  #spreadsheet extraction
         | 
| 14 | 
            +
                  spreadsheet = spreadsheets.first
         | 
| 15 | 
            +
                  return false if spreadsheet.nil?
         | 
| 16 | 
            +
                  rows_defined_by_lines = spreadsheet.rows.size #rows filled in automatically
         | 
| 17 | 
            +
                  columns_defined_by_lines = spreadsheet.cols.size
         | 
| 18 | 
            +
             | 
| 19 | 
            +
                  table = self.get_table
         | 
| 20 | 
            +
                  columns_defined_without_lines = table.cols.size
         | 
| 21 | 
            +
                  rows_defined_without_lines = table.rows.size
         | 
| 22 | 
            +
                  ratio = ((columns_defined_by_lines.to_f / columns_defined_without_lines) + (rows_defined_by_lines.to_f / rows_defined_without_lines)) / 2
         | 
| 23 | 
            +
             | 
| 24 | 
            +
                  return ratio > ANOTHER_MAGIC_NUMBER && ratio < (1 / ANOTHER_MAGIC_NUMBER)
         | 
| 25 | 
            +
                end
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                # finds cells from the ruling lines on the page.
         | 
| 28 | 
            +
                # implements Nurminen thesis algorithm cf. https://github.com/jazzido/tabula-extractor/issues/16
         | 
| 29 | 
            +
                # subclasses must define cells, vertical_ruling_lines, horizontal_ruling_lines accessors
         | 
| 30 | 
            +
                def find_cells!(options={})
         | 
| 31 | 
            +
                  # All lines need to been sorted from up to down,
         | 
| 32 | 
            +
                  # and left to right in ascending order
         | 
| 33 | 
            +
             | 
| 34 | 
            +
                  cellsFound = []
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                  intersection_points = Ruling.find_intersections(horizontal_ruling_lines, vertical_ruling_lines)
         | 
| 37 | 
            +
             | 
| 38 | 
            +
                  # All crossing-points have been sorted from up to down,
         | 
| 39 | 
            +
                  # and left to right in ascending order
         | 
| 40 | 
            +
                  # depending on the Point2D default sort here.
         | 
| 41 | 
            +
                  intersection_points_array = intersection_points.keys.sort
         | 
| 42 | 
            +
             | 
| 43 | 
            +
                  intersection_points.each_with_index do |(topLeft, ((horizontal, vertical))), i|
         | 
| 44 | 
            +
                    # Fetch all points on the same vertical and horizontal
         | 
| 45 | 
            +
                    # line with current crossing point
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                    # this lets us go to the next intersection_point in intersection_points_array
         | 
| 48 | 
            +
                    # it is bad and I feel bad.
         | 
| 49 | 
            +
                    catch :cellCreated do
         | 
| 50 | 
            +
             | 
| 51 | 
            +
                      # CrossingPointsDirectlyBelow( topLeft );
         | 
| 52 | 
            +
                      x_points = intersection_points_array[i..-1].select{|pt| pt.x == topLeft.x && pt.y > topLeft.y }
         | 
| 53 | 
            +
                      # CrossingPointsDirectlyToTheRight( topLeft );
         | 
| 54 | 
            +
                      y_points = intersection_points_array[i..-1].select{|pt| pt.y == topLeft.y && pt.x > topLeft.x }
         | 
| 55 | 
            +
             | 
| 56 | 
            +
             | 
| 57 | 
            +
                      x_points.each do |x_point|
         | 
| 58 | 
            +
                        #                                Skip to next crossing-point
         | 
| 59 | 
            +
                        # if( NOT EdgeExistsBetween( topLeft, x_point)) next crossing-
         | 
| 60 | 
            +
                        #                                                    point;
         | 
| 61 | 
            +
                        next unless vertical.colinear?(x_point)
         | 
| 62 | 
            +
                        y_points.each do |y_point|
         | 
| 63 | 
            +
             | 
| 64 | 
            +
                          # if( NOT EdgeExistsBetween( topLeft, y_point)) next crossing-
         | 
| 65 | 
            +
                          #                                                    point;
         | 
| 66 | 
            +
                          next unless horizontal.colinear?(y_point)
         | 
| 67 | 
            +
                          #Hypothetical bottom right point of rectangle
         | 
| 68 | 
            +
                          btmRight = Point2D::Float.new( y_point.x, x_point.y )
         | 
| 69 | 
            +
                          if intersection_points.include?(btmRight)
         | 
| 70 | 
            +
                            intersection_points[btmRight].each do |btmRightHorizontal, btmRightVertical|
         | 
| 71 | 
            +
                              if btmRightHorizontal.colinear?( x_point ) &&
         | 
| 72 | 
            +
                                btmRightVertical.colinear?( y_point )
         | 
| 73 | 
            +
                                # Rectangle is confirmed to have 4 sides
         | 
| 74 | 
            +
                                cellsFound << Cell.new_from_points( topLeft, btmRight, options)
         | 
| 75 | 
            +
                                # Each crossing point can be the top left corner
         | 
| 76 | 
            +
                                # of only a single rectangle
         | 
| 77 | 
            +
                                #next crossing-point; we need to "next" out of the outer loop here
         | 
| 78 | 
            +
                                # to avoid creating non-minimal cells, I htink.
         | 
| 79 | 
            +
                                throw :cellCreated
         | 
| 80 | 
            +
                              end
         | 
| 81 | 
            +
                            end
         | 
| 82 | 
            +
                          end
         | 
| 83 | 
            +
                        end
         | 
| 84 | 
            +
                      end
         | 
| 85 | 
            +
                    end #cellCreated
         | 
| 86 | 
            +
                  end
         | 
| 87 | 
            +
                  self.cells = cellsFound
         | 
| 88 | 
            +
                  cellsFound
         | 
| 89 | 
            +
                end
         | 
| 90 | 
            +
             | 
| 91 | 
            +
                #############################
         | 
| 92 | 
            +
                # Chapter 2, Spanning Cells #
         | 
| 93 | 
            +
                #############################
         | 
| 94 | 
            +
                #if c is a "spanning cell", that is
         | 
| 95 | 
            +
                #              if there are N>0 vertical lines strictly between this cell's left and right
         | 
| 96 | 
            +
                #insert N placeholder cells after it with zero size (but same top)
         | 
| 97 | 
            +
             | 
| 98 | 
            +
                # subclasses must define cells, vertical_ruling_lines, horizontal_ruling_lines accessors
         | 
| 99 | 
            +
                def add_spanning_cells!
         | 
| 100 | 
            +
                  #rounding: because Cell.new_from_points, using in #find_cells above, has
         | 
| 101 | 
            +
                  # a float precision error where, for instance, a cell whose x2 coord is
         | 
| 102 | 
            +
                  # supposed to be 160.137451171875 comes out as 160.13745498657227 because
         | 
| 103 | 
            +
                  # of minus. :(
         | 
| 104 | 
            +
                  vertical_uniq_locs = vertical_ruling_lines.map{|l| l.left.round(5)}.uniq    #already sorted
         | 
| 105 | 
            +
                  horizontal_uniq_locs = horizontal_ruling_lines.map{|l| l.top.round(5)}.uniq #already sorted
         | 
| 106 | 
            +
             | 
| 107 | 
            +
                  cells.each do |c|
         | 
| 108 | 
            +
                    vertical_rulings_spanned_over = vertical_uniq_locs.select{|l| l > c.left.round(5) && l < c.right.round(5) }
         | 
| 109 | 
            +
                    horizontal_rulings_spanned_over = horizontal_uniq_locs.select{|t| t > c.top.round(5) && t < c.bottom.round(5) }
         | 
| 110 | 
            +
             | 
| 111 | 
            +
                    unless vertical_rulings_spanned_over.empty?
         | 
| 112 | 
            +
                      c.spanning = true
         | 
| 113 | 
            +
                      vertical_rulings_spanned_over.each do |spanned_over_line_loc|
         | 
| 114 | 
            +
                        placeholder = Cell.new(c.top, spanned_over_line_loc, 0, c.height)
         | 
| 115 | 
            +
                        placeholder.placeholder = true
         | 
| 116 | 
            +
                        cells << placeholder
         | 
| 117 | 
            +
                      end
         | 
| 118 | 
            +
                    end
         | 
| 119 | 
            +
                    unless horizontal_rulings_spanned_over.empty?
         | 
| 120 | 
            +
                      c.spanning = true
         | 
| 121 | 
            +
                      horizontal_rulings_spanned_over.each do |spanned_over_line_loc|
         | 
| 122 | 
            +
                        placeholder = Cell.new(spanned_over_line_loc, c.left, c.width, 0)
         | 
| 123 | 
            +
                        placeholder.placeholder = true
         | 
| 124 | 
            +
                        cells << placeholder
         | 
| 125 | 
            +
                      end
         | 
| 126 | 
            +
                    end
         | 
| 127 | 
            +
             | 
| 128 | 
            +
                    #if there's a spanning cell that's spans over both rows and columns, then it has "double placeholder" cells
         | 
| 129 | 
            +
                    # e.g. -------------------
         | 
| 130 | 
            +
                    #      | C |  C |  C | C |         (this is some pretty sweet ASCII art, eh?)
         | 
| 131 | 
            +
                    #      |-----------------|
         | 
| 132 | 
            +
                    #      | C |  C |  C | C |
         | 
| 133 | 
            +
                    #      |-----------------|
         | 
| 134 | 
            +
                    #      | C | SC    P | C |   where MC is the "spanning cell" that holds all the text within its bounds
         | 
| 135 | 
            +
                    #      |----    +    ----|         P is a "placeholder" cell with either zero width or zero height
         | 
| 136 | 
            +
                    #      | C | P    DP | C |         DP is a "double placeholder" cell with zero width and zero height
         | 
| 137 | 
            +
                    #      |----    +    ----|         C is an ordinary cell.
         | 
| 138 | 
            +
                    #      | C | P    DP | C |
         | 
| 139 | 
            +
                    #      |-----------------|
         | 
| 140 | 
            +
             | 
| 141 | 
            +
                    unless (double_placeholders = vertical_rulings_spanned_over.product(horizontal_rulings_spanned_over)).empty?
         | 
| 142 | 
            +
                      double_placeholders.each do |vert_spanned_over, horiz_spanned_over|
         | 
| 143 | 
            +
                        placeholder = Cell.new(horiz_spanned_over, vert_spanned_over, 0, 0)
         | 
| 144 | 
            +
                        placeholder.placeholder = true
         | 
| 145 | 
            +
                        cells << placeholder
         | 
| 146 | 
            +
                      end
         | 
| 147 | 
            +
                    end
         | 
| 148 | 
            +
                  end
         | 
| 149 | 
            +
                end
         | 
| 150 | 
            +
             | 
| 151 | 
            +
                #TODO:
         | 
| 152 | 
            +
                #returns array of Spreadsheet objects constructed (or spreadsheet_areas => cells)
         | 
| 153 | 
            +
                #maybe placeholders should be added after cells is split into spreadsheets
         | 
| 154 | 
            +
                def find_spreadsheets_from_cells
         | 
| 155 | 
            +
                  cells.sort!
         | 
| 156 | 
            +
             | 
| 157 | 
            +
                  # via http://stackoverflow.com/questions/13746284/merging-multiple-adjacent-rectangles-into-one-polygon
         | 
| 158 | 
            +
             | 
| 159 | 
            +
                  points = Set.new
         | 
| 160 | 
            +
                  cells.each do |cell|
         | 
| 161 | 
            +
                    #TODO: keep track of cells for each point here for more efficiently keeping track of cells inside a polygon
         | 
| 162 | 
            +
                    cell.points.each do |pt|
         | 
| 163 | 
            +
                      if points.include?(pt) # Shared vertex, remove it.
         | 
| 164 | 
            +
                        points.delete(pt)
         | 
| 165 | 
            +
                      else
         | 
| 166 | 
            +
                        points << pt
         | 
| 167 | 
            +
                      end
         | 
| 168 | 
            +
                    end
         | 
| 169 | 
            +
                  end
         | 
| 170 | 
            +
                  points = points.to_a
         | 
| 171 | 
            +
             | 
| 172 | 
            +
                  #x first sort
         | 
| 173 | 
            +
                  points_sort_x = points.sort{ |s, other| s.x_first_cmp(other) }
         | 
| 174 | 
            +
                  points_sort_y = points.sort
         | 
| 175 | 
            +
             | 
| 176 | 
            +
                  edges_h = {}
         | 
| 177 | 
            +
                  edges_v = {}
         | 
| 178 | 
            +
             | 
| 179 | 
            +
                  i = 0
         | 
| 180 | 
            +
                  while i < points.size do
         | 
| 181 | 
            +
                    curr_y = points_sort_y[i].y
         | 
| 182 | 
            +
                    while i < points.size && points_sort_y[i].y == curr_y do
         | 
| 183 | 
            +
                      edges_h[points_sort_y[i]] = points_sort_y[i + 1]
         | 
| 184 | 
            +
                      edges_h[points_sort_y[i + 1]] = points_sort_y[i]
         | 
| 185 | 
            +
                      i += 2
         | 
| 186 | 
            +
                    end
         | 
| 187 | 
            +
                  end
         | 
| 188 | 
            +
             | 
| 189 | 
            +
                  i = 0
         | 
| 190 | 
            +
                  while i < points.size do
         | 
| 191 | 
            +
                    curr_x = points_sort_x[i].x
         | 
| 192 | 
            +
                    while i < points.size && points_sort_x[i].x == curr_x do
         | 
| 193 | 
            +
                      edges_v[points_sort_x[i]] = points_sort_x[i + 1]
         | 
| 194 | 
            +
                      edges_v[points_sort_x[i + 1]] = points_sort_x[i]
         | 
| 195 | 
            +
                      i += 2
         | 
| 196 | 
            +
                    end
         | 
| 197 | 
            +
                  end
         | 
| 198 | 
            +
             | 
| 199 | 
            +
                  # Get all the polygons.
         | 
| 200 | 
            +
                  polygons = []
         | 
| 201 | 
            +
                  while !edges_h.empty?
         | 
| 202 | 
            +
                    # We can start with any point.
         | 
| 203 | 
            +
                    #TODO: should the polygon be represented just by an ordered array of points?
         | 
| 204 | 
            +
                    polygon = [[edges_h.shift[0], :horiz]] #popitem removes and returns a random key-value pair
         | 
| 205 | 
            +
                    loop do
         | 
| 206 | 
            +
                      curr, e = polygon.last
         | 
| 207 | 
            +
                      if e == :horiz
         | 
| 208 | 
            +
                        next_vertex = edges_v.delete(curr)
         | 
| 209 | 
            +
                        polygon << [next_vertex, :vert]
         | 
| 210 | 
            +
                      else
         | 
| 211 | 
            +
                        next_vertex = edges_h.delete(curr) #pop removes and returns the value at key `curr`
         | 
| 212 | 
            +
                        polygon << [next_vertex, :horiz]
         | 
| 213 | 
            +
                      end
         | 
| 214 | 
            +
                      if polygon[-1] == polygon[0]
         | 
| 215 | 
            +
                        # Closed polygon
         | 
| 216 | 
            +
                        polygon.pop()
         | 
| 217 | 
            +
                        break
         | 
| 218 | 
            +
                      end
         | 
| 219 | 
            +
                    end
         | 
| 220 | 
            +
             | 
| 221 | 
            +
                    # Remove implementation-markers (:horiz and :vert) from the polygon.
         | 
| 222 | 
            +
                    polygon.map!{|point, _| point}
         | 
| 223 | 
            +
                    polygon.each do |vertex|
         | 
| 224 | 
            +
                      edges_h.delete(vertex) if edges_h.include?(vertex)
         | 
| 225 | 
            +
                      edges_v.delete(vertex) if edges_v.include?(vertex)
         | 
| 226 | 
            +
                    end
         | 
| 227 | 
            +
                    polygons << polygon
         | 
| 228 | 
            +
                  end
         | 
| 229 | 
            +
             | 
| 230 | 
            +
                  # for efficiency's sake, we maybe ought to use java Polygon objects internally
         | 
| 231 | 
            +
                  # for flexibility, we don't.
         | 
| 232 | 
            +
             | 
| 233 | 
            +
                  polygons.map do |polygon|
         | 
| 234 | 
            +
                    xpoints = []
         | 
| 235 | 
            +
                    ypoints = []
         | 
| 236 | 
            +
                    polygon.each do |pt|
         | 
| 237 | 
            +
                      xpoints << pt.x
         | 
| 238 | 
            +
                      ypoints << pt.y
         | 
| 239 | 
            +
                    end
         | 
| 240 | 
            +
                    Area.new(Polygon.new(xpoints.to_java(Java::int), ypoints.to_java(Java::int), xpoints.size)) #lol jruby
         | 
| 241 | 
            +
                  end
         | 
| 242 | 
            +
                end
         | 
| 243 | 
            +
              end
         | 
| 244 | 
            +
            end
         | 
| @@ -0,0 +1,39 @@ | |
| 1 | 
            +
            module Tabula
         | 
| 2 | 
            +
              class Line < ZoneEntity
         | 
| 3 | 
            +
                attr_accessor :text_elements
         | 
| 4 | 
            +
                attr_reader :index
         | 
| 5 | 
            +
             | 
| 6 | 
            +
                def initialize(index=nil)
         | 
| 7 | 
            +
                  @text_elements = []
         | 
| 8 | 
            +
                  @index = index
         | 
| 9 | 
            +
                end
         | 
| 10 | 
            +
             | 
| 11 | 
            +
                def <<(t)
         | 
| 12 | 
            +
                  if @text_elements.size == 0
         | 
| 13 | 
            +
                    @text_elements << t
         | 
| 14 | 
            +
                    self.top = t.top
         | 
| 15 | 
            +
                    self.left = t.left
         | 
| 16 | 
            +
                    self.width = t.width
         | 
| 17 | 
            +
                    self.height = t.height
         | 
| 18 | 
            +
                  else
         | 
| 19 | 
            +
                    if in_same_column = @text_elements.find { |te| te.horizontally_overlaps?(t) }
         | 
| 20 | 
            +
                      in_same_column.merge!(t)
         | 
| 21 | 
            +
                    else
         | 
| 22 | 
            +
                      self.text_elements << t
         | 
| 23 | 
            +
                      self.merge!(t)
         | 
| 24 | 
            +
                    end
         | 
| 25 | 
            +
                  end
         | 
| 26 | 
            +
                end
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                #used for testing, ignores text element stuff besides stripped text.
         | 
| 29 | 
            +
                def ==(other)
         | 
| 30 | 
            +
                  return false if other.nil?
         | 
| 31 | 
            +
                  self.text_elements = self.text_elements.rpad(TextElement::EMPTY, other.text_elements.size)
         | 
| 32 | 
            +
                  other.text_elements = other.text_elements.rpad(TextElement::EMPTY, self.text_elements.size)
         | 
| 33 | 
            +
                  self.text_elements.zip(other.text_elements).inject(true) do |memo, my_yours|
         | 
| 34 | 
            +
                    my, yours = my_yours
         | 
| 35 | 
            +
                    memo && my == yours
         | 
| 36 | 
            +
                  end
         | 
| 37 | 
            +
                end
         | 
| 38 | 
            +
              end
         | 
| 39 | 
            +
            end
         | 
| @@ -0,0 +1,269 @@ | |
| 1 | 
            +
            module Tabula
         | 
| 2 | 
            +
              class Page < ZoneEntity
         | 
| 3 | 
            +
                include Tabula::HasCells
         | 
| 4 | 
            +
             | 
| 5 | 
            +
                attr_reader :rotation, :number_one_indexed, :file_path
         | 
| 6 | 
            +
                attr_writer :min_char_width, :min_char_height
         | 
| 7 | 
            +
                attr_accessor :cells
         | 
| 8 | 
            +
             | 
| 9 | 
            +
                def initialize(file_path, width, height, rotation, number, texts=[], ruling_lines=[], min_char_width=nil, min_char_height=nil)
         | 
| 10 | 
            +
                  super(0, 0, width, height)
         | 
| 11 | 
            +
                  @rotation = rotation
         | 
| 12 | 
            +
                  if number < 1
         | 
| 13 | 
            +
                    raise ArgumentError, "Tabula::Page numbers are one-indexed; numbers < 1 are invalid."
         | 
| 14 | 
            +
                  end
         | 
| 15 | 
            +
                  @ruling_lines = ruling_lines
         | 
| 16 | 
            +
                  @file_path = file_path
         | 
| 17 | 
            +
                  @number_one_indexed = number
         | 
| 18 | 
            +
                  self.texts = texts
         | 
| 19 | 
            +
                  @cells = []
         | 
| 20 | 
            +
                  @spreadsheets = nil
         | 
| 21 | 
            +
                  @min_char_width = min_char_width
         | 
| 22 | 
            +
                  @min_char_height = min_char_height
         | 
| 23 | 
            +
                end
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                def min_char_width
         | 
| 26 | 
            +
                  @min_char_width ||= texts.map(&:width).min
         | 
| 27 | 
            +
                end
         | 
| 28 | 
            +
             | 
| 29 | 
            +
                def min_char_height
         | 
| 30 | 
            +
                  @min_char_height ||= texts.map(&:height).min
         | 
| 31 | 
            +
                end
         | 
| 32 | 
            +
             | 
| 33 | 
            +
                def get_area(area)
         | 
| 34 | 
            +
                  if area.is_a?(Array)
         | 
| 35 | 
            +
                    top, left, bottom, right = area
         | 
| 36 | 
            +
                    area = Tabula::ZoneEntity.new(top, left,
         | 
| 37 | 
            +
                                                  right - left, bottom - top)
         | 
| 38 | 
            +
                  end
         | 
| 39 | 
            +
             | 
| 40 | 
            +
                  texts = self.get_text(area)
         | 
| 41 | 
            +
                  page_area = PageArea.new(file_path,
         | 
| 42 | 
            +
                                           area.width,
         | 
| 43 | 
            +
                                           area.height,
         | 
| 44 | 
            +
                                           rotation,
         | 
| 45 | 
            +
                                           number,
         | 
| 46 | 
            +
                                           texts,
         | 
| 47 | 
            +
                                           Ruling.crop_rulings_to_area(@ruling_lines, area),
         | 
| 48 | 
            +
                                           texts.map(&:width).min,
         | 
| 49 | 
            +
                                           texts.map(&:height).min)
         | 
| 50 | 
            +
                  return page_area
         | 
| 51 | 
            +
                end
         | 
| 52 | 
            +
             | 
| 53 | 
            +
                #returns a Table object
         | 
| 54 | 
            +
                def get_table(options={})
         | 
| 55 | 
            +
                  options = {:vertical_rulings => []}.merge(options)
         | 
| 56 | 
            +
                  if texts.empty?
         | 
| 57 | 
            +
                    return []
         | 
| 58 | 
            +
                  end
         | 
| 59 | 
            +
             | 
| 60 | 
            +
                  text_chunks = TextElement.merge_words(self.texts, options).sort
         | 
| 61 | 
            +
             | 
| 62 | 
            +
                  lines = TextChunk.group_by_lines(text_chunks)
         | 
| 63 | 
            +
             | 
| 64 | 
            +
                  unless options[:vertical_rulings].empty?
         | 
| 65 | 
            +
                    columns = options[:vertical_rulings].map(&:left) #pixel locations, not entities
         | 
| 66 | 
            +
                    separators = columns.sort.reverse
         | 
| 67 | 
            +
                  else
         | 
| 68 | 
            +
                    columns = TextChunk.column_positions(text_chunks)
         | 
| 69 | 
            +
                    separators = columns[1..-1].sort.reverse
         | 
| 70 | 
            +
                  end
         | 
| 71 | 
            +
             | 
| 72 | 
            +
                  table = Table.new(lines.count, separators)
         | 
| 73 | 
            +
                  lines.each_with_index do |line, i|
         | 
| 74 | 
            +
                    line.text_elements.each do |te|
         | 
| 75 | 
            +
                      j = separators.find_index { |s| te.left > s } || separators.count
         | 
| 76 | 
            +
                      table.add_text_element(te, i, separators.count - j)
         | 
| 77 | 
            +
                    end
         | 
| 78 | 
            +
                  end
         | 
| 79 | 
            +
             | 
| 80 | 
            +
                  table.lstrip_lines!
         | 
| 81 | 
            +
                  table
         | 
| 82 | 
            +
                end
         | 
| 83 | 
            +
             | 
| 84 | 
            +
                #for API backwards-compatibility reasons, this returns an array of arrays.
         | 
| 85 | 
            +
                def make_table(options={})
         | 
| 86 | 
            +
                  get_table(options).lines.map do |l|
         | 
| 87 | 
            +
                    l.text_elements.map! do |te|
         | 
| 88 | 
            +
                      te || TextElement.new(nil, nil, nil, nil, nil, nil, '', nil)
         | 
| 89 | 
            +
                    end
         | 
| 90 | 
            +
                  end.sort_by { |l| l.map { |te| te.top or 0 }.max }
         | 
| 91 | 
            +
                end
         | 
| 92 | 
            +
             | 
| 93 | 
            +
                # returns the Spreadsheets; creating them if they're not memoized
         | 
| 94 | 
            +
                def spreadsheets(options={})
         | 
| 95 | 
            +
                  unless @spreadsheets.nil?
         | 
| 96 | 
            +
                    return @spreadsheets
         | 
| 97 | 
            +
                  end
         | 
| 98 | 
            +
                  get_ruling_lines!(options)
         | 
| 99 | 
            +
                  self.find_cells!(options)
         | 
| 100 | 
            +
             | 
| 101 | 
            +
                  spreadsheet_areas = find_spreadsheets_from_cells #literally, java.awt.geom.Area objects. lol sorry. polygons.
         | 
| 102 | 
            +
             | 
| 103 | 
            +
                  #transform each spreadsheet area into a rectangle
         | 
| 104 | 
            +
                  # and get the cells contained within it.
         | 
| 105 | 
            +
                  spreadsheet_rectangle_areas = spreadsheet_areas.map{|a| a.getBounds } #getBounds2D is theoretically better, but returns a Rectangle2D.Double, which doesn't have our Ruby sugar on it.
         | 
| 106 | 
            +
             | 
| 107 | 
            +
                  @spreadsheets = spreadsheet_rectangle_areas.map do |rect|
         | 
| 108 | 
            +
                    spr = Spreadsheet.new(rect.y, rect.x,
         | 
| 109 | 
            +
                                    rect.width, rect.height,
         | 
| 110 | 
            +
                                    self,
         | 
| 111 | 
            +
                                    #TODO: keep track of the cells, instead of getting them again inefficiently.
         | 
| 112 | 
            +
                                    [],
         | 
| 113 | 
            +
                                    vertical_ruling_lines.select{|vl| rect.intersectsLine(vl) },
         | 
| 114 | 
            +
                                    horizontal_ruling_lines.select{|hl| rect.intersectsLine(hl) }
         | 
| 115 | 
            +
                                    )
         | 
| 116 | 
            +
                    spr.cells = @cells.select{|c| spr.overlaps?(c) }
         | 
| 117 | 
            +
                    spr.add_spanning_cells!
         | 
| 118 | 
            +
                    spr
         | 
| 119 | 
            +
                  end
         | 
| 120 | 
            +
                  if options[:fill_in_cells]
         | 
| 121 | 
            +
                    fill_in_cells!
         | 
| 122 | 
            +
                  end
         | 
| 123 | 
            +
                  spreadsheets
         | 
| 124 | 
            +
                end
         | 
| 125 | 
            +
             | 
| 126 | 
            +
                def fill_in_cells!(options={})
         | 
| 127 | 
            +
                  spreadsheets(options).each do |spreadsheet|
         | 
| 128 | 
            +
                    spreadsheet.cells.each do |cell|
         | 
| 129 | 
            +
                      cell.text_elements = page.get_cell_text(cell)
         | 
| 130 | 
            +
                      spreadsheet.cells_resolved = true
         | 
| 131 | 
            +
                    end
         | 
| 132 | 
            +
                  end
         | 
| 133 | 
            +
                end
         | 
| 134 | 
            +
             | 
| 135 | 
            +
                def number(indexing_base=:one_indexed)
         | 
| 136 | 
            +
                  if indexing_base == :zero_indexed
         | 
| 137 | 
            +
                    return @number_one_indexed - 1
         | 
| 138 | 
            +
                  else
         | 
| 139 | 
            +
                    return @number_one_indexed
         | 
| 140 | 
            +
                  end
         | 
| 141 | 
            +
                end
         | 
| 142 | 
            +
             | 
| 143 | 
            +
                # TODO no need for this, let's choose one name
         | 
| 144 | 
            +
                def ruling_lines
         | 
| 145 | 
            +
                  get_ruling_lines!
         | 
| 146 | 
            +
                end
         | 
| 147 | 
            +
             | 
| 148 | 
            +
                def horizontal_ruling_lines
         | 
| 149 | 
            +
                  get_ruling_lines!
         | 
| 150 | 
            +
                  @horizontal_ruling_lines.nil? ? [] : @horizontal_ruling_lines
         | 
| 151 | 
            +
                end
         | 
| 152 | 
            +
             | 
| 153 | 
            +
                def vertical_ruling_lines
         | 
| 154 | 
            +
                  get_ruling_lines!
         | 
| 155 | 
            +
                  @vertical_ruling_lines.nil? ? [] : @vertical_ruling_lines
         | 
| 156 | 
            +
                end
         | 
| 157 | 
            +
             | 
| 158 | 
            +
                #returns ruling lines, memoizes them in
         | 
| 159 | 
            +
                def get_ruling_lines!(options={})
         | 
| 160 | 
            +
                  if !@ruling_lines.nil? && !@ruling_lines.empty?
         | 
| 161 | 
            +
                    self.snap_points!
         | 
| 162 | 
            +
                    @vertical_ruling_lines ||= self.collapse_oriented_rulings(@ruling_lines.select(&:vertical?))
         | 
| 163 | 
            +
                    @horizontal_ruling_lines ||= self.collapse_oriented_rulings(@ruling_lines.select(&:horizontal?))
         | 
| 164 | 
            +
                    @vertical_ruling_lines + @horizontal_ruling_lines
         | 
| 165 | 
            +
                  else
         | 
| 166 | 
            +
                    []
         | 
| 167 | 
            +
                  end
         | 
| 168 | 
            +
                end
         | 
| 169 | 
            +
             | 
| 170 | 
            +
                ##
         | 
| 171 | 
            +
                # get text insidea area
         | 
| 172 | 
            +
                # area can be an Array ([top, left, width, height])
         | 
| 173 | 
            +
                # or a Rectangle2D
         | 
| 174 | 
            +
                def get_text(area=nil)
         | 
| 175 | 
            +
                  if area.instance_of?(Array)
         | 
| 176 | 
            +
                    top, left, bottom, right = area
         | 
| 177 | 
            +
                    area = Tabula::ZoneEntity.new(top, left,
         | 
| 178 | 
            +
                                                  right - left, bottom - top)
         | 
| 179 | 
            +
                  end
         | 
| 180 | 
            +
                  if area.nil?
         | 
| 181 | 
            +
                    texts
         | 
| 182 | 
            +
                  else
         | 
| 183 | 
            +
                    texts.select do |t|
         | 
| 184 | 
            +
                      area.contains(t)
         | 
| 185 | 
            +
                    end
         | 
| 186 | 
            +
                  end
         | 
| 187 | 
            +
                end
         | 
| 188 | 
            +
             | 
| 189 | 
            +
                def get_cell_text(area=nil)
         | 
| 190 | 
            +
                  TextElement.merge_words(self.get_text(area))
         | 
| 191 | 
            +
                end
         | 
| 192 | 
            +
             | 
| 193 | 
            +
                def to_json(options={})
         | 
| 194 | 
            +
                  { :width => self.width,
         | 
| 195 | 
            +
                    :height => self.height,
         | 
| 196 | 
            +
                    :number => self.number,
         | 
| 197 | 
            +
                    :rotation => self.rotation,
         | 
| 198 | 
            +
                    :texts => self.texts
         | 
| 199 | 
            +
                  }.to_json(options)
         | 
| 200 | 
            +
                end
         | 
| 201 | 
            +
             | 
| 202 | 
            +
                def snap_points!
         | 
| 203 | 
            +
                  lines_to_points = {}
         | 
| 204 | 
            +
                  points = []
         | 
| 205 | 
            +
                  @ruling_lines.each do |line|
         | 
| 206 | 
            +
                    point1 = line.p1 #comptooters are the wurst
         | 
| 207 | 
            +
                    point2 = line.p2
         | 
| 208 | 
            +
                    # for a given line, each call to #p1 and #p2 creates a new
         | 
| 209 | 
            +
                    # Point2D::Float object, rather than returning the same one over and
         | 
| 210 | 
            +
                    # over again.
         | 
| 211 | 
            +
                    # so we have to get it, store it in memory as `point1` and `point2`
         | 
| 212 | 
            +
                    # and then store those in various places (and now, modifying one will
         | 
| 213 | 
            +
                    # modify the reference and thereby modify the other)
         | 
| 214 | 
            +
                    lines_to_points[line] = [point1, point2]
         | 
| 215 | 
            +
                    points += [point1, point2]
         | 
| 216 | 
            +
                  end
         | 
| 217 | 
            +
             | 
| 218 | 
            +
                  # lines are stored separately from their constituent points
         | 
| 219 | 
            +
                  # so you can't modify the points and then modify the lines.
         | 
| 220 | 
            +
                  # ah, but perhaps I can stick the points in a hash AND in an array
         | 
| 221 | 
            +
                  # and then modify the lines by means of the points in the hash.
         | 
| 222 | 
            +
             | 
| 223 | 
            +
                  [[:x, :x=, self.min_char_width], [:y, :y=, self.min_char_height]].each do |getter, setter, cell_size|
         | 
| 224 | 
            +
                    sorted_points = points.sort_by(&getter)
         | 
| 225 | 
            +
                    first_point = sorted_points.shift
         | 
| 226 | 
            +
                    grouped_points = sorted_points.inject([[first_point]] ) do |memo, next_point|
         | 
| 227 | 
            +
                      last = memo.last
         | 
| 228 | 
            +
             | 
| 229 | 
            +
                      if (next_point.send(getter) - last.first.send(getter)).abs < cell_size
         | 
| 230 | 
            +
                        memo[-1] << next_point
         | 
| 231 | 
            +
                      else
         | 
| 232 | 
            +
                        memo << [next_point]
         | 
| 233 | 
            +
                      end
         | 
| 234 | 
            +
                      memo
         | 
| 235 | 
            +
                    end
         | 
| 236 | 
            +
                    grouped_points.each do |group|
         | 
| 237 | 
            +
                      uniq_locs = group.map(&getter).uniq
         | 
| 238 | 
            +
                      avg_loc = uniq_locs.sum / uniq_locs.size
         | 
| 239 | 
            +
                      group.each{|p| p.send(setter, avg_loc) }
         | 
| 240 | 
            +
                    end
         | 
| 241 | 
            +
                  end
         | 
| 242 | 
            +
             | 
| 243 | 
            +
                  lines_to_points.each do |l, p1_p2|
         | 
| 244 | 
            +
                    l.java_send :setLine, [java.awt.geom.Point2D, java.awt.geom.Point2D], p1_p2[0],
         | 
| 245 | 
            +
             p1_p2[1]
         | 
| 246 | 
            +
                  end
         | 
| 247 | 
            +
                end
         | 
| 248 | 
            +
             | 
| 249 | 
            +
                def collapse_oriented_rulings(lines)
         | 
| 250 | 
            +
                  # lines must all be of one orientation (i.e. horizontal, vertical)
         | 
| 251 | 
            +
                  lines.sort! {|a, b| a.position != b.position ? a.position <=> b.position : a.start <=> b.start }
         | 
| 252 | 
            +
             | 
| 253 | 
            +
                  lines = lines.inject([lines.shift]) do |memo, next_line|
         | 
| 254 | 
            +
                    last = memo.last
         | 
| 255 | 
            +
                    if next_line.position == last.position && last.nearlyIntersects?(next_line)
         | 
| 256 | 
            +
                      memo.last.start = next_line.start < last.start ? next_line.start : last.start
         | 
| 257 | 
            +
                      memo.last.end = next_line.end < last.end ? last.end : next_line.end
         | 
| 258 | 
            +
                      memo
         | 
| 259 | 
            +
                    elsif next_line.length == 0
         | 
| 260 | 
            +
                      memo
         | 
| 261 | 
            +
                    else
         | 
| 262 | 
            +
                      memo << next_line
         | 
| 263 | 
            +
                    end
         | 
| 264 | 
            +
                  end
         | 
| 265 | 
            +
                  lines
         | 
| 266 | 
            +
                end
         | 
| 267 | 
            +
              end
         | 
| 268 | 
            +
             | 
| 269 | 
            +
            end
         |