RubyGems - tabula-rb - Versions diffs - 1.0.0 - Mend

tabula-rb 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

checksums.yaml +7 -0
data/.rspec +3 -0
data/.rubocop.yml +39 -0
data/CHANGELOG.md +59 -0
data/LICENSE +21 -0
data/README.md +176 -0
data/Rakefile +28 -0
data/exe/tabula +7 -0
data/lib/tabula/algorithms/cohen_sutherland_clipping.rb +94 -0
data/lib/tabula/algorithms/projection_profile.rb +109 -0
data/lib/tabula/cli.rb +271 -0
data/lib/tabula/configuration.rb +119 -0
data/lib/tabula/core/point.rb +60 -0
data/lib/tabula/core/rectangle.rb +218 -0
data/lib/tabula/core/ruling.rb +303 -0
data/lib/tabula/core/spatial_index.rb +120 -0
data/lib/tabula/detectors/detection_algorithm.rb +34 -0
data/lib/tabula/detectors/nurminen_detection_algorithm.rb +211 -0
data/lib/tabula/detectors/spreadsheet_detection_algorithm.rb +142 -0
data/lib/tabula/extractors/basic_extraction_algorithm.rb +168 -0
data/lib/tabula/extractors/extraction_algorithm.rb +34 -0
data/lib/tabula/extractors/spreadsheet_extraction_algorithm.rb +299 -0
data/lib/tabula/pdf/object_extractor.rb +400 -0
data/lib/tabula/pdf/page.rb +230 -0
data/lib/tabula/pdf/text_stripper.rb +150 -0
data/lib/tabula/table/cell.rb +110 -0
data/lib/tabula/table/table.rb +184 -0
data/lib/tabula/text/line.rb +133 -0
data/lib/tabula/text/text_chunk.rb +185 -0
data/lib/tabula/text/text_element.rb +120 -0
data/lib/tabula/version.rb +5 -0
data/lib/tabula/writers/csv_writer.rb +49 -0
data/lib/tabula/writers/json_writer.rb +41 -0
data/lib/tabula/writers/markdown_writer.rb +71 -0
data/lib/tabula/writers/tsv_writer.rb +35 -0
data/lib/tabula/writers/writer.rb +39 -0
data/lib/tabula.rb +160 -0
data/mise.toml +2 -0
data/tabula-rb.gemspec +44 -0
metadata +115 -0

data/lib/tabula/detectors/spreadsheet_detection_algorithm.rb ADDED Viewed

@@ -0,0 +1,142 @@
+# frozen_string_literal: true
+module Tabula
+  module Detectors
+    # Detects table areas using ruling line analysis.
+    # Suitable for PDFs with clear table borders.
+    class SpreadsheetDetection < DetectionAlgorithm
+      # Minimum cells for a valid table
+      MIN_CELLS = 4
+      # Minimum table dimension (in points)
+      MIN_DIMENSION = 10
+      # Detect table areas on a page
+      # @param page [Page] page to detect tables on
+      # @return [Array<Rectangle>] detected table areas
+      def detect(page)
+        horizontal = page.horizontal_rulings
+        vertical = page.vertical_rulings
+        return [] if horizontal.empty? || vertical.empty?
+        # Find cells from ruling intersections
+        cells = find_cells(horizontal, vertical)
+        return [] if cells.size < MIN_CELLS
+        # Group cells into table regions
+        regions = find_table_regions(cells)
+        # Filter valid regions
+        regions.select { |r| valid_table_region?(r) }
+      end
+      private
+      def find_cells(horizontal_rulings, vertical_rulings)
+        # Use the same logic as SpreadsheetExtractionAlgorithm
+        cells = []
+        # Find intersection points
+        intersections = build_intersection_map(horizontal_rulings, vertical_rulings)
+        return cells if intersections.empty?
+        # Get unique x and y positions
+        x_positions = intersections.keys.map { |x, _| x }.uniq.sort
+        y_positions = intersections.keys.map { |_, y| y }.uniq.sort
+        # Find cells by checking for rectangular intersections
+        y_positions.each_cons(2) do |top, bottom|
+          x_positions.each_cons(2) do |left, right|
+            if valid_cell?(left, right, top, bottom, intersections)
+              cells << Rectangle.new(top, left, right - left, bottom - top)
+            end
+          end
+        end
+        cells
+      end
+      def build_intersection_map(horizontal_rulings, vertical_rulings)
+        intersections = {}
+        horizontal_rulings.each do |h|
+          vertical_rulings.each do |v|
+            next unless h.intersects?(v)
+            point = h.intersection_point(v)
+            next unless point
+            key = [point.x.round(1), point.y.round(1)]
+            intersections[key] = true
+          end
+        end
+        intersections
+      end
+      def valid_cell?(left, right, top, bottom, intersections)
+        tolerance = 2.0
+        corners = [
+          [left, top],
+          [right, top],
+          [left, bottom],
+          [right, bottom]
+        ]
+        corners.all? do |x, y|
+          intersections.keys.any? do |ix, iy|
+            (x - ix).abs <= tolerance && (y - iy).abs <= tolerance
+          end
+        end
+      end
+      def find_table_regions(cells)
+        return [] if cells.empty?
+        regions = []
+        remaining = cells.dup
+        until remaining.empty?
+          seed = remaining.shift
+          region = [seed]
+          loop do
+            adjacent = remaining.select { |c| adjacent_to_region?(c, region) }
+            break if adjacent.empty?
+            region.concat(adjacent)
+            remaining -= adjacent
+          end
+          regions << Rectangle.bounding_box_of(region)
+        end
+        regions
+      end
+      def adjacent_to_region?(cell, region)
+        region.any? { |r| cells_adjacent?(r, cell) }
+      end
+      def cells_adjacent?(c1, c2)
+        tolerance = 2.0
+        # Horizontal adjacency
+        h_adjacent = (c1.right - c2.left).abs <= tolerance || (c2.right - c1.left).abs <= tolerance
+        v_overlap = c1.vertically_overlaps?(c2, 0.5)
+        # Vertical adjacency
+        v_adjacent = (c1.bottom - c2.top).abs <= tolerance || (c2.bottom - c1.top).abs <= tolerance
+        h_overlap = c1.horizontally_overlaps?(c2, 0.5)
+        (h_adjacent && v_overlap) || (v_adjacent && h_overlap)
+      end
+      def valid_table_region?(region)
+        region.width >= MIN_DIMENSION && region.height >= MIN_DIMENSION
+      end
+    end
+  end
+end

data/lib/tabula/extractors/basic_extraction_algorithm.rb ADDED Viewed

@@ -0,0 +1,168 @@
+# frozen_string_literal: true
+module Tabula
+  module Extractors
+    # Stream-mode extraction algorithm.
+    # Extracts tables by analyzing text positions and gaps without relying on ruling lines.
+    class Basic < ExtractionAlgorithm
+      # @param columns [Array<Float>, nil] explicit column positions
+      # @param guess [Boolean] whether to guess column positions
+      def initialize(columns: nil, guess: true, **options)
+        super(**options)
+        @columns = columns
+        @guess = guess
+      end
+      # Extract tables from a page
+      # @param page [Page] page to extract from
+      # @return [Array<Table>]
+      def extract(page)
+        return [] if page.text_elements.empty?
+        # Get text chunks and lines
+        chunks = page.text_chunks
+        return [] if chunks.empty?
+        lines = TextChunk.group_by_lines(chunks)
+        return [] if lines.empty?
+        # Determine column positions
+        column_positions = determine_columns(lines, page)
+        # Build table
+        table = build_table(lines, column_positions, page.page_number)
+        table.empty? ? [] : [table]
+      end
+      private
+      def determine_columns(lines, page)
+        if @columns
+          # Use explicit columns
+          @columns.sort
+        elsif page.vertical_rulings.any?
+          # Use vertical ruling positions
+          page.vertical_rulings.map(&:x1).sort.uniq
+        elsif @guess
+          # Guess columns from text gaps
+          guess_column_positions(lines)
+        else
+          # No column separators - single column
+          []
+        end
+      end
+      def guess_column_positions(lines)
+        return [] if lines.empty?
+        # Collect all gap positions from all lines
+        all_gaps = []
+        lines.each do |line|
+          gaps = line.gap_positions
+          all_gaps.concat(gaps)
+        end
+        return [] if all_gaps.empty?
+        # Cluster gaps that appear in multiple lines
+        clustered = cluster_positions(all_gaps, tolerance: 5.0)
+        # Only keep gaps that appear in at least 30% of lines
+        min_occurrences = (lines.size * 0.3).ceil
+        frequent = clustered.select { |_, count| count >= min_occurrences }
+        frequent.keys.sort
+      end
+      def cluster_positions(positions, tolerance:)
+        return {} if positions.empty?
+        sorted = positions.sort
+        clusters = {}
+        current_cluster = [sorted.first]
+        sorted[1..].each do |pos|
+          if (pos - current_cluster.last) <= tolerance
+            current_cluster << pos
+          else
+            avg = current_cluster.sum / current_cluster.size
+            clusters[avg] = current_cluster.size
+            current_cluster = [pos]
+          end
+        end
+        unless current_cluster.empty?
+          avg = current_cluster.sum / current_cluster.size
+          clusters[avg] = current_cluster.size
+        end
+        clusters
+      end
+      def build_table(lines, column_positions, page_number)
+        table = Table.new(extraction_method: name, page_number: page_number)
+        lines.each_with_index do |line, row_idx|
+          assign_chunks_to_columns(line, column_positions, table, row_idx)
+        end
+        table
+      end
+      def assign_chunks_to_columns(line, column_positions, table, row_idx)
+        if column_positions.empty?
+          # Single column
+          cell = create_cell_from_line(line)
+          table.add(row_idx, 0, cell)
+        else
+          # Multiple columns - assign chunks to appropriate columns
+          columns = split_line_by_columns(line, column_positions)
+          columns.each_with_index do |chunks, col_idx|
+            cell = create_cell_from_chunks(chunks)
+            table.add(row_idx, col_idx, cell)
+          end
+        end
+      end
+      def split_line_by_columns(line, column_positions)
+        # Create column boundaries
+        boundaries = [line.left, *column_positions, Float::INFINITY]
+        # Initialize columns
+        num_columns = boundaries.size - 1
+        columns = Array.new(num_columns) { [] }
+        # Assign each chunk to a column
+        line.sorted_chunks.each do |chunk|
+          chunk_center = chunk.left + (chunk.width / 2.0)
+          col_idx = find_column_index(chunk_center, boundaries)
+          columns[col_idx] << chunk
+        end
+        columns
+      end
+      def find_column_index(x, boundaries)
+        boundaries.each_cons(2).with_index do |(left, right), idx|
+          return idx if x >= left && x < right
+        end
+        boundaries.size - 2 # Last column
+      end
+      def create_cell_from_line(line)
+        cell = Cell.new(line.top, line.left, line.width, line.height)
+        line.chunks.each { |chunk| cell.add(chunk) }
+        cell
+      end
+      def create_cell_from_chunks(chunks)
+        return Cell.empty if chunks.empty?
+        bounds = Rectangle.bounding_box_of(chunks)
+        cell = Cell.new(bounds.top, bounds.left, bounds.width, bounds.height)
+        chunks.each { |chunk| cell.add(chunk) }
+        cell
+      end
+    end
+  end
+end

data/lib/tabula/extractors/extraction_algorithm.rb ADDED Viewed

@@ -0,0 +1,34 @@
+# frozen_string_literal: true
+module Tabula
+  module Extractors
+    # Base class for table extraction algorithms
+    class ExtractionAlgorithm
+      # Extract tables from a page
+      # @param page [Page] page to extract from
+      # @param options [Hash] algorithm-specific options
+      # @return [Array<Table>] extracted tables
+      def self.extract(page, **options)
+        new(**options).extract(page)
+      end
+      # @param options [Hash] algorithm options
+      def initialize(**options)
+        @options = options
+      end
+      # Extract tables from a page
+      # @param page [Page] page to extract from
+      # @return [Array<Table>]
+      def extract(page)
+        raise NotImplementedError, 'Subclasses must implement #extract'
+      end
+      # Get algorithm name for table metadata
+      # @return [String]
+      def name
+        self.class.name.split('::').last
+      end
+    end
+  end
+end

data/lib/tabula/extractors/spreadsheet_extraction_algorithm.rb ADDED Viewed

@@ -0,0 +1,299 @@
+# frozen_string_literal: true
+module Tabula
+  module Extractors
+    # Lattice-mode extraction algorithm.
+    # Extracts tables by analyzing ruling lines (cell borders) in the PDF.
+    class Spreadsheet < ExtractionAlgorithm
+      # Minimum cells required for a valid table
+      MIN_CELLS = 4
+      # Magic heuristic for determining tabular content
+      TABULAR_RATIO_THRESHOLD = 0.65
+      # Extract tables from a page
+      # @param page [Page] page to extract from
+      # @return [Array<Table>]
+      def extract(page)
+        horizontal = page.horizontal_rulings
+        vertical = page.vertical_rulings
+        return [] if horizontal.empty? || vertical.empty?
+        # Find cells from ruling intersections
+        cells = find_cells(horizontal, vertical)
+        return [] if cells.size < MIN_CELLS
+        # Find spreadsheet regions from cells and get cells per region
+        cell_groups = find_spreadsheet_areas_with_cells(cells)
+        return [] if cell_groups.empty?
+        # Extract tables from each region using the found cells
+        tables = cell_groups.map do |region_cells|
+          extract_table_from_cells(page, region_cells, horizontal, vertical)
+        end
+        tables.reject(&:empty?)
+      end
+      # Check if a page contains tabular content
+      # @param page [Page] page to check
+      # @return [Boolean]
+      def self.tabular?(page)
+        extractor = new
+        tables = extractor.extract(page)
+        return false if tables.empty?
+        # Check if tables have reasonable structure
+        tables.any? do |table|
+          ratio = table.row_count.to_f / table.col_count
+          ratio.between?(TABULAR_RATIO_THRESHOLD, 1.0 / TABULAR_RATIO_THRESHOLD)
+        end
+      end
+      private
+      def find_cells(horizontal_rulings, vertical_rulings)
+        cells = []
+        tolerance = Tabula.configuration.cell_tolerance
+        # Find intersection points
+        intersections = build_intersection_map(horizontal_rulings, vertical_rulings)
+        return cells if intersections.empty?
+        # Get unique y positions from horizontal rulings (row boundaries)
+        y_positions = horizontal_rulings.map { |r| r.y1.round(1) }.uniq.sort
+        return cells if y_positions.size < 2
+        # Process each row individually to handle spanning cells
+        y_positions.each_cons(2) do |top, bottom|
+          # Find vertical rulings that span this row (intersect with row's Y range)
+          row_verticals = vertical_rulings.select do |v|
+            v.y1 <= top + tolerance && v.y2 >= bottom - tolerance
+          end
+          # Get unique X positions from vertical rulings only
+          x_positions = row_verticals.map { |v| v.x1.round(1) }.uniq.sort
+          next if x_positions.size < 2
+          # Create cells for this row
+          x_positions.each_cons(2) do |left, right|
+            # Verify this cell has valid edges
+            if valid_cell_by_edges?(left, right, top, bottom, horizontal_rulings, vertical_rulings, tolerance)
+              cells << Cell.new(top, left, right - left, bottom - top)
+            # Also accept cells with corner validation
+            elsif valid_cell_by_corners?(left, right, top, bottom, intersections, tolerance)
+              cells << Cell.new(top, left, right - left, bottom - top)
+            end
+          end
+        end
+        cells
+      end
+      def build_intersection_map(horizontal_rulings, vertical_rulings)
+        intersections = {}
+        horizontal_rulings.each do |h|
+          vertical_rulings.each do |v|
+            next unless h.intersects?(v)
+            point = h.intersection_point(v)
+            next unless point
+            # Round to avoid floating point issues
+            key = [point.x.round(1), point.y.round(1)]
+            intersections[key] = true
+          end
+        end
+        intersections
+      end
+      def valid_cell_by_corners?(left, right, top, bottom, intersections, tolerance)
+        corners = [
+          [left, top],
+          [right, top],
+          [left, bottom],
+          [right, bottom]
+        ]
+        corners.all? do |x, y|
+          intersections.keys.any? do |ix, iy|
+            (x - ix).abs <= tolerance && (y - iy).abs <= tolerance
+          end
+        end
+      end
+      # Check if there are rulings that form the edges of a potential cell
+      def valid_cell_by_edges?(left, right, top, bottom, horizontal_rulings, vertical_rulings, tolerance)
+        # Check for top edge (horizontal ruling at top that covers left to right)
+        has_top = horizontal_rulings.any? do |h|
+          (h.y1 - top).abs <= tolerance &&
+            h.x1 <= left + tolerance &&
+            h.x2 >= right - tolerance
+        end
+        # Check for bottom edge
+        has_bottom = horizontal_rulings.any? do |h|
+          (h.y1 - bottom).abs <= tolerance &&
+            h.x1 <= left + tolerance &&
+            h.x2 >= right - tolerance
+        end
+        # Check for left edge (vertical ruling at left that covers top to bottom)
+        has_left = vertical_rulings.any? do |v|
+          (v.x1 - left).abs <= tolerance &&
+            v.y1 <= top + tolerance &&
+            v.y2 >= bottom - tolerance
+        end
+        # Check for right edge
+        has_right = vertical_rulings.any? do |v|
+          (v.x1 - right).abs <= tolerance &&
+            v.y1 <= top + tolerance &&
+            v.y2 >= bottom - tolerance
+        end
+        has_top && has_bottom && has_left && has_right
+      end
+      def find_spreadsheet_areas(cells)
+        find_spreadsheet_areas_with_cells(cells).map do |region_cells|
+          Rectangle.bounding_box_of(region_cells)
+        end
+      end
+      def find_spreadsheet_areas_with_cells(cells)
+        return [] if cells.empty?
+        # Group adjacent cells into regions
+        cell_groups = []
+        remaining = cells.dup
+        until remaining.empty?
+          seed = remaining.shift
+          region = [seed]
+          loop do
+            adjacent = remaining.select { |c| adjacent?(region, c) }
+            break if adjacent.empty?
+            region.concat(adjacent)
+            remaining -= adjacent
+          end
+          # Filter out small regions
+          bbox = Rectangle.bounding_box_of(region)
+          cell_groups << region if bbox.area.positive?
+        end
+        cell_groups
+      end
+      def adjacent?(region, cell)
+        region.any? { |r| cells_adjacent?(r, cell) }
+      end
+      def cells_adjacent?(c1, c2)
+        # Cells are adjacent if they share an edge
+        tolerance = 2.0
+        # Horizontal adjacency (share vertical edge)
+        horizontal = (c1.right - c2.left).abs <= tolerance || (c2.right - c1.left).abs <= tolerance
+        vertical_overlap = c1.vertically_overlaps?(c2, 0.5)
+        # Vertical adjacency (share horizontal edge)
+        vertical = (c1.bottom - c2.top).abs <= tolerance || (c2.bottom - c1.top).abs <= tolerance
+        horizontal_overlap = c1.horizontally_overlaps?(c2, 0.5)
+        (horizontal && vertical_overlap) || (vertical && horizontal_overlap)
+      end
+      def extract_table_from_cells(page, cells, horizontal_rulings, vertical_rulings)
+        return Table.new if cells.empty?
+        # Get area bounds from cells
+        area = Rectangle.bounding_box_of(cells)
+        # Get rulings within the area
+        h_rulings = horizontal_rulings.select { |r| ruling_in_area?(r, area) }
+        v_rulings = vertical_rulings.select { |r| ruling_in_area?(r, area) }
+        # Build table
+        table = Table::WithRulingLines.new(
+          horizontal_rulings: h_rulings,
+          vertical_rulings: v_rulings,
+          extraction_method: name,
+          page_number: page.page_number
+        )
+        # Organize cells into grid positions
+        # Get unique y positions (rows) and sort cells by position
+        y_positions = cells.map { |c| c.top.round(1) }.uniq.sort
+        y_to_row = y_positions.each_with_index.to_h
+        cells.each do |cell|
+          row_idx = y_to_row[cell.top.round(1)]
+          next unless row_idx
+          # Find column index based on x position within this row
+          row_cells = cells.select { |c| (c.top - cell.top).abs < 2 }.sort_by(&:left)
+          col_idx = row_cells.index(cell) || 0
+          # Populate cell with text elements
+          cell_area = Rectangle.from_bounds(cell.top, cell.left, cell.bottom, cell.right)
+          text_elements = page.get_text(cell_area)
+          cell.add_all(text_elements)
+          table.add(row_idx, col_idx, cell)
+        end
+        table
+      end
+      def extract_table_from_area(page, area, horizontal_rulings, vertical_rulings)
+        # Get rulings within the area
+        h_rulings = horizontal_rulings.select { |r| ruling_in_area?(r, area) }
+        v_rulings = vertical_rulings.select { |r| ruling_in_area?(r, area) }
+        # Get unique positions for grid
+        y_positions = h_rulings.map(&:y1).uniq.sort
+        x_positions = v_rulings.map(&:x1).uniq.sort
+        return Table.new if y_positions.size < 2 || x_positions.size < 2
+        # Build table
+        table = Table::WithRulingLines.new(
+          horizontal_rulings: h_rulings,
+          vertical_rulings: v_rulings,
+          extraction_method: name,
+          page_number: page.page_number
+        )
+        # Create cells and populate with text
+        y_positions.each_cons(2).with_index do |(top, bottom), row_idx|
+          x_positions.each_cons(2).with_index do |(left, right), col_idx|
+            cell = Cell.new(top, left, right - left, bottom - top)
+            # Find text elements in this cell
+            cell_area = Rectangle.from_bounds(top, left, bottom, right)
+            text_elements = page.get_text(cell_area)
+            cell.add_all(text_elements)
+            table.add(row_idx, col_idx, cell)
+          end
+        end
+        table
+      end
+      def ruling_in_area?(ruling, area)
+        ruling_rect = Rectangle.from_bounds(ruling.top, ruling.left, ruling.bottom, ruling.right)
+        area.intersects?(ruling_rect)
+      end
+    end
+  end
+end