RubyGems - tabula-rb - Versions diffs - 1.0.0 - Mend

tabula-rb 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

checksums.yaml +7 -0
data/.rspec +3 -0
data/.rubocop.yml +39 -0
data/CHANGELOG.md +59 -0
data/LICENSE +21 -0
data/README.md +176 -0
data/Rakefile +28 -0
data/exe/tabula +7 -0
data/lib/tabula/algorithms/cohen_sutherland_clipping.rb +94 -0
data/lib/tabula/algorithms/projection_profile.rb +109 -0
data/lib/tabula/cli.rb +271 -0
data/lib/tabula/configuration.rb +119 -0
data/lib/tabula/core/point.rb +60 -0
data/lib/tabula/core/rectangle.rb +218 -0
data/lib/tabula/core/ruling.rb +303 -0
data/lib/tabula/core/spatial_index.rb +120 -0
data/lib/tabula/detectors/detection_algorithm.rb +34 -0
data/lib/tabula/detectors/nurminen_detection_algorithm.rb +211 -0
data/lib/tabula/detectors/spreadsheet_detection_algorithm.rb +142 -0
data/lib/tabula/extractors/basic_extraction_algorithm.rb +168 -0
data/lib/tabula/extractors/extraction_algorithm.rb +34 -0
data/lib/tabula/extractors/spreadsheet_extraction_algorithm.rb +299 -0
data/lib/tabula/pdf/object_extractor.rb +400 -0
data/lib/tabula/pdf/page.rb +230 -0
data/lib/tabula/pdf/text_stripper.rb +150 -0
data/lib/tabula/table/cell.rb +110 -0
data/lib/tabula/table/table.rb +184 -0
data/lib/tabula/text/line.rb +133 -0
data/lib/tabula/text/text_chunk.rb +185 -0
data/lib/tabula/text/text_element.rb +120 -0
data/lib/tabula/version.rb +5 -0
data/lib/tabula/writers/csv_writer.rb +49 -0
data/lib/tabula/writers/json_writer.rb +41 -0
data/lib/tabula/writers/markdown_writer.rb +71 -0
data/lib/tabula/writers/tsv_writer.rb +35 -0
data/lib/tabula/writers/writer.rb +39 -0
data/lib/tabula.rb +160 -0
data/mise.toml +2 -0
data/tabula-rb.gemspec +44 -0
metadata +115 -0

data/lib/tabula/pdf/text_stripper.rb ADDED Viewed

@@ -0,0 +1,150 @@
+# frozen_string_literal: true
+require 'pdf-reader'
+module Tabula
+  # Extracts text elements from PDF pages using pdf-reader.
+  # Uses pdf-reader's PageTextReceiver for proper font encoding and CMap handling.
+  class TextStripper
+    # @param page [PDF::Reader::Page] pdf-reader page object
+    def initialize(page)
+      @page = page
+      @text_elements = []
+      @min_char_width = Float::INFINITY
+      @min_char_height = Float::INFINITY
+    end
+    # Extract text elements from the page
+    # @return [Array<TextElement>] extracted text elements
+    def extract
+      # Use pdf-reader's PageTextReceiver for proper font encoding
+      receiver = PDF::Reader::PageTextReceiver.new
+      receiver.page = @page
+      @page.walk(receiver)
+      # Get merged text runs for readable output
+      # merge: true combines adjacent characters into words/phrases
+      runs = receiver.runs(
+        merge: true,
+        skip_zero_width: true,
+        skip_overlapping: true
+      )
+      # Get page dimensions and rotation
+      rotation = @page.attributes[:Rotate] || 0
+      runs.each do |run|
+        next if run.text.nil? || run.text.empty?
+        next unless printable?(run.text)
+        # pdf-reader already applies rotation transformation
+        # For rotated pages, y coordinates are negative
+        # For non-rotated pages, we need to flip from bottom-origin to top-origin
+        if [90, 270].include?(rotation)
+          # Rotated pages: y is negative, convert to positive
+          top = run.y.abs
+        else
+          # Non-rotated pages: convert from bottom-origin to top-origin
+          page_height = calculate_page_height
+          top = page_height - run.y
+        end
+        left = run.x
+        width = run.width
+        height = run.font_size
+        # Detect text direction from Unicode character properties
+        direction = rtl_text?(run.text) ? TextElement::DIRECTION_RTL : TextElement::DIRECTION_LTR
+        element = TextElement.new(
+          top: top,
+          left: left,
+          width: width,
+          height: height,
+          text: run.text,
+          font_name: nil, # pdf-reader doesn't expose font name in runs
+          font_size: run.font_size,
+          width_of_space: estimate_space_width(run),
+          direction: direction
+        )
+        @text_elements << element
+        @min_char_width = [@min_char_width, width].min if width.positive?
+        @min_char_height = [@min_char_height, height].min if height.positive?
+      end
+      @text_elements
+    end
+    attr_reader :min_char_width, :min_char_height
+    private
+    def calculate_page_height
+      box = @page.attributes[:CropBox] || @page.attributes[:MediaBox]
+      (box[3].to_f - box[1].to_f).abs
+    end
+    # Check if character is printable (port of Java's isPrintable)
+    def printable?(text)
+      return false if text.nil? || text.empty?
+      text.each_char do |char|
+        code = char.ord
+        # Filter control characters except space, tab, newline
+        return false if code < 0x20 && code != 0x09 && code != 0x0A && code != 0x0D
+        # Filter delete character
+        return false if code == 0x7F
+        # Filter Unicode replacement character
+        return false if code == 0xFFFD
+        # Filter null character
+        return false if code.zero?
+      end
+      true
+    end
+    # Estimate width of space character based on font size
+    def estimate_space_width(run)
+      # Approximate space width as 0.25 of font size (common for proportional fonts)
+      run.font_size * 0.25
+    end
+    # Detect if text contains RTL (right-to-left) characters
+    # Uses Unicode ranges for Arabic, Hebrew, and other RTL scripts
+    def rtl_text?(text)
+      return false if text.nil? || text.empty?
+      text.each_char do |char|
+        code = char.ord
+        # Arabic (0600-06FF, 0750-077F, 08A0-08FF, FB50-FDFF, FE70-FEFF)
+        return true if code.between?(0x0600, 0x06FF)
+        return true if code.between?(0x0750, 0x077F)
+        return true if code.between?(0x08A0, 0x08FF)
+        return true if code.between?(0xFB50, 0xFDFF)
+        return true if code.between?(0xFE70, 0xFEFF)
+        # Hebrew (0590-05FF, FB1D-FB4F)
+        return true if code.between?(0x0590, 0x05FF)
+        return true if code.between?(0xFB1D, 0xFB4F)
+        # Syriac (0700-074F)
+        return true if code.between?(0x0700, 0x074F)
+        # Thaana (0780-07BF)
+        return true if code.between?(0x0780, 0x07BF)
+        # N'Ko (07C0-07FF)
+        return true if code.between?(0x07C0, 0x07FF)
+      end
+      false
+    end
+  end
+end

data/lib/tabula/table/cell.rb ADDED Viewed

@@ -0,0 +1,110 @@
+# frozen_string_literal: true
+module Tabula
+  # Represents a cell in a table.
+  # Contains text content and positional information.
+  class Cell < Rectangle
+    attr_reader :text_elements
+    attr_accessor :placeholder
+    # @param top [Float] top coordinate
+    # @param left [Float] left coordinate
+    # @param width [Float] cell width
+    # @param height [Float] cell height
+    # @param placeholder [Boolean] whether this is a placeholder cell
+    def initialize(top, left, width, height, placeholder: false)
+      super(top, left, width, height)
+      @text_elements = []
+      @placeholder = placeholder
+    end
+    # Create a cell from a rectangle
+    # @param rect [Rectangle] rectangle to convert
+    # @return [Cell]
+    def self.from_rectangle(rect)
+      new(rect.top, rect.left, rect.width, rect.height)
+    end
+    # Create an empty placeholder cell
+    # @return [Cell]
+    def self.empty
+      new(0, 0, 0, 0, placeholder: true)
+    end
+    # Add a text element to this cell
+    # @param element [TextElement, TextChunk] text to add
+    def add(element)
+      @text_elements << element
+      self
+    end
+    # Add multiple text elements
+    # @param elements [Array<TextElement, TextChunk>] elements to add
+    def add_all(elements)
+      elements.each { |e| add(e) }
+      self
+    end
+    # Get cell text content
+    # @param separator [String] separator between text elements
+    # @return [String]
+    def text(separator: ' ')
+      sorted = @text_elements.sort_by { |e| [e.top, e.left] }
+      sorted.map do |e|
+        e.respond_to?(:text) ? e.text : e.to_s
+      end.join(separator).strip
+    end
+    # Check if cell has any text
+    # @return [Boolean]
+    def has_text?
+      @text_elements.any?
+    end
+    # Check if cell is empty (no text elements)
+    # @return [Boolean]
+    def empty?
+      @text_elements.empty?
+    end
+    # Check if cell is blank (empty or contains only whitespace)
+    # @return [Boolean]
+    def blank?
+      return true if @text_elements.empty?
+      # Check if all text content is just whitespace
+      text.strip.empty?
+    end
+    # Check if this is a placeholder cell
+    # @return [Boolean]
+    def placeholder?
+      @placeholder
+    end
+    # Check if this cell spans multiple rows/columns (stub for future use)
+    # @return [Boolean]
+    def spanning?
+      false
+    end
+    def to_s
+      "Cell[#{text.inspect}](#{left}, #{top}, #{width}, #{height})"
+    end
+    def inspect
+      to_s
+    end
+    def ==(other)
+      return false unless other.is_a?(Cell)
+      super && text == other.text
+    end
+    alias eql? ==
+    def hash
+      [super, text].hash
+    end
+  end
+end

data/lib/tabula/table/table.rb ADDED Viewed

@@ -0,0 +1,184 @@
+# frozen_string_literal: true
+module Tabula
+  # Represents an extracted table with rows and cells.
+  # Provides methods for accessing table data and converting to various formats.
+  class Table < Rectangle
+    attr_reader :extraction_method, :page_number
+    # @param extraction_method [String] method used for extraction
+    # @param page_number [Integer] page number where table was found
+    def initialize(extraction_method: 'unknown', page_number: nil)
+      super(0, 0, 0, 0)
+      @extraction_method = extraction_method
+      @page_number = page_number
+      @cells = {} # { [row, col] => Cell }
+      @row_count = 0
+      @col_count = 0
+      @memoized_rows = nil
+    end
+    # Add a cell at a specific position
+    # @param row [Integer] row index (0-based)
+    # @param col [Integer] column index (0-based)
+    # @param cell [Cell] cell to add
+    def add(row, col, cell)
+      @cells[[row, col]] = cell
+      @row_count = [row + 1, @row_count].max
+      @col_count = [col + 1, @col_count].max
+      @memoized_rows = nil # Invalidate cache
+      # Update bounds
+      if @cells.size == 1
+        @top = cell.top
+        @left = cell.left
+        @width = cell.width
+        @height = cell.height
+      else
+        merge!(cell)
+      end
+      self
+    end
+    # Get a cell at a specific position
+    # @param row [Integer] row index
+    # @param col [Integer] column index
+    # @return [Cell] cell at position, or empty cell if none
+    def get_cell(row, col)
+      @cells[[row, col]] || Cell.empty
+    end
+    alias [] get_cell
+    # Get number of rows
+    # @return [Integer]
+    attr_reader :row_count
+    # Get number of columns
+    # @return [Integer]
+    attr_reader :col_count
+    # Get all rows as 2D array
+    # @return [Array<Array<Cell>>]
+    def rows
+      @rows ||= compute_rows
+    end
+    # Get all cells as flat array
+    # @return [Array<Cell>]
+    def cells
+      @cells.values
+    end
+    # Get a specific row
+    # @param index [Integer] row index
+    # @return [Array<Cell>]
+    def row(index)
+      rows[index] || []
+    end
+    # Get a specific column
+    # @param index [Integer] column index
+    # @return [Array<Cell>]
+    def column(index)
+      rows.map { |r| r[index] || Cell.empty }
+    end
+    # Convert to 2D array of strings
+    # @return [Array<Array<String>>]
+    def to_a
+      rows.map { |row| row.map(&:text) }
+    end
+    # Convert to CSV string
+    # @param options [Hash] options for CSV generation
+    # @return [String]
+    def to_csv(**options)
+      require 'csv'
+      CSV.generate(**options) do |csv|
+        to_a.each { |row| csv << row }
+      end
+    end
+    # Convert to TSV string
+    # @return [String]
+    def to_tsv
+      to_a.map { |row| row.join("\t") }.join("\n")
+    end
+    # Convert to hash (for JSON serialization)
+    # @return [Hash]
+    def to_h
+      {
+        extraction_method: @extraction_method,
+        page_number: @page_number,
+        top: top,
+        left: left,
+        width: width,
+        height: height,
+        data: to_a
+      }
+    end
+    # Convert to JSON string
+    # @return [String]
+    def to_json(*args)
+      require 'json'
+      to_h.to_json(*args)
+    end
+    # Check if table is empty
+    # @return [Boolean]
+    def empty?
+      @cells.empty?
+    end
+    # Iterate over rows
+    # @yield [Array<Cell>] each row
+    def each_row(&)
+      rows.each(&)
+    end
+    # Iterate over cells
+    # @yield [Integer, Integer, Cell] row, col, cell
+    def each_cell
+      rows.each_with_index do |row, row_idx|
+        row.each_with_index do |cell, col_idx|
+          yield row_idx, col_idx, cell
+        end
+      end
+    end
+    def to_s
+      "Table[#{row_count}x#{col_count}](#{left}, #{top}, #{width}, #{height})"
+    end
+    def inspect
+      to_s
+    end
+    private
+    def compute_rows
+      result = Array.new(@row_count) { Array.new(@col_count) { Cell.empty } }
+      @cells.each do |(row, col), cell|
+        result[row][col] = cell
+      end
+      result
+    end
+    # Table with ruling lines - extends Table with ruling information
+    class WithRulingLines < Table
+      attr_reader :horizontal_rulings, :vertical_rulings
+      def initialize(horizontal_rulings: [], vertical_rulings: [], **kwargs)
+        super(**kwargs)
+        @horizontal_rulings = horizontal_rulings
+        @vertical_rulings = vertical_rulings
+      end
+    end
+  end
+end

data/lib/tabula/text/line.rb ADDED Viewed

@@ -0,0 +1,133 @@
+# frozen_string_literal: true
+module Tabula
+  # Represents a line of text (a row of text chunks).
+  # Used for grouping text elements that share the same vertical position.
+  class Line < Rectangle
+    attr_reader :chunks
+    def initialize
+      super(0, 0, 0, 0)
+      @chunks = []
+      @initialized = false
+    end
+    # Add a text chunk to this line
+    # @param chunk [TextChunk] chunk to add
+    def add_chunk(chunk)
+      @chunks << chunk
+      if @initialized
+        merge!(chunk)
+      else
+        @top = chunk.top
+        @left = chunk.left
+        @width = chunk.width
+        @height = chunk.height
+        @initialized = true
+      end
+      self
+    end
+    # Get chunks sorted by horizontal position
+    # Respects RTL text direction when most chunks are RTL
+    # @return [Array<TextChunk>] sorted chunks
+    def sorted_chunks
+      if rtl_dominant?
+        @chunks.sort_by(&:left).reverse
+      else
+        @chunks.sort_by(&:left)
+      end
+    end
+    # Check if this line is LTR dominant
+    def ltr_dominant?
+      return true if @chunks.empty?
+      ltr_count = @chunks.count(&:ltr_dominant?)
+      rtl_count = @chunks.count(&:rtl_dominant?)
+      ltr_count >= rtl_count
+    end
+    # Check if this line is RTL dominant
+    def rtl_dominant?
+      !ltr_dominant?
+    end
+    # Get the combined text of all chunks
+    # @param separator [String] separator between chunks
+    # @return [String]
+    def text(separator: ' ')
+      sorted_chunks.map(&:text).join(separator)
+    end
+    # Get text elements from all chunks
+    # @return [Array<TextElement>]
+    def text_elements
+      @chunks.flat_map(&:elements)
+    end
+    # Average character width in this line
+    # @return [Float]
+    def average_char_width
+      elements = text_elements
+      return 0.0 if elements.empty?
+      total_width = elements.sum(&:width)
+      total_width / elements.size
+    end
+    # Check if a position falls within a gap between chunks
+    # @param x [Float] horizontal position
+    # @param min_gap [Float] minimum gap size
+    # @return [Boolean]
+    def in_gap?(x, min_gap: nil)
+      min_gap ||= average_char_width * 0.5
+      sorted = sorted_chunks
+      sorted.each_cons(2) do |left_chunk, right_chunk|
+        gap_start = left_chunk.right
+        gap_end = right_chunk.left
+        gap_size = gap_end - gap_start
+        return true if x.between?(gap_start, gap_end) && gap_size >= min_gap
+      end
+      false
+    end
+    # Find gap positions between chunks
+    # @param min_gap [Float] minimum gap size
+    # @return [Array<Float>] gap center positions
+    def gap_positions(min_gap: nil)
+      min_gap ||= average_char_width * 2
+      gaps = []
+      sorted = sorted_chunks
+      sorted.each_cons(2) do |left_chunk, right_chunk|
+        gap_start = left_chunk.right
+        gap_end = right_chunk.left
+        gap_size = gap_end - gap_start
+        gaps << ((gap_start + gap_end) / 2.0) if gap_size >= min_gap
+      end
+      gaps
+    end
+    def empty?
+      @chunks.empty?
+    end
+    def size
+      @chunks.size
+    end
+    def to_s
+      "Line[#{text.inspect}](#{left}, #{top}, #{width}, #{height})"
+    end
+    def inspect
+      to_s
+    end
+  end
+end