RubyGems - tabula-rb - Versions diffs - 1.0.0 - Mend

tabula-rb 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

checksums.yaml +7 -0
data/.rspec +3 -0
data/.rubocop.yml +39 -0
data/CHANGELOG.md +59 -0
data/LICENSE +21 -0
data/README.md +176 -0
data/Rakefile +28 -0
data/exe/tabula +7 -0
data/lib/tabula/algorithms/cohen_sutherland_clipping.rb +94 -0
data/lib/tabula/algorithms/projection_profile.rb +109 -0
data/lib/tabula/cli.rb +271 -0
data/lib/tabula/configuration.rb +119 -0
data/lib/tabula/core/point.rb +60 -0
data/lib/tabula/core/rectangle.rb +218 -0
data/lib/tabula/core/ruling.rb +303 -0
data/lib/tabula/core/spatial_index.rb +120 -0
data/lib/tabula/detectors/detection_algorithm.rb +34 -0
data/lib/tabula/detectors/nurminen_detection_algorithm.rb +211 -0
data/lib/tabula/detectors/spreadsheet_detection_algorithm.rb +142 -0
data/lib/tabula/extractors/basic_extraction_algorithm.rb +168 -0
data/lib/tabula/extractors/extraction_algorithm.rb +34 -0
data/lib/tabula/extractors/spreadsheet_extraction_algorithm.rb +299 -0
data/lib/tabula/pdf/object_extractor.rb +400 -0
data/lib/tabula/pdf/page.rb +230 -0
data/lib/tabula/pdf/text_stripper.rb +150 -0
data/lib/tabula/table/cell.rb +110 -0
data/lib/tabula/table/table.rb +184 -0
data/lib/tabula/text/line.rb +133 -0
data/lib/tabula/text/text_chunk.rb +185 -0
data/lib/tabula/text/text_element.rb +120 -0
data/lib/tabula/version.rb +5 -0
data/lib/tabula/writers/csv_writer.rb +49 -0
data/lib/tabula/writers/json_writer.rb +41 -0
data/lib/tabula/writers/markdown_writer.rb +71 -0
data/lib/tabula/writers/tsv_writer.rb +35 -0
data/lib/tabula/writers/writer.rb +39 -0
data/lib/tabula.rb +160 -0
data/mise.toml +2 -0
data/tabula-rb.gemspec +44 -0
metadata +115 -0

data/lib/tabula/text/text_chunk.rb ADDED Viewed

@@ -0,0 +1,185 @@
+# frozen_string_literal: true
+module Tabula
+  # Represents a group of text elements (typically a word or phrase).
+  # Extends Rectangle to provide bounding box functionality.
+  class TextChunk < Rectangle
+    attr_reader :elements
+    # @param element_or_rect [TextElement, Rectangle] initial element or bounds
+    def initialize(element_or_rect = nil)
+      if element_or_rect.is_a?(TextElement)
+        super(element_or_rect.top, element_or_rect.left, element_or_rect.width, element_or_rect.height)
+        @elements = [element_or_rect]
+      elsif element_or_rect.is_a?(Rectangle)
+        super(element_or_rect.top, element_or_rect.left, element_or_rect.width, element_or_rect.height)
+        @elements = []
+      elsif element_or_rect.nil?
+        super(0, 0, 0, 0)
+        @elements = []
+      else
+        raise ArgumentError, 'Expected TextElement, Rectangle, or nil'
+      end
+    end
+    # Add a text element to this chunk
+    # @param element [TextElement] element to add
+    def add(element)
+      @elements << element
+      merge!(element)
+      self
+    end
+    # Add multiple elements
+    # @param elements [Array<TextElement>] elements to add
+    def add_all(elements)
+      elements.each { |e| add(e) }
+      self
+    end
+    # Get the combined text content
+    # @param normalize [Boolean] whether to normalize whitespace
+    # @return [String] the text content
+    def text(normalize: true)
+      # Sort elements based on text direction
+      sorted = if ltr_dominant?
+                 @elements.sort_by(&:left)
+               else
+                 @elements.sort_by(&:left).reverse
+               end
+      raw = sorted.map(&:text).join
+      normalize ? raw.gsub(/\s+/, ' ').strip : raw
+    end
+    # Check if this chunk is RTL dominant
+    def rtl_dominant?
+      !ltr_dominant?
+    end
+    # Get width of space character for this chunk
+    def width_of_space
+      @elements.map(&:width_of_space).compact.first
+    end
+    # Get font name
+    def font_name
+      @elements.first&.font_name
+    end
+    # Get font size
+    def font_size
+      @elements.first&.font_size
+    end
+    # Check if this chunk contains only a single repeated character
+    # @param chars [Array<String>] characters to check for
+    # @return [Boolean]
+    def same_char?(chars)
+      return false if @elements.empty?
+      @elements.all? { |e| chars.include?(e.text) }
+    end
+    # Remove runs of identical characters
+    # @param char [String] character to squeeze
+    # @param min_run [Integer] minimum run length to squeeze
+    # @return [TextChunk] new chunk with squeezed text
+    def squeeze(char, min_run: 3)
+      return self if @elements.size < min_run
+      new_chunk = TextChunk.new(Rectangle.new(top, left, width, height))
+      run_count = 0
+      @elements.each do |element|
+        if element.text == char
+          run_count += 1
+          new_chunk.add(element) if run_count <= 1
+        else
+          run_count = 0
+          new_chunk.add(element)
+        end
+      end
+      new_chunk
+    end
+    # Check if LTR text is dominant in this chunk
+    def ltr_dominant?
+      ltr_count = @elements.count(&:ltr?)
+      rtl_count = @elements.count(&:rtl?)
+      ltr_count >= rtl_count
+    end
+    # Split this chunk at an index
+    # @param index [Integer] element index to split at
+    # @return [Array<TextChunk>] two chunks, before and after the split
+    def split_at(index)
+      return [dup, TextChunk.new] if index >= @elements.size
+      return [TextChunk.new, dup] if index <= 0
+      left_chunk = TextChunk.new
+      right_chunk = TextChunk.new
+      @elements[0...index].each { |e| left_chunk.add(e) }
+      @elements[index..].each { |e| right_chunk.add(e) }
+      [left_chunk, right_chunk]
+    end
+    # Merge with another chunk
+    # @param other [TextChunk] chunk to merge
+    # @return [TextChunk] self
+    def merge_chunk(other)
+      other.elements.each { |e| add(e) }
+      self
+    end
+    def to_s
+      "TextChunk[#{text.inspect}](#{left}, #{top}, #{width}, #{height})"
+    end
+    def inspect
+      to_s
+    end
+    def empty?
+      @elements.empty?
+    end
+    def size
+      @elements.size
+    end
+    class << self
+      # Check if all chunks contain the same repeated character
+      # @param chunks [Array<TextChunk>] chunks to check
+      # @param chars [Array<String>] characters to check for
+      # @return [Boolean]
+      def all_same_char?(chunks, chars)
+        chunks.all? { |c| c.same_char?(chars) }
+      end
+      # Group text chunks into lines
+      # @param chunks [Array<TextChunk>] chunks to group
+      # @return [Array<Line>] lines of text
+      def group_by_lines(chunks)
+        return [] if chunks.empty?
+        sorted = chunks.sort_by { |c| [c.top, c.left] }
+        lines = []
+        current_line = Line.new
+        sorted.each do |chunk|
+          unless current_line.empty? || current_line.vertically_overlaps?(chunk)
+            lines << current_line
+            current_line = Line.new
+          end
+          current_line.add_chunk(chunk)
+        end
+        lines << current_line unless current_line.empty?
+        lines
+      end
+    end
+  end
+end

data/lib/tabula/text/text_element.rb ADDED Viewed

@@ -0,0 +1,120 @@
+# frozen_string_literal: true
+module Tabula
+  # Represents a single text element (character or glyph) extracted from a PDF.
+  # Contains position, dimensions, and font information.
+  class TextElement < Rectangle
+    # Text direction constants
+    DIRECTION_LTR = 0
+    DIRECTION_RTL = 1
+    attr_reader :text, :font_name, :font_size, :width_of_space, :direction
+    # @param top [Float] top coordinate
+    # @param left [Float] left coordinate
+    # @param width [Float] width
+    # @param height [Float] height
+    # @param text [String] the text content
+    # @param font_name [String] name of the font
+    # @param font_size [Float] font size in points
+    # @param width_of_space [Float] width of space character in this font
+    # @param direction [Integer] text direction (LTR or RTL)
+    def initialize(top:, left:, width:, height:, text:, font_name: nil, font_size: nil,
+                   width_of_space: nil, direction: DIRECTION_LTR)
+      super(top, left, width, height)
+      @text = text
+      @font_name = font_name
+      @font_size = font_size&.to_f
+      @width_of_space = width_of_space&.to_f
+      @direction = direction
+    end
+    def ltr?
+      direction == DIRECTION_LTR
+    end
+    def rtl?
+      direction == DIRECTION_RTL
+    end
+    # Check if this element is whitespace
+    def whitespace?
+      text.nil? || text.strip.empty?
+    end
+    def to_s
+      "TextElement[#{text.inspect}](#{left}, #{top}, #{width}, #{height})"
+    end
+    def inspect
+      to_s
+    end
+    def ==(other)
+      return false unless other.is_a?(TextElement)
+      super && text == other.text && font_name == other.font_name &&
+        font_size == other.font_size
+    end
+    alias eql? ==
+    def hash
+      [super, text, font_name, font_size].hash
+    end
+    class << self
+      # Merge text elements into text chunks (words)
+      # @param elements [Array<TextElement>] text elements to merge
+      # @param vertical_rulings [Array<Ruling>] vertical rulings that act as word separators
+      # @return [Array<TextChunk>] merged text chunks
+      def merge_words(elements, vertical_rulings: [])
+        return [] if elements.empty?
+        chunks = []
+        current_chunk = nil
+        # Sort by top first, then by left (RTL sorting handled in text assembly)
+        sorted = elements.reject(&:whitespace?).sort_by { |e| [e.top, e.left] }
+        sorted.each do |element|
+          if current_chunk.nil?
+            current_chunk = TextChunk.new(element)
+          elsif should_merge?(current_chunk, element, vertical_rulings)
+            current_chunk.add(element)
+          else
+            chunks << current_chunk
+            current_chunk = TextChunk.new(element)
+          end
+        end
+        chunks << current_chunk if current_chunk
+        chunks
+      end
+      private
+      def should_merge?(chunk, element, vertical_rulings)
+        return false unless chunk.vertically_overlaps?(element)
+        # Check if there's a vertical ruling between them
+        return false if vertical_rulings.any? { |r| ruling_between?(chunk, element, r) }
+        # Check horizontal gap
+        gap = element.left - chunk.right
+        max_gap = [chunk.width_of_space || chunk.width, element.width_of_space || element.width].compact.max
+        max_gap ||= element.width
+        gap <= max_gap * 0.5
+      end
+      def ruling_between?(chunk, element, ruling)
+        return false unless ruling.vertical?
+        ruling_x = ruling.x1
+        ruling_x > chunk.right && ruling_x < element.left &&
+          ruling.top <= [chunk.top, element.top].min &&
+          ruling.bottom >= [chunk.bottom, element.bottom].max
+      end
+    end
+  end
+end

data/lib/tabula/version.rb ADDED Viewed

@@ -0,0 +1,5 @@
+# frozen_string_literal: true
+module Tabula
+  VERSION = '1.0.0'
+end

data/lib/tabula/writers/csv_writer.rb ADDED Viewed

@@ -0,0 +1,49 @@
+# frozen_string_literal: true
+require 'csv'
+module Tabula
+  module Writers
+    # Writes tables in CSV format
+    class CSVWriter < Writer
+      # @param separator [String] field separator (default: comma)
+      # @param quote_char [String] quote character (default: double quote)
+      # @param force_quotes [Boolean] always quote fields (default: false)
+      def initialize(separator: ',', quote_char: '"', force_quotes: false, **options)
+        super(**options)
+        @separator = separator
+        @quote_char = quote_char
+        @force_quotes = force_quotes
+      end
+      # Write tables to an IO object
+      # @param tables [Array<Table>] tables to write
+      # @param io [IO] output destination
+      def write(tables, io)
+        csv_options = {
+          col_sep: @separator,
+          quote_char: @quote_char,
+          force_quotes: @force_quotes
+        }
+        tables.each_with_index do |table, idx|
+          # Add blank line between tables
+          io.puts if idx.positive?
+          csv = CSV.new(io, **csv_options)
+          table.to_a.each { |row| csv << row }
+        end
+      end
+      # Write tables to a string
+      # @param tables [Array<Table>] tables to write
+      # @return [String] CSV formatted output
+      def self.to_string(tables, **options)
+        require 'stringio'
+        io = StringIO.new
+        new(**options).write(tables, io)
+        io.string
+      end
+    end
+  end
+end

data/lib/tabula/writers/json_writer.rb ADDED Viewed

@@ -0,0 +1,41 @@
+# frozen_string_literal: true
+require 'json'
+module Tabula
+  module Writers
+    # Writes tables in JSON format
+    class JSONWriter < Writer
+      # @param pretty [Boolean] pretty-print JSON (default: false)
+      # @param include_metadata [Boolean] include table metadata (default: true)
+      def initialize(pretty: false, include_metadata: true, **options)
+        super(**options)
+        @pretty = pretty
+        @include_metadata = include_metadata
+      end
+      # Write tables to an IO object
+      # @param tables [Array<Table>] tables to write
+      # @param io [IO] output destination
+      def write(tables, io)
+        output = tables.map { |table| table_to_hash(table) }
+        if @pretty
+          io.puts JSON.pretty_generate(output)
+        else
+          io.puts JSON.generate(output)
+        end
+      end
+      private
+      def table_to_hash(table)
+        if @include_metadata
+          table.to_h
+        else
+          { data: table.to_a }
+        end
+      end
+    end
+  end
+end

data/lib/tabula/writers/markdown_writer.rb ADDED Viewed

@@ -0,0 +1,71 @@
+# frozen_string_literal: true
+module Tabula
+  module Writers
+    # Writes tables in Markdown format (GitHub-flavored)
+    class MarkdownWriter < Writer
+      # @param alignment [Symbol] column alignment (:left, :center, :right, or nil for default)
+      def initialize(alignment: nil, **options)
+        super(**options)
+        @alignment = alignment
+      end
+      # Write tables to an IO object
+      # @param tables [Array<Table>] tables to write
+      # @param io [IO] output destination
+      def write(tables, io)
+        tables.each_with_index do |table, idx|
+          # Add blank line between tables
+          io.puts if idx.positive?
+          rows = table.to_a
+          next if rows.empty?
+          col_count = rows.map(&:size).max || 0
+          next if col_count.zero?
+          # Write header row (first row)
+          write_row(io, rows.first, col_count)
+          # Write separator row
+          write_separator(io, col_count)
+          # Write data rows
+          rows.drop(1).each do |row|
+            write_row(io, row, col_count)
+          end
+        end
+      end
+      private
+      def write_row(io, row, col_count)
+        cells = (0...col_count).map do |i|
+          escape_markdown(row[i].to_s)
+        end
+        io.puts "| #{cells.join(' | ')} |"
+      end
+      def write_separator(io, col_count)
+        separators = Array.new(col_count) do
+          case @alignment
+          when :left
+            ':---'
+          when :center
+            ':---:'
+          when :right
+            '---:'
+          else
+            '---'
+          end
+        end
+        io.puts "| #{separators.join(' | ')} |"
+      end
+      def escape_markdown(text)
+        # Escape pipe characters and normalize whitespace
+        text.gsub('|', '\\|').gsub(/\s+/, ' ').strip
+      end
+    end
+  end
+end

data/lib/tabula/writers/tsv_writer.rb ADDED Viewed

@@ -0,0 +1,35 @@
+# frozen_string_literal: true
+module Tabula
+  module Writers
+    # Writes tables in TSV (Tab-Separated Values) format
+    class TSVWriter < Writer
+      # Write tables to an IO object
+      # @param tables [Array<Table>] tables to write
+      # @param io [IO] output destination
+      def write(tables, io)
+        tables.each_with_index do |table, idx|
+          # Add blank line between tables
+          io.puts if idx.positive?
+          table.to_a.each do |row|
+            # Escape tabs and newlines in cell values
+            escaped = row.map { |cell| escape_value(cell) }
+            io.puts escaped.join("\t")
+          end
+        end
+      end
+      private
+      def escape_value(value)
+        return '' if value.nil?
+        value.to_s
+             .gsub("\t", '\\t')
+             .gsub("\n", '\\n')
+             .gsub("\r", '\\r')
+      end
+    end
+  end
+end

data/lib/tabula/writers/writer.rb ADDED Viewed

@@ -0,0 +1,39 @@
+# frozen_string_literal: true
+module Tabula
+  module Writers
+    # Base class for table output writers
+    class Writer
+      # Write tables to an IO object
+      # @param tables [Array<Table>] tables to write
+      # @param io [IO] output destination
+      # @param options [Hash] writer-specific options
+      def self.write(tables, io = $stdout, **options)
+        new(**options).write(tables, io)
+      end
+      # Write tables to a string
+      # @param tables [Array<Table>] tables to write
+      # @param options [Hash] writer-specific options
+      # @return [String] formatted output
+      def self.to_string(tables, **options)
+        require 'stringio'
+        io = StringIO.new
+        write(tables, io, **options)
+        io.string
+      end
+      # @param options [Hash] writer options
+      def initialize(**options)
+        @options = options
+      end
+      # Write tables to an IO object
+      # @param tables [Array<Table>] tables to write
+      # @param io [IO] output destination
+      def write(tables, io)
+        raise NotImplementedError, 'Subclasses must implement #write'
+      end
+    end
+  end
+end