tabula-rb 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +39 -0
  4. data/CHANGELOG.md +59 -0
  5. data/LICENSE +21 -0
  6. data/README.md +176 -0
  7. data/Rakefile +28 -0
  8. data/exe/tabula +7 -0
  9. data/lib/tabula/algorithms/cohen_sutherland_clipping.rb +94 -0
  10. data/lib/tabula/algorithms/projection_profile.rb +109 -0
  11. data/lib/tabula/cli.rb +271 -0
  12. data/lib/tabula/configuration.rb +119 -0
  13. data/lib/tabula/core/point.rb +60 -0
  14. data/lib/tabula/core/rectangle.rb +218 -0
  15. data/lib/tabula/core/ruling.rb +303 -0
  16. data/lib/tabula/core/spatial_index.rb +120 -0
  17. data/lib/tabula/detectors/detection_algorithm.rb +34 -0
  18. data/lib/tabula/detectors/nurminen_detection_algorithm.rb +211 -0
  19. data/lib/tabula/detectors/spreadsheet_detection_algorithm.rb +142 -0
  20. data/lib/tabula/extractors/basic_extraction_algorithm.rb +168 -0
  21. data/lib/tabula/extractors/extraction_algorithm.rb +34 -0
  22. data/lib/tabula/extractors/spreadsheet_extraction_algorithm.rb +299 -0
  23. data/lib/tabula/pdf/object_extractor.rb +400 -0
  24. data/lib/tabula/pdf/page.rb +230 -0
  25. data/lib/tabula/pdf/text_stripper.rb +150 -0
  26. data/lib/tabula/table/cell.rb +110 -0
  27. data/lib/tabula/table/table.rb +184 -0
  28. data/lib/tabula/text/line.rb +133 -0
  29. data/lib/tabula/text/text_chunk.rb +185 -0
  30. data/lib/tabula/text/text_element.rb +120 -0
  31. data/lib/tabula/version.rb +5 -0
  32. data/lib/tabula/writers/csv_writer.rb +49 -0
  33. data/lib/tabula/writers/json_writer.rb +41 -0
  34. data/lib/tabula/writers/markdown_writer.rb +71 -0
  35. data/lib/tabula/writers/tsv_writer.rb +35 -0
  36. data/lib/tabula/writers/writer.rb +39 -0
  37. data/lib/tabula.rb +160 -0
  38. data/mise.toml +2 -0
  39. data/tabula-rb.gemspec +44 -0
  40. metadata +115 -0
@@ -0,0 +1,150 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'pdf-reader'
4
+
5
+ module Tabula
6
+ # Extracts text elements from PDF pages using pdf-reader.
7
+ # Uses pdf-reader's PageTextReceiver for proper font encoding and CMap handling.
8
+ class TextStripper
9
+ # @param page [PDF::Reader::Page] pdf-reader page object
10
+ def initialize(page)
11
+ @page = page
12
+ @text_elements = []
13
+ @min_char_width = Float::INFINITY
14
+ @min_char_height = Float::INFINITY
15
+ end
16
+
17
+ # Extract text elements from the page
18
+ # @return [Array<TextElement>] extracted text elements
19
+ def extract
20
+ # Use pdf-reader's PageTextReceiver for proper font encoding
21
+ receiver = PDF::Reader::PageTextReceiver.new
22
+ receiver.page = @page
23
+ @page.walk(receiver)
24
+
25
+ # Get merged text runs for readable output
26
+ # merge: true combines adjacent characters into words/phrases
27
+ runs = receiver.runs(
28
+ merge: true,
29
+ skip_zero_width: true,
30
+ skip_overlapping: true
31
+ )
32
+
33
+ # Get page dimensions and rotation
34
+ rotation = @page.attributes[:Rotate] || 0
35
+
36
+ runs.each do |run|
37
+ next if run.text.nil? || run.text.empty?
38
+ next unless printable?(run.text)
39
+
40
+ # pdf-reader already applies rotation transformation
41
+ # For rotated pages, y coordinates are negative
42
+ # For non-rotated pages, we need to flip from bottom-origin to top-origin
43
+ if [90, 270].include?(rotation)
44
+ # Rotated pages: y is negative, convert to positive
45
+ top = run.y.abs
46
+ else
47
+ # Non-rotated pages: convert from bottom-origin to top-origin
48
+ page_height = calculate_page_height
49
+ top = page_height - run.y
50
+ end
51
+
52
+ left = run.x
53
+ width = run.width
54
+ height = run.font_size
55
+
56
+ # Detect text direction from Unicode character properties
57
+ direction = rtl_text?(run.text) ? TextElement::DIRECTION_RTL : TextElement::DIRECTION_LTR
58
+
59
+ element = TextElement.new(
60
+ top: top,
61
+ left: left,
62
+ width: width,
63
+ height: height,
64
+ text: run.text,
65
+ font_name: nil, # pdf-reader doesn't expose font name in runs
66
+ font_size: run.font_size,
67
+ width_of_space: estimate_space_width(run),
68
+ direction: direction
69
+ )
70
+
71
+ @text_elements << element
72
+
73
+ @min_char_width = [@min_char_width, width].min if width.positive?
74
+ @min_char_height = [@min_char_height, height].min if height.positive?
75
+ end
76
+
77
+ @text_elements
78
+ end
79
+
80
+ attr_reader :min_char_width, :min_char_height
81
+
82
+ private
83
+
84
+ def calculate_page_height
85
+ box = @page.attributes[:CropBox] || @page.attributes[:MediaBox]
86
+ (box[3].to_f - box[1].to_f).abs
87
+ end
88
+
89
+ # Check if character is printable (port of Java's isPrintable)
90
+ def printable?(text)
91
+ return false if text.nil? || text.empty?
92
+
93
+ text.each_char do |char|
94
+ code = char.ord
95
+
96
+ # Filter control characters except space, tab, newline
97
+ return false if code < 0x20 && code != 0x09 && code != 0x0A && code != 0x0D
98
+
99
+ # Filter delete character
100
+ return false if code == 0x7F
101
+
102
+ # Filter Unicode replacement character
103
+ return false if code == 0xFFFD
104
+
105
+ # Filter null character
106
+ return false if code.zero?
107
+ end
108
+
109
+ true
110
+ end
111
+
112
+ # Estimate width of space character based on font size
113
+ def estimate_space_width(run)
114
+ # Approximate space width as 0.25 of font size (common for proportional fonts)
115
+ run.font_size * 0.25
116
+ end
117
+
118
+ # Detect if text contains RTL (right-to-left) characters
119
+ # Uses Unicode ranges for Arabic, Hebrew, and other RTL scripts
120
+ def rtl_text?(text)
121
+ return false if text.nil? || text.empty?
122
+
123
+ text.each_char do |char|
124
+ code = char.ord
125
+
126
+ # Arabic (0600-06FF, 0750-077F, 08A0-08FF, FB50-FDFF, FE70-FEFF)
127
+ return true if code.between?(0x0600, 0x06FF)
128
+ return true if code.between?(0x0750, 0x077F)
129
+ return true if code.between?(0x08A0, 0x08FF)
130
+ return true if code.between?(0xFB50, 0xFDFF)
131
+ return true if code.between?(0xFE70, 0xFEFF)
132
+
133
+ # Hebrew (0590-05FF, FB1D-FB4F)
134
+ return true if code.between?(0x0590, 0x05FF)
135
+ return true if code.between?(0xFB1D, 0xFB4F)
136
+
137
+ # Syriac (0700-074F)
138
+ return true if code.between?(0x0700, 0x074F)
139
+
140
+ # Thaana (0780-07BF)
141
+ return true if code.between?(0x0780, 0x07BF)
142
+
143
+ # N'Ko (07C0-07FF)
144
+ return true if code.between?(0x07C0, 0x07FF)
145
+ end
146
+
147
+ false
148
+ end
149
+ end
150
+ end
@@ -0,0 +1,110 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Tabula
4
+ # Represents a cell in a table.
5
+ # Contains text content and positional information.
6
+ class Cell < Rectangle
7
+ attr_reader :text_elements
8
+ attr_accessor :placeholder
9
+
10
+ # @param top [Float] top coordinate
11
+ # @param left [Float] left coordinate
12
+ # @param width [Float] cell width
13
+ # @param height [Float] cell height
14
+ # @param placeholder [Boolean] whether this is a placeholder cell
15
+ def initialize(top, left, width, height, placeholder: false)
16
+ super(top, left, width, height)
17
+ @text_elements = []
18
+ @placeholder = placeholder
19
+ end
20
+
21
+ # Create a cell from a rectangle
22
+ # @param rect [Rectangle] rectangle to convert
23
+ # @return [Cell]
24
+ def self.from_rectangle(rect)
25
+ new(rect.top, rect.left, rect.width, rect.height)
26
+ end
27
+
28
+ # Create an empty placeholder cell
29
+ # @return [Cell]
30
+ def self.empty
31
+ new(0, 0, 0, 0, placeholder: true)
32
+ end
33
+
34
+ # Add a text element to this cell
35
+ # @param element [TextElement, TextChunk] text to add
36
+ def add(element)
37
+ @text_elements << element
38
+ self
39
+ end
40
+
41
+ # Add multiple text elements
42
+ # @param elements [Array<TextElement, TextChunk>] elements to add
43
+ def add_all(elements)
44
+ elements.each { |e| add(e) }
45
+ self
46
+ end
47
+
48
+ # Get cell text content
49
+ # @param separator [String] separator between text elements
50
+ # @return [String]
51
+ def text(separator: ' ')
52
+ sorted = @text_elements.sort_by { |e| [e.top, e.left] }
53
+ sorted.map do |e|
54
+ e.respond_to?(:text) ? e.text : e.to_s
55
+ end.join(separator).strip
56
+ end
57
+
58
+ # Check if cell has any text
59
+ # @return [Boolean]
60
+ def has_text?
61
+ @text_elements.any?
62
+ end
63
+
64
+ # Check if cell is empty (no text elements)
65
+ # @return [Boolean]
66
+ def empty?
67
+ @text_elements.empty?
68
+ end
69
+
70
+ # Check if cell is blank (empty or contains only whitespace)
71
+ # @return [Boolean]
72
+ def blank?
73
+ return true if @text_elements.empty?
74
+
75
+ # Check if all text content is just whitespace
76
+ text.strip.empty?
77
+ end
78
+
79
+ # Check if this is a placeholder cell
80
+ # @return [Boolean]
81
+ def placeholder?
82
+ @placeholder
83
+ end
84
+
85
+ # Check if this cell spans multiple rows/columns (stub for future use)
86
+ # @return [Boolean]
87
+ def spanning?
88
+ false
89
+ end
90
+
91
+ def to_s
92
+ "Cell[#{text.inspect}](#{left}, #{top}, #{width}, #{height})"
93
+ end
94
+
95
+ def inspect
96
+ to_s
97
+ end
98
+
99
+ def ==(other)
100
+ return false unless other.is_a?(Cell)
101
+
102
+ super && text == other.text
103
+ end
104
+ alias eql? ==
105
+
106
+ def hash
107
+ [super, text].hash
108
+ end
109
+ end
110
+ end
@@ -0,0 +1,184 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Tabula
4
+ # Represents an extracted table with rows and cells.
5
+ # Provides methods for accessing table data and converting to various formats.
6
+ class Table < Rectangle
7
+ attr_reader :extraction_method, :page_number
8
+
9
+ # @param extraction_method [String] method used for extraction
10
+ # @param page_number [Integer] page number where table was found
11
+ def initialize(extraction_method: 'unknown', page_number: nil)
12
+ super(0, 0, 0, 0)
13
+ @extraction_method = extraction_method
14
+ @page_number = page_number
15
+ @cells = {} # { [row, col] => Cell }
16
+ @row_count = 0
17
+ @col_count = 0
18
+ @memoized_rows = nil
19
+ end
20
+
21
+ # Add a cell at a specific position
22
+ # @param row [Integer] row index (0-based)
23
+ # @param col [Integer] column index (0-based)
24
+ # @param cell [Cell] cell to add
25
+ def add(row, col, cell)
26
+ @cells[[row, col]] = cell
27
+ @row_count = [row + 1, @row_count].max
28
+ @col_count = [col + 1, @col_count].max
29
+ @memoized_rows = nil # Invalidate cache
30
+
31
+ # Update bounds
32
+ if @cells.size == 1
33
+ @top = cell.top
34
+ @left = cell.left
35
+ @width = cell.width
36
+ @height = cell.height
37
+ else
38
+ merge!(cell)
39
+ end
40
+
41
+ self
42
+ end
43
+
44
+ # Get a cell at a specific position
45
+ # @param row [Integer] row index
46
+ # @param col [Integer] column index
47
+ # @return [Cell] cell at position, or empty cell if none
48
+ def get_cell(row, col)
49
+ @cells[[row, col]] || Cell.empty
50
+ end
51
+
52
+ alias [] get_cell
53
+
54
+ # Get number of rows
55
+ # @return [Integer]
56
+ attr_reader :row_count
57
+
58
+ # Get number of columns
59
+ # @return [Integer]
60
+ attr_reader :col_count
61
+
62
+ # Get all rows as 2D array
63
+ # @return [Array<Array<Cell>>]
64
+ def rows
65
+ @rows ||= compute_rows
66
+ end
67
+
68
+ # Get all cells as flat array
69
+ # @return [Array<Cell>]
70
+ def cells
71
+ @cells.values
72
+ end
73
+
74
+ # Get a specific row
75
+ # @param index [Integer] row index
76
+ # @return [Array<Cell>]
77
+ def row(index)
78
+ rows[index] || []
79
+ end
80
+
81
+ # Get a specific column
82
+ # @param index [Integer] column index
83
+ # @return [Array<Cell>]
84
+ def column(index)
85
+ rows.map { |r| r[index] || Cell.empty }
86
+ end
87
+
88
+ # Convert to 2D array of strings
89
+ # @return [Array<Array<String>>]
90
+ def to_a
91
+ rows.map { |row| row.map(&:text) }
92
+ end
93
+
94
+ # Convert to CSV string
95
+ # @param options [Hash] options for CSV generation
96
+ # @return [String]
97
+ def to_csv(**options)
98
+ require 'csv'
99
+ CSV.generate(**options) do |csv|
100
+ to_a.each { |row| csv << row }
101
+ end
102
+ end
103
+
104
+ # Convert to TSV string
105
+ # @return [String]
106
+ def to_tsv
107
+ to_a.map { |row| row.join("\t") }.join("\n")
108
+ end
109
+
110
+ # Convert to hash (for JSON serialization)
111
+ # @return [Hash]
112
+ def to_h
113
+ {
114
+ extraction_method: @extraction_method,
115
+ page_number: @page_number,
116
+ top: top,
117
+ left: left,
118
+ width: width,
119
+ height: height,
120
+ data: to_a
121
+ }
122
+ end
123
+
124
+ # Convert to JSON string
125
+ # @return [String]
126
+ def to_json(*args)
127
+ require 'json'
128
+ to_h.to_json(*args)
129
+ end
130
+
131
+ # Check if table is empty
132
+ # @return [Boolean]
133
+ def empty?
134
+ @cells.empty?
135
+ end
136
+
137
+ # Iterate over rows
138
+ # @yield [Array<Cell>] each row
139
+ def each_row(&)
140
+ rows.each(&)
141
+ end
142
+
143
+ # Iterate over cells
144
+ # @yield [Integer, Integer, Cell] row, col, cell
145
+ def each_cell
146
+ rows.each_with_index do |row, row_idx|
147
+ row.each_with_index do |cell, col_idx|
148
+ yield row_idx, col_idx, cell
149
+ end
150
+ end
151
+ end
152
+
153
+ def to_s
154
+ "Table[#{row_count}x#{col_count}](#{left}, #{top}, #{width}, #{height})"
155
+ end
156
+
157
+ def inspect
158
+ to_s
159
+ end
160
+
161
+ private
162
+
163
+ def compute_rows
164
+ result = Array.new(@row_count) { Array.new(@col_count) { Cell.empty } }
165
+
166
+ @cells.each do |(row, col), cell|
167
+ result[row][col] = cell
168
+ end
169
+
170
+ result
171
+ end
172
+
173
+ # Table with ruling lines - extends Table with ruling information
174
+ class WithRulingLines < Table
175
+ attr_reader :horizontal_rulings, :vertical_rulings
176
+
177
+ def initialize(horizontal_rulings: [], vertical_rulings: [], **kwargs)
178
+ super(**kwargs)
179
+ @horizontal_rulings = horizontal_rulings
180
+ @vertical_rulings = vertical_rulings
181
+ end
182
+ end
183
+ end
184
+ end
@@ -0,0 +1,133 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Tabula
4
+ # Represents a line of text (a row of text chunks).
5
+ # Used for grouping text elements that share the same vertical position.
6
+ class Line < Rectangle
7
+ attr_reader :chunks
8
+
9
+ def initialize
10
+ super(0, 0, 0, 0)
11
+ @chunks = []
12
+ @initialized = false
13
+ end
14
+
15
+ # Add a text chunk to this line
16
+ # @param chunk [TextChunk] chunk to add
17
+ def add_chunk(chunk)
18
+ @chunks << chunk
19
+ if @initialized
20
+ merge!(chunk)
21
+ else
22
+ @top = chunk.top
23
+ @left = chunk.left
24
+ @width = chunk.width
25
+ @height = chunk.height
26
+ @initialized = true
27
+ end
28
+ self
29
+ end
30
+
31
+ # Get chunks sorted by horizontal position
32
+ # Respects RTL text direction when most chunks are RTL
33
+ # @return [Array<TextChunk>] sorted chunks
34
+ def sorted_chunks
35
+ if rtl_dominant?
36
+ @chunks.sort_by(&:left).reverse
37
+ else
38
+ @chunks.sort_by(&:left)
39
+ end
40
+ end
41
+
42
+ # Check if this line is LTR dominant
43
+ def ltr_dominant?
44
+ return true if @chunks.empty?
45
+
46
+ ltr_count = @chunks.count(&:ltr_dominant?)
47
+ rtl_count = @chunks.count(&:rtl_dominant?)
48
+ ltr_count >= rtl_count
49
+ end
50
+
51
+ # Check if this line is RTL dominant
52
+ def rtl_dominant?
53
+ !ltr_dominant?
54
+ end
55
+
56
+ # Get the combined text of all chunks
57
+ # @param separator [String] separator between chunks
58
+ # @return [String]
59
+ def text(separator: ' ')
60
+ sorted_chunks.map(&:text).join(separator)
61
+ end
62
+
63
+ # Get text elements from all chunks
64
+ # @return [Array<TextElement>]
65
+ def text_elements
66
+ @chunks.flat_map(&:elements)
67
+ end
68
+
69
+ # Average character width in this line
70
+ # @return [Float]
71
+ def average_char_width
72
+ elements = text_elements
73
+ return 0.0 if elements.empty?
74
+
75
+ total_width = elements.sum(&:width)
76
+ total_width / elements.size
77
+ end
78
+
79
+ # Check if a position falls within a gap between chunks
80
+ # @param x [Float] horizontal position
81
+ # @param min_gap [Float] minimum gap size
82
+ # @return [Boolean]
83
+ def in_gap?(x, min_gap: nil)
84
+ min_gap ||= average_char_width * 0.5
85
+ sorted = sorted_chunks
86
+
87
+ sorted.each_cons(2) do |left_chunk, right_chunk|
88
+ gap_start = left_chunk.right
89
+ gap_end = right_chunk.left
90
+ gap_size = gap_end - gap_start
91
+
92
+ return true if x.between?(gap_start, gap_end) && gap_size >= min_gap
93
+ end
94
+
95
+ false
96
+ end
97
+
98
+ # Find gap positions between chunks
99
+ # @param min_gap [Float] minimum gap size
100
+ # @return [Array<Float>] gap center positions
101
+ def gap_positions(min_gap: nil)
102
+ min_gap ||= average_char_width * 2
103
+ gaps = []
104
+ sorted = sorted_chunks
105
+
106
+ sorted.each_cons(2) do |left_chunk, right_chunk|
107
+ gap_start = left_chunk.right
108
+ gap_end = right_chunk.left
109
+ gap_size = gap_end - gap_start
110
+
111
+ gaps << ((gap_start + gap_end) / 2.0) if gap_size >= min_gap
112
+ end
113
+
114
+ gaps
115
+ end
116
+
117
+ def empty?
118
+ @chunks.empty?
119
+ end
120
+
121
+ def size
122
+ @chunks.size
123
+ end
124
+
125
+ def to_s
126
+ "Line[#{text.inspect}](#{left}, #{top}, #{width}, #{height})"
127
+ end
128
+
129
+ def inspect
130
+ to_s
131
+ end
132
+ end
133
+ end