tabula-rb 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +39 -0
  4. data/CHANGELOG.md +59 -0
  5. data/LICENSE +21 -0
  6. data/README.md +176 -0
  7. data/Rakefile +28 -0
  8. data/exe/tabula +7 -0
  9. data/lib/tabula/algorithms/cohen_sutherland_clipping.rb +94 -0
  10. data/lib/tabula/algorithms/projection_profile.rb +109 -0
  11. data/lib/tabula/cli.rb +271 -0
  12. data/lib/tabula/configuration.rb +119 -0
  13. data/lib/tabula/core/point.rb +60 -0
  14. data/lib/tabula/core/rectangle.rb +218 -0
  15. data/lib/tabula/core/ruling.rb +303 -0
  16. data/lib/tabula/core/spatial_index.rb +120 -0
  17. data/lib/tabula/detectors/detection_algorithm.rb +34 -0
  18. data/lib/tabula/detectors/nurminen_detection_algorithm.rb +211 -0
  19. data/lib/tabula/detectors/spreadsheet_detection_algorithm.rb +142 -0
  20. data/lib/tabula/extractors/basic_extraction_algorithm.rb +168 -0
  21. data/lib/tabula/extractors/extraction_algorithm.rb +34 -0
  22. data/lib/tabula/extractors/spreadsheet_extraction_algorithm.rb +299 -0
  23. data/lib/tabula/pdf/object_extractor.rb +400 -0
  24. data/lib/tabula/pdf/page.rb +230 -0
  25. data/lib/tabula/pdf/text_stripper.rb +150 -0
  26. data/lib/tabula/table/cell.rb +110 -0
  27. data/lib/tabula/table/table.rb +184 -0
  28. data/lib/tabula/text/line.rb +133 -0
  29. data/lib/tabula/text/text_chunk.rb +185 -0
  30. data/lib/tabula/text/text_element.rb +120 -0
  31. data/lib/tabula/version.rb +5 -0
  32. data/lib/tabula/writers/csv_writer.rb +49 -0
  33. data/lib/tabula/writers/json_writer.rb +41 -0
  34. data/lib/tabula/writers/markdown_writer.rb +71 -0
  35. data/lib/tabula/writers/tsv_writer.rb +35 -0
  36. data/lib/tabula/writers/writer.rb +39 -0
  37. data/lib/tabula.rb +160 -0
  38. data/mise.toml +2 -0
  39. data/tabula-rb.gemspec +44 -0
  40. metadata +115 -0
@@ -0,0 +1,142 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Tabula
4
+ module Detectors
5
+ # Detects table areas using ruling line analysis.
6
+ # Suitable for PDFs with clear table borders.
7
+ class SpreadsheetDetection < DetectionAlgorithm
8
+ # Minimum cells for a valid table
9
+ MIN_CELLS = 4
10
+
11
+ # Minimum table dimension (in points)
12
+ MIN_DIMENSION = 10
13
+
14
+ # Detect table areas on a page
15
+ # @param page [Page] page to detect tables on
16
+ # @return [Array<Rectangle>] detected table areas
17
+ def detect(page)
18
+ horizontal = page.horizontal_rulings
19
+ vertical = page.vertical_rulings
20
+
21
+ return [] if horizontal.empty? || vertical.empty?
22
+
23
+ # Find cells from ruling intersections
24
+ cells = find_cells(horizontal, vertical)
25
+ return [] if cells.size < MIN_CELLS
26
+
27
+ # Group cells into table regions
28
+ regions = find_table_regions(cells)
29
+
30
+ # Filter valid regions
31
+ regions.select { |r| valid_table_region?(r) }
32
+ end
33
+
34
+ private
35
+
36
+ def find_cells(horizontal_rulings, vertical_rulings)
37
+ # Use the same logic as SpreadsheetExtractionAlgorithm
38
+ cells = []
39
+
40
+ # Find intersection points
41
+ intersections = build_intersection_map(horizontal_rulings, vertical_rulings)
42
+ return cells if intersections.empty?
43
+
44
+ # Get unique x and y positions
45
+ x_positions = intersections.keys.map { |x, _| x }.uniq.sort
46
+ y_positions = intersections.keys.map { |_, y| y }.uniq.sort
47
+
48
+ # Find cells by checking for rectangular intersections
49
+ y_positions.each_cons(2) do |top, bottom|
50
+ x_positions.each_cons(2) do |left, right|
51
+ if valid_cell?(left, right, top, bottom, intersections)
52
+ cells << Rectangle.new(top, left, right - left, bottom - top)
53
+ end
54
+ end
55
+ end
56
+
57
+ cells
58
+ end
59
+
60
+ def build_intersection_map(horizontal_rulings, vertical_rulings)
61
+ intersections = {}
62
+
63
+ horizontal_rulings.each do |h|
64
+ vertical_rulings.each do |v|
65
+ next unless h.intersects?(v)
66
+
67
+ point = h.intersection_point(v)
68
+ next unless point
69
+
70
+ key = [point.x.round(1), point.y.round(1)]
71
+ intersections[key] = true
72
+ end
73
+ end
74
+
75
+ intersections
76
+ end
77
+
78
+ def valid_cell?(left, right, top, bottom, intersections)
79
+ tolerance = 2.0
80
+
81
+ corners = [
82
+ [left, top],
83
+ [right, top],
84
+ [left, bottom],
85
+ [right, bottom]
86
+ ]
87
+
88
+ corners.all? do |x, y|
89
+ intersections.keys.any? do |ix, iy|
90
+ (x - ix).abs <= tolerance && (y - iy).abs <= tolerance
91
+ end
92
+ end
93
+ end
94
+
95
+ def find_table_regions(cells)
96
+ return [] if cells.empty?
97
+
98
+ regions = []
99
+ remaining = cells.dup
100
+
101
+ until remaining.empty?
102
+ seed = remaining.shift
103
+ region = [seed]
104
+
105
+ loop do
106
+ adjacent = remaining.select { |c| adjacent_to_region?(c, region) }
107
+ break if adjacent.empty?
108
+
109
+ region.concat(adjacent)
110
+ remaining -= adjacent
111
+ end
112
+
113
+ regions << Rectangle.bounding_box_of(region)
114
+ end
115
+
116
+ regions
117
+ end
118
+
119
+ def adjacent_to_region?(cell, region)
120
+ region.any? { |r| cells_adjacent?(r, cell) }
121
+ end
122
+
123
+ def cells_adjacent?(c1, c2)
124
+ tolerance = 2.0
125
+
126
+ # Horizontal adjacency
127
+ h_adjacent = (c1.right - c2.left).abs <= tolerance || (c2.right - c1.left).abs <= tolerance
128
+ v_overlap = c1.vertically_overlaps?(c2, 0.5)
129
+
130
+ # Vertical adjacency
131
+ v_adjacent = (c1.bottom - c2.top).abs <= tolerance || (c2.bottom - c1.top).abs <= tolerance
132
+ h_overlap = c1.horizontally_overlaps?(c2, 0.5)
133
+
134
+ (h_adjacent && v_overlap) || (v_adjacent && h_overlap)
135
+ end
136
+
137
+ def valid_table_region?(region)
138
+ region.width >= MIN_DIMENSION && region.height >= MIN_DIMENSION
139
+ end
140
+ end
141
+ end
142
+ end
@@ -0,0 +1,168 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Tabula
4
+ module Extractors
5
+ # Stream-mode extraction algorithm.
6
+ # Extracts tables by analyzing text positions and gaps without relying on ruling lines.
7
+ class Basic < ExtractionAlgorithm
8
+ # @param columns [Array<Float>, nil] explicit column positions
9
+ # @param guess [Boolean] whether to guess column positions
10
+ def initialize(columns: nil, guess: true, **options)
11
+ super(**options)
12
+ @columns = columns
13
+ @guess = guess
14
+ end
15
+
16
+ # Extract tables from a page
17
+ # @param page [Page] page to extract from
18
+ # @return [Array<Table>]
19
+ def extract(page)
20
+ return [] if page.text_elements.empty?
21
+
22
+ # Get text chunks and lines
23
+ chunks = page.text_chunks
24
+ return [] if chunks.empty?
25
+
26
+ lines = TextChunk.group_by_lines(chunks)
27
+ return [] if lines.empty?
28
+
29
+ # Determine column positions
30
+ column_positions = determine_columns(lines, page)
31
+
32
+ # Build table
33
+ table = build_table(lines, column_positions, page.page_number)
34
+ table.empty? ? [] : [table]
35
+ end
36
+
37
+ private
38
+
39
+ def determine_columns(lines, page)
40
+ if @columns
41
+ # Use explicit columns
42
+ @columns.sort
43
+ elsif page.vertical_rulings.any?
44
+ # Use vertical ruling positions
45
+ page.vertical_rulings.map(&:x1).sort.uniq
46
+ elsif @guess
47
+ # Guess columns from text gaps
48
+ guess_column_positions(lines)
49
+ else
50
+ # No column separators - single column
51
+ []
52
+ end
53
+ end
54
+
55
+ def guess_column_positions(lines)
56
+ return [] if lines.empty?
57
+
58
+ # Collect all gap positions from all lines
59
+ all_gaps = []
60
+ lines.each do |line|
61
+ gaps = line.gap_positions
62
+ all_gaps.concat(gaps)
63
+ end
64
+
65
+ return [] if all_gaps.empty?
66
+
67
+ # Cluster gaps that appear in multiple lines
68
+ clustered = cluster_positions(all_gaps, tolerance: 5.0)
69
+
70
+ # Only keep gaps that appear in at least 30% of lines
71
+ min_occurrences = (lines.size * 0.3).ceil
72
+ frequent = clustered.select { |_, count| count >= min_occurrences }
73
+
74
+ frequent.keys.sort
75
+ end
76
+
77
+ def cluster_positions(positions, tolerance:)
78
+ return {} if positions.empty?
79
+
80
+ sorted = positions.sort
81
+ clusters = {}
82
+ current_cluster = [sorted.first]
83
+
84
+ sorted[1..].each do |pos|
85
+ if (pos - current_cluster.last) <= tolerance
86
+ current_cluster << pos
87
+ else
88
+ avg = current_cluster.sum / current_cluster.size
89
+ clusters[avg] = current_cluster.size
90
+ current_cluster = [pos]
91
+ end
92
+ end
93
+
94
+ unless current_cluster.empty?
95
+ avg = current_cluster.sum / current_cluster.size
96
+ clusters[avg] = current_cluster.size
97
+ end
98
+
99
+ clusters
100
+ end
101
+
102
+ def build_table(lines, column_positions, page_number)
103
+ table = Table.new(extraction_method: name, page_number: page_number)
104
+
105
+ lines.each_with_index do |line, row_idx|
106
+ assign_chunks_to_columns(line, column_positions, table, row_idx)
107
+ end
108
+
109
+ table
110
+ end
111
+
112
+ def assign_chunks_to_columns(line, column_positions, table, row_idx)
113
+ if column_positions.empty?
114
+ # Single column
115
+ cell = create_cell_from_line(line)
116
+ table.add(row_idx, 0, cell)
117
+ else
118
+ # Multiple columns - assign chunks to appropriate columns
119
+ columns = split_line_by_columns(line, column_positions)
120
+ columns.each_with_index do |chunks, col_idx|
121
+ cell = create_cell_from_chunks(chunks)
122
+ table.add(row_idx, col_idx, cell)
123
+ end
124
+ end
125
+ end
126
+
127
+ def split_line_by_columns(line, column_positions)
128
+ # Create column boundaries
129
+ boundaries = [line.left, *column_positions, Float::INFINITY]
130
+
131
+ # Initialize columns
132
+ num_columns = boundaries.size - 1
133
+ columns = Array.new(num_columns) { [] }
134
+
135
+ # Assign each chunk to a column
136
+ line.sorted_chunks.each do |chunk|
137
+ chunk_center = chunk.left + (chunk.width / 2.0)
138
+ col_idx = find_column_index(chunk_center, boundaries)
139
+ columns[col_idx] << chunk
140
+ end
141
+
142
+ columns
143
+ end
144
+
145
+ def find_column_index(x, boundaries)
146
+ boundaries.each_cons(2).with_index do |(left, right), idx|
147
+ return idx if x >= left && x < right
148
+ end
149
+ boundaries.size - 2 # Last column
150
+ end
151
+
152
+ def create_cell_from_line(line)
153
+ cell = Cell.new(line.top, line.left, line.width, line.height)
154
+ line.chunks.each { |chunk| cell.add(chunk) }
155
+ cell
156
+ end
157
+
158
+ def create_cell_from_chunks(chunks)
159
+ return Cell.empty if chunks.empty?
160
+
161
+ bounds = Rectangle.bounding_box_of(chunks)
162
+ cell = Cell.new(bounds.top, bounds.left, bounds.width, bounds.height)
163
+ chunks.each { |chunk| cell.add(chunk) }
164
+ cell
165
+ end
166
+ end
167
+ end
168
+ end
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Tabula
4
+ module Extractors
5
+ # Base class for table extraction algorithms
6
+ class ExtractionAlgorithm
7
+ # Extract tables from a page
8
+ # @param page [Page] page to extract from
9
+ # @param options [Hash] algorithm-specific options
10
+ # @return [Array<Table>] extracted tables
11
+ def self.extract(page, **options)
12
+ new(**options).extract(page)
13
+ end
14
+
15
+ # @param options [Hash] algorithm options
16
+ def initialize(**options)
17
+ @options = options
18
+ end
19
+
20
+ # Extract tables from a page
21
+ # @param page [Page] page to extract from
22
+ # @return [Array<Table>]
23
+ def extract(page)
24
+ raise NotImplementedError, 'Subclasses must implement #extract'
25
+ end
26
+
27
+ # Get algorithm name for table metadata
28
+ # @return [String]
29
+ def name
30
+ self.class.name.split('::').last
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,299 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Tabula
4
+ module Extractors
5
+ # Lattice-mode extraction algorithm.
6
+ # Extracts tables by analyzing ruling lines (cell borders) in the PDF.
7
+ class Spreadsheet < ExtractionAlgorithm
8
+ # Minimum cells required for a valid table
9
+ MIN_CELLS = 4
10
+
11
+ # Magic heuristic for determining tabular content
12
+ TABULAR_RATIO_THRESHOLD = 0.65
13
+
14
+ # Extract tables from a page
15
+ # @param page [Page] page to extract from
16
+ # @return [Array<Table>]
17
+ def extract(page)
18
+ horizontal = page.horizontal_rulings
19
+ vertical = page.vertical_rulings
20
+
21
+ return [] if horizontal.empty? || vertical.empty?
22
+
23
+ # Find cells from ruling intersections
24
+ cells = find_cells(horizontal, vertical)
25
+ return [] if cells.size < MIN_CELLS
26
+
27
+ # Find spreadsheet regions from cells and get cells per region
28
+ cell_groups = find_spreadsheet_areas_with_cells(cells)
29
+ return [] if cell_groups.empty?
30
+
31
+ # Extract tables from each region using the found cells
32
+ tables = cell_groups.map do |region_cells|
33
+ extract_table_from_cells(page, region_cells, horizontal, vertical)
34
+ end
35
+
36
+ tables.reject(&:empty?)
37
+ end
38
+
39
+ # Check if a page contains tabular content
40
+ # @param page [Page] page to check
41
+ # @return [Boolean]
42
+ def self.tabular?(page)
43
+ extractor = new
44
+ tables = extractor.extract(page)
45
+ return false if tables.empty?
46
+
47
+ # Check if tables have reasonable structure
48
+ tables.any? do |table|
49
+ ratio = table.row_count.to_f / table.col_count
50
+ ratio.between?(TABULAR_RATIO_THRESHOLD, 1.0 / TABULAR_RATIO_THRESHOLD)
51
+ end
52
+ end
53
+
54
+ private
55
+
56
+ def find_cells(horizontal_rulings, vertical_rulings)
57
+ cells = []
58
+ tolerance = Tabula.configuration.cell_tolerance
59
+
60
+ # Find intersection points
61
+ intersections = build_intersection_map(horizontal_rulings, vertical_rulings)
62
+ return cells if intersections.empty?
63
+
64
+ # Get unique y positions from horizontal rulings (row boundaries)
65
+ y_positions = horizontal_rulings.map { |r| r.y1.round(1) }.uniq.sort
66
+
67
+ return cells if y_positions.size < 2
68
+
69
+ # Process each row individually to handle spanning cells
70
+ y_positions.each_cons(2) do |top, bottom|
71
+ # Find vertical rulings that span this row (intersect with row's Y range)
72
+ row_verticals = vertical_rulings.select do |v|
73
+ v.y1 <= top + tolerance && v.y2 >= bottom - tolerance
74
+ end
75
+
76
+ # Get unique X positions from vertical rulings only
77
+ x_positions = row_verticals.map { |v| v.x1.round(1) }.uniq.sort
78
+
79
+ next if x_positions.size < 2
80
+
81
+ # Create cells for this row
82
+ x_positions.each_cons(2) do |left, right|
83
+ # Verify this cell has valid edges
84
+ if valid_cell_by_edges?(left, right, top, bottom, horizontal_rulings, vertical_rulings, tolerance)
85
+ cells << Cell.new(top, left, right - left, bottom - top)
86
+ # Also accept cells with corner validation
87
+ elsif valid_cell_by_corners?(left, right, top, bottom, intersections, tolerance)
88
+ cells << Cell.new(top, left, right - left, bottom - top)
89
+ end
90
+ end
91
+ end
92
+
93
+ cells
94
+ end
95
+
96
+ def build_intersection_map(horizontal_rulings, vertical_rulings)
97
+ intersections = {}
98
+
99
+ horizontal_rulings.each do |h|
100
+ vertical_rulings.each do |v|
101
+ next unless h.intersects?(v)
102
+
103
+ point = h.intersection_point(v)
104
+ next unless point
105
+
106
+ # Round to avoid floating point issues
107
+ key = [point.x.round(1), point.y.round(1)]
108
+ intersections[key] = true
109
+ end
110
+ end
111
+
112
+ intersections
113
+ end
114
+
115
+ def valid_cell_by_corners?(left, right, top, bottom, intersections, tolerance)
116
+ corners = [
117
+ [left, top],
118
+ [right, top],
119
+ [left, bottom],
120
+ [right, bottom]
121
+ ]
122
+
123
+ corners.all? do |x, y|
124
+ intersections.keys.any? do |ix, iy|
125
+ (x - ix).abs <= tolerance && (y - iy).abs <= tolerance
126
+ end
127
+ end
128
+ end
129
+
130
+ # Check if there are rulings that form the edges of a potential cell
131
+ def valid_cell_by_edges?(left, right, top, bottom, horizontal_rulings, vertical_rulings, tolerance)
132
+ # Check for top edge (horizontal ruling at top that covers left to right)
133
+ has_top = horizontal_rulings.any? do |h|
134
+ (h.y1 - top).abs <= tolerance &&
135
+ h.x1 <= left + tolerance &&
136
+ h.x2 >= right - tolerance
137
+ end
138
+
139
+ # Check for bottom edge
140
+ has_bottom = horizontal_rulings.any? do |h|
141
+ (h.y1 - bottom).abs <= tolerance &&
142
+ h.x1 <= left + tolerance &&
143
+ h.x2 >= right - tolerance
144
+ end
145
+
146
+ # Check for left edge (vertical ruling at left that covers top to bottom)
147
+ has_left = vertical_rulings.any? do |v|
148
+ (v.x1 - left).abs <= tolerance &&
149
+ v.y1 <= top + tolerance &&
150
+ v.y2 >= bottom - tolerance
151
+ end
152
+
153
+ # Check for right edge
154
+ has_right = vertical_rulings.any? do |v|
155
+ (v.x1 - right).abs <= tolerance &&
156
+ v.y1 <= top + tolerance &&
157
+ v.y2 >= bottom - tolerance
158
+ end
159
+
160
+ has_top && has_bottom && has_left && has_right
161
+ end
162
+
163
+ def find_spreadsheet_areas(cells)
164
+ find_spreadsheet_areas_with_cells(cells).map do |region_cells|
165
+ Rectangle.bounding_box_of(region_cells)
166
+ end
167
+ end
168
+
169
+ def find_spreadsheet_areas_with_cells(cells)
170
+ return [] if cells.empty?
171
+
172
+ # Group adjacent cells into regions
173
+ cell_groups = []
174
+ remaining = cells.dup
175
+
176
+ until remaining.empty?
177
+ seed = remaining.shift
178
+ region = [seed]
179
+
180
+ loop do
181
+ adjacent = remaining.select { |c| adjacent?(region, c) }
182
+ break if adjacent.empty?
183
+
184
+ region.concat(adjacent)
185
+ remaining -= adjacent
186
+ end
187
+
188
+ # Filter out small regions
189
+ bbox = Rectangle.bounding_box_of(region)
190
+ cell_groups << region if bbox.area.positive?
191
+ end
192
+
193
+ cell_groups
194
+ end
195
+
196
+ def adjacent?(region, cell)
197
+ region.any? { |r| cells_adjacent?(r, cell) }
198
+ end
199
+
200
+ def cells_adjacent?(c1, c2)
201
+ # Cells are adjacent if they share an edge
202
+ tolerance = 2.0
203
+
204
+ # Horizontal adjacency (share vertical edge)
205
+ horizontal = (c1.right - c2.left).abs <= tolerance || (c2.right - c1.left).abs <= tolerance
206
+ vertical_overlap = c1.vertically_overlaps?(c2, 0.5)
207
+
208
+ # Vertical adjacency (share horizontal edge)
209
+ vertical = (c1.bottom - c2.top).abs <= tolerance || (c2.bottom - c1.top).abs <= tolerance
210
+ horizontal_overlap = c1.horizontally_overlaps?(c2, 0.5)
211
+
212
+ (horizontal && vertical_overlap) || (vertical && horizontal_overlap)
213
+ end
214
+
215
+ def extract_table_from_cells(page, cells, horizontal_rulings, vertical_rulings)
216
+ return Table.new if cells.empty?
217
+
218
+ # Get area bounds from cells
219
+ area = Rectangle.bounding_box_of(cells)
220
+
221
+ # Get rulings within the area
222
+ h_rulings = horizontal_rulings.select { |r| ruling_in_area?(r, area) }
223
+ v_rulings = vertical_rulings.select { |r| ruling_in_area?(r, area) }
224
+
225
+ # Build table
226
+ table = Table::WithRulingLines.new(
227
+ horizontal_rulings: h_rulings,
228
+ vertical_rulings: v_rulings,
229
+ extraction_method: name,
230
+ page_number: page.page_number
231
+ )
232
+
233
+ # Organize cells into grid positions
234
+ # Get unique y positions (rows) and sort cells by position
235
+ y_positions = cells.map { |c| c.top.round(1) }.uniq.sort
236
+ y_to_row = y_positions.each_with_index.to_h
237
+
238
+ cells.each do |cell|
239
+ row_idx = y_to_row[cell.top.round(1)]
240
+ next unless row_idx
241
+
242
+ # Find column index based on x position within this row
243
+ row_cells = cells.select { |c| (c.top - cell.top).abs < 2 }.sort_by(&:left)
244
+ col_idx = row_cells.index(cell) || 0
245
+
246
+ # Populate cell with text elements
247
+ cell_area = Rectangle.from_bounds(cell.top, cell.left, cell.bottom, cell.right)
248
+ text_elements = page.get_text(cell_area)
249
+ cell.add_all(text_elements)
250
+
251
+ table.add(row_idx, col_idx, cell)
252
+ end
253
+
254
+ table
255
+ end
256
+
257
+ def extract_table_from_area(page, area, horizontal_rulings, vertical_rulings)
258
+ # Get rulings within the area
259
+ h_rulings = horizontal_rulings.select { |r| ruling_in_area?(r, area) }
260
+ v_rulings = vertical_rulings.select { |r| ruling_in_area?(r, area) }
261
+
262
+ # Get unique positions for grid
263
+ y_positions = h_rulings.map(&:y1).uniq.sort
264
+ x_positions = v_rulings.map(&:x1).uniq.sort
265
+
266
+ return Table.new if y_positions.size < 2 || x_positions.size < 2
267
+
268
+ # Build table
269
+ table = Table::WithRulingLines.new(
270
+ horizontal_rulings: h_rulings,
271
+ vertical_rulings: v_rulings,
272
+ extraction_method: name,
273
+ page_number: page.page_number
274
+ )
275
+
276
+ # Create cells and populate with text
277
+ y_positions.each_cons(2).with_index do |(top, bottom), row_idx|
278
+ x_positions.each_cons(2).with_index do |(left, right), col_idx|
279
+ cell = Cell.new(top, left, right - left, bottom - top)
280
+
281
+ # Find text elements in this cell
282
+ cell_area = Rectangle.from_bounds(top, left, bottom, right)
283
+ text_elements = page.get_text(cell_area)
284
+ cell.add_all(text_elements)
285
+
286
+ table.add(row_idx, col_idx, cell)
287
+ end
288
+ end
289
+
290
+ table
291
+ end
292
+
293
+ def ruling_in_area?(ruling, area)
294
+ ruling_rect = Rectangle.from_bounds(ruling.top, ruling.left, ruling.bottom, ruling.right)
295
+ area.intersects?(ruling_rect)
296
+ end
297
+ end
298
+ end
299
+ end