tabula-rb 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +39 -0
  4. data/CHANGELOG.md +59 -0
  5. data/LICENSE +21 -0
  6. data/README.md +176 -0
  7. data/Rakefile +28 -0
  8. data/exe/tabula +7 -0
  9. data/lib/tabula/algorithms/cohen_sutherland_clipping.rb +94 -0
  10. data/lib/tabula/algorithms/projection_profile.rb +109 -0
  11. data/lib/tabula/cli.rb +271 -0
  12. data/lib/tabula/configuration.rb +119 -0
  13. data/lib/tabula/core/point.rb +60 -0
  14. data/lib/tabula/core/rectangle.rb +218 -0
  15. data/lib/tabula/core/ruling.rb +303 -0
  16. data/lib/tabula/core/spatial_index.rb +120 -0
  17. data/lib/tabula/detectors/detection_algorithm.rb +34 -0
  18. data/lib/tabula/detectors/nurminen_detection_algorithm.rb +211 -0
  19. data/lib/tabula/detectors/spreadsheet_detection_algorithm.rb +142 -0
  20. data/lib/tabula/extractors/basic_extraction_algorithm.rb +168 -0
  21. data/lib/tabula/extractors/extraction_algorithm.rb +34 -0
  22. data/lib/tabula/extractors/spreadsheet_extraction_algorithm.rb +299 -0
  23. data/lib/tabula/pdf/object_extractor.rb +400 -0
  24. data/lib/tabula/pdf/page.rb +230 -0
  25. data/lib/tabula/pdf/text_stripper.rb +150 -0
  26. data/lib/tabula/table/cell.rb +110 -0
  27. data/lib/tabula/table/table.rb +184 -0
  28. data/lib/tabula/text/line.rb +133 -0
  29. data/lib/tabula/text/text_chunk.rb +185 -0
  30. data/lib/tabula/text/text_element.rb +120 -0
  31. data/lib/tabula/version.rb +5 -0
  32. data/lib/tabula/writers/csv_writer.rb +49 -0
  33. data/lib/tabula/writers/json_writer.rb +41 -0
  34. data/lib/tabula/writers/markdown_writer.rb +71 -0
  35. data/lib/tabula/writers/tsv_writer.rb +35 -0
  36. data/lib/tabula/writers/writer.rb +39 -0
  37. data/lib/tabula.rb +160 -0
  38. data/mise.toml +2 -0
  39. data/tabula-rb.gemspec +44 -0
  40. metadata +115 -0
@@ -0,0 +1,303 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Tabula
4
+ # Represents a ruling line (horizontal or vertical line segment) in a PDF.
5
+ # Used for detecting table cell boundaries in lattice-mode extraction.
6
+ class Ruling
7
+ # Tolerance for considering lines as horizontal/vertical
8
+ ORIENTATION_TOLERANCE = 1.0
9
+
10
+ # Tolerance for near-intersection detection
11
+ INTERSECTION_TOLERANCE = 1.0
12
+
13
+ attr_accessor :x1, :y1, :x2, :y2
14
+
15
+ def initialize(x1, y1, x2, y2)
16
+ @x1 = x1.to_f
17
+ @y1 = y1.to_f
18
+ @x2 = x2.to_f
19
+ @y2 = y2.to_f
20
+ normalize!
21
+ end
22
+
23
+ # Create from two points
24
+ def self.from_points(p1, p2)
25
+ new(p1.x, p1.y, p2.x, p2.y)
26
+ end
27
+
28
+ # Create from top, left, width, height (like Rectangle)
29
+ def self.from_bounds(top, left, width, height)
30
+ new(left, top, left + width, top + height)
31
+ end
32
+
33
+ # Normalize almost-horizontal and almost-vertical lines
34
+ def normalize!
35
+ if horizontal?
36
+ avg_y = (y1 + y2) / 2.0
37
+ @y1 = avg_y
38
+ @y2 = avg_y
39
+ # Ensure x1 < x2
40
+ @x1, @x2 = @x2, @x1 if x1 > x2
41
+ elsif vertical?
42
+ avg_x = (x1 + x2) / 2.0
43
+ @x1 = avg_x
44
+ @x2 = avg_x
45
+ # Ensure y1 < y2
46
+ @y1, @y2 = @y2, @y1 if y1 > y2
47
+ end
48
+ self
49
+ end
50
+
51
+ def horizontal?
52
+ (y2 - y1).abs <= ORIENTATION_TOLERANCE
53
+ end
54
+
55
+ def vertical?
56
+ (x2 - x1).abs <= ORIENTATION_TOLERANCE
57
+ end
58
+
59
+ def oblique?
60
+ !horizontal? && !vertical?
61
+ end
62
+
63
+ # Position perpendicular to the line (y for horizontal, x for vertical)
64
+ def position
65
+ horizontal? ? y1 : x1
66
+ end
67
+
68
+ def position=(value)
69
+ if horizontal?
70
+ @y1 = value
71
+ @y2 = value
72
+ else
73
+ @x1 = value
74
+ @x2 = value
75
+ end
76
+ end
77
+
78
+ # Start point along the line direction
79
+ def start
80
+ horizontal? ? x1 : y1
81
+ end
82
+
83
+ # End point along the line direction
84
+ def end
85
+ horizontal? ? x2 : y2
86
+ end
87
+
88
+ def length
89
+ Math.sqrt(((x2 - x1)**2) + ((y2 - y1)**2))
90
+ end
91
+
92
+ def top
93
+ [y1, y2].min
94
+ end
95
+
96
+ def bottom
97
+ [y1, y2].max
98
+ end
99
+
100
+ def left
101
+ [x1, x2].min
102
+ end
103
+
104
+ def right
105
+ [x1, x2].max
106
+ end
107
+
108
+ def width
109
+ right - left
110
+ end
111
+
112
+ def height
113
+ bottom - top
114
+ end
115
+
116
+ # Get bounding rectangle
117
+ def bounds
118
+ Rectangle.new(top, left, width, height)
119
+ end
120
+
121
+ # Get start and end points as Point objects
122
+ def p1
123
+ Point.new(x1, y1)
124
+ end
125
+
126
+ def p2
127
+ Point.new(x2, y2)
128
+ end
129
+
130
+ # Calculate angle in degrees (0 = horizontal, 90 = vertical)
131
+ def angle
132
+ Math.atan2(y2 - y1, x2 - x1) * 180.0 / Math::PI
133
+ end
134
+
135
+ # Find intersection point with another ruling (only for orthogonal lines)
136
+ def intersection_point(other)
137
+ return nil if horizontal? == other.horizontal?
138
+ return nil if oblique? || other.oblique?
139
+
140
+ if horizontal?
141
+ Point.new(other.x1, y1)
142
+ else
143
+ Point.new(x1, other.y1)
144
+ end
145
+ end
146
+
147
+ # Check if this ruling intersects another (with tolerance)
148
+ def intersects?(other, tolerance = INTERSECTION_TOLERANCE)
149
+ point = intersection_point(other)
150
+ return false unless point
151
+
152
+ # Check if intersection point lies within both line segments
153
+ if horizontal?
154
+ x_in_self = point.x.between?(left - tolerance, right + tolerance)
155
+ y_in_other = point.y.between?(other.top - tolerance, other.bottom + tolerance)
156
+ x_in_self && y_in_other
157
+ else
158
+ y_in_self = point.y.between?(top - tolerance, bottom + tolerance)
159
+ x_in_other = point.x.between?(other.left - tolerance, other.right + tolerance)
160
+ y_in_self && x_in_other
161
+ end
162
+ end
163
+
164
+ # Check if lines nearly intersect (for cell detection)
165
+ def nearly_intersects?(other, tolerance = INTERSECTION_TOLERANCE)
166
+ intersects?(other, tolerance)
167
+ end
168
+
169
+ # Expand the ruling by extending its endpoints
170
+ def expand(amount)
171
+ if horizontal?
172
+ Ruling.new(x1 - amount, y1, x2 + amount, y2)
173
+ elsif vertical?
174
+ Ruling.new(x1, y1 - amount, x2, y2 + amount)
175
+ else
176
+ # For oblique lines, expand in both directions
177
+ dx = (x2 - x1) / length * amount
178
+ dy = (y2 - y1) / length * amount
179
+ Ruling.new(x1 - dx, y1 - dy, x2 + dx, y2 + dy)
180
+ end
181
+ end
182
+
183
+ # Clip ruling to a rectangular area
184
+ def clip_to(rect)
185
+ CohenSutherlandClipping.clip(self, rect)
186
+ end
187
+
188
+ # Check if this ruling overlaps with another (for collapsing)
189
+ def colinear_with?(other, tolerance = 1.0)
190
+ return false unless horizontal? == other.horizontal?
191
+
192
+ if horizontal?
193
+ (y1 - other.y1).abs < tolerance
194
+ else
195
+ (x1 - other.x1).abs < tolerance
196
+ end
197
+ end
198
+
199
+ def ==(other)
200
+ return false unless other.is_a?(Ruling)
201
+
202
+ x1 == other.x1 && y1 == other.y1 && x2 == other.x2 && y2 == other.y2
203
+ end
204
+ alias eql? ==
205
+
206
+ def hash
207
+ [x1, y1, x2, y2].hash
208
+ end
209
+
210
+ def dup
211
+ Ruling.new(x1, y1, x2, y2)
212
+ end
213
+
214
+ def to_s
215
+ orientation = if horizontal?
216
+ 'H'
217
+ else
218
+ (vertical? ? 'V' : 'O')
219
+ end
220
+ "Ruling[#{orientation}](#{x1}, #{y1}) -> (#{x2}, #{y2})"
221
+ end
222
+
223
+ def inspect
224
+ to_s
225
+ end
226
+
227
+ class << self
228
+ # Find all intersection points between horizontal and vertical rulings
229
+ # Uses sweep line algorithm for O(n log n) performance
230
+ def find_intersections(horizontal_rulings, vertical_rulings)
231
+ intersections = {}
232
+
233
+ horizontal_rulings.each do |h|
234
+ vertical_rulings.each do |v|
235
+ next unless h.intersects?(v)
236
+
237
+ point = h.intersection_point(v)
238
+ next unless point
239
+
240
+ # Round to avoid floating point issues
241
+ key = [point.x.round(2), point.y.round(2)]
242
+ intersections[key] ||= point
243
+ end
244
+ end
245
+
246
+ intersections.values
247
+ end
248
+
249
+ # Collapse colinear rulings that are close together
250
+ def collapse_oriented_rulings(rulings, tolerance = 1.0)
251
+ return [] if rulings.empty?
252
+
253
+ # Separate horizontal and vertical
254
+ horizontal = rulings.select(&:horizontal?).sort_by(&:y1)
255
+ vertical = rulings.select(&:vertical?).sort_by(&:x1)
256
+
257
+ collapsed = []
258
+ collapsed.concat(collapse_group(horizontal, tolerance))
259
+ collapsed.concat(collapse_group(vertical, tolerance))
260
+ collapsed
261
+ end
262
+
263
+ # Crop rulings to a rectangular area
264
+ def crop_to_area(rulings, rect)
265
+ rulings.filter_map { |r| r.clip_to(rect) }
266
+ end
267
+
268
+ private
269
+
270
+ def collapse_group(rulings, tolerance)
271
+ return [] if rulings.empty?
272
+
273
+ groups = []
274
+ current_group = [rulings.first]
275
+
276
+ rulings[1..].each do |ruling|
277
+ if current_group.last.colinear_with?(ruling, tolerance)
278
+ current_group << ruling
279
+ else
280
+ groups << current_group
281
+ current_group = [ruling]
282
+ end
283
+ end
284
+ groups << current_group
285
+
286
+ # Merge each group into a single ruling
287
+ groups.map do |group|
288
+ if group.first.horizontal?
289
+ y = group.map(&:y1).sum / group.size
290
+ min_x = group.map(&:left).min
291
+ max_x = group.map(&:right).max
292
+ Ruling.new(min_x, y, max_x, y)
293
+ else
294
+ x = group.map(&:x1).sum / group.size
295
+ min_y = group.map(&:top).min
296
+ max_y = group.map(&:bottom).max
297
+ Ruling.new(x, min_y, x, max_y)
298
+ end
299
+ end
300
+ end
301
+ end
302
+ end
303
+ end
@@ -0,0 +1,120 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Tabula
4
+ # Spatial index for efficient rectangle queries.
5
+ # Uses a simple grid-based approach for O(1) average lookup.
6
+ class SpatialIndex
7
+ attr_reader :rectangles
8
+
9
+ # @param cell_size [Float] size of grid cells (default 50)
10
+ def initialize(cell_size: 50.0)
11
+ @cell_size = cell_size
12
+ @grid = Hash.new { |h, k| h[k] = [] }
13
+ @rectangles = []
14
+ end
15
+
16
+ # Add a rectangle to the index
17
+ # @param rectangle [Rectangle] rectangle to add
18
+ def add(rectangle)
19
+ @rectangles << rectangle
20
+ cells_for(rectangle).each do |cell|
21
+ @grid[cell] << rectangle
22
+ end
23
+ self
24
+ end
25
+
26
+ # Add multiple rectangles
27
+ # @param rectangles [Array<Rectangle>] rectangles to add
28
+ def add_all(rectangles)
29
+ rectangles.each { |r| add(r) }
30
+ self
31
+ end
32
+
33
+ # Find all rectangles that intersect with the query rectangle
34
+ # @param query [Rectangle] query rectangle
35
+ # @return [Array<Rectangle>] intersecting rectangles
36
+ def intersects(query)
37
+ candidates = candidate_set(query)
38
+ candidates.select { |r| r.intersects?(query) }
39
+ end
40
+
41
+ # Find all rectangles that are fully contained within the query rectangle
42
+ # @param query [Rectangle] query rectangle
43
+ # @return [Array<Rectangle>] contained rectangles
44
+ def contains(query)
45
+ candidates = candidate_set(query)
46
+ candidates.select { |r| query.contains?(r) }
47
+ end
48
+
49
+ # Find all rectangles that contain the query point
50
+ # @param point [Point] query point
51
+ # @return [Array<Rectangle>] rectangles containing the point
52
+ def at_point(point)
53
+ cell = cell_for_point(point)
54
+ @grid[cell].select { |r| r.contains_point?(point) }
55
+ end
56
+
57
+ # Find rectangles within a given distance of the query rectangle
58
+ # @param query [Rectangle] query rectangle
59
+ # @param distance [Float] maximum distance
60
+ # @return [Array<Rectangle>] nearby rectangles
61
+ def nearby(query, distance)
62
+ expanded = Rectangle.from_bounds(
63
+ query.top - distance,
64
+ query.left - distance,
65
+ query.bottom + distance,
66
+ query.right + distance
67
+ )
68
+ intersects(expanded)
69
+ end
70
+
71
+ # Compute bounding box of all indexed rectangles
72
+ # @return [Rectangle, nil] bounding box or nil if empty
73
+ def bounds
74
+ Rectangle.bounding_box_of(@rectangles)
75
+ end
76
+
77
+ # Number of indexed rectangles
78
+ def size
79
+ @rectangles.size
80
+ end
81
+
82
+ def empty?
83
+ @rectangles.empty?
84
+ end
85
+
86
+ # Clear all indexed rectangles
87
+ def clear
88
+ @grid.clear
89
+ @rectangles.clear
90
+ self
91
+ end
92
+
93
+ private
94
+
95
+ def cell_for_point(point)
96
+ col = (point.x / @cell_size).floor
97
+ row = (point.y / @cell_size).floor
98
+ [col, row]
99
+ end
100
+
101
+ def cells_for(rectangle)
102
+ min_col = (rectangle.left / @cell_size).floor
103
+ max_col = (rectangle.right / @cell_size).floor
104
+ min_row = (rectangle.top / @cell_size).floor
105
+ max_row = (rectangle.bottom / @cell_size).floor
106
+
107
+ cells = []
108
+ (min_col..max_col).each do |col|
109
+ (min_row..max_row).each do |row|
110
+ cells << [col, row]
111
+ end
112
+ end
113
+ cells
114
+ end
115
+
116
+ def candidate_set(query)
117
+ cells_for(query).flat_map { |cell| @grid[cell] }.uniq
118
+ end
119
+ end
120
+ end
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Tabula
4
+ module Detectors
5
+ # Base class for table detection algorithms
6
+ class DetectionAlgorithm
7
+ # Detect table areas on a page
8
+ # @param page [Page] page to detect tables on
9
+ # @param options [Hash] algorithm-specific options
10
+ # @return [Array<Rectangle>] detected table areas
11
+ def self.detect(page, **options)
12
+ new(**options).detect(page)
13
+ end
14
+
15
+ # @param options [Hash] algorithm options
16
+ def initialize(**options)
17
+ @options = options
18
+ end
19
+
20
+ # Detect table areas on a page
21
+ # @param page [Page] page to detect tables on
22
+ # @return [Array<Rectangle>]
23
+ def detect(page)
24
+ raise NotImplementedError, 'Subclasses must implement #detect'
25
+ end
26
+
27
+ # Get algorithm name
28
+ # @return [String]
29
+ def name
30
+ self.class.name.split('::').last
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,211 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Tabula
4
+ module Detectors
5
+ # Nurminen's table detection algorithm.
6
+ # Based on Anssi Nurminen's master's thesis approach.
7
+ # Detects tables using text alignment and edge analysis.
8
+ class Nurminen < DetectionAlgorithm
9
+ # Text edge types
10
+ EDGE_LEFT = 0
11
+ EDGE_MID = 1
12
+ EDGE_RIGHT = 2
13
+
14
+ # Minimum rows for a valid table
15
+ MIN_ROWS = 2
16
+
17
+ # Overlap threshold for duplicate detection
18
+ OVERLAP_THRESHOLD = 0.9
19
+
20
+ # Detect table areas on a page
21
+ # @param page [Page] page to detect tables on
22
+ # @return [Array<Rectangle>] detected table areas
23
+ def detect(page)
24
+ tables = []
25
+
26
+ # First, try ruling-based detection
27
+ ruling_tables = detect_from_rulings(page)
28
+ tables.concat(ruling_tables)
29
+
30
+ # Then, try text-based detection
31
+ text_tables = detect_from_text(page)
32
+
33
+ # Merge results, removing duplicates
34
+ text_tables.each do |text_table|
35
+ tables << text_table unless overlaps_existing?(text_table, tables)
36
+ end
37
+
38
+ tables
39
+ end
40
+
41
+ private
42
+
43
+ def detect_from_rulings(page)
44
+ SpreadsheetDetection.detect(page)
45
+ end
46
+
47
+ def detect_from_text(page)
48
+ lines = page.text_lines
49
+ return [] if lines.size < MIN_ROWS
50
+
51
+ # Find text edges
52
+ edges = find_text_edges(lines)
53
+ return [] if edges.empty?
54
+
55
+ # Find relevant edges (most common alignment)
56
+ relevant = find_relevant_edges(edges)
57
+ return [] if relevant.empty?
58
+
59
+ # Detect tables from edge patterns
60
+ detect_tables_from_edges(lines, relevant, page)
61
+ end
62
+
63
+ def find_text_edges(lines)
64
+ edges = { EDGE_LEFT => [], EDGE_MID => [], EDGE_RIGHT => [] }
65
+
66
+ lines.each do |line|
67
+ line.sorted_chunks.each do |chunk|
68
+ # Left edge
69
+ edges[EDGE_LEFT] << TextEdge.new(chunk.left, line.top, line.bottom, EDGE_LEFT)
70
+
71
+ # Center edge
72
+ center = chunk.left + (chunk.width / 2.0)
73
+ edges[EDGE_MID] << TextEdge.new(center, line.top, line.bottom, EDGE_MID)
74
+
75
+ # Right edge
76
+ edges[EDGE_RIGHT] << TextEdge.new(chunk.right, line.top, line.bottom, EDGE_RIGHT)
77
+ end
78
+ end
79
+
80
+ edges
81
+ end
82
+
83
+ def find_relevant_edges(edges)
84
+ # Cluster edges by x position
85
+ all_edges = edges.values.flatten
86
+ return [] if all_edges.empty?
87
+
88
+ clustered = cluster_edges(all_edges)
89
+ return [] if clustered.empty?
90
+
91
+ # Find edges that appear in multiple rows
92
+ min_occurrences = [2, (all_edges.size * 0.1).ceil].max
93
+ clustered.select { |_, count| count >= min_occurrences }.keys
94
+ end
95
+
96
+ def cluster_edges(edges, tolerance: 8.0)
97
+ return {} if edges.empty?
98
+
99
+ sorted = edges.sort_by(&:x)
100
+ clusters = {}
101
+ current_cluster = [sorted.first]
102
+
103
+ sorted[1..].each do |edge|
104
+ if (edge.x - current_cluster.last.x).abs <= tolerance
105
+ current_cluster << edge
106
+ else
107
+ avg_x = current_cluster.sum(&:x) / current_cluster.size
108
+ clusters[avg_x] = current_cluster.size
109
+ current_cluster = [edge]
110
+ end
111
+ end
112
+
113
+ unless current_cluster.empty?
114
+ avg_x = current_cluster.sum(&:x) / current_cluster.size
115
+ clusters[avg_x] = current_cluster.size
116
+ end
117
+
118
+ clusters
119
+ end
120
+
121
+ def detect_tables_from_edges(lines, edge_positions, page)
122
+ return [] if edge_positions.size < 2
123
+
124
+ tables = []
125
+
126
+ # Look for consistent patterns across consecutive lines
127
+ table_start = nil
128
+ table_lines = []
129
+
130
+ lines.each_with_index do |line, idx|
131
+ line_edges = extract_line_edges(line)
132
+ aligned = edges_aligned_with_columns?(line_edges, edge_positions)
133
+
134
+ if aligned
135
+ table_start ||= idx
136
+ table_lines << line
137
+ elsif table_lines.size >= MIN_ROWS
138
+ # End of table
139
+ table = create_table_bounds(table_lines, page)
140
+ tables << table if table
141
+ table_start = nil
142
+ table_lines = []
143
+ else
144
+ table_start = nil
145
+ table_lines = []
146
+ end
147
+ end
148
+
149
+ # Handle table at end of page
150
+ if table_lines.size >= MIN_ROWS
151
+ table = create_table_bounds(table_lines, page)
152
+ tables << table if table
153
+ end
154
+
155
+ tables
156
+ end
157
+
158
+ def extract_line_edges(line)
159
+ edges = []
160
+ line.sorted_chunks.each do |chunk|
161
+ edges << chunk.left
162
+ edges << chunk.right
163
+ end
164
+ edges
165
+ end
166
+
167
+ def edges_aligned_with_columns?(line_edges, column_positions, tolerance: 10.0)
168
+ return false if line_edges.empty?
169
+
170
+ # Check if at least half of the line edges align with column positions
171
+ aligned_count = line_edges.count do |edge|
172
+ column_positions.any? { |col| (edge - col).abs <= tolerance }
173
+ end
174
+
175
+ aligned_count >= (line_edges.size * 0.3)
176
+ end
177
+
178
+ def create_table_bounds(lines, page)
179
+ return nil if lines.empty?
180
+
181
+ bounds = Rectangle.bounding_box_of(lines)
182
+ return nil unless bounds
183
+
184
+ # Expand slightly to include full cell boundaries
185
+ padding = 2.0
186
+ Rectangle.from_bounds(
187
+ [bounds.top - padding, 0].max,
188
+ [bounds.left - padding, 0].max,
189
+ [bounds.bottom + padding, page.height].min,
190
+ [bounds.right + padding, page.width].min
191
+ )
192
+ end
193
+
194
+ def overlaps_existing?(table, existing_tables)
195
+ existing_tables.any? { |t| t.overlap_ratio(table) >= OVERLAP_THRESHOLD }
196
+ end
197
+
198
+ # Helper class for text edges
199
+ class TextEdge
200
+ attr_reader :x, :top, :bottom, :type
201
+
202
+ def initialize(x, top, bottom, type)
203
+ @x = x
204
+ @top = top
205
+ @bottom = bottom
206
+ @type = type
207
+ end
208
+ end
209
+ end
210
+ end
211
+ end