tabula-rb 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +39 -0
- data/CHANGELOG.md +59 -0
- data/LICENSE +21 -0
- data/README.md +176 -0
- data/Rakefile +28 -0
- data/exe/tabula +7 -0
- data/lib/tabula/algorithms/cohen_sutherland_clipping.rb +94 -0
- data/lib/tabula/algorithms/projection_profile.rb +109 -0
- data/lib/tabula/cli.rb +271 -0
- data/lib/tabula/configuration.rb +119 -0
- data/lib/tabula/core/point.rb +60 -0
- data/lib/tabula/core/rectangle.rb +218 -0
- data/lib/tabula/core/ruling.rb +303 -0
- data/lib/tabula/core/spatial_index.rb +120 -0
- data/lib/tabula/detectors/detection_algorithm.rb +34 -0
- data/lib/tabula/detectors/nurminen_detection_algorithm.rb +211 -0
- data/lib/tabula/detectors/spreadsheet_detection_algorithm.rb +142 -0
- data/lib/tabula/extractors/basic_extraction_algorithm.rb +168 -0
- data/lib/tabula/extractors/extraction_algorithm.rb +34 -0
- data/lib/tabula/extractors/spreadsheet_extraction_algorithm.rb +299 -0
- data/lib/tabula/pdf/object_extractor.rb +400 -0
- data/lib/tabula/pdf/page.rb +230 -0
- data/lib/tabula/pdf/text_stripper.rb +150 -0
- data/lib/tabula/table/cell.rb +110 -0
- data/lib/tabula/table/table.rb +184 -0
- data/lib/tabula/text/line.rb +133 -0
- data/lib/tabula/text/text_chunk.rb +185 -0
- data/lib/tabula/text/text_element.rb +120 -0
- data/lib/tabula/version.rb +5 -0
- data/lib/tabula/writers/csv_writer.rb +49 -0
- data/lib/tabula/writers/json_writer.rb +41 -0
- data/lib/tabula/writers/markdown_writer.rb +71 -0
- data/lib/tabula/writers/tsv_writer.rb +35 -0
- data/lib/tabula/writers/writer.rb +39 -0
- data/lib/tabula.rb +160 -0
- data/mise.toml +2 -0
- data/tabula-rb.gemspec +44 -0
- metadata +115 -0
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Tabula
|
|
4
|
+
# Represents a ruling line (horizontal or vertical line segment) in a PDF.
|
|
5
|
+
# Used for detecting table cell boundaries in lattice-mode extraction.
|
|
6
|
+
class Ruling
|
|
7
|
+
# Tolerance for considering lines as horizontal/vertical
|
|
8
|
+
ORIENTATION_TOLERANCE = 1.0
|
|
9
|
+
|
|
10
|
+
# Tolerance for near-intersection detection
|
|
11
|
+
INTERSECTION_TOLERANCE = 1.0
|
|
12
|
+
|
|
13
|
+
attr_accessor :x1, :y1, :x2, :y2
|
|
14
|
+
|
|
15
|
+
def initialize(x1, y1, x2, y2)
|
|
16
|
+
@x1 = x1.to_f
|
|
17
|
+
@y1 = y1.to_f
|
|
18
|
+
@x2 = x2.to_f
|
|
19
|
+
@y2 = y2.to_f
|
|
20
|
+
normalize!
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Create from two points
|
|
24
|
+
def self.from_points(p1, p2)
|
|
25
|
+
new(p1.x, p1.y, p2.x, p2.y)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Create from top, left, width, height (like Rectangle)
|
|
29
|
+
def self.from_bounds(top, left, width, height)
|
|
30
|
+
new(left, top, left + width, top + height)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Normalize almost-horizontal and almost-vertical lines
|
|
34
|
+
def normalize!
|
|
35
|
+
if horizontal?
|
|
36
|
+
avg_y = (y1 + y2) / 2.0
|
|
37
|
+
@y1 = avg_y
|
|
38
|
+
@y2 = avg_y
|
|
39
|
+
# Ensure x1 < x2
|
|
40
|
+
@x1, @x2 = @x2, @x1 if x1 > x2
|
|
41
|
+
elsif vertical?
|
|
42
|
+
avg_x = (x1 + x2) / 2.0
|
|
43
|
+
@x1 = avg_x
|
|
44
|
+
@x2 = avg_x
|
|
45
|
+
# Ensure y1 < y2
|
|
46
|
+
@y1, @y2 = @y2, @y1 if y1 > y2
|
|
47
|
+
end
|
|
48
|
+
self
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def horizontal?
|
|
52
|
+
(y2 - y1).abs <= ORIENTATION_TOLERANCE
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def vertical?
|
|
56
|
+
(x2 - x1).abs <= ORIENTATION_TOLERANCE
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def oblique?
|
|
60
|
+
!horizontal? && !vertical?
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Position perpendicular to the line (y for horizontal, x for vertical)
|
|
64
|
+
def position
|
|
65
|
+
horizontal? ? y1 : x1
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def position=(value)
|
|
69
|
+
if horizontal?
|
|
70
|
+
@y1 = value
|
|
71
|
+
@y2 = value
|
|
72
|
+
else
|
|
73
|
+
@x1 = value
|
|
74
|
+
@x2 = value
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# Start point along the line direction
|
|
79
|
+
def start
|
|
80
|
+
horizontal? ? x1 : y1
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# End point along the line direction
|
|
84
|
+
def end
|
|
85
|
+
horizontal? ? x2 : y2
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
def length
|
|
89
|
+
Math.sqrt(((x2 - x1)**2) + ((y2 - y1)**2))
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def top
|
|
93
|
+
[y1, y2].min
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def bottom
|
|
97
|
+
[y1, y2].max
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
def left
|
|
101
|
+
[x1, x2].min
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def right
|
|
105
|
+
[x1, x2].max
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def width
|
|
109
|
+
right - left
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
def height
|
|
113
|
+
bottom - top
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# Get bounding rectangle
|
|
117
|
+
def bounds
|
|
118
|
+
Rectangle.new(top, left, width, height)
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
# Get start and end points as Point objects
|
|
122
|
+
def p1
|
|
123
|
+
Point.new(x1, y1)
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
def p2
|
|
127
|
+
Point.new(x2, y2)
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
# Calculate angle in degrees (0 = horizontal, 90 = vertical)
|
|
131
|
+
def angle
|
|
132
|
+
Math.atan2(y2 - y1, x2 - x1) * 180.0 / Math::PI
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
# Find intersection point with another ruling (only for orthogonal lines)
|
|
136
|
+
def intersection_point(other)
|
|
137
|
+
return nil if horizontal? == other.horizontal?
|
|
138
|
+
return nil if oblique? || other.oblique?
|
|
139
|
+
|
|
140
|
+
if horizontal?
|
|
141
|
+
Point.new(other.x1, y1)
|
|
142
|
+
else
|
|
143
|
+
Point.new(x1, other.y1)
|
|
144
|
+
end
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
# Check if this ruling intersects another (with tolerance)
|
|
148
|
+
def intersects?(other, tolerance = INTERSECTION_TOLERANCE)
|
|
149
|
+
point = intersection_point(other)
|
|
150
|
+
return false unless point
|
|
151
|
+
|
|
152
|
+
# Check if intersection point lies within both line segments
|
|
153
|
+
if horizontal?
|
|
154
|
+
x_in_self = point.x.between?(left - tolerance, right + tolerance)
|
|
155
|
+
y_in_other = point.y.between?(other.top - tolerance, other.bottom + tolerance)
|
|
156
|
+
x_in_self && y_in_other
|
|
157
|
+
else
|
|
158
|
+
y_in_self = point.y.between?(top - tolerance, bottom + tolerance)
|
|
159
|
+
x_in_other = point.x.between?(other.left - tolerance, other.right + tolerance)
|
|
160
|
+
y_in_self && x_in_other
|
|
161
|
+
end
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
# Check if lines nearly intersect (for cell detection)
|
|
165
|
+
def nearly_intersects?(other, tolerance = INTERSECTION_TOLERANCE)
|
|
166
|
+
intersects?(other, tolerance)
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
# Expand the ruling by extending its endpoints
|
|
170
|
+
def expand(amount)
|
|
171
|
+
if horizontal?
|
|
172
|
+
Ruling.new(x1 - amount, y1, x2 + amount, y2)
|
|
173
|
+
elsif vertical?
|
|
174
|
+
Ruling.new(x1, y1 - amount, x2, y2 + amount)
|
|
175
|
+
else
|
|
176
|
+
# For oblique lines, expand in both directions
|
|
177
|
+
dx = (x2 - x1) / length * amount
|
|
178
|
+
dy = (y2 - y1) / length * amount
|
|
179
|
+
Ruling.new(x1 - dx, y1 - dy, x2 + dx, y2 + dy)
|
|
180
|
+
end
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
# Clip ruling to a rectangular area
|
|
184
|
+
def clip_to(rect)
|
|
185
|
+
CohenSutherlandClipping.clip(self, rect)
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
# Check if this ruling overlaps with another (for collapsing)
|
|
189
|
+
def colinear_with?(other, tolerance = 1.0)
|
|
190
|
+
return false unless horizontal? == other.horizontal?
|
|
191
|
+
|
|
192
|
+
if horizontal?
|
|
193
|
+
(y1 - other.y1).abs < tolerance
|
|
194
|
+
else
|
|
195
|
+
(x1 - other.x1).abs < tolerance
|
|
196
|
+
end
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
def ==(other)
|
|
200
|
+
return false unless other.is_a?(Ruling)
|
|
201
|
+
|
|
202
|
+
x1 == other.x1 && y1 == other.y1 && x2 == other.x2 && y2 == other.y2
|
|
203
|
+
end
|
|
204
|
+
alias eql? ==
|
|
205
|
+
|
|
206
|
+
def hash
|
|
207
|
+
[x1, y1, x2, y2].hash
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
def dup
|
|
211
|
+
Ruling.new(x1, y1, x2, y2)
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
def to_s
|
|
215
|
+
orientation = if horizontal?
|
|
216
|
+
'H'
|
|
217
|
+
else
|
|
218
|
+
(vertical? ? 'V' : 'O')
|
|
219
|
+
end
|
|
220
|
+
"Ruling[#{orientation}](#{x1}, #{y1}) -> (#{x2}, #{y2})"
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
def inspect
|
|
224
|
+
to_s
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
class << self
|
|
228
|
+
# Find all intersection points between horizontal and vertical rulings
|
|
229
|
+
# Uses sweep line algorithm for O(n log n) performance
|
|
230
|
+
def find_intersections(horizontal_rulings, vertical_rulings)
|
|
231
|
+
intersections = {}
|
|
232
|
+
|
|
233
|
+
horizontal_rulings.each do |h|
|
|
234
|
+
vertical_rulings.each do |v|
|
|
235
|
+
next unless h.intersects?(v)
|
|
236
|
+
|
|
237
|
+
point = h.intersection_point(v)
|
|
238
|
+
next unless point
|
|
239
|
+
|
|
240
|
+
# Round to avoid floating point issues
|
|
241
|
+
key = [point.x.round(2), point.y.round(2)]
|
|
242
|
+
intersections[key] ||= point
|
|
243
|
+
end
|
|
244
|
+
end
|
|
245
|
+
|
|
246
|
+
intersections.values
|
|
247
|
+
end
|
|
248
|
+
|
|
249
|
+
# Collapse colinear rulings that are close together
|
|
250
|
+
def collapse_oriented_rulings(rulings, tolerance = 1.0)
|
|
251
|
+
return [] if rulings.empty?
|
|
252
|
+
|
|
253
|
+
# Separate horizontal and vertical
|
|
254
|
+
horizontal = rulings.select(&:horizontal?).sort_by(&:y1)
|
|
255
|
+
vertical = rulings.select(&:vertical?).sort_by(&:x1)
|
|
256
|
+
|
|
257
|
+
collapsed = []
|
|
258
|
+
collapsed.concat(collapse_group(horizontal, tolerance))
|
|
259
|
+
collapsed.concat(collapse_group(vertical, tolerance))
|
|
260
|
+
collapsed
|
|
261
|
+
end
|
|
262
|
+
|
|
263
|
+
# Crop rulings to a rectangular area
|
|
264
|
+
def crop_to_area(rulings, rect)
|
|
265
|
+
rulings.filter_map { |r| r.clip_to(rect) }
|
|
266
|
+
end
|
|
267
|
+
|
|
268
|
+
private
|
|
269
|
+
|
|
270
|
+
def collapse_group(rulings, tolerance)
|
|
271
|
+
return [] if rulings.empty?
|
|
272
|
+
|
|
273
|
+
groups = []
|
|
274
|
+
current_group = [rulings.first]
|
|
275
|
+
|
|
276
|
+
rulings[1..].each do |ruling|
|
|
277
|
+
if current_group.last.colinear_with?(ruling, tolerance)
|
|
278
|
+
current_group << ruling
|
|
279
|
+
else
|
|
280
|
+
groups << current_group
|
|
281
|
+
current_group = [ruling]
|
|
282
|
+
end
|
|
283
|
+
end
|
|
284
|
+
groups << current_group
|
|
285
|
+
|
|
286
|
+
# Merge each group into a single ruling
|
|
287
|
+
groups.map do |group|
|
|
288
|
+
if group.first.horizontal?
|
|
289
|
+
y = group.map(&:y1).sum / group.size
|
|
290
|
+
min_x = group.map(&:left).min
|
|
291
|
+
max_x = group.map(&:right).max
|
|
292
|
+
Ruling.new(min_x, y, max_x, y)
|
|
293
|
+
else
|
|
294
|
+
x = group.map(&:x1).sum / group.size
|
|
295
|
+
min_y = group.map(&:top).min
|
|
296
|
+
max_y = group.map(&:bottom).max
|
|
297
|
+
Ruling.new(x, min_y, x, max_y)
|
|
298
|
+
end
|
|
299
|
+
end
|
|
300
|
+
end
|
|
301
|
+
end
|
|
302
|
+
end
|
|
303
|
+
end
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Tabula
|
|
4
|
+
# Spatial index for efficient rectangle queries.
|
|
5
|
+
# Uses a simple grid-based approach for O(1) average lookup.
|
|
6
|
+
class SpatialIndex
|
|
7
|
+
attr_reader :rectangles
|
|
8
|
+
|
|
9
|
+
# @param cell_size [Float] size of grid cells (default 50)
|
|
10
|
+
def initialize(cell_size: 50.0)
|
|
11
|
+
@cell_size = cell_size
|
|
12
|
+
@grid = Hash.new { |h, k| h[k] = [] }
|
|
13
|
+
@rectangles = []
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
# Add a rectangle to the index
|
|
17
|
+
# @param rectangle [Rectangle] rectangle to add
|
|
18
|
+
def add(rectangle)
|
|
19
|
+
@rectangles << rectangle
|
|
20
|
+
cells_for(rectangle).each do |cell|
|
|
21
|
+
@grid[cell] << rectangle
|
|
22
|
+
end
|
|
23
|
+
self
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# Add multiple rectangles
|
|
27
|
+
# @param rectangles [Array<Rectangle>] rectangles to add
|
|
28
|
+
def add_all(rectangles)
|
|
29
|
+
rectangles.each { |r| add(r) }
|
|
30
|
+
self
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Find all rectangles that intersect with the query rectangle
|
|
34
|
+
# @param query [Rectangle] query rectangle
|
|
35
|
+
# @return [Array<Rectangle>] intersecting rectangles
|
|
36
|
+
def intersects(query)
|
|
37
|
+
candidates = candidate_set(query)
|
|
38
|
+
candidates.select { |r| r.intersects?(query) }
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Find all rectangles that are fully contained within the query rectangle
|
|
42
|
+
# @param query [Rectangle] query rectangle
|
|
43
|
+
# @return [Array<Rectangle>] contained rectangles
|
|
44
|
+
def contains(query)
|
|
45
|
+
candidates = candidate_set(query)
|
|
46
|
+
candidates.select { |r| query.contains?(r) }
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Find all rectangles that contain the query point
|
|
50
|
+
# @param point [Point] query point
|
|
51
|
+
# @return [Array<Rectangle>] rectangles containing the point
|
|
52
|
+
def at_point(point)
|
|
53
|
+
cell = cell_for_point(point)
|
|
54
|
+
@grid[cell].select { |r| r.contains_point?(point) }
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Find rectangles within a given distance of the query rectangle
|
|
58
|
+
# @param query [Rectangle] query rectangle
|
|
59
|
+
# @param distance [Float] maximum distance
|
|
60
|
+
# @return [Array<Rectangle>] nearby rectangles
|
|
61
|
+
def nearby(query, distance)
|
|
62
|
+
expanded = Rectangle.from_bounds(
|
|
63
|
+
query.top - distance,
|
|
64
|
+
query.left - distance,
|
|
65
|
+
query.bottom + distance,
|
|
66
|
+
query.right + distance
|
|
67
|
+
)
|
|
68
|
+
intersects(expanded)
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Compute bounding box of all indexed rectangles
|
|
72
|
+
# @return [Rectangle, nil] bounding box or nil if empty
|
|
73
|
+
def bounds
|
|
74
|
+
Rectangle.bounding_box_of(@rectangles)
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# Number of indexed rectangles
|
|
78
|
+
def size
|
|
79
|
+
@rectangles.size
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def empty?
|
|
83
|
+
@rectangles.empty?
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Clear all indexed rectangles
|
|
87
|
+
def clear
|
|
88
|
+
@grid.clear
|
|
89
|
+
@rectangles.clear
|
|
90
|
+
self
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
private
|
|
94
|
+
|
|
95
|
+
def cell_for_point(point)
|
|
96
|
+
col = (point.x / @cell_size).floor
|
|
97
|
+
row = (point.y / @cell_size).floor
|
|
98
|
+
[col, row]
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
def cells_for(rectangle)
|
|
102
|
+
min_col = (rectangle.left / @cell_size).floor
|
|
103
|
+
max_col = (rectangle.right / @cell_size).floor
|
|
104
|
+
min_row = (rectangle.top / @cell_size).floor
|
|
105
|
+
max_row = (rectangle.bottom / @cell_size).floor
|
|
106
|
+
|
|
107
|
+
cells = []
|
|
108
|
+
(min_col..max_col).each do |col|
|
|
109
|
+
(min_row..max_row).each do |row|
|
|
110
|
+
cells << [col, row]
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
cells
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
def candidate_set(query)
|
|
117
|
+
cells_for(query).flat_map { |cell| @grid[cell] }.uniq
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
end
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Tabula
|
|
4
|
+
module Detectors
|
|
5
|
+
# Base class for table detection algorithms
|
|
6
|
+
class DetectionAlgorithm
|
|
7
|
+
# Detect table areas on a page
|
|
8
|
+
# @param page [Page] page to detect tables on
|
|
9
|
+
# @param options [Hash] algorithm-specific options
|
|
10
|
+
# @return [Array<Rectangle>] detected table areas
|
|
11
|
+
def self.detect(page, **options)
|
|
12
|
+
new(**options).detect(page)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
# @param options [Hash] algorithm options
|
|
16
|
+
def initialize(**options)
|
|
17
|
+
@options = options
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# Detect table areas on a page
|
|
21
|
+
# @param page [Page] page to detect tables on
|
|
22
|
+
# @return [Array<Rectangle>]
|
|
23
|
+
def detect(page)
|
|
24
|
+
raise NotImplementedError, 'Subclasses must implement #detect'
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Get algorithm name
|
|
28
|
+
# @return [String]
|
|
29
|
+
def name
|
|
30
|
+
self.class.name.split('::').last
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Tabula
|
|
4
|
+
module Detectors
|
|
5
|
+
# Nurminen's table detection algorithm.
|
|
6
|
+
# Based on Anssi Nurminen's master's thesis approach.
|
|
7
|
+
# Detects tables using text alignment and edge analysis.
|
|
8
|
+
class Nurminen < DetectionAlgorithm
|
|
9
|
+
# Text edge types
|
|
10
|
+
EDGE_LEFT = 0
|
|
11
|
+
EDGE_MID = 1
|
|
12
|
+
EDGE_RIGHT = 2
|
|
13
|
+
|
|
14
|
+
# Minimum rows for a valid table
|
|
15
|
+
MIN_ROWS = 2
|
|
16
|
+
|
|
17
|
+
# Overlap threshold for duplicate detection
|
|
18
|
+
OVERLAP_THRESHOLD = 0.9
|
|
19
|
+
|
|
20
|
+
# Detect table areas on a page
|
|
21
|
+
# @param page [Page] page to detect tables on
|
|
22
|
+
# @return [Array<Rectangle>] detected table areas
|
|
23
|
+
def detect(page)
|
|
24
|
+
tables = []
|
|
25
|
+
|
|
26
|
+
# First, try ruling-based detection
|
|
27
|
+
ruling_tables = detect_from_rulings(page)
|
|
28
|
+
tables.concat(ruling_tables)
|
|
29
|
+
|
|
30
|
+
# Then, try text-based detection
|
|
31
|
+
text_tables = detect_from_text(page)
|
|
32
|
+
|
|
33
|
+
# Merge results, removing duplicates
|
|
34
|
+
text_tables.each do |text_table|
|
|
35
|
+
tables << text_table unless overlaps_existing?(text_table, tables)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
tables
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
private
|
|
42
|
+
|
|
43
|
+
def detect_from_rulings(page)
|
|
44
|
+
SpreadsheetDetection.detect(page)
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def detect_from_text(page)
|
|
48
|
+
lines = page.text_lines
|
|
49
|
+
return [] if lines.size < MIN_ROWS
|
|
50
|
+
|
|
51
|
+
# Find text edges
|
|
52
|
+
edges = find_text_edges(lines)
|
|
53
|
+
return [] if edges.empty?
|
|
54
|
+
|
|
55
|
+
# Find relevant edges (most common alignment)
|
|
56
|
+
relevant = find_relevant_edges(edges)
|
|
57
|
+
return [] if relevant.empty?
|
|
58
|
+
|
|
59
|
+
# Detect tables from edge patterns
|
|
60
|
+
detect_tables_from_edges(lines, relevant, page)
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def find_text_edges(lines)
|
|
64
|
+
edges = { EDGE_LEFT => [], EDGE_MID => [], EDGE_RIGHT => [] }
|
|
65
|
+
|
|
66
|
+
lines.each do |line|
|
|
67
|
+
line.sorted_chunks.each do |chunk|
|
|
68
|
+
# Left edge
|
|
69
|
+
edges[EDGE_LEFT] << TextEdge.new(chunk.left, line.top, line.bottom, EDGE_LEFT)
|
|
70
|
+
|
|
71
|
+
# Center edge
|
|
72
|
+
center = chunk.left + (chunk.width / 2.0)
|
|
73
|
+
edges[EDGE_MID] << TextEdge.new(center, line.top, line.bottom, EDGE_MID)
|
|
74
|
+
|
|
75
|
+
# Right edge
|
|
76
|
+
edges[EDGE_RIGHT] << TextEdge.new(chunk.right, line.top, line.bottom, EDGE_RIGHT)
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
edges
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def find_relevant_edges(edges)
|
|
84
|
+
# Cluster edges by x position
|
|
85
|
+
all_edges = edges.values.flatten
|
|
86
|
+
return [] if all_edges.empty?
|
|
87
|
+
|
|
88
|
+
clustered = cluster_edges(all_edges)
|
|
89
|
+
return [] if clustered.empty?
|
|
90
|
+
|
|
91
|
+
# Find edges that appear in multiple rows
|
|
92
|
+
min_occurrences = [2, (all_edges.size * 0.1).ceil].max
|
|
93
|
+
clustered.select { |_, count| count >= min_occurrences }.keys
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def cluster_edges(edges, tolerance: 8.0)
|
|
97
|
+
return {} if edges.empty?
|
|
98
|
+
|
|
99
|
+
sorted = edges.sort_by(&:x)
|
|
100
|
+
clusters = {}
|
|
101
|
+
current_cluster = [sorted.first]
|
|
102
|
+
|
|
103
|
+
sorted[1..].each do |edge|
|
|
104
|
+
if (edge.x - current_cluster.last.x).abs <= tolerance
|
|
105
|
+
current_cluster << edge
|
|
106
|
+
else
|
|
107
|
+
avg_x = current_cluster.sum(&:x) / current_cluster.size
|
|
108
|
+
clusters[avg_x] = current_cluster.size
|
|
109
|
+
current_cluster = [edge]
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
unless current_cluster.empty?
|
|
114
|
+
avg_x = current_cluster.sum(&:x) / current_cluster.size
|
|
115
|
+
clusters[avg_x] = current_cluster.size
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
clusters
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
def detect_tables_from_edges(lines, edge_positions, page)
|
|
122
|
+
return [] if edge_positions.size < 2
|
|
123
|
+
|
|
124
|
+
tables = []
|
|
125
|
+
|
|
126
|
+
# Look for consistent patterns across consecutive lines
|
|
127
|
+
table_start = nil
|
|
128
|
+
table_lines = []
|
|
129
|
+
|
|
130
|
+
lines.each_with_index do |line, idx|
|
|
131
|
+
line_edges = extract_line_edges(line)
|
|
132
|
+
aligned = edges_aligned_with_columns?(line_edges, edge_positions)
|
|
133
|
+
|
|
134
|
+
if aligned
|
|
135
|
+
table_start ||= idx
|
|
136
|
+
table_lines << line
|
|
137
|
+
elsif table_lines.size >= MIN_ROWS
|
|
138
|
+
# End of table
|
|
139
|
+
table = create_table_bounds(table_lines, page)
|
|
140
|
+
tables << table if table
|
|
141
|
+
table_start = nil
|
|
142
|
+
table_lines = []
|
|
143
|
+
else
|
|
144
|
+
table_start = nil
|
|
145
|
+
table_lines = []
|
|
146
|
+
end
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
# Handle table at end of page
|
|
150
|
+
if table_lines.size >= MIN_ROWS
|
|
151
|
+
table = create_table_bounds(table_lines, page)
|
|
152
|
+
tables << table if table
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
tables
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
def extract_line_edges(line)
|
|
159
|
+
edges = []
|
|
160
|
+
line.sorted_chunks.each do |chunk|
|
|
161
|
+
edges << chunk.left
|
|
162
|
+
edges << chunk.right
|
|
163
|
+
end
|
|
164
|
+
edges
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
def edges_aligned_with_columns?(line_edges, column_positions, tolerance: 10.0)
|
|
168
|
+
return false if line_edges.empty?
|
|
169
|
+
|
|
170
|
+
# Check if at least half of the line edges align with column positions
|
|
171
|
+
aligned_count = line_edges.count do |edge|
|
|
172
|
+
column_positions.any? { |col| (edge - col).abs <= tolerance }
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
aligned_count >= (line_edges.size * 0.3)
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
def create_table_bounds(lines, page)
|
|
179
|
+
return nil if lines.empty?
|
|
180
|
+
|
|
181
|
+
bounds = Rectangle.bounding_box_of(lines)
|
|
182
|
+
return nil unless bounds
|
|
183
|
+
|
|
184
|
+
# Expand slightly to include full cell boundaries
|
|
185
|
+
padding = 2.0
|
|
186
|
+
Rectangle.from_bounds(
|
|
187
|
+
[bounds.top - padding, 0].max,
|
|
188
|
+
[bounds.left - padding, 0].max,
|
|
189
|
+
[bounds.bottom + padding, page.height].min,
|
|
190
|
+
[bounds.right + padding, page.width].min
|
|
191
|
+
)
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
def overlaps_existing?(table, existing_tables)
|
|
195
|
+
existing_tables.any? { |t| t.overlap_ratio(table) >= OVERLAP_THRESHOLD }
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
# Helper class for text edges
|
|
199
|
+
class TextEdge
|
|
200
|
+
attr_reader :x, :top, :bottom, :type
|
|
201
|
+
|
|
202
|
+
def initialize(x, top, bottom, type)
|
|
203
|
+
@x = x
|
|
204
|
+
@top = top
|
|
205
|
+
@bottom = bottom
|
|
206
|
+
@type = type
|
|
207
|
+
end
|
|
208
|
+
end
|
|
209
|
+
end
|
|
210
|
+
end
|
|
211
|
+
end
|