rpdfium 0.4.1 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +601 -1317
- data/README.md +73 -78
- data/lib/rpdfium/annotation/annotation.rb +10 -8
- data/lib/rpdfium/document.rb +49 -22
- data/lib/rpdfium/errors.rb +2 -2
- data/lib/rpdfium/form/form.rb +9 -9
- data/lib/rpdfium/image/embedded.rb +17 -16
- data/lib/rpdfium/io/png.rb +9 -9
- data/lib/rpdfium/page.rb +562 -527
- data/lib/rpdfium/raw.rb +216 -203
- data/lib/rpdfium/search/search.rb +5 -5
- data/lib/rpdfium/structure/attachment.rb +6 -6
- data/lib/rpdfium/structure/element.rb +74 -74
- data/lib/rpdfium/structure/outline.rb +2 -2
- data/lib/rpdfium/structure/tree.rb +56 -55
- data/lib/rpdfium/table/cells.rb +36 -33
- data/lib/rpdfium/table/debugger.rb +12 -12
- data/lib/rpdfium/table/edges.rb +51 -49
- data/lib/rpdfium/table/extractor.rb +35 -34
- data/lib/rpdfium/table/table.rb +65 -62
- data/lib/rpdfium/util/cluster.rb +35 -33
- data/lib/rpdfium/util/column_inference.rb +34 -32
- data/lib/rpdfium/util/label_matcher.rb +30 -30
- data/lib/rpdfium/util/text_extraction.rb +15 -15
- data/lib/rpdfium/util/word_extractor.rb +49 -48
- data/lib/rpdfium/util/word_merger.rb +25 -24
- data/lib/rpdfium/version.rb +1 -1
- data/lib/rpdfium.rb +17 -15
- metadata +1 -1
data/lib/rpdfium/table/cells.rb
CHANGED
|
@@ -2,32 +2,34 @@
|
|
|
2
2
|
|
|
3
3
|
module Rpdfium
|
|
4
4
|
module Table
|
|
5
|
-
#
|
|
6
|
-
#
|
|
5
|
+
# Builds cells from intersections and tables from cells.
|
|
6
|
+
# Algorithms 1:1 with pdfplumber.intersections_to_cells and
|
|
7
7
|
# pdfplumber.cells_to_tables.
|
|
8
8
|
module Cells
|
|
9
9
|
module_function
|
|
10
10
|
|
|
11
|
-
#
|
|
12
|
-
# `pt = (x, y)`,
|
|
13
|
-
#
|
|
11
|
+
# "Smallest cell" search for each intersection: given a point
|
|
12
|
+
# `pt = (x, y)`, find the minimal rectangle whose 4 corners are
|
|
13
|
+
# intersections and whose 4 sides have edges connecting them.
|
|
14
14
|
#
|
|
15
|
-
#
|
|
16
|
-
# x
|
|
17
|
-
#
|
|
18
|
-
#
|
|
19
|
-
#
|
|
15
|
+
# The "edge connect" constraint is crucial: two intersections with
|
|
16
|
+
# the same x are not enough — they must SHARE at least one vertical
|
|
17
|
+
# edge (i.e. belong to the same continuous segment). Likewise
|
|
18
|
+
# horizontally. This avoids false positives such as "two distant
|
|
19
|
+
# columns accidentally aligned".
|
|
20
20
|
#
|
|
21
|
-
# `intersections`
|
|
22
|
-
#
|
|
21
|
+
# `intersections` is the Hash produced by
|
|
22
|
+
# Edges.edges_to_intersections, with keys `[x, y]` and values
|
|
23
|
+
# `{ v: [edges...], h: [edges...] }`.
|
|
23
24
|
def intersections_to_cells(intersections)
|
|
24
25
|
return [] if intersections.empty?
|
|
25
26
|
|
|
26
|
-
#
|
|
27
|
-
#
|
|
28
|
-
#
|
|
29
|
-
#
|
|
30
|
-
#
|
|
27
|
+
# Adjacency indices: for each edge (a Hash object, ruby
|
|
28
|
+
# identity), which intersection points does it contain?
|
|
29
|
+
# Pdfplumber does this by comparing the bbox of the edges — we
|
|
30
|
+
# have direct access to the edge objects inside
|
|
31
|
+
# `intersections[pt]`, so it suffices to use identity. For "same
|
|
32
|
+
# edge" we use `equal?` (object identity).
|
|
31
33
|
edge_ids = intersections.transform_values do |val|
|
|
32
34
|
{ v: val[:v].map(&:object_id).to_set,
|
|
33
35
|
h: val[:h].map(&:object_id).to_set }
|
|
@@ -46,9 +48,9 @@ module Rpdfium
|
|
|
46
48
|
points = intersections.keys.sort
|
|
47
49
|
npoints = points.size
|
|
48
50
|
|
|
49
|
-
#
|
|
50
|
-
# (
|
|
51
|
-
#
|
|
51
|
+
# Spatial indices: precompute points by column (same x) and by
|
|
52
|
+
# row (same y), already ordered because `points` is sorted.
|
|
53
|
+
# Allows O(log n) lookup via bsearch instead of O(n) via select.
|
|
52
54
|
by_x = Hash.new { |h, k| h[k] = [] }
|
|
53
55
|
by_y = Hash.new { |h, k| h[k] = [] }
|
|
54
56
|
points.each { |p| by_x[p[0]] << p; by_y[p[1]] << p }
|
|
@@ -57,18 +59,18 @@ module Rpdfium
|
|
|
57
59
|
points.each_with_index do |pt, i|
|
|
58
60
|
next if i == npoints - 1
|
|
59
61
|
|
|
60
|
-
#
|
|
62
|
+
# Points directly below `pt` (same x, greater y)
|
|
61
63
|
col = by_x[pt[0]]
|
|
62
64
|
below_start = col.bsearch_index { |q| q[1] > pt[1] } || col.size
|
|
63
65
|
below = col[below_start..]
|
|
64
66
|
|
|
65
|
-
#
|
|
67
|
+
# Points directly to the right of `pt` (same y, greater x)
|
|
66
68
|
row_pts = by_y[pt[1]]
|
|
67
69
|
right_start = row_pts.bsearch_index { |q| q[0] > pt[0] } || row_pts.size
|
|
68
70
|
right = row_pts[right_start..]
|
|
69
71
|
|
|
70
|
-
#
|
|
71
|
-
#
|
|
72
|
+
# Find the FIRST (== smallest, due to ordering) bottom-right
|
|
73
|
+
# whose 4 corners are present and whose edges connect.
|
|
72
74
|
found = nil
|
|
73
75
|
below.each do |b|
|
|
74
76
|
next unless edge_connects.call(pt, b)
|
|
@@ -91,13 +93,14 @@ module Rpdfium
|
|
|
91
93
|
cells
|
|
92
94
|
end
|
|
93
95
|
|
|
94
|
-
#
|
|
96
|
+
# Groups cells into tables based on shared corners.
|
|
95
97
|
#
|
|
96
|
-
#
|
|
97
|
-
#
|
|
98
|
-
#
|
|
98
|
+
# Algorithm: Union-Find (disjoint set) on the corners — O(n α(n))
|
|
99
|
+
# instead of pdfplumber's greedy fixed-point O(n²). The result is
|
|
100
|
+
# identical: two cells end up in the same group if they share at
|
|
101
|
+
# least one corner.
|
|
99
102
|
#
|
|
100
|
-
#
|
|
103
|
+
# Final filter: discard tables with a SINGLE cell (noise).
|
|
101
104
|
def cells_to_tables(cells)
|
|
102
105
|
return [] if cells.empty?
|
|
103
106
|
|
|
@@ -110,8 +113,8 @@ module Rpdfium
|
|
|
110
113
|
end
|
|
111
114
|
union = ->(a, b) { parent[find.call(a)] = find.call(b) }
|
|
112
115
|
|
|
113
|
-
#
|
|
114
|
-
#
|
|
116
|
+
# For each corner, collect the indices of the cells that share it
|
|
117
|
+
# and union them into the same component.
|
|
115
118
|
corner_to_cells = Hash.new { |h, k| h[k] = [] }
|
|
116
119
|
cells.each_with_index do |cell, idx|
|
|
117
120
|
x0, top, x1, bottom = cell
|
|
@@ -123,11 +126,11 @@ module Rpdfium
|
|
|
123
126
|
idxs.each_cons(2) { |a, b| union.call(a, b) }
|
|
124
127
|
end
|
|
125
128
|
|
|
126
|
-
#
|
|
129
|
+
# Group by the Union-Find root
|
|
127
130
|
groups = Hash.new { |h, k| h[k] = [] }
|
|
128
131
|
cells.each_with_index { |cell, i| groups[find.call(i)] << cell }
|
|
129
132
|
|
|
130
|
-
# Sort top-to-bottom, left-to-right;
|
|
133
|
+
# Sort top-to-bottom, left-to-right; filter out single-cell.
|
|
131
134
|
groups.values
|
|
132
135
|
.sort_by { |t| t.map { |c| [c[1], c[0]] }.min }
|
|
133
136
|
.reject { |t| t.size <= 1 }
|
|
@@ -2,12 +2,12 @@
|
|
|
2
2
|
|
|
3
3
|
module Rpdfium
|
|
4
4
|
module Table
|
|
5
|
-
#
|
|
6
|
-
#
|
|
5
|
+
# Generates a debug visualization: the page rendered to PNG with the
|
|
6
|
+
# detected edges and cell bboxes overlaid. Equivalent to
|
|
7
7
|
# pdfplumber.Page.to_image().debug_tablefinder().
|
|
8
8
|
#
|
|
9
|
-
#
|
|
10
|
-
#
|
|
9
|
+
# Implemented in pure Ruby: rasterizes the page via render(), then draws
|
|
10
|
+
# over the bitmap by manipulating the RGBA bytes, and finally saves to PNG.
|
|
11
11
|
module Debugger
|
|
12
12
|
module_function
|
|
13
13
|
|
|
@@ -24,19 +24,19 @@ module Rpdfium
|
|
|
24
24
|
w, h, bytes, _stride = page.render(scale: scale, output: :rgba)
|
|
25
25
|
canvas = Canvas.new(w, h, bytes)
|
|
26
26
|
|
|
27
|
-
#
|
|
28
|
-
#
|
|
27
|
+
# Draws edges. New format: each edge has orientation + x0/x1/top/bottom.
|
|
28
|
+
# A horizontal edge has top == bottom; a vertical one has x0 == x1.
|
|
29
29
|
edges.each do |e|
|
|
30
30
|
canvas.line((e[:x0] * scale).to_i, (e[:top] * scale).to_i,
|
|
31
31
|
(e[:x1] * scale).to_i, (e[:bottom] * scale).to_i, RED)
|
|
32
32
|
end
|
|
33
33
|
|
|
34
|
-
#
|
|
34
|
+
# Draws intersections (4px circles). They are Hashes keyed by [x, y].
|
|
35
35
|
intersections.each_key do |(x, y)|
|
|
36
36
|
canvas.dot((x * scale).to_i, (y * scale).to_i, GREEN, 4)
|
|
37
37
|
end
|
|
38
38
|
|
|
39
|
-
#
|
|
39
|
+
# Fills tables with transparent blue. Table#bbox is the tuple [x0, top, x1, bottom].
|
|
40
40
|
tables.each do |t|
|
|
41
41
|
x0, top, x1, bottom = t.bbox
|
|
42
42
|
canvas.rect_fill((x0 * scale).to_i, (top * scale).to_i,
|
|
@@ -48,15 +48,15 @@ module Rpdfium
|
|
|
48
48
|
end
|
|
49
49
|
end
|
|
50
50
|
|
|
51
|
-
#
|
|
52
|
-
#
|
|
51
|
+
# Minimal RGBA canvas for drawing over the rendering. Nothing sophisticated:
|
|
52
|
+
# Bresenham lines, dots, rect fill with simple alpha blending.
|
|
53
53
|
class Canvas
|
|
54
54
|
attr_reader :bytes, :width, :height
|
|
55
55
|
|
|
56
56
|
def initialize(width, height, rgba_bytes)
|
|
57
57
|
@width = width
|
|
58
58
|
@height = height
|
|
59
|
-
#
|
|
59
|
+
# We work on a mutable string (binstring)
|
|
60
60
|
@bytes = rgba_bytes.dup.force_encoding(Encoding::ASCII_8BIT)
|
|
61
61
|
end
|
|
62
62
|
|
|
@@ -71,7 +71,7 @@ module Rpdfium
|
|
|
71
71
|
@bytes.setbyte(idx + 2, b)
|
|
72
72
|
@bytes.setbyte(idx + 3, 255)
|
|
73
73
|
else
|
|
74
|
-
#
|
|
74
|
+
# Simple alpha blending (over operator)
|
|
75
75
|
src_a = a / 255.0
|
|
76
76
|
inv = 1 - src_a
|
|
77
77
|
@bytes.setbyte(idx, (r * src_a + @bytes.getbyte(idx) * inv).to_i)
|
data/lib/rpdfium/table/edges.rb
CHANGED
|
@@ -2,25 +2,26 @@
|
|
|
2
2
|
|
|
3
3
|
module Rpdfium
|
|
4
4
|
module Table
|
|
5
|
-
#
|
|
6
|
-
# TableFinder.
|
|
5
|
+
# Operations on edges (horizontal/vertical segments) used by the
|
|
6
|
+
# TableFinder. A direct mapping onto `pdfplumber/table.py`.
|
|
7
7
|
#
|
|
8
|
-
#
|
|
9
|
-
# -
|
|
10
|
-
# :x0, :x1, :top, :bottom (in
|
|
11
|
-
# -
|
|
12
|
-
# -
|
|
8
|
+
# Internal conventions (aligned with pdfplumber):
|
|
9
|
+
# - Each edge is a Hash with :orientation ("v" | "h"),
|
|
10
|
+
# :x0, :x1, :top, :bottom (in top-down coordinates).
|
|
11
|
+
# - Horizontal edge: top == bottom, x0 < x1.
|
|
12
|
+
# - Vertical edge: x0 == x1, top < bottom.
|
|
13
13
|
#
|
|
14
|
-
#
|
|
15
|
-
# -
|
|
16
|
-
# -
|
|
17
|
-
# -
|
|
18
|
-
# -
|
|
14
|
+
# The edges can come from:
|
|
15
|
+
# - vector lines of the PDF (path segments)
|
|
16
|
+
# - rectangles (decomposed into 4 sides)
|
|
17
|
+
# - "implicit" lines inferred from word alignment (:text strategy)
|
|
18
|
+
# - lines specified by the user (:explicit strategy)
|
|
19
19
|
module Edges
|
|
20
20
|
module_function
|
|
21
21
|
|
|
22
|
-
# Snap: cluster
|
|
23
|
-
#
|
|
22
|
+
# Snap: cluster of near-collinear edges → common average coordinate.
|
|
23
|
+
# For horizontals it snaps the `top` (== `bottom`); for verticals the
|
|
24
|
+
# `x0`.
|
|
24
25
|
def snap_edges(edges, x_tolerance: 3.0, y_tolerance: 3.0)
|
|
25
26
|
v_edges, h_edges = edges.partition { |e| e[:orientation] == "v" }
|
|
26
27
|
|
|
@@ -42,13 +43,13 @@ module Rpdfium
|
|
|
42
43
|
end
|
|
43
44
|
end
|
|
44
45
|
|
|
45
|
-
# Join:
|
|
46
|
-
#
|
|
47
|
-
#
|
|
46
|
+
# Join: given a group of edges on the same infinite line (same top
|
|
47
|
+
# for horizontals, same x0 for verticals), merges those whose
|
|
48
|
+
# endpoints are within `tolerance`.
|
|
48
49
|
#
|
|
49
|
-
#
|
|
50
|
-
#
|
|
51
|
-
# tolerance,
|
|
50
|
+
# Exact match of pdfplumber.join_edge_group behavior: iterates sorted
|
|
51
|
+
# by minprop, extends the "current" if overlap/contiguity is within
|
|
52
|
+
# tolerance, otherwise opens a new current.
|
|
52
53
|
def join_edge_group(edges, orientation, tolerance: 3.0)
|
|
53
54
|
return [] if edges.empty?
|
|
54
55
|
|
|
@@ -68,7 +69,7 @@ module Rpdfium
|
|
|
68
69
|
joined
|
|
69
70
|
end
|
|
70
71
|
|
|
71
|
-
#
|
|
72
|
+
# Complete pipeline: snap + join. Faithful to pdfplumber.merge_edges.
|
|
72
73
|
def merge_edges(edges,
|
|
73
74
|
snap_x_tolerance: 3.0, snap_y_tolerance: 3.0,
|
|
74
75
|
join_x_tolerance: 3.0, join_y_tolerance: 3.0)
|
|
@@ -78,7 +79,7 @@ module Rpdfium
|
|
|
78
79
|
y_tolerance: snap_y_tolerance)
|
|
79
80
|
end
|
|
80
81
|
|
|
81
|
-
#
|
|
82
|
+
# Group by (orientation, "line value")
|
|
82
83
|
# h → top, v → x0
|
|
83
84
|
groups = edges.group_by do |e|
|
|
84
85
|
e[:orientation] == "h" ? ["h", e[:top]] : ["v", e[:x0]]
|
|
@@ -89,7 +90,7 @@ module Rpdfium
|
|
|
89
90
|
end
|
|
90
91
|
end
|
|
91
92
|
|
|
92
|
-
#
|
|
93
|
+
# Filters out edges that are too short.
|
|
93
94
|
def filter_edges(edges, orientation: nil, min_length: 1.0)
|
|
94
95
|
edges.reject do |e|
|
|
95
96
|
next true if orientation && e[:orientation] != orientation
|
|
@@ -110,11 +111,11 @@ module Rpdfium
|
|
|
110
111
|
DEFAULT_MIN_WORDS_VERTICAL = 3
|
|
111
112
|
DEFAULT_MIN_WORDS_HORIZONTAL = 1
|
|
112
113
|
|
|
113
|
-
#
|
|
114
|
-
#
|
|
115
|
-
#
|
|
116
|
-
#
|
|
117
|
-
#
|
|
114
|
+
# For each cluster of words aligned "at the top" (same top, within
|
|
115
|
+
# tol=1) with at least `word_threshold` members, it emits TWO
|
|
116
|
+
# horizontal edges (top and bottom of that cluster's bbox). Having
|
|
117
|
+
# the bottom in addition to the top is critical: it guarantees that
|
|
118
|
+
# the last row of each table has a closing horizontal edge.
|
|
118
119
|
def words_to_edges_h(words, word_threshold: DEFAULT_MIN_WORDS_HORIZONTAL)
|
|
119
120
|
by_top = Util::Cluster.cluster_objects(words, :top, tolerance: 1.0)
|
|
120
121
|
large = by_top.select { |g| g.size >= word_threshold }
|
|
@@ -132,14 +133,14 @@ module Rpdfium
|
|
|
132
133
|
end
|
|
133
134
|
end
|
|
134
135
|
|
|
135
|
-
#
|
|
136
|
-
# `word_threshold`
|
|
137
|
-
# cluster
|
|
138
|
-
#
|
|
136
|
+
# Three clusters of words by x: x0, x1, centerpoint. Clusters with at
|
|
137
|
+
# least `word_threshold` members are column candidates. The bboxes of
|
|
138
|
+
# each cluster are "condensed": if a bbox overlaps another already
|
|
139
|
+
# selected (more populated) one, it is discarded.
|
|
139
140
|
#
|
|
140
|
-
#
|
|
141
|
-
#
|
|
142
|
-
#
|
|
141
|
+
# For each condensed bbox I emit a vertical edge at its x0 (left of
|
|
142
|
+
# the column). In addition, I emit a final "right" edge at the max x1
|
|
143
|
+
# of all the bboxes: it visually closes the table on the right.
|
|
143
144
|
def words_to_edges_v(words, word_threshold: DEFAULT_MIN_WORDS_VERTICAL)
|
|
144
145
|
by_x0 = Util::Cluster.cluster_objects(words, :x0, tolerance: 1.0)
|
|
145
146
|
by_x1 = Util::Cluster.cluster_objects(words, :x1, tolerance: 1.0)
|
|
@@ -147,7 +148,7 @@ module Rpdfium
|
|
|
147
148
|
by_center = Util::Cluster.cluster_objects(words, center_fn, tolerance: 1.0)
|
|
148
149
|
|
|
149
150
|
clusters = by_x0 + by_x1 + by_center
|
|
150
|
-
#
|
|
151
|
+
# More populated first
|
|
151
152
|
sorted = clusters.sort_by { |c| -c.size }
|
|
152
153
|
large = sorted.select { |c| c.size >= word_threshold }
|
|
153
154
|
bboxes = large.map { |c| Util::Cluster.objects_to_bbox(c) }
|
|
@@ -157,7 +158,7 @@ module Rpdfium
|
|
|
157
158
|
end
|
|
158
159
|
return [] if condensed_bboxes.empty?
|
|
159
160
|
|
|
160
|
-
# Sort left-to-right
|
|
161
|
+
# Sort left-to-right to emit edges in geometric order.
|
|
161
162
|
condensed_rects = condensed_bboxes.map do |b|
|
|
162
163
|
{ x0: b[0], top: b[1], x1: b[2], bottom: b[3] }
|
|
163
164
|
end.sort_by { |r| r[:x0] }
|
|
@@ -170,7 +171,7 @@ module Rpdfium
|
|
|
170
171
|
acc[2] = r[:bottom] if r[:bottom] > acc[2]
|
|
171
172
|
end
|
|
172
173
|
|
|
173
|
-
#
|
|
174
|
+
# "left" edge of each column + a final "right" edge.
|
|
174
175
|
left_edges = condensed_rects.map do |r|
|
|
175
176
|
{ x0: r[:x0], x1: r[:x0], top: min_top, bottom: max_bottom, orientation: "v" }
|
|
176
177
|
end
|
|
@@ -179,18 +180,19 @@ module Rpdfium
|
|
|
179
180
|
end
|
|
180
181
|
|
|
181
182
|
# ------------------------------------------------------------------
|
|
182
|
-
#
|
|
183
|
+
# edge intersections
|
|
183
184
|
# ------------------------------------------------------------------
|
|
184
185
|
|
|
185
|
-
#
|
|
186
|
-
#
|
|
187
|
-
#
|
|
188
|
-
#
|
|
186
|
+
# For each (h, v) pair that intersects within tolerance, it records
|
|
187
|
+
# an intersection `(v.x0, h.top)` with pointers to the source edges.
|
|
188
|
+
# The value in `intersections[(x, y)] = { v: [...], h: [...] }` then
|
|
189
|
+
# allows the cell-builder to verify "edge connect".
|
|
189
190
|
#
|
|
190
|
-
#
|
|
191
|
-
#
|
|
192
|
-
#
|
|
193
|
-
#
|
|
191
|
+
# Optimization over the naïve O(|v|×|h|) loop: sorted_h is ordered by
|
|
192
|
+
# top; for each vertical edge a bsearch is used to find the first
|
|
193
|
+
# candidate h and the loop exits as soon as h[:top] exceeds
|
|
194
|
+
# v[:bottom] + y_tolerance, reducing the iterations to only the
|
|
195
|
+
# vertically relevant subset.
|
|
194
196
|
def edges_to_intersections(edges, x_tolerance: 1.0, y_tolerance: 1.0)
|
|
195
197
|
v_edges, h_edges = edges.partition { |e| e[:orientation] == "v" }
|
|
196
198
|
intersections = {}
|
|
@@ -202,11 +204,11 @@ module Rpdfium
|
|
|
202
204
|
v_top_min = v[:top] - y_tolerance
|
|
203
205
|
v_top_max = v[:bottom] + y_tolerance
|
|
204
206
|
|
|
205
|
-
#
|
|
207
|
+
# Skip all the h whose top is still below the vertical window.
|
|
206
208
|
start_idx = h_tops.bsearch_index { |t| t >= v_top_min } || sorted_h.size
|
|
207
209
|
|
|
208
210
|
sorted_h[start_idx..].each do |h|
|
|
209
|
-
#
|
|
211
|
+
# The remaining h are beyond the window: exit immediately.
|
|
210
212
|
break if h[:top] > v_top_max
|
|
211
213
|
|
|
212
214
|
next unless v[:x0] >= h[:x0] - x_tolerance
|
|
@@ -2,25 +2,25 @@
|
|
|
2
2
|
|
|
3
3
|
module Rpdfium
|
|
4
4
|
module Table
|
|
5
|
-
#
|
|
5
|
+
# Finds tables on a page, faithful to `pdfplumber.TableFinder`.
|
|
6
6
|
#
|
|
7
7
|
# Pipeline:
|
|
8
|
-
# 1.
|
|
8
|
+
# 1. collect candidate edges for each axis, according to strategy
|
|
9
9
|
# (`:lines` / `:lines_strict` / `:text` / `:explicit`)
|
|
10
|
-
# 2. merge_edges (snap
|
|
11
|
-
# 3. filter
|
|
12
|
-
# 4. edges_to_intersections
|
|
13
|
-
# 5. intersections_to_cells (smallest cell
|
|
14
|
-
# 6. cells_to_tables (grouping
|
|
10
|
+
# 2. merge_edges (snap collinear + join contiguous)
|
|
11
|
+
# 3. filter by minimum length
|
|
12
|
+
# 4. edges_to_intersections with tolerance
|
|
13
|
+
# 5. intersections_to_cells (smallest cell for each point)
|
|
14
|
+
# 6. cells_to_tables (grouping by shared corners)
|
|
15
15
|
#
|
|
16
|
-
# API
|
|
16
|
+
# Public API:
|
|
17
17
|
# ext = Rpdfium::Table::Extractor.new(page, **opts)
|
|
18
|
-
# ext.tables # => [Table, ...] (
|
|
19
|
-
# ext.extract # => [[[String]]] (Array
|
|
20
|
-
#
|
|
21
|
-
#
|
|
22
|
-
# ext.find # alias
|
|
23
|
-
# ext.edges # edges
|
|
18
|
+
# ext.tables # => [Table, ...] (Rpdfium::Table::Table objects)
|
|
19
|
+
# ext.extract # => [[[String]]] (Array of tables, each table
|
|
20
|
+
# is an Array of rows, each row
|
|
21
|
+
# is an Array of strings)
|
|
22
|
+
# ext.find # alias of .tables (back-compat with 0.2.x)
|
|
23
|
+
# ext.edges # refined edges
|
|
24
24
|
# ext.intersections # Hash {[x,y] => {v:[],h:[]}}
|
|
25
25
|
# ext.cells # Array<bbox>
|
|
26
26
|
class Extractor
|
|
@@ -30,7 +30,7 @@ module Rpdfium
|
|
|
30
30
|
explicit_vertical_lines: [],
|
|
31
31
|
explicit_horizontal_lines: [],
|
|
32
32
|
|
|
33
|
-
#
|
|
33
|
+
# Tolerances. The `_x_` / `_y_` inherit from the un-suffixed value.
|
|
34
34
|
snap_tolerance: 3.0,
|
|
35
35
|
snap_x_tolerance: nil,
|
|
36
36
|
snap_y_tolerance: nil,
|
|
@@ -48,16 +48,17 @@ module Rpdfium
|
|
|
48
48
|
intersection_x_tolerance: nil,
|
|
49
49
|
intersection_y_tolerance: nil,
|
|
50
50
|
|
|
51
|
-
#
|
|
52
|
-
#
|
|
51
|
+
# Text settings (passed to TextExtraction when .extract is called).
|
|
52
|
+
# The 3.0 defaults are those of pdfplumber.
|
|
53
53
|
text_x_tolerance: Util::WordExtractor::DEFAULT_X_TOLERANCE,
|
|
54
54
|
text_y_tolerance: Util::WordExtractor::DEFAULT_Y_TOLERANCE,
|
|
55
55
|
text_keep_blank_chars: false,
|
|
56
56
|
|
|
57
|
-
# Auto-fallback:
|
|
58
|
-
#
|
|
59
|
-
#
|
|
60
|
-
#
|
|
57
|
+
# Auto-fallback: if :lines produces no edges, retry with :text.
|
|
58
|
+
# We keep the flag (it was already in 0.2.x) but ONLY as a fallback,
|
|
59
|
+
# never as a "fix" for pathological layouts — consistent with
|
|
60
|
+
# pdfplumber, which does not have it (pdfplumber users know they
|
|
61
|
+
# must choose the strategy).
|
|
61
62
|
auto_fallback: true
|
|
62
63
|
}.freeze
|
|
63
64
|
|
|
@@ -71,14 +72,14 @@ module Rpdfium
|
|
|
71
72
|
validate_strategies!
|
|
72
73
|
end
|
|
73
74
|
|
|
74
|
-
#
|
|
75
|
+
# Full pipeline, builds the refined edges.
|
|
75
76
|
def edges
|
|
76
77
|
@edges ||= build_edges(@settings[:vertical_strategy],
|
|
77
78
|
@settings[:horizontal_strategy]).then do |built|
|
|
78
79
|
if built.empty? && @settings[:auto_fallback] &&
|
|
79
80
|
(@settings[:vertical_strategy] != :text ||
|
|
80
81
|
@settings[:horizontal_strategy] != :text)
|
|
81
|
-
# Fallback:
|
|
82
|
+
# Fallback: the auto-fallback is LOOSE, retry everything as :text.
|
|
82
83
|
build_edges(:text, :text)
|
|
83
84
|
else
|
|
84
85
|
built
|
|
@@ -103,7 +104,7 @@ module Rpdfium
|
|
|
103
104
|
end
|
|
104
105
|
alias find tables
|
|
105
106
|
|
|
106
|
-
#
|
|
107
|
+
# Extract the data of all tables: Array<Array<Array<String>>>.
|
|
107
108
|
def extract(**text_opts)
|
|
108
109
|
merged = {
|
|
109
110
|
x_tolerance: @settings[:text_x_tolerance],
|
|
@@ -117,7 +118,7 @@ module Rpdfium
|
|
|
117
118
|
private
|
|
118
119
|
|
|
119
120
|
def resolve_settings(s)
|
|
120
|
-
#
|
|
121
|
+
# Cascade x/y from the un-suffixed values
|
|
121
122
|
s[:snap_x_tolerance] ||= s[:snap_tolerance]
|
|
122
123
|
s[:snap_y_tolerance] ||= s[:snap_tolerance]
|
|
123
124
|
s[:join_x_tolerance] ||= s[:join_tolerance]
|
|
@@ -164,10 +165,10 @@ module Rpdfium
|
|
|
164
165
|
end
|
|
165
166
|
|
|
166
167
|
def page_words
|
|
167
|
-
#
|
|
168
|
-
#
|
|
169
|
-
# `
|
|
170
|
-
chars = @page.chars(lean: true)
|
|
168
|
+
# Generate words using our WordExtractor (consistent with the one
|
|
169
|
+
# used in Table#extract, so the thresholds match).
|
|
170
|
+
# `geometry: true`: see comment in Table#extract.
|
|
171
|
+
chars = @page.chars(lean: true, geometry: true)
|
|
171
172
|
Util::WordExtractor.new(
|
|
172
173
|
x_tolerance: @settings[:text_x_tolerance],
|
|
173
174
|
y_tolerance: @settings[:text_y_tolerance],
|
|
@@ -187,11 +188,11 @@ module Rpdfium
|
|
|
187
188
|
end
|
|
188
189
|
end
|
|
189
190
|
|
|
190
|
-
#
|
|
191
|
-
#
|
|
192
|
-
#
|
|
193
|
-
# (
|
|
194
|
-
#
|
|
191
|
+
# Converts Page's `vertical_lines` (format {x, top, bottom}) to the
|
|
192
|
+
# pdfplumber-style format expected by Edges.
|
|
193
|
+
# Note: in 0.3.0 we do NOT include rectangle sides when :strict
|
|
194
|
+
# (but at present Page does not expose them separately, a
|
|
195
|
+
# simplification that we will document).
|
|
195
196
|
def page_vertical_edges(strict: false) # rubocop:disable Lint/UnusedMethodArgument
|
|
196
197
|
prefilter = @settings[:edge_min_length_prefilter]
|
|
197
198
|
@page.vertical_lines.filter_map do |s|
|