rpdfium 0.4.1 → 0.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +615 -1317
- data/README.md +73 -78
- data/lib/rpdfium/annotation/annotation.rb +10 -8
- data/lib/rpdfium/document.rb +49 -22
- data/lib/rpdfium/errors.rb +2 -2
- data/lib/rpdfium/form/form.rb +9 -9
- data/lib/rpdfium/image/embedded.rb +17 -16
- data/lib/rpdfium/io/png.rb +9 -9
- data/lib/rpdfium/page.rb +561 -526
- data/lib/rpdfium/raw.rb +216 -203
- data/lib/rpdfium/search/search.rb +5 -5
- data/lib/rpdfium/structure/attachment.rb +6 -6
- data/lib/rpdfium/structure/element.rb +74 -74
- data/lib/rpdfium/structure/outline.rb +2 -2
- data/lib/rpdfium/structure/tree.rb +56 -55
- data/lib/rpdfium/table/cells.rb +36 -33
- data/lib/rpdfium/table/debugger.rb +12 -12
- data/lib/rpdfium/table/edges.rb +51 -49
- data/lib/rpdfium/table/extractor.rb +35 -34
- data/lib/rpdfium/table/table.rb +65 -62
- data/lib/rpdfium/util/cluster.rb +35 -33
- data/lib/rpdfium/util/column_inference.rb +34 -32
- data/lib/rpdfium/util/label_matcher.rb +30 -30
- data/lib/rpdfium/util/text_extraction.rb +15 -15
- data/lib/rpdfium/util/word_extractor.rb +49 -48
- data/lib/rpdfium/util/word_merger.rb +25 -24
- data/lib/rpdfium/version.rb +1 -1
- data/lib/rpdfium.rb +17 -15
- metadata +1 -1
data/lib/rpdfium/table/table.rb
CHANGED
|
@@ -2,12 +2,12 @@
|
|
|
2
2
|
|
|
3
3
|
module Rpdfium
|
|
4
4
|
module Table
|
|
5
|
-
#
|
|
6
|
-
#
|
|
5
|
+
# Represents a table found on a page. Exposes cells, rows,
|
|
6
|
+
# columns, bbox, and the `extract` method that returns the textual data.
|
|
7
7
|
#
|
|
8
|
-
#
|
|
9
|
-
#
|
|
10
|
-
#
|
|
8
|
+
# Each cell is a bbox `[x0, top, x1, bottom]` (top-down).
|
|
9
|
+
# A "row" is the group of cells sharing the same `top`.
|
|
10
|
+
# A "column" is the group sharing the same `x0`.
|
|
11
11
|
class Table
|
|
12
12
|
attr_reader :page, :cells
|
|
13
13
|
|
|
@@ -27,9 +27,9 @@ module Rpdfium
|
|
|
27
27
|
end
|
|
28
28
|
end
|
|
29
29
|
|
|
30
|
-
#
|
|
31
|
-
# in
|
|
32
|
-
#
|
|
30
|
+
# Returns the rows as Array<Array<bbox|nil>>. The "missing" cells
|
|
31
|
+
# in a row (e.g. because the table has an irregular topology) are
|
|
32
|
+
# represented as nil — consistent with pdfplumber.
|
|
33
33
|
def rows
|
|
34
34
|
rows_or_columns(:row)
|
|
35
35
|
end
|
|
@@ -38,57 +38,60 @@ module Rpdfium
|
|
|
38
38
|
rows_or_columns(:col)
|
|
39
39
|
end
|
|
40
40
|
|
|
41
|
-
#
|
|
42
|
-
#
|
|
43
|
-
#
|
|
44
|
-
#
|
|
41
|
+
# Extract data: Array<Array<String>>. For each row, for each cell,
|
|
42
|
+
# filter the page chars whose MIDPOINT lies within the cell's bbox,
|
|
43
|
+
# then reconstruct the text via Util::TextExtraction (which in turn
|
|
44
|
+
# goes through WordExtractor).
|
|
45
45
|
#
|
|
46
|
-
#
|
|
47
|
-
#
|
|
48
|
-
#
|
|
49
|
-
#
|
|
46
|
+
# This is the pdfplumber.Table.extract path — for each row it first
|
|
47
|
+
# filters the row's chars (optimization: nearly all chars from the
|
|
48
|
+
# other rows are discarded immediately), then for each cell filters
|
|
49
|
+
# again within the sub-bbox.
|
|
50
50
|
#
|
|
51
|
-
#
|
|
52
|
-
# midpoint
|
|
53
|
-
#
|
|
54
|
-
#
|
|
51
|
+
# Optimization over the naïve path: the chars are sorted by their
|
|
52
|
+
# vertical midpoint only once; for each row bsearch is used to find
|
|
53
|
+
# the candidate chars in O(log n) instead of scanning the whole
|
|
54
|
+
# array O(n) for every row.
|
|
55
55
|
#
|
|
56
|
-
#
|
|
57
|
-
# edges per
|
|
58
|
-
#
|
|
59
|
-
#
|
|
60
|
-
#
|
|
61
|
-
# caller
|
|
62
|
-
#
|
|
63
|
-
# `cell_padding`:
|
|
64
|
-
#
|
|
65
|
-
#
|
|
66
|
-
# (
|
|
67
|
-
#
|
|
68
|
-
#
|
|
69
|
-
#
|
|
56
|
+
# NOTE on the :text strategy: `words_to_edges_h` emits by design TWO
|
|
57
|
+
# edges per row (top and bottom of the cluster bbox). This means that
|
|
58
|
+
# a table detected by the text-strategy will have "real" rows
|
|
59
|
+
# interleaved with "empty" rows between the bottom-edge of row N and
|
|
60
|
+
# the top-edge of row N+1. This is identical to pdfplumber's behavior.
|
|
61
|
+
# The caller may filter via `result.reject { |row| row.all?(&:empty?) }`
|
|
62
|
+
# if it wants to drop them.
|
|
63
|
+
# `cell_padding`: extends each cell's bbox toward the left and toward
|
|
64
|
+
# the top by N points. Default 0 (= identical pdfplumber behavior).
|
|
65
|
+
# Useful for PDFs where chars protrude slightly past the cell border
|
|
66
|
+
# (e.g. the uppercase "I" of the "Intermediario" cell in a CR Banca
|
|
67
|
+
# d'Italia form has x0=24.0 but the cell border is at x=25.6 — it gets
|
|
68
|
+
# discarded by the midpoint filter, output "ntermediario:"). With
|
|
69
|
+
# `cell_padding: 2.0` the cell becomes [23.6, ..., 100, ...] and the
|
|
70
|
+
# "I" is captured.
|
|
70
71
|
#
|
|
71
|
-
# Padding
|
|
72
|
-
#
|
|
73
|
-
#
|
|
74
|
-
#
|
|
72
|
+
# Padding only on the "inner-left" and "inner-top" borders to avoid
|
|
73
|
+
# duplicating chars shared between adjacent cells (a char between
|
|
74
|
+
# cell A and cell B would end up in both if both padded on all
|
|
75
|
+
# sides).
|
|
75
76
|
def extract(x_tolerance: Util::WordExtractor::DEFAULT_X_TOLERANCE,
|
|
76
77
|
y_tolerance: Util::WordExtractor::DEFAULT_Y_TOLERANCE,
|
|
77
78
|
keep_blank_chars: false,
|
|
78
79
|
cell_padding: 0.0)
|
|
79
|
-
# `
|
|
80
|
-
# angle
|
|
81
|
-
#
|
|
82
|
-
#
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
80
|
+
# `geometry: true`: the strongest lean mode — on top of skipping
|
|
81
|
+
# font/weight/angle/hyphen/unicode-error it also drops the per-char
|
|
82
|
+
# origin read and emits a minimal hash. It keeps only the fields the
|
|
83
|
+
# table/word pipeline reads, cutting both FFI roundtrips and hash
|
|
84
|
+
# allocation. On tables with thousands of chars this is the dominant
|
|
85
|
+
# cost of extract_tables. See Page#chars.
|
|
86
|
+
chars = @page.chars(lean: true, geometry: true)
|
|
87
|
+
|
|
88
|
+
# Sort by vertical midpoint once; build a parallel array of vmid
|
|
89
|
+
# for bsearch. Cost: O(n log n) one-time.
|
|
87
90
|
sorted_chars = chars.sort_by { |c| (c[:top] + c[:bottom]) / 2.0 }
|
|
88
91
|
vmids = sorted_chars.map { |c| (c[:top] + c[:bottom]) / 2.0 }
|
|
89
92
|
|
|
90
|
-
#
|
|
91
|
-
# (
|
|
93
|
+
# Instantiate WordExtractor ONCE and reuse it for all cells
|
|
94
|
+
# (a table may have dozens of cells; avoid allocations).
|
|
92
95
|
word_extractor = Util::WordExtractor.new(
|
|
93
96
|
x_tolerance: x_tolerance,
|
|
94
97
|
y_tolerance: y_tolerance,
|
|
@@ -118,8 +121,8 @@ module Rpdfium
|
|
|
118
121
|
|
|
119
122
|
private
|
|
120
123
|
|
|
121
|
-
#
|
|
122
|
-
#
|
|
124
|
+
# "Inlined" version of Util::TextExtraction.extract_text that reuses
|
|
125
|
+
# a pre-existing WordExtractor instead of creating one every time.
|
|
123
126
|
def extract_text_with(chars, word_extractor, y_tolerance)
|
|
124
127
|
words = word_extractor.extract_words(chars)
|
|
125
128
|
return "" if words.empty?
|
|
@@ -132,15 +135,15 @@ module Rpdfium
|
|
|
132
135
|
|
|
133
136
|
def pad_cell_bbox(bbox, padding)
|
|
134
137
|
x0, top, x1, bottom = bbox
|
|
135
|
-
#
|
|
136
|
-
#
|
|
138
|
+
# Extend only the "inner-left" and "inner-top" borders to avoid
|
|
139
|
+
# capturing chars from the adjacent cell to the right/below.
|
|
137
140
|
[x0 - padding, top - padding, x1, bottom]
|
|
138
141
|
end
|
|
139
142
|
|
|
140
|
-
# Test "char midpoint
|
|
141
|
-
#
|
|
142
|
-
#
|
|
143
|
-
#
|
|
143
|
+
# Test "char midpoint inside bbox" — exactly like pdfplumber.
|
|
144
|
+
# The char's midpoint (not the bbox extremes) is the criterion:
|
|
145
|
+
# a char straddling the border is assigned to the cell in which it
|
|
146
|
+
# has more "visual weight".
|
|
144
147
|
def char_in_bbox?(char, bbox)
|
|
145
148
|
x0, top, x1, bottom = bbox
|
|
146
149
|
h_mid = (char[:x0] + char[:x1]) / 2.0
|
|
@@ -159,15 +162,15 @@ module Rpdfium
|
|
|
159
162
|
end
|
|
160
163
|
end
|
|
161
164
|
|
|
162
|
-
#
|
|
163
|
-
# axis 1 = top (
|
|
164
|
-
#
|
|
165
|
+
# Reconstructs rows or columns. axis 0 = x (for row clustering antiaxis=top),
|
|
166
|
+
# axis 1 = top (for column clustering antiaxis=x0). Uses the invariant key
|
|
167
|
+
# as "anchor" and the variable key as the internal ordering.
|
|
165
168
|
def rows_or_columns(kind)
|
|
166
|
-
#
|
|
167
|
-
#
|
|
169
|
+
# For row: sortBy = top, antiaxis = x0
|
|
170
|
+
# For col: sortBy = x0, antiaxis = top
|
|
168
171
|
sort_idx, group_idx = kind == :row ? [1, 0] : [0, 1]
|
|
169
172
|
|
|
170
|
-
#
|
|
173
|
+
# All distinct x0 (for row) or top (for col), sorted
|
|
171
174
|
all_keys = @cells.map { |c| c[group_idx] }.uniq.sort
|
|
172
175
|
|
|
173
176
|
# Group by sort_idx
|
data/lib/rpdfium/util/cluster.rb
CHANGED
|
@@ -2,30 +2,31 @@
|
|
|
2
2
|
|
|
3
3
|
module Rpdfium
|
|
4
4
|
module Util
|
|
5
|
-
#
|
|
6
|
-
#
|
|
5
|
+
# 1D clustering primitives used throughout the table pipeline.
|
|
6
|
+
# Direct mapping onto `pdfplumber.utils.clustering` (cluster_list,
|
|
7
7
|
# cluster_objects, make_cluster_dict).
|
|
8
8
|
#
|
|
9
|
-
#
|
|
10
|
-
#
|
|
11
|
-
#
|
|
12
|
-
#
|
|
13
|
-
# `tolerance` (
|
|
14
|
-
#
|
|
9
|
+
# KEY PROPERTY: these clusters are "1D agglomerative single-linkage":
|
|
10
|
+
# two values end up in the same cluster if they are within
|
|
11
|
+
# `tolerance` of any value in the cluster. NOT only of the
|
|
12
|
+
# center/mean. As a result, chains of close values can extend the
|
|
13
|
+
# cluster well beyond `tolerance` (this is exactly pdfplumber's
|
|
14
|
+
# behavior, on which its edge/intersection heuristics rely).
|
|
15
15
|
module Cluster
|
|
16
16
|
module_function
|
|
17
17
|
|
|
18
|
-
#
|
|
19
|
-
#
|
|
18
|
+
# Groups scalar values into clusters. The values within the same
|
|
19
|
+
# cluster are within `tolerance` of at least one other value of
|
|
20
|
+
# the cluster.
|
|
20
21
|
#
|
|
21
|
-
#
|
|
22
|
+
# Example:
|
|
22
23
|
# cluster_list([1.0, 1.5, 2.0, 5.0], tolerance: 1.0)
|
|
23
24
|
# #=> [[1.0, 1.5, 2.0], [5.0]]
|
|
24
25
|
#
|
|
25
|
-
#
|
|
26
|
-
#
|
|
27
|
-
# pdfplumber,
|
|
28
|
-
#
|
|
26
|
+
# NOTE: "Stepping stone" chains: [1, 2, 3, 4] with tol=1 form a
|
|
27
|
+
# SINGLE cluster, even though 1 and 4 are 3 apart. This is
|
|
28
|
+
# pdfplumber's behavior, documented in its issues as potentially
|
|
29
|
+
# surprising but intentional. We keep it identical.
|
|
29
30
|
def cluster_list(values, tolerance: 0)
|
|
30
31
|
return [] if values.empty?
|
|
31
32
|
|
|
@@ -41,22 +42,23 @@ module Rpdfium
|
|
|
41
42
|
clusters
|
|
42
43
|
end
|
|
43
44
|
|
|
44
|
-
#
|
|
45
|
-
#
|
|
45
|
+
# Groups objects (Hash) into clusters based on an extraction
|
|
46
|
+
# function `key_fn` (or a Hash key symbol) and a tolerance.
|
|
46
47
|
#
|
|
47
|
-
#
|
|
48
|
+
# Example:
|
|
48
49
|
# cluster_objects(words, ->(w) { w[:top] }, tolerance: 1)
|
|
49
50
|
# cluster_objects(words, :top, tolerance: 1) # syntactic sugar
|
|
50
51
|
def cluster_objects(objects, key_fn, tolerance: 0, presorted: false)
|
|
51
52
|
return [] if objects.empty?
|
|
52
53
|
|
|
53
|
-
# Fast path
|
|
54
|
-
#
|
|
54
|
+
# Fast path for the most common Symbol case (:top, :x0, :bottom):
|
|
55
|
+
# direct Hash[symbol] access is ~2x faster than the lambda call.
|
|
55
56
|
if key_fn.is_a?(Symbol)
|
|
56
|
-
#
|
|
57
|
-
# (
|
|
58
|
-
#
|
|
59
|
-
# cluster_objects
|
|
57
|
+
# If the caller guarantees that the input is already sorted by
|
|
58
|
+
# key_fn (e.g. because it comes from a lexicographic sort
|
|
59
|
+
# [key_fn, ...]) the internal sort can be skipped. A significant
|
|
60
|
+
# saving when cluster_objects is called in a loop over many
|
|
61
|
+
# small rows.
|
|
60
62
|
sorted = presorted ? objects : objects.sort_by { |o| o[key_fn] }
|
|
61
63
|
first = sorted.first
|
|
62
64
|
last_key = first[key_fn]
|
|
@@ -78,7 +80,7 @@ module Rpdfium
|
|
|
78
80
|
return clusters
|
|
79
81
|
end
|
|
80
82
|
|
|
81
|
-
#
|
|
83
|
+
# Generic path with a callable accessor
|
|
82
84
|
accessor = key_fn
|
|
83
85
|
sorted = presorted ? objects : objects.sort_by { |o| accessor.call(o) }
|
|
84
86
|
last_key = accessor.call(sorted.first)
|
|
@@ -96,8 +98,8 @@ module Rpdfium
|
|
|
96
98
|
clusters
|
|
97
99
|
end
|
|
98
100
|
|
|
99
|
-
# bbox = [x0, top, x1, bottom] (top-down).
|
|
100
|
-
#
|
|
101
|
+
# bbox = [x0, top, x1, bottom] (top-down). Returns the bbox that
|
|
102
|
+
# encloses all the passed objects. Uses min/max of x0/top/x1/bottom.
|
|
101
103
|
def objects_to_bbox(objects)
|
|
102
104
|
objects.each_with_object(
|
|
103
105
|
[Float::INFINITY, Float::INFINITY, -Float::INFINITY, -Float::INFINITY]
|
|
@@ -109,16 +111,16 @@ module Rpdfium
|
|
|
109
111
|
end
|
|
110
112
|
end
|
|
111
113
|
|
|
112
|
-
#
|
|
113
|
-
# edge
|
|
114
|
+
# Variant that returns a Hash instead of a tuple — handy in the
|
|
115
|
+
# edge context where we need to mix bbox+orientation.
|
|
114
116
|
def objects_to_rect(objects)
|
|
115
117
|
x0, top, x1, bottom = objects_to_bbox(objects)
|
|
116
118
|
{ x0: x0, top: top, x1: x1, bottom: bottom,
|
|
117
119
|
width: x1 - x0, height: bottom - top }
|
|
118
120
|
end
|
|
119
121
|
|
|
120
|
-
# bbox
|
|
121
|
-
# get_bbox_overlap:
|
|
122
|
+
# Overlapping bbox. No overlap => nil. Matches pdfplumber's
|
|
123
|
+
# get_bbox_overlap: returns the intersection bbox, or nil.
|
|
122
124
|
def bbox_overlap(a, b)
|
|
123
125
|
ax0, atop, ax1, abot = a
|
|
124
126
|
bx0, btop, bx1, bbot = b
|
|
@@ -133,8 +135,8 @@ module Rpdfium
|
|
|
133
135
|
[x0, top, x1, bot]
|
|
134
136
|
end
|
|
135
137
|
|
|
136
|
-
# True
|
|
137
|
-
#
|
|
138
|
+
# True if two bbox overlap (even just at a point is no; there must
|
|
139
|
+
# be positive area).
|
|
138
140
|
def bbox_overlaps?(a, b)
|
|
139
141
|
!bbox_overlap(a, b).nil?
|
|
140
142
|
end
|
|
@@ -2,26 +2,28 @@
|
|
|
2
2
|
|
|
3
3
|
module Rpdfium
|
|
4
4
|
module Util
|
|
5
|
-
#
|
|
5
|
+
# Inference of data columns on non-tabular PDFs.
|
|
6
6
|
#
|
|
7
|
-
#
|
|
8
|
-
#
|
|
9
|
-
#
|
|
7
|
+
# Identifies groups of words that belong to the same vertical
|
|
8
|
+
# "column" of a layout (e.g. a column of amounts in a prestamped
|
|
9
|
+
# form) even when no lines are drawn.
|
|
10
10
|
#
|
|
11
|
-
#
|
|
11
|
+
# The algorithm operates in three passes:
|
|
12
12
|
#
|
|
13
|
-
# 1. **Cluster
|
|
14
|
-
# (left-aligned)
|
|
15
|
-
#
|
|
13
|
+
# 1. **Cluster by X coordinate** — groups words with the same x0
|
|
14
|
+
# (left-aligned) or x1 (right-aligned, typical of numbers) within
|
|
15
|
+
# the configurable tolerance.
|
|
16
16
|
#
|
|
17
|
-
# 2. **
|
|
18
|
-
#
|
|
19
|
-
# > 40pt),
|
|
20
|
-
#
|
|
17
|
+
# 2. **Split by vertical gaps** — if two consecutive words in a
|
|
18
|
+
# group have an "anomalous" vertical gap (> 3x the median, or
|
|
19
|
+
# > 40pt), they are separated into distinct columns. Resolves
|
|
20
|
+
# cases such as "fiscal code at the top + table below" that share
|
|
21
|
+
# the same X.
|
|
21
22
|
#
|
|
22
|
-
# 3. **
|
|
23
|
-
#
|
|
24
|
-
#
|
|
23
|
+
# 3. **Filter by density** — a "true" column has regularly
|
|
24
|
+
# equispaced values (coefficient of variation of the gaps <
|
|
25
|
+
# threshold). Excludes false positives such as isolated values
|
|
26
|
+
# that happen to be aligned by chance.
|
|
25
27
|
#
|
|
26
28
|
# @example
|
|
27
29
|
# inference = Rpdfium::Util::ColumnInference.new(
|
|
@@ -31,8 +33,8 @@ module Rpdfium
|
|
|
31
33
|
# )
|
|
32
34
|
# columns = inference.infer(words)
|
|
33
35
|
# # => [
|
|
34
|
-
# # [word1, word2, ..., word12], # 12
|
|
35
|
-
# # [word1, word2, ..., word12] # 12
|
|
36
|
+
# # [word1, word2, ..., word12], # 12 amounts in column 1
|
|
37
|
+
# # [word1, word2, ..., word12] # 12 codes in column 2
|
|
36
38
|
# # ]
|
|
37
39
|
class ColumnInference
|
|
38
40
|
DEFAULT_X_TOLERANCE = 3.0
|
|
@@ -53,27 +55,27 @@ module Rpdfium
|
|
|
53
55
|
@gap_absolute = gap_absolute
|
|
54
56
|
end
|
|
55
57
|
|
|
56
|
-
#
|
|
57
|
-
# x1 (right-align)
|
|
58
|
-
#
|
|
58
|
+
# Infers the columns from the supplied words. Uses both x0
|
|
59
|
+
# (left-align) and x1 (right-align) as alignment criteria, returns
|
|
60
|
+
# the union of the identified columns.
|
|
59
61
|
#
|
|
60
|
-
# @param words [Array<Hash>]
|
|
61
|
-
# @return [Array<Array<Hash>>] array
|
|
62
|
-
#
|
|
62
|
+
# @param words [Array<Hash>] words with :x0, :x1, :top
|
|
63
|
+
# @return [Array<Array<Hash>>] array of columns, each one an array
|
|
64
|
+
# of words ordered by ascending :top
|
|
63
65
|
def infer(words)
|
|
64
66
|
return [] if words.empty?
|
|
65
67
|
|
|
66
68
|
by_x0 = cluster_by(words, :x0)
|
|
67
69
|
by_x1 = cluster_by(words, :x1)
|
|
68
70
|
|
|
69
|
-
#
|
|
70
|
-
#
|
|
71
|
-
#
|
|
71
|
+
# Union: a word may appear in more than one column. It is the
|
|
72
|
+
# caller's responsibility to decide how to handle this (e.g.
|
|
73
|
+
# prefer the first column, or the largest one). Here we return all.
|
|
72
74
|
(by_x0 + by_x1)
|
|
73
75
|
end
|
|
74
76
|
|
|
75
|
-
#
|
|
76
|
-
# @param coord [Symbol] :x0
|
|
77
|
+
# Clusters words by a specific coordinate.
|
|
78
|
+
# @param coord [Symbol] :x0 or :x1
|
|
77
79
|
def cluster_by(words, coord)
|
|
78
80
|
sorted = words.sort_by { |v| v[coord] }
|
|
79
81
|
x_groups = []
|
|
@@ -116,10 +118,10 @@ module Rpdfium
|
|
|
116
118
|
columns
|
|
117
119
|
end
|
|
118
120
|
|
|
119
|
-
#
|
|
120
|
-
#
|
|
121
|
-
#
|
|
122
|
-
#
|
|
121
|
+
# A column is "dense enough" if it has at least min_size values
|
|
122
|
+
# and the coefficient of variation (std_dev/mean) of the vertical
|
|
123
|
+
# gaps is below the threshold. Low CV = regular spacing = a true
|
|
124
|
+
# repetitive column (vs. scattered values accidentally aligned).
|
|
123
125
|
def dense_enough?(col_values)
|
|
124
126
|
return false if col_values.size < @min_size
|
|
125
127
|
|
|
@@ -2,29 +2,29 @@
|
|
|
2
2
|
|
|
3
3
|
module Rpdfium
|
|
4
4
|
module Util
|
|
5
|
-
#
|
|
6
|
-
# (F24,
|
|
7
|
-
#
|
|
5
|
+
# Associates semantic labels with values placed on PDFs of filled-in
|
|
6
|
+
# forms (F24, VAT communications, Modello 770) where template and data
|
|
7
|
+
# coexist as graphic text in different fonts.
|
|
8
8
|
#
|
|
9
|
-
#
|
|
9
|
+
# Base strategy:
|
|
10
10
|
#
|
|
11
|
-
# 1. **Cluster**
|
|
12
|
-
#
|
|
11
|
+
# 1. **Cluster** the template words into "coherent labels": words that
|
|
12
|
+
# are geometrically close form a single label.
|
|
13
13
|
#
|
|
14
|
-
# 2. **
|
|
15
|
-
# - `:col` — label
|
|
16
|
-
# - `:row` — label
|
|
14
|
+
# 2. **For each value** search for:
|
|
15
|
+
# - `:col` — the label ABOVE in the same column
|
|
16
|
+
# - `:row` — the label TO THE LEFT in the same row
|
|
17
17
|
#
|
|
18
|
-
# 3. (
|
|
19
|
-
#
|
|
20
|
-
# ST)
|
|
21
|
-
#
|
|
18
|
+
# 3. (Optional) **Column reassignment**: uses `ColumnInference` to
|
|
19
|
+
# identify repetitive columns (e.g. ST2..ST13 of the 770 Quadro
|
|
20
|
+
# ST) and propagates the canonical header to all the values in the
|
|
21
|
+
# column, overriding the `col_max_dy` limit.
|
|
22
22
|
#
|
|
23
|
-
# @example
|
|
23
|
+
# @example basic usage
|
|
24
24
|
# matcher = Rpdfium::Util::LabelMatcher.new
|
|
25
25
|
# matcher.match(value_words, anchor_words)
|
|
26
26
|
#
|
|
27
|
-
# @example
|
|
27
|
+
# @example with repetitive tables (header at the top of the column)
|
|
28
28
|
# matcher = Rpdfium::Util::LabelMatcher.new(
|
|
29
29
|
# column_inference: Rpdfium::Util::ColumnInference.new
|
|
30
30
|
# )
|
|
@@ -60,11 +60,11 @@ module Rpdfium
|
|
|
60
60
|
@column_inference = column_inference
|
|
61
61
|
end
|
|
62
62
|
|
|
63
|
-
#
|
|
63
|
+
# Computes the label → value associations.
|
|
64
64
|
#
|
|
65
|
-
# @param values [Array<Hash>]
|
|
66
|
-
# @param anchors [Array<Hash>]
|
|
67
|
-
# @return [Array<Hash>]
|
|
65
|
+
# @param values [Array<Hash>] words of the "data" layer
|
|
66
|
+
# @param anchors [Array<Hash>] words of the "template" layer
|
|
67
|
+
# @return [Array<Hash>] one per value: { value:, labels: { col:, row: }, geometry: }
|
|
68
68
|
def match(values, anchors)
|
|
69
69
|
labels = cluster_anchors(anchors)
|
|
70
70
|
|
|
@@ -74,7 +74,7 @@ module Rpdfium
|
|
|
74
74
|
{ value: v, col: col, row: row }
|
|
75
75
|
end
|
|
76
76
|
|
|
77
|
-
#
|
|
77
|
+
# Optional reassignment for repetitive columns
|
|
78
78
|
prelim = reassign_by_columns(prelim, labels, values) if @column_inference
|
|
79
79
|
|
|
80
80
|
prelim.map do |entry|
|
|
@@ -92,8 +92,8 @@ module Rpdfium
|
|
|
92
92
|
end
|
|
93
93
|
end
|
|
94
94
|
|
|
95
|
-
#
|
|
96
|
-
#
|
|
95
|
+
# Reconstructs the labels from the cluster of template words.
|
|
96
|
+
# Exposed publicly for inspection/debug.
|
|
97
97
|
def cluster_anchors(anchor_words)
|
|
98
98
|
remaining = anchor_words.dup
|
|
99
99
|
groups = []
|
|
@@ -145,10 +145,10 @@ module Rpdfium
|
|
|
145
145
|
end
|
|
146
146
|
|
|
147
147
|
def find_col_label(value, labels)
|
|
148
|
-
#
|
|
149
|
-
#
|
|
150
|
-
#
|
|
151
|
-
#
|
|
148
|
+
# For "wide" words (wider than most labels, typically because
|
|
149
|
+
# they result from the merge of a string that spans several
|
|
150
|
+
# template columns) use the left edge: the correct label is the
|
|
151
|
+
# one below which the value STARTS.
|
|
152
152
|
value_width = value[:x1] - value[:x0]
|
|
153
153
|
anchor_point =
|
|
154
154
|
if value_width > WIDE_VALUE_THRESHOLD
|
|
@@ -175,14 +175,14 @@ module Rpdfium
|
|
|
175
175
|
end.max_by { |l| l[:x1] }
|
|
176
176
|
end
|
|
177
177
|
|
|
178
|
-
#
|
|
179
|
-
#
|
|
180
|
-
#
|
|
178
|
+
# Identifies data columns and propagates the canonical header
|
|
179
|
+
# printed at the top of the column to ALL the values of the column.
|
|
180
|
+
# Uses the @column_inference provided to the constructor.
|
|
181
181
|
def reassign_by_columns(prelim, labels, values)
|
|
182
182
|
columns = @column_inference.infer(values)
|
|
183
183
|
return prelim if columns.empty?
|
|
184
184
|
|
|
185
|
-
#
|
|
185
|
+
# Sort larger columns first (more statistical evidence)
|
|
186
186
|
sorted_columns = columns.sort_by { |c| -c.size }
|
|
187
187
|
|
|
188
188
|
column_headers = {}
|
|
@@ -2,19 +2,19 @@
|
|
|
2
2
|
|
|
3
3
|
module Rpdfium
|
|
4
4
|
module Util
|
|
5
|
-
#
|
|
6
|
-
#
|
|
7
|
-
#
|
|
5
|
+
# "Linear" text extraction from a collection of chars, layout=False.
|
|
6
|
+
# Equivalent of pdfplumber.utils.text.chars_to_textmap in the variant
|
|
7
|
+
# without preservation of the graphic layout.
|
|
8
8
|
#
|
|
9
|
-
#
|
|
10
|
-
# 1.
|
|
11
|
-
# 2. Cluster
|
|
12
|
-
# 3.
|
|
13
|
-
# 4.
|
|
9
|
+
# Algorithm:
|
|
10
|
+
# 1. Extract words with WordExtractor (same tolerances).
|
|
11
|
+
# 2. Cluster words by `top` with y_tolerance → logical lines.
|
|
12
|
+
# 3. For each line, sort by x0 and join with a single space.
|
|
13
|
+
# 4. Join the lines with "\n".
|
|
14
14
|
#
|
|
15
|
-
#
|
|
16
|
-
#
|
|
17
|
-
#
|
|
15
|
+
# NOTE on a subtlety: pdfplumber allows using an x_tolerance different
|
|
16
|
+
# from y_tolerance both for word-extraction and for line-clustering.
|
|
17
|
+
# We replicate this flexibility.
|
|
18
18
|
module TextExtraction
|
|
19
19
|
module_function
|
|
20
20
|
|
|
@@ -34,12 +34,12 @@ module Rpdfium
|
|
|
34
34
|
).extract_words(chars)
|
|
35
35
|
return "" if words.empty?
|
|
36
36
|
|
|
37
|
-
# Cluster
|
|
38
|
-
#
|
|
39
|
-
#
|
|
37
|
+
# Cluster the WORDS by top: final output lines.
|
|
38
|
+
# Uses the "line" y_tolerance — pdfplumber here uses the same
|
|
39
|
+
# y_tolerance passed in, consistent with how extract_text behaves.
|
|
40
40
|
line_clusters = Cluster.cluster_objects(words, :top, tolerance: y_tolerance)
|
|
41
41
|
|
|
42
|
-
#
|
|
42
|
+
# For each output line, join with a single space.
|
|
43
43
|
line_clusters.map do |line_words|
|
|
44
44
|
line_words.sort_by { |w| w[:x0] }.map { |w| w[:text] }.join(" ")
|
|
45
45
|
end.join("\n")
|