rpdfium 0.4.1 → 0.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +615 -1317
- data/README.md +73 -78
- data/lib/rpdfium/annotation/annotation.rb +10 -8
- data/lib/rpdfium/document.rb +49 -22
- data/lib/rpdfium/errors.rb +2 -2
- data/lib/rpdfium/form/form.rb +9 -9
- data/lib/rpdfium/image/embedded.rb +17 -16
- data/lib/rpdfium/io/png.rb +9 -9
- data/lib/rpdfium/page.rb +561 -526
- data/lib/rpdfium/raw.rb +216 -203
- data/lib/rpdfium/search/search.rb +5 -5
- data/lib/rpdfium/structure/attachment.rb +6 -6
- data/lib/rpdfium/structure/element.rb +74 -74
- data/lib/rpdfium/structure/outline.rb +2 -2
- data/lib/rpdfium/structure/tree.rb +56 -55
- data/lib/rpdfium/table/cells.rb +36 -33
- data/lib/rpdfium/table/debugger.rb +12 -12
- data/lib/rpdfium/table/edges.rb +51 -49
- data/lib/rpdfium/table/extractor.rb +35 -34
- data/lib/rpdfium/table/table.rb +65 -62
- data/lib/rpdfium/util/cluster.rb +35 -33
- data/lib/rpdfium/util/column_inference.rb +34 -32
- data/lib/rpdfium/util/label_matcher.rb +30 -30
- data/lib/rpdfium/util/text_extraction.rb +15 -15
- data/lib/rpdfium/util/word_extractor.rb +49 -48
- data/lib/rpdfium/util/word_merger.rb +25 -24
- data/lib/rpdfium/version.rb +1 -1
- data/lib/rpdfium.rb +17 -15
- metadata +1 -1
data/lib/rpdfium/page.rb
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module Rpdfium
|
|
4
|
-
#
|
|
5
|
-
#
|
|
6
|
-
#
|
|
7
|
-
#
|
|
4
|
+
# Page wrapper. Lazy-loads the TextPage. All returned coordinates are
|
|
5
|
+
# in the page's "top-down" space: (0,0) is at the top left, x grows
|
|
6
|
+
# toward the right, y toward the bottom. PDFium uses "bottom-up" — the
|
|
7
|
+
# conversion happens here once and for all.
|
|
8
8
|
class Page
|
|
9
9
|
attr_reader :document, :index
|
|
10
10
|
|
|
@@ -15,10 +15,10 @@ module Rpdfium
|
|
|
15
15
|
raise PageError, "Could not load page #{index}" if handle.null?
|
|
16
16
|
|
|
17
17
|
@text_page = nil
|
|
18
|
-
#
|
|
19
|
-
#
|
|
20
|
-
# @document
|
|
21
|
-
# Page (FPDF_ClosePage
|
|
18
|
+
# State shared with the finalizer: idempotent on close, survives GC
|
|
19
|
+
# without making a double FPDF_ClosePage call. Holding a reference to
|
|
20
|
+
# @document guarantees that the Document is not collected before the
|
|
21
|
+
# Page (FPDF_ClosePage requires the Document still alive).
|
|
22
22
|
@state = { handle: handle, closed: false }
|
|
23
23
|
ObjectSpace.define_finalizer(self, self.class.finalizer(@state))
|
|
24
24
|
end
|
|
@@ -37,12 +37,12 @@ module Rpdfium
|
|
|
37
37
|
@state[:handle]
|
|
38
38
|
end
|
|
39
39
|
|
|
40
|
-
# =====
|
|
40
|
+
# ===== Geometry =====
|
|
41
41
|
|
|
42
42
|
def width; Raw.FPDF_GetPageWidthF(@state[:handle]); end
|
|
43
43
|
def height; Raw.FPDF_GetPageHeightF(@state[:handle]); end
|
|
44
44
|
|
|
45
|
-
#
|
|
45
|
+
# Rotation in degrees: 0/90/180/270
|
|
46
46
|
def rotation
|
|
47
47
|
[0, 90, 180, 270][Raw.FPDFPage_GetRotation(@state[:handle])] || 0
|
|
48
48
|
end
|
|
@@ -53,10 +53,10 @@ module Rpdfium
|
|
|
53
53
|
|
|
54
54
|
BOX_FUNCTIONS = {
|
|
55
55
|
media: :FPDFPage_GetMediaBox,
|
|
56
|
-
crop:
|
|
56
|
+
crop: :FPDFPage_GetCropBox,
|
|
57
57
|
bleed: :FPDFPage_GetBleedBox,
|
|
58
|
-
trim:
|
|
59
|
-
art:
|
|
58
|
+
trim: :FPDFPage_GetTrimBox,
|
|
59
|
+
art: :FPDFPage_GetArtBox
|
|
60
60
|
}.freeze
|
|
61
61
|
|
|
62
62
|
def box(kind = :crop)
|
|
@@ -71,19 +71,20 @@ module Rpdfium
|
|
|
71
71
|
right: r.read_float, top: t.read_float }
|
|
72
72
|
end
|
|
73
73
|
|
|
74
|
-
#
|
|
75
|
-
# [x0, top, x1, bottom] in
|
|
76
|
-
#
|
|
77
|
-
#
|
|
74
|
+
# pdfplumber-compatible accessors. Return the box as the tuple
|
|
75
|
+
# [x0, top, x1, bottom] in top-down coordinates (the same system
|
|
76
|
+
# used by chars, edges, table cells). Return nil if the box is not
|
|
77
|
+
# defined in the PDF (e.g. ArtBox or BleedBox are often absent).
|
|
78
78
|
#
|
|
79
|
-
#
|
|
80
|
-
# crop = page.cropbox # → [0.0, 0.0, 595.28, 841.88]
|
|
81
|
-
# crop != [0, 0, page.width, page.height] # PDF
|
|
79
|
+
# Usage example:
|
|
80
|
+
# crop = page.cropbox # → [0.0, 0.0, 595.28, 841.88] or nil
|
|
81
|
+
# crop != [0, 0, page.width, page.height] # PDF has an explicit crop
|
|
82
82
|
def mediabox; box_to_topdown(box(:media)); end
|
|
83
83
|
|
|
84
|
-
# PDF spec 14.11.2:
|
|
85
|
-
#
|
|
86
|
-
#
|
|
84
|
+
# PDF spec 14.11.2: if CropBox is absent, the default is MediaBox. The
|
|
85
|
+
# cropbox is the "visible" area of the page; for PDFs from business
|
|
86
|
+
# software it often coincides with the MediaBox. pdfplumber performs the
|
|
87
|
+
# fallback automatically.
|
|
87
88
|
def cropbox
|
|
88
89
|
box_to_topdown(box(:crop)) || mediabox
|
|
89
90
|
end
|
|
@@ -92,112 +93,120 @@ module Rpdfium
|
|
|
92
93
|
def trimbox; box_to_topdown(box(:trim)); end
|
|
93
94
|
def artbox; box_to_topdown(box(:art)); end
|
|
94
95
|
|
|
95
|
-
# =====
|
|
96
|
+
# ===== Text ("simple" version) =====
|
|
96
97
|
|
|
97
98
|
def text
|
|
98
99
|
tp = text_page
|
|
99
100
|
n = tp.char_count
|
|
100
|
-
return
|
|
101
|
+
return '' if n.zero?
|
|
101
102
|
|
|
102
103
|
buf = FFI::MemoryPointer.new(:ushort, n + 1)
|
|
103
104
|
Raw.FPDFText_GetText(tp.handle, 0, n, buf)
|
|
104
|
-
buf.read_bytes((n + 1) * 2)
|
|
105
|
-
.encode("UTF-8", invalid: :replace, undef: :replace)
|
|
106
|
-
.delete("\x00")
|
|
105
|
+
decode_utf16le(buf.read_bytes((n + 1) * 2), replace: true)
|
|
107
106
|
end
|
|
108
107
|
|
|
109
|
-
#
|
|
110
|
-
#
|
|
108
|
+
# Extracts the text inside an arbitrary bbox (top-down coords).
|
|
109
|
+
# Useful for "read the header of this cell".
|
|
111
110
|
def text_in_bbox(left:, top:, right:, bottom:)
|
|
112
111
|
tp = text_page
|
|
113
112
|
h = height
|
|
114
|
-
#
|
|
113
|
+
# Convert to bottom-up for PDFium
|
|
115
114
|
pdf_top = h - top
|
|
116
115
|
pdf_bottom = h - bottom
|
|
117
|
-
# PDFium
|
|
116
|
+
# PDFium wants: left, top, right, bottom where top > bottom (PDF coords)
|
|
118
117
|
# Probe size:
|
|
119
118
|
n = Raw.FPDFText_GetBoundedText(
|
|
120
119
|
tp.handle, left, pdf_top, right, pdf_bottom, FFI::Pointer::NULL, 0
|
|
121
120
|
)
|
|
122
|
-
return
|
|
121
|
+
return '' if n <= 0
|
|
123
122
|
|
|
124
123
|
buf = FFI::MemoryPointer.new(:ushort, n)
|
|
125
124
|
Raw.FPDFText_GetBoundedText(
|
|
126
125
|
tp.handle, left, pdf_top, right, pdf_bottom, buf, n
|
|
127
126
|
)
|
|
128
|
-
buf.read_bytes(n * 2)
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
#
|
|
134
|
-
|
|
135
|
-
#
|
|
136
|
-
# :
|
|
137
|
-
# :
|
|
138
|
-
# :
|
|
139
|
-
# :
|
|
140
|
-
# :
|
|
141
|
-
# :
|
|
142
|
-
# :
|
|
143
|
-
#
|
|
144
|
-
#
|
|
145
|
-
#
|
|
146
|
-
#
|
|
147
|
-
#
|
|
148
|
-
#
|
|
149
|
-
# :
|
|
150
|
-
# :hyphen true se trattino di sillabazione
|
|
151
|
-
# :unicode_error true se PDFium non ha potuto mapparlo
|
|
127
|
+
decode_utf16le(buf.read_bytes(n * 2), replace: true)
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
# ===== Characters (char-level) =====
|
|
131
|
+
|
|
132
|
+
# Returns every char with rich metadata:
|
|
133
|
+
# :char string (1 codepoint)
|
|
134
|
+
# :x0,:x1 horizontal bbox
|
|
135
|
+
# :top,:bottom vertical bbox (top-down: top < bottom)
|
|
136
|
+
# :origin_x, :origin_y glyph insertion point (top-down)
|
|
137
|
+
# :angle glyph rotation angle (radians)
|
|
138
|
+
# :fontsize size in points
|
|
139
|
+
# :font font name (if available)
|
|
140
|
+
# :weight weight (e.g. 400=regular, 700=bold)
|
|
141
|
+
# :render_mode rendering mode (fill/stroke/invisible). Read via
|
|
142
|
+
# the text object that contains the char (PDFium no
|
|
143
|
+
# longer exposes a char-level API after chromium/6611).
|
|
144
|
+
# nil on old PDFium builds that do not support the
|
|
145
|
+
# char→object lookup.
|
|
146
|
+
# :generated true if inserted by PDFium (e.g. synthetic spaces)
|
|
147
|
+
# :hyphen true if a hyphenation hyphen
|
|
148
|
+
# :unicode_error true if PDFium could not map it
|
|
152
149
|
#
|
|
153
|
-
# `loose: true` (DEFAULT)
|
|
154
|
-
#
|
|
155
|
-
#
|
|
156
|
-
#
|
|
157
|
-
#
|
|
158
|
-
#
|
|
159
|
-
# baseline.
|
|
160
|
-
#
|
|
161
|
-
#
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
150
|
+
# `loose: true` (DEFAULT) uses FPDFText_GetLooseCharBox: all chars on
|
|
151
|
+
# the same logical line share the same vertical bbox (top/bottom),
|
|
152
|
+
# proportional to the font size rather than to the individual glyph.
|
|
153
|
+
# This is exactly the behavior of pdfminer.six/pdfplumber, and the only
|
|
154
|
+
# one that lets the midpoint test in Table#extract also capture
|
|
155
|
+
# punctuation chars (`.`, `,`) along with the numbers aligned to the
|
|
156
|
+
# baseline. With `loose: false` you get the "tight" bbox of the single
|
|
157
|
+
# glyph, useful for fine layout measurements but wrong for the table
|
|
158
|
+
# cell filter.
|
|
159
|
+
# `geometry: true` is a stronger form of `lean` reserved for the
|
|
160
|
+
# table/word pipeline: on top of `lean` it ALSO skips the per-char
|
|
161
|
+
# origin (FPDFText_GetCharOrigin) and the text-object lookup
|
|
162
|
+
# (FPDFText_GetTextObject + GetFont/GetFontSize/GetTextRenderMode/
|
|
163
|
+
# GetText), and emits a 6-key hash (char, x0, x1, top, bottom,
|
|
164
|
+
# generated) instead of the full one. Those are exactly the fields the
|
|
165
|
+
# WordExtractor / Table pipeline reads; cutting the rest removes ~3 FFI
|
|
166
|
+
# roundtrips per char and a large amount of hash allocation, which on a
|
|
167
|
+
# page with thousands of chars is the dominant cost of extract_tables.
|
|
168
|
+
# Unlike `lean` (which keeps the full hash shape, just with nil/false
|
|
169
|
+
# metadata), `geometry` changes the hash shape, so it is NOT a drop-in
|
|
170
|
+
# for general char consumers — only for the geometry-only pipeline.
|
|
171
|
+
def chars(loose: true, inject_spaces: true, lean: false, geometry: false)
|
|
172
|
+
# Cache: chars() is called once by Table#extract and then again by
|
|
173
|
+
# WordExtractor (going through Extractor#page_words if
|
|
174
|
+
# vertical/horizontal_strategy is :text). Each call costs O(n) FFI
|
|
175
|
+
# roundtrips per char — expensive on pages with thousands of chars.
|
|
176
|
+
cache_key = [loose, inject_spaces, lean, geometry]
|
|
168
177
|
@chars_cache ||= {}
|
|
169
178
|
return @chars_cache[cache_key] if @chars_cache.key?(cache_key)
|
|
170
179
|
|
|
171
|
-
raw = compute_chars(loose: loose, lean: lean)
|
|
180
|
+
raw = geometry ? compute_geometry_chars(loose: loose) : compute_chars(loose: loose, lean: lean)
|
|
172
181
|
result = inject_spaces ? rebuild_word_separators(raw) : raw
|
|
173
182
|
@chars_cache[cache_key] = result
|
|
174
183
|
end
|
|
175
184
|
|
|
176
|
-
#
|
|
177
|
-
#
|
|
178
|
-
#
|
|
179
|
-
#
|
|
185
|
+
# Rebuilds the spaces that separate words based on the GEOMETRY of the
|
|
186
|
+
# "real" chars, completely discarding PDFium's synthetic spaces (which
|
|
187
|
+
# are unreliable: PDFium emits them aggressively even between digits of
|
|
188
|
+
# numbers like "2.895,26").
|
|
180
189
|
#
|
|
181
|
-
#
|
|
182
|
-
# 1.
|
|
183
|
-
#
|
|
184
|
-
# 2. Cluster
|
|
185
|
-
# 3.
|
|
186
|
-
#
|
|
187
|
-
#
|
|
188
|
-
# (bbox
|
|
190
|
+
# Algorithm:
|
|
191
|
+
# 1. Filter out all :generated chars (typically synthetic spaces
|
|
192
|
+
# with a degenerate bbox).
|
|
193
|
+
# 2. Cluster the remaining chars by row (top tolerance 1pt).
|
|
194
|
+
# 3. Within each row, sort by x0 and for each consecutive pair
|
|
195
|
+
# compute gap = next.x0 - prev.x1 and char_w = (prev.w + next.w) / 2.
|
|
196
|
+
# If gap > 0.275 × char_w → insert a new synthetic space
|
|
197
|
+
# (bbox normalized to the top/bottom of the chars).
|
|
189
198
|
#
|
|
190
|
-
#
|
|
191
|
-
#
|
|
192
|
-
#
|
|
193
|
-
# dataset
|
|
194
|
-
#
|
|
195
|
-
#
|
|
199
|
+
# Threshold 0.275: tuned empirically on a real TeamSystem PDF.
|
|
200
|
+
# Measured distribution: intra-word gap max ratio 0.24, inter-word
|
|
201
|
+
# gap min ratio 0.31. Classification 100% correct on the training
|
|
202
|
+
# dataset (1400 intra + 663 inter cases). pdfminer.six uses 0.1
|
|
203
|
+
# internally (`word_margin`) but with additional info from the font
|
|
204
|
+
# advance, not available from PDFium.
|
|
196
205
|
def rebuild_word_separators(chars)
|
|
197
206
|
reals = chars.reject { |c| c[:generated] }
|
|
198
207
|
return chars if reals.empty?
|
|
199
208
|
|
|
200
|
-
# Cluster
|
|
209
|
+
# Cluster by row, preserving the top ordering
|
|
201
210
|
sorted_top = reals.sort_by { |c| c[:top] }
|
|
202
211
|
rows = []
|
|
203
212
|
sorted_top.each do |c|
|
|
@@ -216,19 +225,19 @@ module Rpdfium
|
|
|
216
225
|
if prev
|
|
217
226
|
gap = c[:x0] - prev[:x1]
|
|
218
227
|
|
|
219
|
-
#
|
|
220
|
-
#
|
|
221
|
-
# →
|
|
228
|
+
# Signal from the PDF content stream: prev.text_obj_ends_with_space.
|
|
229
|
+
# If prev does NOT end a token (false), the gap is internal
|
|
230
|
+
# kerning → never insert a space.
|
|
222
231
|
#
|
|
223
|
-
#
|
|
224
|
-
# -
|
|
225
|
-
# -
|
|
226
|
-
#
|
|
232
|
+
# If prev ends a token (true), it may be:
|
|
233
|
+
# - a real word end (relatively large geometric gap)
|
|
234
|
+
# - a syntactic token end (e.g. between digits and punctuation
|
|
235
|
+
# of a number "2", "."), with a small gap.
|
|
227
236
|
#
|
|
228
|
-
#
|
|
229
|
-
#
|
|
230
|
-
#
|
|
231
|
-
#
|
|
237
|
+
# We discriminate with the geometric threshold combined with the
|
|
238
|
+
# typographic "context": if the pair (prev_char, curr_char) looks
|
|
239
|
+
# like a numeric context (digits + punctuation), we use a higher
|
|
240
|
+
# threshold; otherwise the normal threshold.
|
|
232
241
|
obj_signal_present = prev.key?(:text_obj_ends_with_space)
|
|
233
242
|
obj_says_continues = obj_signal_present && !prev[:text_obj_ends_with_space]
|
|
234
243
|
|
|
@@ -246,11 +255,11 @@ module Rpdfium
|
|
|
246
255
|
result
|
|
247
256
|
end
|
|
248
257
|
|
|
249
|
-
# True
|
|
250
|
-
#
|
|
251
|
-
#
|
|
252
|
-
#
|
|
253
|
-
#
|
|
258
|
+
# True if the pair (prev_char, curr_char) is a "numeric" context:
|
|
259
|
+
# digit-punctuation, punctuation-digit, or digit-digit. In these
|
|
260
|
+
# cases a modest gap is probably kerning internal to the number, not
|
|
261
|
+
# a word boundary. A higher threshold avoids splitting numbers like
|
|
262
|
+
# "2.895,26" into "2 . 895 , 26".
|
|
254
263
|
NUMERIC_PUNCT = %w[. , ].freeze
|
|
255
264
|
|
|
256
265
|
def numeric_context?(prev_char, curr_char)
|
|
@@ -261,23 +270,22 @@ module Rpdfium
|
|
|
261
270
|
prev_num && curr_num
|
|
262
271
|
end
|
|
263
272
|
|
|
264
|
-
#
|
|
265
|
-
#
|
|
266
|
-
#
|
|
267
|
-
#
|
|
273
|
+
# Returns the "reference" width for computing the gap/width ratio.
|
|
274
|
+
# Prefers the advance (more stable than the bbox for chars with
|
|
275
|
+
# post-applied kerning). If either char lacks an advance, falls back
|
|
276
|
+
# to the max of the bbox widths.
|
|
268
277
|
def best_reference_width(a, b)
|
|
269
278
|
a_adv = a[:advance]
|
|
270
279
|
b_adv = b[:advance]
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
end
|
|
280
|
+
|
|
281
|
+
return [a_adv, b_adv].max if a_adv && b_adv
|
|
282
|
+
|
|
283
|
+
[(a[:x1] - a[:x0]), (b[:x1] - b[:x0])].max
|
|
276
284
|
end
|
|
277
285
|
|
|
278
286
|
def build_synthetic_space(prev, c)
|
|
279
287
|
{
|
|
280
|
-
char:
|
|
288
|
+
char: ' ', codepoint: 32,
|
|
281
289
|
x0: prev[:x1], x1: c[:x0],
|
|
282
290
|
top: prev[:top], bottom: prev[:bottom],
|
|
283
291
|
origin_x: prev[:x1], origin_y: prev[:origin_y],
|
|
@@ -294,21 +302,15 @@ module Rpdfium
|
|
|
294
302
|
n = tp.char_count
|
|
295
303
|
return [] if n.zero?
|
|
296
304
|
|
|
297
|
-
#
|
|
298
|
-
h = height
|
|
299
|
-
w = width
|
|
305
|
+
# Page geometry after applying the PDF rotation.
|
|
300
306
|
page_rotation = rotation
|
|
301
|
-
|
|
302
|
-
raw_w, raw_h = case page_rotation
|
|
303
|
-
when 90, 270 then [h, w]
|
|
304
|
-
else [w, h]
|
|
305
|
-
end
|
|
307
|
+
raw_w, raw_h = rotated_dimensions(page_rotation)
|
|
306
308
|
|
|
307
309
|
result = Array.new(n)
|
|
308
310
|
|
|
309
|
-
#
|
|
310
|
-
# MemoryPointer.new
|
|
311
|
-
# char
|
|
311
|
+
# FFI buffers reused across all loop iterations.
|
|
312
|
+
# MemoryPointer.new is non-trivial (~µs each); allocating O(n) of them
|
|
313
|
+
# per char is the main cost of compute_chars after the FFI calls.
|
|
312
314
|
l = FFI::MemoryPointer.new(:double)
|
|
313
315
|
r = FFI::MemoryPointer.new(:double)
|
|
314
316
|
b = FFI::MemoryPointer.new(:double)
|
|
@@ -332,11 +334,11 @@ module Rpdfium
|
|
|
332
334
|
origin_x_raw = ox.read_double
|
|
333
335
|
origin_y_raw = oy.read_double
|
|
334
336
|
|
|
335
|
-
# Font name:
|
|
337
|
+
# Font name: skipped in lean mode (1 FFI call saved per char).
|
|
336
338
|
font_name = nil
|
|
337
339
|
unless lean
|
|
338
340
|
n_bytes = Raw.FPDFText_GetFontInfo(tp_handle, i, font_buf, 256, flags_buf)
|
|
339
|
-
font_name = font_buf.read_bytes(n_bytes - 1).force_encoding(
|
|
341
|
+
font_name = font_buf.read_bytes(n_bytes - 1).force_encoding(Encoding::UTF_8.to_s) if n_bytes > 1
|
|
340
342
|
end
|
|
341
343
|
|
|
342
344
|
cp = Raw.FPDFText_GetUnicode(tp_handle, i)
|
|
@@ -351,9 +353,9 @@ module Rpdfium
|
|
|
351
353
|
fetch_text_obj_info(text_obj, tp, text_obj_cache,
|
|
352
354
|
fs_buf: fs_buf, text_buf: text_obj_text_buf)
|
|
353
355
|
|
|
354
|
-
# Advance: 2 FFI per char (GetGlyphWidth + GetMatrix). In lean
|
|
355
|
-
# mode
|
|
356
|
-
#
|
|
356
|
+
# Advance: 2 FFI calls per char (GetGlyphWidth + GetMatrix). In lean
|
|
357
|
+
# mode we skip it — best_reference_width falls back to bbox-width
|
|
358
|
+
# which works just as well for the word-boundary discriminant.
|
|
357
359
|
advance = if lean
|
|
358
360
|
nil
|
|
359
361
|
else
|
|
@@ -366,30 +368,30 @@ module Rpdfium
|
|
|
366
368
|
x0, x1, y_top, y_bot,
|
|
367
369
|
origin_x_raw, origin_y_raw)
|
|
368
370
|
|
|
369
|
-
# In lean mode
|
|
371
|
+
# In lean mode we skip 5 FFI calls per char:
|
|
370
372
|
# GetCharAngle, GetFontWeight, IsHyphen, HasUnicodeMapError,
|
|
371
|
-
# (
|
|
372
|
-
#
|
|
373
|
-
# (
|
|
374
|
-
#
|
|
373
|
+
# (and the GetFontSize fallback if font_size_for_obj is nil).
|
|
374
|
+
# On pages with thousands of chars the saving is significant
|
|
375
|
+
# (tens of ms). The metadata come out nil/false, which is the
|
|
376
|
+
# neutral value for the internal text/tables/words pipeline.
|
|
375
377
|
result[i] =
|
|
376
378
|
if lean
|
|
377
379
|
{
|
|
378
|
-
char:
|
|
380
|
+
char: safe_codepoint(cp),
|
|
379
381
|
codepoint: cp,
|
|
380
|
-
x0:
|
|
381
|
-
x1:
|
|
382
|
-
top:
|
|
383
|
-
bottom:
|
|
382
|
+
x0: td_x0,
|
|
383
|
+
x1: td_x1,
|
|
384
|
+
top: td_top,
|
|
385
|
+
bottom: td_bottom,
|
|
384
386
|
origin_x: td_ox,
|
|
385
387
|
origin_y: td_oy,
|
|
386
|
-
angle:
|
|
388
|
+
angle: nil,
|
|
387
389
|
fontsize: font_size_for_obj,
|
|
388
|
-
font:
|
|
389
|
-
weight:
|
|
390
|
-
render_mode:
|
|
391
|
-
generated:
|
|
392
|
-
hyphen:
|
|
390
|
+
font: nil,
|
|
391
|
+
weight: nil,
|
|
392
|
+
render_mode: rm,
|
|
393
|
+
generated: Raw.FPDFText_IsGenerated(tp_handle, i) == 1,
|
|
394
|
+
hyphen: false,
|
|
393
395
|
unicode_error: false,
|
|
394
396
|
advance: advance,
|
|
395
397
|
text_obj_id: text_obj && !text_obj.null? ? text_obj.address : nil,
|
|
@@ -397,21 +399,21 @@ module Rpdfium
|
|
|
397
399
|
}
|
|
398
400
|
else
|
|
399
401
|
{
|
|
400
|
-
char:
|
|
402
|
+
char: safe_codepoint(cp),
|
|
401
403
|
codepoint: cp,
|
|
402
|
-
x0:
|
|
403
|
-
x1:
|
|
404
|
-
top:
|
|
405
|
-
bottom:
|
|
404
|
+
x0: td_x0,
|
|
405
|
+
x1: td_x1,
|
|
406
|
+
top: td_top,
|
|
407
|
+
bottom: td_bottom,
|
|
406
408
|
origin_x: td_ox,
|
|
407
409
|
origin_y: td_oy,
|
|
408
|
-
angle:
|
|
410
|
+
angle: Raw.FPDFText_GetCharAngle(tp_handle, i),
|
|
409
411
|
fontsize: font_size_for_obj || Raw.FPDFText_GetFontSize(tp_handle, i),
|
|
410
|
-
font:
|
|
411
|
-
weight:
|
|
412
|
-
render_mode:
|
|
413
|
-
generated:
|
|
414
|
-
hyphen:
|
|
412
|
+
font: font_name,
|
|
413
|
+
weight: Raw.FPDFText_GetFontWeight(tp_handle, i),
|
|
414
|
+
render_mode: rm,
|
|
415
|
+
generated: Raw.FPDFText_IsGenerated(tp_handle, i) == 1,
|
|
416
|
+
hyphen: Raw.FPDFText_IsHyphen(tp_handle, i) == 1,
|
|
415
417
|
unicode_error: Raw.FPDFText_HasUnicodeMapError(tp_handle, i) == 1,
|
|
416
418
|
advance: advance,
|
|
417
419
|
text_obj_id: text_obj && !text_obj.null? ? text_obj.address : nil,
|
|
@@ -422,67 +424,137 @@ module Rpdfium
|
|
|
422
424
|
result
|
|
423
425
|
end
|
|
424
426
|
|
|
425
|
-
#
|
|
427
|
+
# Minimal char extraction for the table/word pipeline. See `chars`
|
|
428
|
+
# `geometry:` for the rationale. Compared to compute_chars(lean: true)
|
|
429
|
+
# this skips, per char: FPDFText_GetCharOrigin (origin is never read by
|
|
430
|
+
# the pipeline) and the per-char angle/font/weight/render-mode reads,
|
|
431
|
+
# the page rotation is applied inline (no origin, no intermediate
|
|
432
|
+
# 6-tuple allocation), and the result hash carries only the fields the
|
|
433
|
+
# WordExtractor / Table / rebuild_word_separators path reads.
|
|
426
434
|
#
|
|
427
|
-
#
|
|
428
|
-
#
|
|
429
|
-
#
|
|
435
|
+
# `text_obj_ends_with_space` is intentionally KEPT: rebuild_word_separators
|
|
436
|
+
# uses it as the content-stream "token end" signal that distinguishes a
|
|
437
|
+
# word boundary from internal numeric kerning (e.g. "2.895,26"). Dropping
|
|
438
|
+
# it would change word splitting on PDFs that rely on that signal, so the
|
|
439
|
+
# GetTextObject lookup stays (its info tuple is cached per text object).
|
|
440
|
+
def compute_geometry_chars(loose:)
|
|
441
|
+
tp = text_page
|
|
442
|
+
n = tp.char_count
|
|
443
|
+
return [] if n.zero?
|
|
444
|
+
|
|
445
|
+
page_rotation = rotation
|
|
446
|
+
raw_w, raw_h = rotated_dimensions(page_rotation)
|
|
447
|
+
|
|
448
|
+
result = Array.new(n)
|
|
449
|
+
|
|
450
|
+
# FFI buffers reused across all iterations (see compute_chars).
|
|
451
|
+
l = FFI::MemoryPointer.new(:double)
|
|
452
|
+
r = FFI::MemoryPointer.new(:double)
|
|
453
|
+
b = FFI::MemoryPointer.new(:double)
|
|
454
|
+
t = FFI::MemoryPointer.new(:double)
|
|
455
|
+
rect = Raw::FS_RECTF.new
|
|
456
|
+
fs_buf = FFI::MemoryPointer.new(:float)
|
|
457
|
+
text_obj_text_buf = FFI::MemoryPointer.new(:uint8, TEXT_OBJ_INITIAL_BUF_BYTES)
|
|
458
|
+
text_obj_cache = {}
|
|
459
|
+
tp_handle = tp.handle
|
|
460
|
+
|
|
461
|
+
n.times do |i|
|
|
462
|
+
x0, x1, y_top, y_bot = read_char_bbox(tp, i, loose, l, r, b, t, rect)
|
|
463
|
+
|
|
464
|
+
text_obj = begin
|
|
465
|
+
Raw.FPDFText_GetTextObject(tp_handle, i)
|
|
466
|
+
rescue Rpdfium::LoadError
|
|
467
|
+
nil
|
|
468
|
+
end
|
|
469
|
+
_, _, _, ends_with_space =
|
|
470
|
+
fetch_text_obj_info(text_obj, tp, text_obj_cache,
|
|
471
|
+
fs_buf: fs_buf, text_buf: text_obj_text_buf)
|
|
472
|
+
|
|
473
|
+
# Inline page-rotation → top-down coords (mirror of
|
|
474
|
+
# apply_page_rotation_to_char, dropping the origin outputs).
|
|
475
|
+
td_x0, td_x1, td_top, td_bottom =
|
|
476
|
+
case page_rotation
|
|
477
|
+
when 90 then [y_bot, y_top, x0, x1]
|
|
478
|
+
when 180 then [raw_w - x1, raw_w - x0, y_bot, y_top]
|
|
479
|
+
when 270 then [raw_h - y_top, raw_h - y_bot, raw_w - x1, raw_w - x0]
|
|
480
|
+
else # 0, nil, or non-multiple-of-90 fallback
|
|
481
|
+
[x0, x1, raw_h - y_top, raw_h - y_bot]
|
|
482
|
+
end
|
|
483
|
+
|
|
484
|
+
result[i] = {
|
|
485
|
+
char: safe_codepoint(Raw.FPDFText_GetUnicode(tp_handle, i)),
|
|
486
|
+
x0: td_x0,
|
|
487
|
+
x1: td_x1,
|
|
488
|
+
top: td_top,
|
|
489
|
+
bottom: td_bottom,
|
|
490
|
+
generated: Raw.FPDFText_IsGenerated(tp_handle, i) == 1,
|
|
491
|
+
text_obj_ends_with_space: ends_with_space
|
|
492
|
+
}
|
|
493
|
+
end
|
|
494
|
+
result
|
|
495
|
+
end
|
|
496
|
+
|
|
497
|
+
# Applies the page rotation to a char's coordinates.
|
|
430
498
|
#
|
|
431
|
-
#
|
|
432
|
-
#
|
|
433
|
-
#
|
|
499
|
+
# Input: raw PDFium coords (bottom-up, pre-rotation) of a bbox
|
|
500
|
+
# `[x0, x1, y_top, y_bot]` (with y_top > y_bot because bottom-up) and
|
|
501
|
+
# of an origin point.
|
|
434
502
|
#
|
|
435
|
-
#
|
|
436
|
-
#
|
|
437
|
-
#
|
|
438
|
-
# rotazione per allineare al rendering.
|
|
503
|
+
# Output: top-down coords in the post-rotation page system, in the
|
|
504
|
+
# standard rpdfium convention: `[x0, x1, top, bottom]` with
|
|
505
|
+
# `top < bottom`. Consistent with pdfplumber.
|
|
439
506
|
#
|
|
440
|
-
#
|
|
441
|
-
#
|
|
442
|
-
#
|
|
443
|
-
#
|
|
444
|
-
#
|
|
507
|
+
# PDFium convention: GetRotation = N means the displayed page is
|
|
508
|
+
# rotated by N*90° clockwise relative to the raw content stream
|
|
509
|
+
# system. PDFium returns the coords in the raw system; we apply the
|
|
510
|
+
# rotation to align with the rendering.
|
|
511
|
+
#
|
|
512
|
+
# Case 0°: identity + bottom-up→top-down.
|
|
513
|
+
# Case 90° CW: a bbox wide in x becomes tall in y. The raw x_min (left)
|
|
514
|
+
# coincides with the top of the post-rotation system.
|
|
515
|
+
# Case 180°: flips both axes.
|
|
516
|
+
# Case 270° CW: a bbox wide in x becomes tall in y, but flipped vertically.
|
|
445
517
|
def apply_page_rotation_to_char(rotation, raw_w, raw_h,
|
|
446
518
|
x0, x1, y_top, y_bot,
|
|
447
519
|
origin_x, origin_y)
|
|
448
520
|
case rotation
|
|
449
521
|
when 0, nil
|
|
450
|
-
#
|
|
522
|
+
# No rotation. Standard bottom-up → top-down.
|
|
451
523
|
# page_h_post == raw_h.
|
|
452
524
|
[x0, x1, raw_h - y_top, raw_h - y_bot,
|
|
453
525
|
origin_x, raw_h - origin_y]
|
|
454
526
|
|
|
455
527
|
when 90
|
|
456
|
-
# 90° CW.
|
|
457
|
-
#
|
|
528
|
+
# 90° CW. Post-rotation dimensions: w=raw_h, h=raw_w.
|
|
529
|
+
# Transform: x_post = y_raw, y_post = raw_w - x_raw (bottom-up).
|
|
458
530
|
# In top-down: top = x_min_raw, bottom = x_max_raw.
|
|
459
|
-
new_x0 = y_bot #
|
|
460
|
-
new_x1 = y_top #
|
|
461
|
-
new_top = x0 #
|
|
462
|
-
new_bottom = x1 #
|
|
531
|
+
new_x0 = y_bot # small y_raw → small x_post
|
|
532
|
+
new_x1 = y_top # large y_raw → large x_post
|
|
533
|
+
new_top = x0 # small x_raw → small top (high)
|
|
534
|
+
new_bottom = x1 # large x_raw → large bottom (low)
|
|
463
535
|
new_ox = origin_y
|
|
464
536
|
new_oy = origin_x # top-down origin_y = x_raw
|
|
465
537
|
[new_x0, new_x1, new_top, new_bottom, new_ox, new_oy]
|
|
466
538
|
|
|
467
539
|
when 180
|
|
468
|
-
# 180°.
|
|
469
|
-
#
|
|
540
|
+
# 180°. Post-rotation dimensions: unchanged (raw_w × raw_h).
|
|
541
|
+
# Transform: x_post = raw_w - x_raw, y_post = raw_h - y_raw.
|
|
470
542
|
# In top-down: top = y_bot_raw, bottom = y_top_raw.
|
|
471
543
|
new_x0 = raw_w - x1
|
|
472
544
|
new_x1 = raw_w - x0
|
|
473
|
-
new_top = y_bot # bottom
|
|
474
|
-
new_bottom = y_top # top
|
|
545
|
+
new_top = y_bot # raw bottom → td top (high)
|
|
546
|
+
new_bottom = y_top # raw top → td bottom (low)
|
|
475
547
|
new_ox = raw_w - origin_x
|
|
476
|
-
|
|
477
|
-
#
|
|
548
|
+
# Origin in top-down post-180°: the y axis is already flipped by
|
|
549
|
+
# the rotation, so origin_y carries over unchanged.
|
|
478
550
|
new_oy = origin_y
|
|
479
551
|
[new_x0, new_x1, new_top, new_bottom, new_ox, new_oy]
|
|
480
552
|
|
|
481
553
|
when 270
|
|
482
|
-
# 270° CW (= 90° CCW).
|
|
483
|
-
#
|
|
554
|
+
# 270° CW (= 90° CCW). Post-rotation dimensions: w=raw_h, h=raw_w.
|
|
555
|
+
# Transform: x_post = raw_h - y_raw, y_post = x_raw (bottom-up).
|
|
484
556
|
# In top-down: top = raw_w - x_max_raw, bottom = raw_w - x_min_raw.
|
|
485
|
-
new_x0 = raw_h - y_top #
|
|
557
|
+
new_x0 = raw_h - y_top # large y → small x_post
|
|
486
558
|
new_x1 = raw_h - y_bot
|
|
487
559
|
new_top = raw_w - x1
|
|
488
560
|
new_bottom = raw_w - x0
|
|
@@ -491,22 +563,22 @@ module Rpdfium
|
|
|
491
563
|
[new_x0, new_x1, new_top, new_bottom, new_ox, new_oy]
|
|
492
564
|
|
|
493
565
|
else
|
|
494
|
-
#
|
|
495
|
-
#
|
|
496
|
-
#
|
|
566
|
+
# Non-standard rotation (not a multiple of 90°): fall back to
|
|
567
|
+
# the pre-rotation behavior. This should never happen for
|
|
568
|
+
# well-formed PDFs.
|
|
497
569
|
[x0, x1, raw_h - y_top, raw_h - y_bot,
|
|
498
570
|
origin_x, raw_h - origin_y]
|
|
499
571
|
end
|
|
500
572
|
end
|
|
501
573
|
|
|
502
|
-
# Cache lookup
|
|
574
|
+
# Cache lookup for a text object. Returns a tuple:
|
|
503
575
|
# [render_mode, font_handle, font_size, ends_with_space]
|
|
504
576
|
#
|
|
505
|
-
# `ends_with_space`
|
|
506
|
-
#
|
|
507
|
-
#
|
|
508
|
-
#
|
|
509
|
-
#
|
|
577
|
+
# `ends_with_space` indicates whether the text of the entire text object
|
|
578
|
+
# ends with a space (a "token end" signal declared by the PDF). It is a
|
|
579
|
+
# property of the object, not of the single char, so it can be computed
|
|
580
|
+
# once and cached together with the other fields — this avoids one
|
|
581
|
+
# FPDFTextObj_GetText call for every char that shares the obj.
|
|
510
582
|
def fetch_text_obj_info(text_obj, tp, cache, fs_buf:, text_buf:)
|
|
511
583
|
return [nil, nil, nil, nil] if text_obj.nil? || text_obj.null?
|
|
512
584
|
|
|
@@ -522,18 +594,24 @@ module Rpdfium
|
|
|
522
594
|
end
|
|
523
595
|
|
|
524
596
|
obj_text = read_text_obj_text_fast(text_obj, tp, text_buf)
|
|
525
|
-
ends_with_space = obj_text&.end_with?(
|
|
597
|
+
ends_with_space = obj_text&.end_with?(' ')
|
|
526
598
|
|
|
527
599
|
tuple = [rm, font_handle, font_size, ends_with_space]
|
|
528
600
|
cache[addr] = tuple
|
|
529
601
|
tuple
|
|
530
602
|
end
|
|
531
603
|
|
|
532
|
-
#
|
|
533
|
-
#
|
|
534
|
-
#
|
|
535
|
-
#
|
|
536
|
-
#
|
|
604
|
+
# Reads the text of a PDF text object, reusing the caller-provided
|
|
605
|
+
# buffer instead of allocating one per call.
|
|
606
|
+
#
|
|
607
|
+
# C signature: `unsigned long FPDFTextObj_GetText(FPDF_PAGEOBJECT,
|
|
608
|
+
# FPDF_TEXTPAGE, FPDF_WCHAR* buffer, unsigned long length)` — length in
|
|
609
|
+
# BYTES, the return is the total number of bytes needed (including the
|
|
610
|
+
# null terminator), even if the buffer is too small.
|
|
611
|
+
#
|
|
612
|
+
# For 99% of text objs the initial 256-byte buffer is enough; in the
|
|
613
|
+
# rare case PDFium requires more space, a larger buffer is allocated on
|
|
614
|
+
# demand (rare path, OK to allocate).
|
|
537
615
|
def read_text_obj_text_fast(text_obj, tp, buf)
|
|
538
616
|
return nil if text_obj.nil? || text_obj.null?
|
|
539
617
|
|
|
@@ -542,7 +620,7 @@ module Rpdfium
|
|
|
542
620
|
return nil if needed < 2
|
|
543
621
|
|
|
544
622
|
if needed > TEXT_OBJ_INITIAL_BUF_BYTES
|
|
545
|
-
#
|
|
623
|
+
# Rare path: text obj with > 128 chars. Allocate a dedicated buffer.
|
|
546
624
|
big_buf = FFI::MemoryPointer.new(:uint8, needed)
|
|
547
625
|
needed = Raw.FPDFTextObj_GetText(text_obj, tp.handle, big_buf, needed)
|
|
548
626
|
return nil if needed < 2
|
|
@@ -550,23 +628,20 @@ module Rpdfium
|
|
|
550
628
|
payload_bytes = needed - 2
|
|
551
629
|
return nil if payload_bytes <= 0
|
|
552
630
|
|
|
553
|
-
return big_buf.read_bytes(payload_bytes)
|
|
554
|
-
.force_encoding("UTF-16LE")
|
|
555
|
-
.encode("UTF-8")
|
|
556
|
-
.delete("\u0000")
|
|
631
|
+
return decode_utf16le(big_buf.read_bytes(payload_bytes))
|
|
557
632
|
end
|
|
558
633
|
|
|
559
634
|
payload_bytes = needed - 2
|
|
560
635
|
return nil if payload_bytes <= 0
|
|
561
636
|
|
|
562
|
-
buf.read_bytes(payload_bytes)
|
|
563
|
-
.force_encoding("UTF-16LE")
|
|
564
|
-
.encode("UTF-8")
|
|
565
|
-
.delete("\u0000")
|
|
637
|
+
decode_utf16le(buf.read_bytes(payload_bytes))
|
|
566
638
|
end
|
|
567
639
|
|
|
568
|
-
#
|
|
569
|
-
#
|
|
640
|
+
# Computes the glyph advance in page coordinates for a specific char.
|
|
641
|
+
# Formula: glyph_width(font, codepoint, font_size) × |CTM.a|. Reuses the
|
|
642
|
+
# caller-provided gw_buf and matrix instead of allocating per char.
|
|
643
|
+
# Returns nil if the advance is not computable (font unavailable, or
|
|
644
|
+
# PDFium build without FPDFFont_GetGlyphWidth).
|
|
570
645
|
def compute_glyph_advance_fast(font, codepoint, font_size, tp_handle,
|
|
571
646
|
char_index, gw_buf, matrix)
|
|
572
647
|
return nil if font.nil? || font_size.nil?
|
|
@@ -580,7 +655,7 @@ module Rpdfium
|
|
|
580
655
|
|
|
581
656
|
glyph_w_font_units = gw_buf.read_float
|
|
582
657
|
|
|
583
|
-
# CTM scale:
|
|
658
|
+
# CTM scale: reuse the matrix in-place.
|
|
584
659
|
scale = if Raw.FPDFText_GetMatrix(tp_handle, char_index, matrix) == 1
|
|
585
660
|
matrix[:a].abs
|
|
586
661
|
else
|
|
@@ -589,148 +664,101 @@ module Rpdfium
|
|
|
589
664
|
glyph_w_font_units * scale
|
|
590
665
|
end
|
|
591
666
|
|
|
592
|
-
#
|
|
593
|
-
#
|
|
594
|
-
#
|
|
595
|
-
# fetch
|
|
667
|
+
# Initial buffer size for FPDFTextObj_GetText: 256 bytes = 128 UTF-16 chars.
|
|
668
|
+
# Empirically sufficient for ~99% of real text objects (single words or
|
|
669
|
+
# short phrases). When a text obj is larger, we fall back to the correct
|
|
670
|
+
# probe-then-fetch.
|
|
596
671
|
TEXT_OBJ_INITIAL_BUF_BYTES = 256
|
|
597
672
|
|
|
598
|
-
# Legge il testo di un text object PDF.
|
|
599
|
-
#
|
|
600
|
-
# Firma C: `unsigned long FPDFTextObj_GetText(FPDF_PAGEOBJECT, FPDF_TEXTPAGE,
|
|
601
|
-
# FPDF_WCHAR* buffer, unsigned long length)` — length in BYTE, return è
|
|
602
|
-
# il numero di byte totali necessari (incluso null terminator), anche se
|
|
603
|
-
# il buffer è troppo piccolo. Pattern: prova con buffer stack-friendly,
|
|
604
|
-
# se PDFium ne richiede di più rialloca.
|
|
605
|
-
def read_text_obj_text_from(text_obj, tp, _char_index_unused = nil)
|
|
606
|
-
return nil if text_obj.nil? || text_obj.null?
|
|
607
|
-
|
|
608
|
-
# Prima tentativo: buffer fisso da 256 byte. Risolve il 99% dei casi.
|
|
609
|
-
buf = FFI::MemoryPointer.new(:uint8, TEXT_OBJ_INITIAL_BUF_BYTES)
|
|
610
|
-
needed = Raw.FPDFTextObj_GetText(text_obj, tp.handle, buf,
|
|
611
|
-
TEXT_OBJ_INITIAL_BUF_BYTES)
|
|
612
|
-
return nil if needed < 2
|
|
613
|
-
|
|
614
|
-
# Se PDFium ne vuole più di quanto allocato, rialloca esatto.
|
|
615
|
-
if needed > TEXT_OBJ_INITIAL_BUF_BYTES
|
|
616
|
-
buf = FFI::MemoryPointer.new(:uint8, needed)
|
|
617
|
-
needed = Raw.FPDFTextObj_GetText(text_obj, tp.handle, buf, needed)
|
|
618
|
-
return nil if needed < 2
|
|
619
|
-
end
|
|
620
|
-
|
|
621
|
-
# Clamp difensivo: non leggo mai più di quanto allocato.
|
|
622
|
-
buf_capacity = buf.size
|
|
623
|
-
payload_bytes = [needed - 2, buf_capacity - 2].min
|
|
624
|
-
return nil if payload_bytes <= 0
|
|
625
|
-
|
|
626
|
-
buf.read_bytes(payload_bytes)
|
|
627
|
-
.force_encoding("UTF-16LE")
|
|
628
|
-
.encode("UTF-8")
|
|
629
|
-
.delete("\u0000")
|
|
630
|
-
end
|
|
631
|
-
|
|
632
|
-
# Calcola l'advance del glifo in coordinate pagina, per un char
|
|
633
|
-
# specifico identificato da (text_page, char_index).
|
|
634
|
-
# Formula: glyph_width(font, codepoint, font_size) × |CTM.a|.
|
|
635
|
-
# Ritorna nil se l'advance non è calcolabile (font non disponibile,
|
|
636
|
-
# PDFium che non supporta l'API).
|
|
637
|
-
def compute_glyph_advance(font, codepoint, font_size, tp, char_index)
|
|
638
|
-
return nil if font.nil? || font_size.nil?
|
|
639
|
-
|
|
640
|
-
gw_buf = FFI::MemoryPointer.new(:float)
|
|
641
|
-
ok = begin
|
|
642
|
-
Raw.FPDFFont_GetGlyphWidth(font, codepoint, font_size, gw_buf)
|
|
643
|
-
rescue Rpdfium::LoadError
|
|
644
|
-
return nil # FPDFFont_GetGlyphWidth non disponibile in build vecchi
|
|
645
|
-
end
|
|
646
|
-
return nil if ok == 0
|
|
647
|
-
|
|
648
|
-
glyph_w_font_units = gw_buf.read_float
|
|
649
|
-
scale = char_ctm_scale_x(tp, char_index) || 1.0
|
|
650
|
-
glyph_w_font_units * scale
|
|
651
|
-
end
|
|
652
|
-
|
|
653
|
-
# Calcola la scala orizzontale del CTM per un char specifico.
|
|
654
|
-
def char_ctm_scale_x(tp, char_index)
|
|
655
|
-
mat = Raw::FS_MATRIX.new
|
|
656
|
-
return nil if Raw.FPDFText_GetMatrix(tp.handle, char_index, mat) == 0
|
|
657
|
-
|
|
658
|
-
mat[:a].abs
|
|
659
|
-
end
|
|
660
|
-
|
|
661
673
|
# ===== Form-aware extraction =====
|
|
662
674
|
#
|
|
663
|
-
#
|
|
664
|
-
#
|
|
665
|
-
#
|
|
666
|
-
#
|
|
667
|
-
#
|
|
675
|
+
# "Filled form" PDFs (F24, Comunicazione IVA, 770, etc.) are output PDFs
|
|
676
|
+
# where the pre-printed template and the entered values coexist as
|
|
677
|
+
# graphical text — no AcroForm, no PDF/UA tag. The geometric table
|
|
678
|
+
# extraction pipeline sees the whole form and produces noise (template
|
|
679
|
+
# labels mixed in with the data).
|
|
668
680
|
#
|
|
669
|
-
#
|
|
670
|
-
#
|
|
671
|
-
# (
|
|
672
|
-
#
|
|
673
|
-
#
|
|
681
|
+
# The robust strategy on these PDFs is to separate the chars by "role"
|
|
682
|
+
# using font/height, which typically differ between the template
|
|
683
|
+
# (proportional fonts, various sizes) and the data entered by the
|
|
684
|
+
# business software (a single font, typically Courier or Helvetica,
|
|
685
|
+
# a single size).
|
|
674
686
|
#
|
|
675
|
-
#
|
|
687
|
+
# Classic F24 example:
|
|
676
688
|
# Template: Futura-Light, Futura-Bold, Futura-Heavy, Times-Bold
|
|
677
|
-
#
|
|
689
|
+
# Data: Courier 10.0
|
|
678
690
|
#
|
|
679
|
-
# page.font_inventory # →
|
|
691
|
+
# page.font_inventory # → sees all the (font, height)
|
|
680
692
|
# page.chars_where(font: /Courier/i)
|
|
681
|
-
# # →
|
|
682
|
-
# page.lines(font: /Courier/i) # →
|
|
693
|
+
# # → only the chars of the entered data
|
|
694
|
+
# page.lines(font: /Courier/i) # → data text line by line
|
|
683
695
|
|
|
684
|
-
#
|
|
696
|
+
# Distribution of chars by (font, visual height, weight).
|
|
685
697
|
#
|
|
686
|
-
#
|
|
698
|
+
# Returns an Array of Hash sorted by descending count:
|
|
687
699
|
# [{ font:, height:, weight:, count:, sample: }, ...]
|
|
688
700
|
#
|
|
689
|
-
# `height`
|
|
690
|
-
#
|
|
691
|
-
#
|
|
692
|
-
#
|
|
701
|
+
# `height` is the visual height of the char in points (bottom - top),
|
|
702
|
+
# more reliable than `fontsize`, which PDFium normalizes to 1.0 when the
|
|
703
|
+
# real size is in the CTM matrix (a common case on forms generated with
|
|
704
|
+
# scaling).
|
|
693
705
|
#
|
|
694
|
-
# `sample`
|
|
706
|
+
# `sample` is the first 40 chars of that group, in document order, for
|
|
707
|
+
# inspection.
|
|
695
708
|
#
|
|
696
|
-
#
|
|
697
|
-
#
|
|
698
|
-
#
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
709
|
+
# Heights are bucketed within `height_tolerance` (single-linkage, per
|
|
710
|
+
# font+weight) rather than rounded to a fixed precision. A round glyph
|
|
711
|
+
# whose loose box overshoots the cap line by a fraction of a point
|
|
712
|
+
# ("O", "S", "C"...) would otherwise land in a spurious one-glyph group
|
|
713
|
+
# (e.g. "O" at h=6.6 split off from the rest of the line at h=6.5,
|
|
714
|
+
# producing garbled samples like "CDICE FISCALE" with every "O"
|
|
715
|
+
# missing). Clustering keeps each logical size in a single group.
|
|
716
|
+
#
|
|
717
|
+
# Use it to choose the `chars_where` filter: typically the font with the
|
|
718
|
+
# most chars is the template, and the minority fonts (a single size,
|
|
719
|
+
# often monospace) are the data.
|
|
720
|
+
def font_inventory(height_tolerance: 0.5)
|
|
721
|
+
real = chars.reject { |c| c[:generated] }
|
|
722
|
+
# Tag with document position so the cluster (which gets reordered by
|
|
723
|
+
# height) can be put back in reading order for the sample.
|
|
724
|
+
indexed = real.each_with_index.to_a
|
|
725
|
+
|
|
726
|
+
by_font_weight = indexed.group_by { |(c, _i)| [c[:font], c[:weight]] }
|
|
727
|
+
|
|
728
|
+
by_font_weight.flat_map do |(font, weight), pairs|
|
|
729
|
+
height_of = ->(p) { p[0][:bottom] - p[0][:top] }
|
|
730
|
+
Util::Cluster.cluster_objects(pairs, height_of, tolerance: height_tolerance).map do |cluster|
|
|
731
|
+
mean_h = cluster.sum { |p| height_of.call(p) } / cluster.size.to_f
|
|
732
|
+
ordered = cluster.sort_by { |(_c, i)| i }
|
|
733
|
+
{
|
|
734
|
+
font: font,
|
|
735
|
+
height: mean_h.round(1),
|
|
736
|
+
weight: weight,
|
|
737
|
+
count: cluster.size,
|
|
738
|
+
sample: ordered.first(40).map { |(c, _i)| c[:char] }.join
|
|
739
|
+
}
|
|
740
|
+
end
|
|
712
741
|
end.sort_by { |g| -g[:count] }
|
|
713
742
|
end
|
|
714
743
|
|
|
715
|
-
#
|
|
716
|
-
#
|
|
744
|
+
# Generic char filter. Returns the chars that match ALL the specified
|
|
745
|
+
# predicates (intersection, not union).
|
|
717
746
|
#
|
|
718
|
-
#
|
|
719
|
-
# font: String
|
|
720
|
-
# height: Float (
|
|
721
|
-
# weight: Integer
|
|
722
|
-
# bbox: [left, top, right, bottom] in
|
|
723
|
-
# where: block
|
|
747
|
+
# Supported arguments:
|
|
748
|
+
# font: exact String, Array<String>, or Regexp
|
|
749
|
+
# height: Float (single value), Range, Array<Float>
|
|
750
|
+
# weight: Integer or Range
|
|
751
|
+
# bbox: [left, top, right, bottom] in the page's top-down coords
|
|
752
|
+
# where: block that receives the char hash, must return truthy
|
|
724
753
|
#
|
|
725
|
-
#
|
|
726
|
-
# in AND.
|
|
754
|
+
# All parameters are optional; the ones passed are combined with AND.
|
|
727
755
|
#
|
|
728
|
-
#
|
|
756
|
+
# Typically combined with WordExtractor to extract "clean" text:
|
|
729
757
|
#
|
|
730
758
|
# data_chars = page.chars_where(font: /Courier/i)
|
|
731
759
|
# words = Rpdfium::Util::WordExtractor.new.extract_words(data_chars)
|
|
732
760
|
#
|
|
733
|
-
#
|
|
761
|
+
# or used as a building block for custom pipelines.
|
|
734
762
|
def chars_where(font: nil, height: nil, weight: nil, bbox: nil, where: nil, **char_opts)
|
|
735
763
|
cs = chars(**char_opts)
|
|
736
764
|
|
|
@@ -749,29 +777,29 @@ module Rpdfium
|
|
|
749
777
|
end
|
|
750
778
|
end
|
|
751
779
|
|
|
752
|
-
#
|
|
753
|
-
#
|
|
754
|
-
# left-to-right).
|
|
755
|
-
#
|
|
780
|
+
# Groups the filtered chars into logical rows and returns an Array of
|
|
781
|
+
# strings (one per row, top-to-bottom, chars within the row
|
|
782
|
+
# left-to-right). Convenient when the PDF is a filled form and you
|
|
783
|
+
# want only the entered values as clean rows.
|
|
756
784
|
#
|
|
757
|
-
#
|
|
785
|
+
# F24 example:
|
|
758
786
|
#
|
|
759
787
|
# page.lines(font: /Courier/i)
|
|
760
|
-
# # => ["Soggetto:
|
|
761
|
-
# # "0 2
|
|
762
|
-
# # "
|
|
788
|
+
# # => ["Soggetto: Azienda S.R.L. ( 01234567890 )",
|
|
789
|
+
# # "0 1 2 3 4 5 6 7 8 9 0",
|
|
790
|
+
# # "Azienda S.R.L.",
|
|
763
791
|
# # "1001 11 2021 499,81 0,00",
|
|
764
792
|
# # "1712 12 2021 32,46 0,00",
|
|
765
793
|
# # "1701 11 2021 0,00 295,89",
|
|
766
794
|
# # "532,27 295,89 236,38",
|
|
767
795
|
# # ...]
|
|
768
796
|
#
|
|
769
|
-
#
|
|
770
|
-
# `x_tolerance`
|
|
797
|
+
# The filter parameters are the same as `chars_where`. The
|
|
798
|
+
# `x_tolerance` and `y_tolerance` parameters control the WordExtractor.
|
|
771
799
|
#
|
|
772
|
-
#
|
|
773
|
-
#
|
|
774
|
-
def lines(x_tolerance: 3.0, y_tolerance: 3.0, separator:
|
|
800
|
+
# The inter-word separator is two spaces (for readability on forms with
|
|
801
|
+
# spaced fields); change it with `separator:`.
|
|
802
|
+
def lines(x_tolerance: 3.0, y_tolerance: 3.0, separator: ' ',
|
|
775
803
|
font: nil, height: nil, weight: nil, bbox: nil, where: nil,
|
|
776
804
|
**char_opts)
|
|
777
805
|
cs = chars_where(font: font, height: height, weight: weight,
|
|
@@ -779,44 +807,44 @@ module Rpdfium
|
|
|
779
807
|
return [] if cs.empty?
|
|
780
808
|
|
|
781
809
|
we = Util::WordExtractor.new(x_tolerance: x_tolerance,
|
|
782
|
-
|
|
810
|
+
y_tolerance: y_tolerance)
|
|
783
811
|
words = we.extract_words(cs)
|
|
784
812
|
return [] if words.empty?
|
|
785
813
|
|
|
786
|
-
# Cluster
|
|
814
|
+
# Cluster by top (with tolerance), then sort by x0 within the row
|
|
787
815
|
rows = Util::Cluster.cluster_objects(words, :top, tolerance: y_tolerance)
|
|
788
816
|
rows.map do |row_words|
|
|
789
817
|
row_words.sort_by { |w| w[:x0] }.map { |w| w[:text] }.join(separator)
|
|
790
818
|
end
|
|
791
819
|
end
|
|
792
820
|
|
|
793
|
-
#
|
|
794
|
-
#
|
|
795
|
-
# template
|
|
821
|
+
# Associates the template's semantic labels with the values entered on
|
|
822
|
+
# the page. For filled forms (F24, Comunicazione IVA, 770, etc.) where
|
|
823
|
+
# the template and the data are both static text but in different fonts.
|
|
796
824
|
#
|
|
797
|
-
# @param data_font [String, Regexp, Array] font
|
|
798
|
-
#
|
|
799
|
-
#
|
|
800
|
-
#
|
|
801
|
-
#
|
|
802
|
-
# template
|
|
825
|
+
# @param data_font [String, Regexp, Array] font of the entered "data"
|
|
826
|
+
# layer. Typically Courier (F24, 770) or Helvetica (Comunicazione IVA).
|
|
827
|
+
# See `Page#font_inventory` to identify it.
|
|
828
|
+
# Associates the template's semantic labels with the values entered on
|
|
829
|
+
# the page. A primitive for structured extraction from filled forms
|
|
830
|
+
# where template and data coexist as graphical text in different fonts.
|
|
803
831
|
#
|
|
804
|
-
# **
|
|
805
|
-
# output
|
|
806
|
-
# `Util::ColumnInference`,
|
|
807
|
-
#
|
|
832
|
+
# **For advanced cases** (repetitive tables, merging of multi-cell
|
|
833
|
+
# words, structured output) compose with `Util::WordMerger`,
|
|
834
|
+
# `Util::ColumnInference`, and configure the `Util::LabelMatcher`
|
|
835
|
+
# appropriately — see the examples in the docs.
|
|
808
836
|
#
|
|
809
|
-
# @param data_font [String, Regexp, Array] font
|
|
810
|
-
# @param template_font [String, Regexp, Array, nil] font
|
|
811
|
-
# "template".
|
|
812
|
-
# @param data_filter [Proc, nil]
|
|
813
|
-
# @param matcher [LabelMatcher, nil]
|
|
814
|
-
#
|
|
815
|
-
# @param x_tolerance, y_tolerance [Float]
|
|
816
|
-
# @param char_opts [Hash] kwargs
|
|
817
|
-
# false`
|
|
837
|
+
# @param data_font [String, Regexp, Array] font of the "data" layer.
|
|
838
|
+
# @param template_font [String, Regexp, Array, nil] font of the
|
|
839
|
+
# "template" layer. If nil, uses all chars that are NOT in `data_font`.
|
|
840
|
+
# @param data_filter [Proc, nil] optional filter on the value text.
|
|
841
|
+
# @param matcher [LabelMatcher, nil] preconfigured instance. If nil,
|
|
842
|
+
# creates one with the defaults.
|
|
843
|
+
# @param x_tolerance, y_tolerance [Float] tolerances for WordExtractor.
|
|
844
|
+
# @param char_opts [Hash] kwargs passed to `#chars` (e.g. `inject_spaces:
|
|
845
|
+
# false` for box-based forms).
|
|
818
846
|
#
|
|
819
|
-
# @return [Array<Hash>]
|
|
847
|
+
# @return [Array<Hash>] one per value:
|
|
820
848
|
# { value:, labels: { col:, row: }, geometry: {...} }
|
|
821
849
|
def label_value_pairs(data_font:, template_font: nil,
|
|
822
850
|
data_filter: nil, matcher: nil,
|
|
@@ -848,14 +876,14 @@ module Rpdfium
|
|
|
848
876
|
cs = chars(**char_opts)
|
|
849
877
|
return [] if cs.empty?
|
|
850
878
|
|
|
851
|
-
#
|
|
879
|
+
# Group into rows by y
|
|
852
880
|
rows = group_consecutive(cs.sort_by { |c| [c[:top], c[:x0]] }) do |a, b|
|
|
853
881
|
(a[:top] - b[:top]).abs <= y_tolerance
|
|
854
882
|
end
|
|
855
883
|
|
|
856
884
|
rows.flat_map do |row|
|
|
857
885
|
sorted = row.sort_by { |c| c[:x0] }
|
|
858
|
-
#
|
|
886
|
+
# Split on gap > x_tolerance or explicit space
|
|
859
887
|
word_groups = []
|
|
860
888
|
buf = []
|
|
861
889
|
sorted.each do |c|
|
|
@@ -875,44 +903,41 @@ module Rpdfium
|
|
|
875
903
|
end
|
|
876
904
|
end
|
|
877
905
|
|
|
878
|
-
# =====
|
|
906
|
+
# ===== Vector lines (REAL path segments) =====
|
|
879
907
|
|
|
880
|
-
#
|
|
881
|
-
#
|
|
882
|
-
# :x0,:y0,:x1,:y1
|
|
883
|
-
# :stroke_width
|
|
884
|
-
# :horizontal/:vertical
|
|
908
|
+
# Extracts all the line segments (LINETO) of the path objects.
|
|
909
|
+
# Returns Array<Hash>:
|
|
910
|
+
# :x0,:y0,:x1,:y1 endpoints (top-down)
|
|
911
|
+
# :stroke_width stroke width
|
|
912
|
+
# :horizontal/:vertical derived for convenience
|
|
885
913
|
#
|
|
886
|
-
#
|
|
887
|
-
#
|
|
888
|
-
# (
|
|
914
|
+
# For tables, mainly the "pure" horizontal and vertical segments are of
|
|
915
|
+
# interest. Beziers and oblique segments are ignored by default
|
|
916
|
+
# (pass `include_curves: true` to get them as the bbox of their points).
|
|
889
917
|
#
|
|
890
|
-
#
|
|
891
|
-
#
|
|
892
|
-
#
|
|
893
|
-
#
|
|
894
|
-
#
|
|
918
|
+
# Descends recursively into Form XObjects applying their transformation
|
|
919
|
+
# matrix. Many PDFs (TeamSystem, Zucchetti, Excel templates) encapsulate
|
|
920
|
+
# the entire page in a Form XObject — without the descent, we would see
|
|
921
|
+
# zero lines here even though the page is visually full of
|
|
922
|
+
# borders/separators. Behavior aligned with pdfminer.six (and therefore
|
|
895
923
|
# pdfplumber).
|
|
896
|
-
# `include_curves` true:
|
|
897
|
-
# `include_dashed` true:
|
|
898
|
-
# Default: false.
|
|
899
|
-
#
|
|
900
|
-
#
|
|
924
|
+
# `include_curves` true: includes Beziers as segments (with the :curve flag).
|
|
925
|
+
# `include_dashed` true: includes dashed lines (with the :dashed flag).
|
|
926
|
+
# Default: false. Dashed lines are often non-visual "guides" in print
|
|
927
|
+
# templates and confuse table cell detection. Those who want them
|
|
928
|
+
# explicitly (e.g. full drawing extraction) pass true.
|
|
901
929
|
def line_segments(include_curves: false, include_dashed: false)
|
|
902
|
-
# Cache
|
|
903
|
-
#
|
|
904
|
-
#
|
|
905
|
-
#
|
|
930
|
+
# Cache by parameters: line_segments is typically called twice per
|
|
931
|
+
# page (by horizontal_lines AND by vertical_lines), and iterates all
|
|
932
|
+
# the path objects of the page via FFI — expensive on PDFs with rich
|
|
933
|
+
# graphics (e.g. CR Banca d'Italia: ~500-1000 path objs per page).
|
|
906
934
|
cache_key = [include_curves, include_dashed]
|
|
907
935
|
@line_segments_cache ||= {}
|
|
908
936
|
return @line_segments_cache[cache_key] if @line_segments_cache.key?(cache_key)
|
|
909
937
|
|
|
910
938
|
out = []
|
|
911
939
|
page_rotation = rotation
|
|
912
|
-
raw_w, raw_h =
|
|
913
|
-
when 90, 270 then [height, width]
|
|
914
|
-
else [width, height]
|
|
915
|
-
end
|
|
940
|
+
raw_w, raw_h = rotated_dimensions(page_rotation)
|
|
916
941
|
ctx = { rotation: page_rotation, raw_w: raw_w, raw_h: raw_h }
|
|
917
942
|
collect_line_segments(@state[:handle], identity_matrix, ctx,
|
|
918
943
|
include_curves, out, page_object: false)
|
|
@@ -935,15 +960,15 @@ module Rpdfium
|
|
|
935
960
|
end
|
|
936
961
|
end
|
|
937
962
|
|
|
938
|
-
#
|
|
963
|
+
# Identity matrix in PDF space: [1, 0, 0, 1, 0, 0]
|
|
939
964
|
# (a, b, c, d, e, f) → (x', y') = (a*x + c*y + e, b*x + d*y + f)
|
|
940
965
|
def identity_matrix
|
|
941
966
|
{ a: 1.0, b: 0.0, c: 0.0, d: 1.0, e: 0.0, f: 0.0 }
|
|
942
967
|
end
|
|
943
968
|
|
|
944
|
-
#
|
|
945
|
-
#
|
|
946
|
-
#
|
|
969
|
+
# Composes two PDF affine transforms: applies `child` BEFORE `parent`
|
|
970
|
+
# in PDF space (pdfminer.six "apply_matrix_norm" notation).
|
|
971
|
+
# Equivalent to: result = parent * child (col-major).
|
|
947
972
|
def compose_matrix(parent, child)
|
|
948
973
|
{
|
|
949
974
|
a: parent[:a] * child[:a] + parent[:c] * child[:b],
|
|
@@ -968,10 +993,10 @@ module Rpdfium
|
|
|
968
993
|
e: mat[:e], f: mat[:f] }
|
|
969
994
|
end
|
|
970
995
|
|
|
971
|
-
#
|
|
972
|
-
#
|
|
973
|
-
# FPDF_PAGEOBJECT
|
|
974
|
-
# form xobject.
|
|
996
|
+
# Iterates the objects of a page or of a Form XObject, recursively
|
|
997
|
+
# applying the transformation matrix. `parent` = handle (FPDF_PAGE at the
|
|
998
|
+
# root or FPDF_PAGEOBJECT for form xobjects). `page_object: true` if
|
|
999
|
+
# parent is a form xobject.
|
|
975
1000
|
def collect_line_segments(parent, ctm, rotation_ctx, include_curves, out, page_object:)
|
|
976
1001
|
n = if page_object
|
|
977
1002
|
Raw.FPDFFormObj_CountObjects(parent)
|
|
@@ -992,7 +1017,7 @@ module Rpdfium
|
|
|
992
1017
|
when Raw::PAGEOBJ_PATH
|
|
993
1018
|
extract_path_segments(obj, ctm, rotation_ctx, include_curves, out)
|
|
994
1019
|
when Raw::PAGEOBJ_FORM
|
|
995
|
-
#
|
|
1020
|
+
# Descend into the form xobject composing its matrix with the CTM
|
|
996
1021
|
child_ctm = compose_matrix(ctm, read_object_matrix(obj))
|
|
997
1022
|
collect_line_segments(obj, child_ctm, rotation_ctx, include_curves, out,
|
|
998
1023
|
page_object: true)
|
|
@@ -1052,11 +1077,11 @@ module Rpdfium
|
|
|
1052
1077
|
end
|
|
1053
1078
|
end
|
|
1054
1079
|
|
|
1055
|
-
# FPDFPageObj_GetIsActive:
|
|
1056
|
-
#
|
|
1057
|
-
#
|
|
1058
|
-
# Fallback:
|
|
1059
|
-
# (
|
|
1080
|
+
# FPDFPageObj_GetIsActive: returns true if the page object is marked
|
|
1081
|
+
# active (visible). On PDFs without Optional Content it is always-true;
|
|
1082
|
+
# on PDFs with disabled layers, some objs may be inactive.
|
|
1083
|
+
# Fallback: if the binding is missing or fails, we consider it active
|
|
1084
|
+
# (behavior equivalent to the pre-0.3.6 version).
|
|
1060
1085
|
def object_active?(obj)
|
|
1061
1086
|
active_buf = FFI::MemoryPointer.new(:int)
|
|
1062
1087
|
return true if Raw.FPDFPageObj_GetIsActive(obj, active_buf) == 0
|
|
@@ -1066,9 +1091,8 @@ module Rpdfium
|
|
|
1066
1091
|
true
|
|
1067
1092
|
end
|
|
1068
1093
|
|
|
1069
|
-
# FPDFPageObj_GetDashCount:
|
|
1070
|
-
#
|
|
1071
|
-
# alternati on/off).
|
|
1094
|
+
# FPDFPageObj_GetDashCount: number of elements in the dash array. 0 =
|
|
1095
|
+
# solid line, > 0 = dashed line (with N elements alternating on/off).
|
|
1072
1096
|
def read_dash_count(obj)
|
|
1073
1097
|
Raw.FPDFPageObj_GetDashCount(obj)
|
|
1074
1098
|
rescue Rpdfium::LoadError
|
|
@@ -1077,7 +1101,7 @@ module Rpdfium
|
|
|
1077
1101
|
|
|
1078
1102
|
public
|
|
1079
1103
|
|
|
1080
|
-
#
|
|
1104
|
+
# Horizontal lines: dy ~ 0 within tolerance
|
|
1081
1105
|
def horizontal_lines(tolerance: 0.5)
|
|
1082
1106
|
line_segments.select { |s| (s[:y0] - s[:y1]).abs <= tolerance }
|
|
1083
1107
|
.map { |s| { y: (s[:y0] + s[:y1]) / 2.0,
|
|
@@ -1086,7 +1110,7 @@ module Rpdfium
|
|
|
1086
1110
|
stroke_width: s[:stroke_width] } }
|
|
1087
1111
|
end
|
|
1088
1112
|
|
|
1089
|
-
#
|
|
1113
|
+
# Vertical lines: dx ~ 0 within tolerance
|
|
1090
1114
|
def vertical_lines(tolerance: 0.5)
|
|
1091
1115
|
line_segments.select { |s| (s[:x0] - s[:x1]).abs <= tolerance }
|
|
1092
1116
|
.map { |s| { x: (s[:x0] + s[:x1]) / 2.0,
|
|
@@ -1095,8 +1119,8 @@ module Rpdfium
|
|
|
1095
1119
|
stroke_width: s[:stroke_width] } }
|
|
1096
1120
|
end
|
|
1097
1121
|
|
|
1098
|
-
# Compat
|
|
1099
|
-
# rectangles
|
|
1122
|
+
# Compat with the first version: bbox of the path objects (useful for
|
|
1123
|
+
# rectangles drawn as thin borders).
|
|
1100
1124
|
def vector_rects
|
|
1101
1125
|
n = Raw.FPDFPage_CountObjects(@state[:handle])
|
|
1102
1126
|
h = height
|
|
@@ -1121,20 +1145,20 @@ module Rpdfium
|
|
|
1121
1145
|
|
|
1122
1146
|
# ===== Marked Content (PDF tagged) =====
|
|
1123
1147
|
|
|
1124
|
-
#
|
|
1125
|
-
# stream
|
|
1126
|
-
# Content ID).
|
|
1127
|
-
#
|
|
1128
|
-
#
|
|
1129
|
-
# tag
|
|
1148
|
+
# Iterates all the marked content of the page (BDC/BMC operators of the
|
|
1149
|
+
# PDF content stream) grouping the page objects by their mcid (Marked
|
|
1150
|
+
# Content ID). Useful for "tagged" PDFs (PDF/UA, exports from
|
|
1151
|
+
# Word/InDesign): an mcid ≥ 0 identifies a semantic unit (paragraph,
|
|
1152
|
+
# span, figure), and all the objects with the same mcid belong to the
|
|
1153
|
+
# same structure tag.
|
|
1130
1154
|
#
|
|
1131
|
-
#
|
|
1132
|
-
# mcid -1 (
|
|
1155
|
+
# Returns a Hash { mcid (Integer) => Array<page_object_handle> }.
|
|
1156
|
+
# mcid -1 (the page objects without marked content) is OMITTED.
|
|
1133
1157
|
#
|
|
1134
|
-
#
|
|
1135
|
-
#
|
|
1136
|
-
#
|
|
1137
|
-
#
|
|
1158
|
+
# On non-tagged PDFs (e.g. most PDFs from Italian business software)
|
|
1159
|
+
# the Hash is empty. On tagged PDFs it is the source of truth for
|
|
1160
|
+
# semantically grouping chars/words — more reliable than any geometric
|
|
1161
|
+
# heuristic.
|
|
1138
1162
|
def marked_content_regions
|
|
1139
1163
|
out = Hash.new { |h, k| h[k] = [] }
|
|
1140
1164
|
walk_page_objects do |obj, _ctm|
|
|
@@ -1144,9 +1168,9 @@ module Rpdfium
|
|
|
1144
1168
|
out
|
|
1145
1169
|
end
|
|
1146
1170
|
|
|
1147
|
-
#
|
|
1148
|
-
#
|
|
1149
|
-
#
|
|
1171
|
+
# Iterates all the marks (BMC/BDC operators) with their names and
|
|
1172
|
+
# parameters. Returns Array<Hash> with { obj_handle, mark_name, params }.
|
|
1173
|
+
# For tagged PDFs, the common mark_names are: "P" (paragraph),
|
|
1150
1174
|
# "Span", "Artifact", "Figure", "TR" (table row), "TD" (table cell).
|
|
1151
1175
|
def marked_content_inventory
|
|
1152
1176
|
out = []
|
|
@@ -1168,15 +1192,15 @@ module Rpdfium
|
|
|
1168
1192
|
|
|
1169
1193
|
# ===== Links (annotation links + hit-test posizionale) =====
|
|
1170
1194
|
|
|
1171
|
-
# Hit-test:
|
|
1172
|
-
# in
|
|
1173
|
-
#
|
|
1195
|
+
# Hit-test: returns the link annotation that contains the point (x, y)
|
|
1196
|
+
# in the page's top-down coordinates. Returns an Annotation instance
|
|
1197
|
+
# or nil.
|
|
1174
1198
|
#
|
|
1175
|
-
#
|
|
1176
|
-
# (
|
|
1177
|
-
#
|
|
1199
|
+
# More efficient than iterating `links` when starting from a coordinate
|
|
1200
|
+
# (e.g. mapping a click on the rendering → the link URL). pdfplumber has
|
|
1201
|
+
# no direct equivalent.
|
|
1178
1202
|
def link_at(x, y)
|
|
1179
|
-
# PDFium
|
|
1203
|
+
# PDFium uses bottom-up coords; convert
|
|
1180
1204
|
pdf_y = height - y
|
|
1181
1205
|
link_handle = Raw.FPDFLink_GetLinkAtPoint(@state[:handle],
|
|
1182
1206
|
x.to_f, pdf_y.to_f)
|
|
@@ -1185,9 +1209,9 @@ module Rpdfium
|
|
|
1185
1209
|
annot_handle = Raw.FPDFLink_GetAnnot(@state[:handle], link_handle)
|
|
1186
1210
|
return nil if annot_handle.null?
|
|
1187
1211
|
|
|
1188
|
-
# Annotation
|
|
1189
|
-
#
|
|
1190
|
-
# rect
|
|
1212
|
+
# Annotation requires an index in the page; we do not have it directly
|
|
1213
|
+
# here. We iterate the page's annotations and find the one with the
|
|
1214
|
+
# closest rect. For most PDFs this is O(small).
|
|
1191
1215
|
annotations.find { |a| a.subtype == :link && annotation_contains?(a, x, y) }
|
|
1192
1216
|
end
|
|
1193
1217
|
|
|
@@ -1206,19 +1230,19 @@ module Rpdfium
|
|
|
1206
1230
|
out
|
|
1207
1231
|
end
|
|
1208
1232
|
|
|
1209
|
-
# =====
|
|
1233
|
+
# ===== Annotations =====
|
|
1210
1234
|
|
|
1211
1235
|
def annotations
|
|
1212
1236
|
n = Raw.FPDFPage_GetAnnotCount(@state[:handle])
|
|
1213
1237
|
Array.new(n) { |i| Annotation.new(self, i) }
|
|
1214
1238
|
end
|
|
1215
1239
|
|
|
1216
|
-
#
|
|
1240
|
+
# Link annotations only (clickable, external or internal)
|
|
1217
1241
|
def links
|
|
1218
1242
|
annotations.select { |a| a.subtype == :link }
|
|
1219
1243
|
end
|
|
1220
1244
|
|
|
1221
|
-
#
|
|
1245
|
+
# Form widgets only
|
|
1222
1246
|
def form_fields
|
|
1223
1247
|
return [] unless @document.has_forms?
|
|
1224
1248
|
|
|
@@ -1228,25 +1252,25 @@ module Rpdfium
|
|
|
1228
1252
|
|
|
1229
1253
|
# ===== Struct Tree (PDF tagged) =====
|
|
1230
1254
|
|
|
1231
|
-
# Struct tree
|
|
1232
|
-
#
|
|
1233
|
-
#
|
|
1234
|
-
# (Document → P, H1, Table, TR, TH, TD, Figure,
|
|
1255
|
+
# Struct tree of the page (PDF/UA / Tagged PDF). Returns nil if the
|
|
1256
|
+
# page is not tagged. For PDFs from Word/LibreOffice/InDesign exports
|
|
1257
|
+
# with accessibility tags enabled, it exposes the logical structure
|
|
1258
|
+
# (Document → P, H1, Table, TR, TH, TD, Figure, etc.).
|
|
1235
1259
|
#
|
|
1236
|
-
#
|
|
1260
|
+
# Usage modes:
|
|
1237
1261
|
#
|
|
1238
|
-
# #
|
|
1262
|
+
# # Automatic lifecycle (RAII via finalizer):
|
|
1239
1263
|
# tree = page.struct_tree
|
|
1240
1264
|
# tree&.walk { |el| puts el.type }
|
|
1241
1265
|
#
|
|
1242
|
-
# #
|
|
1266
|
+
# # Deterministic lifecycle (close at end of block):
|
|
1243
1267
|
# page.struct_tree do |tree|
|
|
1244
1268
|
# tree.tables.each { |t| ... }
|
|
1245
1269
|
# end
|
|
1246
1270
|
#
|
|
1247
|
-
#
|
|
1248
|
-
# Banca d'Italia, StructTreeRoot
|
|
1249
|
-
#
|
|
1271
|
+
# On non-tagged PDFs it returns nil. On "tagged but empty" PDFs (e.g. CR
|
|
1272
|
+
# Banca d'Italia, StructTreeRoot present but with placeholder elements),
|
|
1273
|
+
# it returns a Tree with `Tree#empty? == true`.
|
|
1250
1274
|
def struct_tree
|
|
1251
1275
|
tree = Structure::Tree.for_page(self)
|
|
1252
1276
|
if block_given?
|
|
@@ -1262,9 +1286,10 @@ module Rpdfium
|
|
|
1262
1286
|
|
|
1263
1287
|
# ===== Rendering =====
|
|
1264
1288
|
|
|
1265
|
-
# Render a bitmap. `output`
|
|
1266
|
-
#
|
|
1267
|
-
#
|
|
1289
|
+
# Render to a bitmap. `output` can be :rgba (default), :bgra, :gray.
|
|
1290
|
+
# Returns [w, h, bytes] where bytes is a binary string.
|
|
1291
|
+
# If include_forms is true and the document has forms, it overlays the
|
|
1292
|
+
# widgets.
|
|
1268
1293
|
def render(scale: 2.0, rotate: 0, output: :rgba,
|
|
1269
1294
|
include_annotations: false, include_forms: false,
|
|
1270
1295
|
background: 0xFFFFFFFF)
|
|
@@ -1276,7 +1301,7 @@ module Rpdfium
|
|
|
1276
1301
|
format = output == :gray ? Raw::FPDFBitmap_Gray : Raw::FPDFBitmap_BGRA
|
|
1277
1302
|
|
|
1278
1303
|
bitmap = Raw.FPDFBitmap_CreateEx(w, h, format, FFI::Pointer::NULL, 0)
|
|
1279
|
-
raise Error,
|
|
1304
|
+
raise Error, 'Bitmap allocation failed' if bitmap.null?
|
|
1280
1305
|
|
|
1281
1306
|
begin
|
|
1282
1307
|
Raw.FPDFBitmap_FillRect(bitmap, 0, 0, w, h, background)
|
|
@@ -1288,8 +1313,8 @@ module Rpdfium
|
|
|
1288
1313
|
end
|
|
1289
1314
|
stride = Raw.FPDFBitmap_GetStride(bitmap)
|
|
1290
1315
|
buf = Raw.FPDFBitmap_GetBuffer(bitmap)
|
|
1291
|
-
#
|
|
1292
|
-
# In BGRA
|
|
1316
|
+
# The stride may exceed w*bpp due to alignment padding.
|
|
1317
|
+
# In BGRA it is almost always w*4, but we respect it for safety.
|
|
1293
1318
|
bytes = buf.read_bytes(stride * h)
|
|
1294
1319
|
[w, h, bytes, stride]
|
|
1295
1320
|
ensure
|
|
@@ -1297,7 +1322,7 @@ module Rpdfium
|
|
|
1297
1322
|
end
|
|
1298
1323
|
end
|
|
1299
1324
|
|
|
1300
|
-
#
|
|
1325
|
+
# Direct rendering to a PNG file. Uses Rpdfium::IO::PNG (pure Ruby, zero deps).
|
|
1301
1326
|
def render_to_png(path, **opts)
|
|
1302
1327
|
w, h, bytes, stride = render(output: :rgba, **opts)
|
|
1303
1328
|
Rpdfium::IO::PNG.write(path, w, h, bytes, stride: stride)
|
|
@@ -1328,7 +1353,7 @@ module Rpdfium
|
|
|
1328
1353
|
|
|
1329
1354
|
private
|
|
1330
1355
|
|
|
1331
|
-
# Match helper
|
|
1356
|
+
# Match helper for the `font:` parameter of chars_where/lines.
|
|
1332
1357
|
def font_matches?(actual_font, pattern)
|
|
1333
1358
|
return false if actual_font.nil?
|
|
1334
1359
|
|
|
@@ -1340,9 +1365,9 @@ module Rpdfium
|
|
|
1340
1365
|
end
|
|
1341
1366
|
end
|
|
1342
1367
|
|
|
1343
|
-
# Match helper
|
|
1344
|
-
#
|
|
1345
|
-
# numeric
|
|
1368
|
+
# Match helper for numeric parameters (`height:`, `weight:`).
|
|
1369
|
+
# Accepts a single value, Range, or Array<Numeric>. For a single
|
|
1370
|
+
# numeric value it uses a 0.05 tolerance (useful for height in points).
|
|
1346
1371
|
def range_matches?(actual, spec)
|
|
1347
1372
|
return false if actual.nil?
|
|
1348
1373
|
|
|
@@ -1354,13 +1379,13 @@ module Rpdfium
|
|
|
1354
1379
|
end
|
|
1355
1380
|
end
|
|
1356
1381
|
|
|
1357
|
-
#
|
|
1358
|
-
#
|
|
1359
|
-
#
|
|
1360
|
-
#
|
|
1361
|
-
#
|
|
1362
|
-
#
|
|
1363
|
-
#
|
|
1382
|
+
# Converts a PDFium box {left, bottom, right, top} in bottom-up coords
|
|
1383
|
+
# to the top-down tuple [x0, top, x1, bottom] used by the rest of the
|
|
1384
|
+
# library. Returns nil if the box is nil (box absent on the PDF).
|
|
1385
|
+
# Iterates all the page objects of the page recursively (descending
|
|
1386
|
+
# into Form XObjects), passing each (obj, current_ctm) to the block.
|
|
1387
|
+
# Same walk logic as collect_line_segments but abstracted — useful for
|
|
1388
|
+
# other obj-level operations (marked content, etc).
|
|
1364
1389
|
def walk_page_objects(handle = @state[:handle], ctm = identity_matrix,
|
|
1365
1390
|
is_form: false, &block)
|
|
1366
1391
|
n = is_form ? Raw.FPDFFormObj_CountObjects(handle) : Raw.FPDFPage_CountObjects(handle)
|
|
@@ -1399,16 +1424,13 @@ module Rpdfium
|
|
|
1399
1424
|
needed = out_len.read_ulong
|
|
1400
1425
|
return nil if needed < 2
|
|
1401
1426
|
|
|
1402
|
-
# Clamp:
|
|
1403
|
-
#
|
|
1404
|
-
# clamp → IndexError
|
|
1427
|
+
# Clamp: if needed exceeds the buffer, read only what was allocated
|
|
1428
|
+
# (and accept that the string is truncated: the case is pathological).
|
|
1429
|
+
# Without the clamp → IndexError on exceptionally long mark names.
|
|
1405
1430
|
payload_bytes = [needed - 2, buf_bytes - 2].min
|
|
1406
1431
|
return nil if payload_bytes <= 0
|
|
1407
1432
|
|
|
1408
|
-
name_buf.read_bytes(payload_bytes)
|
|
1409
|
-
.force_encoding("UTF-16LE")
|
|
1410
|
-
.encode("UTF-8")
|
|
1411
|
-
.delete("\u0000")
|
|
1433
|
+
decode_utf16le(name_buf.read_bytes(payload_bytes))
|
|
1412
1434
|
end
|
|
1413
1435
|
|
|
1414
1436
|
def read_mark_params(mark)
|
|
@@ -1418,7 +1440,7 @@ module Rpdfium
|
|
|
1418
1440
|
key = read_mark_param_key(mark, pi)
|
|
1419
1441
|
next if key.nil? || key.empty?
|
|
1420
1442
|
|
|
1421
|
-
#
|
|
1443
|
+
# Value type: 0=Null, 1=Int, 2=String, 3=Blob, 4=Dict (ignored)
|
|
1422
1444
|
type = Raw.FPDFPageObjMark_GetParamValueType(mark, key)
|
|
1423
1445
|
params[key] = case type
|
|
1424
1446
|
when 1 then read_mark_param_int(mark, key)
|
|
@@ -1442,10 +1464,7 @@ module Rpdfium
|
|
|
1442
1464
|
payload_bytes = [needed - 2, buf_bytes - 2].min
|
|
1443
1465
|
return nil if payload_bytes <= 0
|
|
1444
1466
|
|
|
1445
|
-
key_buf.read_bytes(payload_bytes)
|
|
1446
|
-
.force_encoding("UTF-16LE")
|
|
1447
|
-
.encode("UTF-8")
|
|
1448
|
-
.delete("\u0000")
|
|
1467
|
+
decode_utf16le(key_buf.read_bytes(payload_bytes))
|
|
1449
1468
|
end
|
|
1450
1469
|
|
|
1451
1470
|
def read_mark_param_int(mark, key)
|
|
@@ -1469,10 +1488,7 @@ module Rpdfium
|
|
|
1469
1488
|
payload_bytes = [needed - 2, buf_bytes - 2].min
|
|
1470
1489
|
return nil if payload_bytes <= 0
|
|
1471
1490
|
|
|
1472
|
-
val_buf.read_bytes(payload_bytes)
|
|
1473
|
-
.force_encoding("UTF-16LE")
|
|
1474
|
-
.encode("UTF-8")
|
|
1475
|
-
.delete("\u0000")
|
|
1491
|
+
decode_utf16le(val_buf.read_bytes(payload_bytes))
|
|
1476
1492
|
end
|
|
1477
1493
|
|
|
1478
1494
|
def annotation_contains?(annot, x, y)
|
|
@@ -1490,13 +1506,37 @@ module Rpdfium
|
|
|
1490
1506
|
box[:right], page_h - box[:bottom]]
|
|
1491
1507
|
end
|
|
1492
1508
|
|
|
1509
|
+
# Page dimensions after applying the PDF rotation: width and height are
|
|
1510
|
+
# swapped for 90°/270°. Shared by the char and line-segment pipelines.
|
|
1511
|
+
def rotated_dimensions(rot = rotation)
|
|
1512
|
+
case rot
|
|
1513
|
+
when 90, 270 then [height, width]
|
|
1514
|
+
else [width, height]
|
|
1515
|
+
end
|
|
1516
|
+
end
|
|
1517
|
+
|
|
1518
|
+
CODEPOINT_CHAR = 'U'.freeze
|
|
1519
|
+
|
|
1493
1520
|
def safe_codepoint(cp)
|
|
1494
|
-
return
|
|
1495
|
-
return
|
|
1521
|
+
return '' if cp.zero?
|
|
1522
|
+
return '' if cp > 0x10FFFF || (0xD800..0xDFFF).cover?(cp)
|
|
1496
1523
|
|
|
1497
|
-
[cp].pack(
|
|
1524
|
+
[cp].pack(CODEPOINT_CHAR)
|
|
1498
1525
|
rescue RangeError, ArgumentError
|
|
1499
|
-
|
|
1526
|
+
''
|
|
1527
|
+
end
|
|
1528
|
+
|
|
1529
|
+
# PDFium returns text as UTF-16LE byte buffers padded with NUL. This
|
|
1530
|
+
# is the single decode path used by every text getter (page text,
|
|
1531
|
+
# bounded text, text objects, marked-content names/params).
|
|
1532
|
+
# `replace: true` swaps invalid/undefined codepoints for U+FFFD
|
|
1533
|
+
# instead of raising — used for whole-page text where a single bad
|
|
1534
|
+
# glyph must not abort the extraction.
|
|
1535
|
+
def decode_utf16le(bytes, replace: false)
|
|
1536
|
+
opts = replace ? { invalid: :replace, undef: :replace } : {}
|
|
1537
|
+
bytes.force_encoding(Encoding::UTF_16LE.to_s)
|
|
1538
|
+
.encode(Encoding::UTF_8.to_s, **opts)
|
|
1539
|
+
.delete("\u0000")
|
|
1500
1540
|
end
|
|
1501
1541
|
|
|
1502
1542
|
def read_stroke_width(obj)
|
|
@@ -1506,10 +1546,10 @@ module Rpdfium
|
|
|
1506
1546
|
buf.read_float
|
|
1507
1547
|
end
|
|
1508
1548
|
|
|
1509
|
-
#
|
|
1510
|
-
#
|
|
1511
|
-
#
|
|
1512
|
-
#
|
|
1549
|
+
# Builds a segment from the pair of endpoints in the raw PDFium space
|
|
1550
|
+
# (bottom-up, pre-rotation). Applies the page rotation to return
|
|
1551
|
+
# top-down coords in the post-rotation system, consistent with the
|
|
1552
|
+
# system used by `chars`.
|
|
1513
1553
|
def build_segment(x0, y0, x1, y1, rotation_ctx, stroke_width, dashed: false)
|
|
1514
1554
|
r = rotation_ctx[:rotation]
|
|
1515
1555
|
raw_w = rotation_ctx[:raw_w]
|
|
@@ -1526,24 +1566,19 @@ module Rpdfium
|
|
|
1526
1566
|
}
|
|
1527
1567
|
end
|
|
1528
1568
|
|
|
1529
|
-
#
|
|
1530
|
-
#
|
|
1569
|
+
# Transforms a single point (x, y) from the raw PDFium system (bottom-up)
|
|
1570
|
+
# to the page's top-down post-rotation system.
|
|
1531
1571
|
def apply_page_rotation_to_point(rotation, raw_w, raw_h, x, y)
|
|
1532
1572
|
case rotation
|
|
1533
|
-
when 0, nil
|
|
1534
|
-
|
|
1535
|
-
when
|
|
1536
|
-
|
|
1537
|
-
|
|
1538
|
-
[raw_w - x, y]
|
|
1539
|
-
when 270
|
|
1540
|
-
[raw_h - y, raw_w - x]
|
|
1541
|
-
else
|
|
1542
|
-
[x, raw_h - y]
|
|
1573
|
+
when 0, nil then [x, raw_h - y] # bottom-up → top-down
|
|
1574
|
+
when 90 then [y, x] # 90° CW
|
|
1575
|
+
when 180 then [raw_w - x, y]
|
|
1576
|
+
when 270 then [raw_h - y, raw_w - x]
|
|
1577
|
+
else [x, raw_h - y]
|
|
1543
1578
|
end
|
|
1544
1579
|
end
|
|
1545
1580
|
|
|
1546
|
-
#
|
|
1581
|
+
# Groups consecutive elements if a block considers them equivalent.
|
|
1547
1582
|
def group_consecutive(arr)
|
|
1548
1583
|
groups = []
|
|
1549
1584
|
current = []
|
|
@@ -1561,14 +1596,14 @@ module Rpdfium
|
|
|
1561
1596
|
|
|
1562
1597
|
def word_from_chars(chars)
|
|
1563
1598
|
{
|
|
1564
|
-
text:
|
|
1565
|
-
x0:
|
|
1566
|
-
x1:
|
|
1567
|
-
top:
|
|
1599
|
+
text: chars.map { |c| c[:char] }.join,
|
|
1600
|
+
x0: chars.first[:x0],
|
|
1601
|
+
x1: chars.last[:x1],
|
|
1602
|
+
top: chars.map { |c| c[:top] }.min,
|
|
1568
1603
|
bottom: chars.map { |c| c[:bottom] }.max,
|
|
1569
1604
|
fontsize: chars.first[:fontsize],
|
|
1570
|
-
font:
|
|
1571
|
-
chars:
|
|
1605
|
+
font: chars.first[:font],
|
|
1606
|
+
chars: chars
|
|
1572
1607
|
}
|
|
1573
1608
|
end
|
|
1574
1609
|
|
|
@@ -1583,11 +1618,11 @@ module Rpdfium
|
|
|
1583
1618
|
end
|
|
1584
1619
|
end
|
|
1585
1620
|
|
|
1586
|
-
# Wrapper
|
|
1621
|
+
# Wrapper for FPDF_TEXTPAGE
|
|
1587
1622
|
class TextPage
|
|
1588
1623
|
def initialize(page)
|
|
1589
1624
|
handle = Raw.FPDFText_LoadPage(page.handle)
|
|
1590
|
-
raise PageError,
|
|
1625
|
+
raise PageError, 'Could not load text page' if handle.null?
|
|
1591
1626
|
|
|
1592
1627
|
@state = { handle: handle, closed: false }
|
|
1593
1628
|
ObjectSpace.define_finalizer(self, self.class.finalizer(@state))
|