rpdfium 0.4.1 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/rpdfium/page.rb CHANGED
@@ -1,10 +1,10 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Rpdfium
4
- # Wrapper di pagina. Lazy-load di TextPage. Tutte le coordinate restituite
5
- # sono nello spazio "top-down" della pagina: (0,0) è in alto a sinistra,
6
- # x cresce verso destra, y verso il basso. PDFium usa "bottom-up" — la
7
- # conversione avviene qui una volta sola.
4
+ # Page wrapper. Lazy-loads the TextPage. All returned coordinates are
5
+ # in the page's "top-down" space: (0,0) is at the top left, x grows
6
+ # toward the right, y toward the bottom. PDFium uses "bottom-up" — the
7
+ # conversion happens here once and for all.
8
8
  class Page
9
9
  attr_reader :document, :index
10
10
 
@@ -15,10 +15,10 @@ module Rpdfium
15
15
  raise PageError, "Could not load page #{index}" if handle.null?
16
16
 
17
17
  @text_page = nil
18
- # Stato condiviso col finalizer: idempotenza su close, sopravvive al GC
19
- # senza fare doppia chiamata FPDF_ClosePage. Tenere un riferimento a
20
- # @document garantisce che il Document non venga raccolto prima della
21
- # Page (FPDF_ClosePage richiede Document ancora vivo).
18
+ # State shared with the finalizer: idempotent on close, survives GC
19
+ # without making a double FPDF_ClosePage call. Holding a reference to
20
+ # @document guarantees that the Document is not collected before the
21
+ # Page (FPDF_ClosePage requires the Document still alive).
22
22
  @state = { handle: handle, closed: false }
23
23
  ObjectSpace.define_finalizer(self, self.class.finalizer(@state))
24
24
  end
@@ -37,12 +37,12 @@ module Rpdfium
37
37
  @state[:handle]
38
38
  end
39
39
 
40
- # ===== Geometria =====
40
+ # ===== Geometry =====
41
41
 
42
42
  def width; Raw.FPDF_GetPageWidthF(@state[:handle]); end
43
43
  def height; Raw.FPDF_GetPageHeightF(@state[:handle]); end
44
44
 
45
- # Rotazione in gradi: 0/90/180/270
45
+ # Rotation in degrees: 0/90/180/270
46
46
  def rotation
47
47
  [0, 90, 180, 270][Raw.FPDFPage_GetRotation(@state[:handle])] || 0
48
48
  end
@@ -53,10 +53,10 @@ module Rpdfium
53
53
 
54
54
  BOX_FUNCTIONS = {
55
55
  media: :FPDFPage_GetMediaBox,
56
- crop: :FPDFPage_GetCropBox,
56
+ crop: :FPDFPage_GetCropBox,
57
57
  bleed: :FPDFPage_GetBleedBox,
58
- trim: :FPDFPage_GetTrimBox,
59
- art: :FPDFPage_GetArtBox
58
+ trim: :FPDFPage_GetTrimBox,
59
+ art: :FPDFPage_GetArtBox
60
60
  }.freeze
61
61
 
62
62
  def box(kind = :crop)
@@ -71,19 +71,20 @@ module Rpdfium
71
71
  right: r.read_float, top: t.read_float }
72
72
  end
73
73
 
74
- # Accessor pdfplumber-compatibili. Restituiscono il box come tuple
75
- # [x0, top, x1, bottom] in coordinate top-down (lo stesso sistema
76
- # usato da chars, edges, table cells). Ritornano nil se il box non
77
- # è definito nel PDF (es. ArtBox o BleedBox sono spesso assenti).
74
+ # pdfplumber-compatible accessors. Return the box as the tuple
75
+ # [x0, top, x1, bottom] in top-down coordinates (the same system
76
+ # used by chars, edges, table cells). Return nil if the box is not
77
+ # defined in the PDF (e.g. ArtBox or BleedBox are often absent).
78
78
  #
79
- # Esempio d'uso:
80
- # crop = page.cropbox # → [0.0, 0.0, 595.28, 841.88] o nil
81
- # crop != [0, 0, page.width, page.height] # PDF ha un crop esplicito
79
+ # Usage example:
80
+ # crop = page.cropbox # → [0.0, 0.0, 595.28, 841.88] or nil
81
+ # crop != [0, 0, page.width, page.height] # PDF has an explicit crop
82
82
  def mediabox; box_to_topdown(box(:media)); end
83
83
 
84
- # PDF spec 14.11.2: se CropBox è assente, default è MediaBox. La cropbox è
85
- # l'area "visibile" della pagina; per PDF da gestionali coincide spesso
86
- # con la MediaBox. Pdfplumber fa il fallback automatico.
84
+ # PDF spec 14.11.2: if CropBox is absent, the default is MediaBox. The
85
+ # cropbox is the "visible" area of the page; for PDFs from business
86
+ # software it often coincides with the MediaBox. pdfplumber performs the
87
+ # fallback automatically.
87
88
  def cropbox
88
89
  box_to_topdown(box(:crop)) || mediabox
89
90
  end
@@ -92,112 +93,120 @@ module Rpdfium
92
93
  def trimbox; box_to_topdown(box(:trim)); end
93
94
  def artbox; box_to_topdown(box(:art)); end
94
95
 
95
- # ===== Testo (versione "semplice") =====
96
+ # ===== Text ("simple" version) =====
96
97
 
97
98
  def text
98
99
  tp = text_page
99
100
  n = tp.char_count
100
- return "" if n.zero?
101
+ return '' if n.zero?
101
102
 
102
103
  buf = FFI::MemoryPointer.new(:ushort, n + 1)
103
104
  Raw.FPDFText_GetText(tp.handle, 0, n, buf)
104
- buf.read_bytes((n + 1) * 2).force_encoding("UTF-16LE")
105
- .encode("UTF-8", invalid: :replace, undef: :replace)
106
- .delete("\x00")
105
+ decode_utf16le(buf.read_bytes((n + 1) * 2), replace: true)
107
106
  end
108
107
 
109
- # Estrae il testo dentro una bbox arbitraria (top-down coords).
110
- # Utile per "leggi l'intestazione di questa cella".
108
+ # Extracts the text inside an arbitrary bbox (top-down coords).
109
+ # Useful for "read the header of this cell".
111
110
  def text_in_bbox(left:, top:, right:, bottom:)
112
111
  tp = text_page
113
112
  h = height
114
- # Converti a bottom-up per PDFium
113
+ # Convert to bottom-up for PDFium
115
114
  pdf_top = h - top
116
115
  pdf_bottom = h - bottom
117
- # PDFium vuole: left, top, right, bottom dove top > bottom (PDF coords)
116
+ # PDFium wants: left, top, right, bottom where top > bottom (PDF coords)
118
117
  # Probe size:
119
118
  n = Raw.FPDFText_GetBoundedText(
120
119
  tp.handle, left, pdf_top, right, pdf_bottom, FFI::Pointer::NULL, 0
121
120
  )
122
- return "" if n <= 0
121
+ return '' if n <= 0
123
122
 
124
123
  buf = FFI::MemoryPointer.new(:ushort, n)
125
124
  Raw.FPDFText_GetBoundedText(
126
125
  tp.handle, left, pdf_top, right, pdf_bottom, buf, n
127
126
  )
128
- buf.read_bytes(n * 2).force_encoding("UTF-16LE")
129
- .encode("UTF-8", invalid: :replace, undef: :replace)
130
- .delete("\x00")
131
- end
132
-
133
- # ===== Caratteri (char-level) =====
134
-
135
- # Ritorna ogni char con metadata ricco:
136
- # :char stringa (1 codepoint)
137
- # :x0,:x1 bbox orizzontale
138
- # :top,:bottom bbox verticale (top-down: top < bottom)
139
- # :origin_x, :origin_y punto di inserimento del glifo (top-down)
140
- # :angle angolo di rotazione del glifo (radianti)
141
- # :fontsize taglia in punti
142
- # :font nome font (se disponibile)
143
- # :weight spessore (es. 400=regular, 700=bold)
144
- # :render_mode modalità rendering (fill/stroke/invisible). Letto via
145
- # il text object che contiene il char (PDFium non
146
- # espone più una API char-level dopo chromium/6611).
147
- # nil su build PDFium antichi che non supportano il
148
- # lookup char→object.
149
- # :generated true se inserito da PDFium (es. spazi sintetici)
150
- # :hyphen true se trattino di sillabazione
151
- # :unicode_error true se PDFium non ha potuto mapparlo
127
+ decode_utf16le(buf.read_bytes(n * 2), replace: true)
128
+ end
129
+
130
+ # ===== Characters (char-level) =====
131
+
132
+ # Returns every char with rich metadata:
133
+ # :char string (1 codepoint)
134
+ # :x0,:x1 horizontal bbox
135
+ # :top,:bottom vertical bbox (top-down: top < bottom)
136
+ # :origin_x, :origin_y glyph insertion point (top-down)
137
+ # :angle glyph rotation angle (radians)
138
+ # :fontsize size in points
139
+ # :font font name (if available)
140
+ # :weight weight (e.g. 400=regular, 700=bold)
141
+ # :render_mode rendering mode (fill/stroke/invisible). Read via
142
+ # the text object that contains the char (PDFium no
143
+ # longer exposes a char-level API after chromium/6611).
144
+ # nil on old PDFium builds that do not support the
145
+ # char→object lookup.
146
+ # :generated true if inserted by PDFium (e.g. synthetic spaces)
147
+ # :hyphen true if a hyphenation hyphen
148
+ # :unicode_error true if PDFium could not map it
152
149
  #
153
- # `loose: true` (DEFAULT) usa FPDFText_GetLooseCharBox: tutti i char
154
- # della stessa linea logica condividono la stessa bbox verticale (top/
155
- # bottom), proporzionale alla font size invece che al singolo glifo. È
156
- # esattamente il comportamento di pdfminer.six/pdfplumber, e l'unico
157
- # che permette al midpoint-test in Table#extract di catturare anche i
158
- # char di punteggiatura (`.`, `,`) insieme ai numeri allineati alla
159
- # baseline. Con `loose: false` si ottengono le bbox "tight" del singolo
160
- # glifo, utili per misure di layout fine ma sbagliate per il filtro
161
- # cella tabellare.
162
- def chars(loose: true, inject_spaces: true, lean: false)
163
- # Cache: chars() viene chiamato una volta da Table#extract e poi
164
- # nuovamente da WordExtractor (passando per Extractor#page_words se
165
- # vertical/horizontal_strategy è :text). Ogni chiamata costa O(n) FFI
166
- # roundtrip per char costoso su pagine con migliaia di char.
167
- cache_key = [loose, inject_spaces, lean]
150
+ # `loose: true` (DEFAULT) uses FPDFText_GetLooseCharBox: all chars on
151
+ # the same logical line share the same vertical bbox (top/bottom),
152
+ # proportional to the font size rather than to the individual glyph.
153
+ # This is exactly the behavior of pdfminer.six/pdfplumber, and the only
154
+ # one that lets the midpoint test in Table#extract also capture
155
+ # punctuation chars (`.`, `,`) along with the numbers aligned to the
156
+ # baseline. With `loose: false` you get the "tight" bbox of the single
157
+ # glyph, useful for fine layout measurements but wrong for the table
158
+ # cell filter.
159
+ # `geometry: true` is a stronger form of `lean` reserved for the
160
+ # table/word pipeline: on top of `lean` it ALSO skips the per-char
161
+ # origin (FPDFText_GetCharOrigin) and the text-object lookup
162
+ # (FPDFText_GetTextObject + GetFont/GetFontSize/GetTextRenderMode/
163
+ # GetText), and emits a 6-key hash (char, x0, x1, top, bottom,
164
+ # generated) instead of the full one. Those are exactly the fields the
165
+ # WordExtractor / Table pipeline reads; cutting the rest removes ~3 FFI
166
+ # roundtrips per char and a large amount of hash allocation, which on a
167
+ # page with thousands of chars is the dominant cost of extract_tables.
168
+ # Unlike `lean` (which keeps the full hash shape, just with nil/false
169
+ # metadata), `geometry` changes the hash shape, so it is NOT a drop-in
170
+ # for general char consumers — only for the geometry-only pipeline.
171
+ def chars(loose: true, inject_spaces: true, lean: false, geometry: false)
172
+ # Cache: chars() is called once by Table#extract and then again by
173
+ # WordExtractor (going through Extractor#page_words if
174
+ # vertical/horizontal_strategy is :text). Each call costs O(n) FFI
175
+ # roundtrips per char — expensive on pages with thousands of chars.
176
+ cache_key = [loose, inject_spaces, lean, geometry]
168
177
  @chars_cache ||= {}
169
178
  return @chars_cache[cache_key] if @chars_cache.key?(cache_key)
170
179
 
171
- raw = compute_chars(loose: loose, lean: lean)
180
+ raw = geometry ? compute_geometry_chars(loose: loose) : compute_chars(loose: loose, lean: lean)
172
181
  result = inject_spaces ? rebuild_word_separators(raw) : raw
173
182
  @chars_cache[cache_key] = result
174
183
  end
175
184
 
176
- # Ricostruisce gli spazi che separano le parole basandosi sulla
177
- # GEOMETRIA dei char "veri", scartando completamente gli spazi
178
- # sintetici di PDFium (che sono inaffidabili: PDFium li emette in
179
- # modo aggressivo anche tra cifre di numeri come "2.895,26").
185
+ # Rebuilds the spaces that separate words based on the GEOMETRY of the
186
+ # "real" chars, completely discarding PDFium's synthetic spaces (which
187
+ # are unreliable: PDFium emits them aggressively even between digits of
188
+ # numbers like "2.895,26").
180
189
  #
181
- # Algoritmo:
182
- # 1. Filtra via tutti i char :generated (tipicamente spazi sintetici
183
- # con bbox degenere).
184
- # 2. Cluster i char rimasti per riga (top tolerance 1pt).
185
- # 3. Dentro ogni riga, sort per x0 e per ogni coppia consecutiva
186
- # calcola gap = next.x0 - prev.x1 e char_w = (prev.w + next.w) / 2.
187
- # Se gap > 0.275 × char_w → inserisci spazio sintetico nuovo
188
- # (bbox normalizzata al top/bottom dei char).
190
+ # Algorithm:
191
+ # 1. Filter out all :generated chars (typically synthetic spaces
192
+ # with a degenerate bbox).
193
+ # 2. Cluster the remaining chars by row (top tolerance 1pt).
194
+ # 3. Within each row, sort by x0 and for each consecutive pair
195
+ # compute gap = next.x0 - prev.x1 and char_w = (prev.w + next.w) / 2.
196
+ # If gap > 0.275 × char_w → insert a new synthetic space
197
+ # (bbox normalized to the top/bottom of the chars).
189
198
  #
190
- # Soglia 0.275: tarata empiricamente su PDF TeamSystem reale.
191
- # Distribuzione misurata: gap intra-parola max ratio 0.24, gap
192
- # inter-parola min ratio 0.31. Classificazione 100% corretta sul
193
- # dataset di training (1400 intra + 663 inter casi). Pdfminer.six
194
- # usa internamente 0.1 (`word_margin`) ma con info aggiuntive
195
- # dall'advance del font, non disponibile da PDFium.
199
+ # Threshold 0.275: tuned empirically on a real TeamSystem PDF.
200
+ # Measured distribution: intra-word gap max ratio 0.24, inter-word
201
+ # gap min ratio 0.31. Classification 100% correct on the training
202
+ # dataset (1400 intra + 663 inter cases). pdfminer.six uses 0.1
203
+ # internally (`word_margin`) but with additional info from the font
204
+ # advance, not available from PDFium.
196
205
  def rebuild_word_separators(chars)
197
206
  reals = chars.reject { |c| c[:generated] }
198
207
  return chars if reals.empty?
199
208
 
200
- # Cluster per riga, mantenendo l'ordine di top
209
+ # Cluster by row, preserving the top ordering
201
210
  sorted_top = reals.sort_by { |c| c[:top] }
202
211
  rows = []
203
212
  sorted_top.each do |c|
@@ -216,19 +225,19 @@ module Rpdfium
216
225
  if prev
217
226
  gap = c[:x0] - prev[:x1]
218
227
 
219
- # Segnale dal content stream PDF: prev.text_obj_ends_with_space.
220
- # Se prev NON termina un token (false), il gap è kerning interno
221
- # → mai inserire spazio.
228
+ # Signal from the PDF content stream: prev.text_obj_ends_with_space.
229
+ # If prev does NOT end a token (false), the gap is internal
230
+ # kerning never insert a space.
222
231
  #
223
- # Se prev termina un token (true), può essere:
224
- # - vera fine parola (gap geometrico relativamente grande)
225
- # - fine token sintattico (es. tra cifre e punteggiatura di
226
- # un numero "2", "."), con gap piccolo.
232
+ # If prev ends a token (true), it may be:
233
+ # - a real word end (relatively large geometric gap)
234
+ # - a syntactic token end (e.g. between digits and punctuation
235
+ # of a number "2", "."), with a small gap.
227
236
  #
228
- # Discrimino con la soglia geometrica abbinata al "contesto"
229
- # tipografico: se la coppia (prev_char, curr_char) sembra un
230
- # contesto numerico (cifre + punteggiatura), uso soglia più
231
- # alta; altrimenti soglia normale.
237
+ # We discriminate with the geometric threshold combined with the
238
+ # typographic "context": if the pair (prev_char, curr_char) looks
239
+ # like a numeric context (digits + punctuation), we use a higher
240
+ # threshold; otherwise the normal threshold.
232
241
  obj_signal_present = prev.key?(:text_obj_ends_with_space)
233
242
  obj_says_continues = obj_signal_present && !prev[:text_obj_ends_with_space]
234
243
 
@@ -246,11 +255,11 @@ module Rpdfium
246
255
  result
247
256
  end
248
257
 
249
- # True se la coppia (prev_char, curr_char) è un contesto "numerico":
250
- # cifra-punteggiatura, punteggiatura-cifra, o cifra-cifra. In questi
251
- # casi un gap modesto è probabilmente kerning interno al numero, non
252
- # confine di parola. Soglia più alta per evitare di spezzare numeri
253
- # come "2.895,26" in "2 . 895 , 26".
258
+ # True if the pair (prev_char, curr_char) is a "numeric" context:
259
+ # digit-punctuation, punctuation-digit, or digit-digit. In these
260
+ # cases a modest gap is probably kerning internal to the number, not
261
+ # a word boundary. A higher threshold avoids splitting numbers like
262
+ # "2.895,26" into "2 . 895 , 26".
254
263
  NUMERIC_PUNCT = %w[. , ].freeze
255
264
 
256
265
  def numeric_context?(prev_char, curr_char)
@@ -261,23 +270,22 @@ module Rpdfium
261
270
  prev_num && curr_num
262
271
  end
263
272
 
264
- # Ritorna la larghezza "di riferimento" per il calcolo del ratio
265
- # gap/width. Preferisce l'advance (più stabile di bbox per char con
266
- # kerning post-applied). Se uno dei due char non ha advance, fallback
267
- # su max delle bbox-width.
273
+ # Returns the "reference" width for computing the gap/width ratio.
274
+ # Prefers the advance (more stable than the bbox for chars with
275
+ # post-applied kerning). If either char lacks an advance, falls back
276
+ # to the max of the bbox widths.
268
277
  def best_reference_width(a, b)
269
278
  a_adv = a[:advance]
270
279
  b_adv = b[:advance]
271
- if a_adv && b_adv
272
- [a_adv, b_adv].max
273
- else
274
- [(a[:x1] - a[:x0]), (b[:x1] - b[:x0])].max
275
- end
280
+
281
+ return [a_adv, b_adv].max if a_adv && b_adv
282
+
283
+ [(a[:x1] - a[:x0]), (b[:x1] - b[:x0])].max
276
284
  end
277
285
 
278
286
  def build_synthetic_space(prev, c)
279
287
  {
280
- char: " ", codepoint: 32,
288
+ char: ' ', codepoint: 32,
281
289
  x0: prev[:x1], x1: c[:x0],
282
290
  top: prev[:top], bottom: prev[:bottom],
283
291
  origin_x: prev[:x1], origin_y: prev[:origin_y],
@@ -294,21 +302,15 @@ module Rpdfium
294
302
  n = tp.char_count
295
303
  return [] if n.zero?
296
304
 
297
- # Geometria della pagina dopo l'applicazione della rotazione PDF.
298
- h = height
299
- w = width
305
+ # Page geometry after applying the PDF rotation.
300
306
  page_rotation = rotation
301
-
302
- raw_w, raw_h = case page_rotation
303
- when 90, 270 then [h, w]
304
- else [w, h]
305
- end
307
+ raw_w, raw_h = rotated_dimensions(page_rotation)
306
308
 
307
309
  result = Array.new(n)
308
310
 
309
- # Buffer FFI riusati tra tutte le iterazioni del loop.
310
- # MemoryPointer.new è non-banale (~µs ciascuna), allocarne O(n) per
311
- # char è il principale costo di compute_chars dopo le chiamate FFI.
311
+ # FFI buffers reused across all loop iterations.
312
+ # MemoryPointer.new is non-trivial (~µs each); allocating O(n) of them
313
+ # per char is the main cost of compute_chars after the FFI calls.
312
314
  l = FFI::MemoryPointer.new(:double)
313
315
  r = FFI::MemoryPointer.new(:double)
314
316
  b = FFI::MemoryPointer.new(:double)
@@ -332,11 +334,11 @@ module Rpdfium
332
334
  origin_x_raw = ox.read_double
333
335
  origin_y_raw = oy.read_double
334
336
 
335
- # Font name: skippato in lean (1 FFI risparmiata per char).
337
+ # Font name: skipped in lean mode (1 FFI call saved per char).
336
338
  font_name = nil
337
339
  unless lean
338
340
  n_bytes = Raw.FPDFText_GetFontInfo(tp_handle, i, font_buf, 256, flags_buf)
339
- font_name = font_buf.read_bytes(n_bytes - 1).force_encoding("UTF-8") if n_bytes > 1
341
+ font_name = font_buf.read_bytes(n_bytes - 1).force_encoding(Encoding::UTF_8.to_s) if n_bytes > 1
340
342
  end
341
343
 
342
344
  cp = Raw.FPDFText_GetUnicode(tp_handle, i)
@@ -351,9 +353,9 @@ module Rpdfium
351
353
  fetch_text_obj_info(text_obj, tp, text_obj_cache,
352
354
  fs_buf: fs_buf, text_buf: text_obj_text_buf)
353
355
 
354
- # Advance: 2 FFI per char (GetGlyphWidth + GetMatrix). In lean
355
- # mode skippiamo — best_reference_width fa fallback su bbox-width
356
- # che funziona altrettanto bene per il discriminante word-boundary.
356
+ # Advance: 2 FFI calls per char (GetGlyphWidth + GetMatrix). In lean
357
+ # mode we skip it — best_reference_width falls back to bbox-width
358
+ # which works just as well for the word-boundary discriminant.
357
359
  advance = if lean
358
360
  nil
359
361
  else
@@ -366,30 +368,30 @@ module Rpdfium
366
368
  x0, x1, y_top, y_bot,
367
369
  origin_x_raw, origin_y_raw)
368
370
 
369
- # In lean mode skippiamo 5 chiamate FFI per char:
371
+ # In lean mode we skip 5 FFI calls per char:
370
372
  # GetCharAngle, GetFontWeight, IsHyphen, HasUnicodeMapError,
371
- # (e GetFontSize fallback se font_size_for_obj è nil).
372
- # Su pagine con migliaia di char il risparmio è significativo
373
- # (decine di ms). I metadata risultano nil/false, che è il valore
374
- # neutro per il pipeline text/tables/words interno.
373
+ # (and the GetFontSize fallback if font_size_for_obj is nil).
374
+ # On pages with thousands of chars the saving is significant
375
+ # (tens of ms). The metadata come out nil/false, which is the
376
+ # neutral value for the internal text/tables/words pipeline.
375
377
  result[i] =
376
378
  if lean
377
379
  {
378
- char: safe_codepoint(cp),
380
+ char: safe_codepoint(cp),
379
381
  codepoint: cp,
380
- x0: td_x0,
381
- x1: td_x1,
382
- top: td_top,
383
- bottom: td_bottom,
382
+ x0: td_x0,
383
+ x1: td_x1,
384
+ top: td_top,
385
+ bottom: td_bottom,
384
386
  origin_x: td_ox,
385
387
  origin_y: td_oy,
386
- angle: nil,
388
+ angle: nil,
387
389
  fontsize: font_size_for_obj,
388
- font: nil,
389
- weight: nil,
390
- render_mode: rm,
391
- generated: Raw.FPDFText_IsGenerated(tp_handle, i) == 1,
392
- hyphen: false,
390
+ font: nil,
391
+ weight: nil,
392
+ render_mode: rm,
393
+ generated: Raw.FPDFText_IsGenerated(tp_handle, i) == 1,
394
+ hyphen: false,
393
395
  unicode_error: false,
394
396
  advance: advance,
395
397
  text_obj_id: text_obj && !text_obj.null? ? text_obj.address : nil,
@@ -397,21 +399,21 @@ module Rpdfium
397
399
  }
398
400
  else
399
401
  {
400
- char: safe_codepoint(cp),
402
+ char: safe_codepoint(cp),
401
403
  codepoint: cp,
402
- x0: td_x0,
403
- x1: td_x1,
404
- top: td_top,
405
- bottom: td_bottom,
404
+ x0: td_x0,
405
+ x1: td_x1,
406
+ top: td_top,
407
+ bottom: td_bottom,
406
408
  origin_x: td_ox,
407
409
  origin_y: td_oy,
408
- angle: Raw.FPDFText_GetCharAngle(tp_handle, i),
410
+ angle: Raw.FPDFText_GetCharAngle(tp_handle, i),
409
411
  fontsize: font_size_for_obj || Raw.FPDFText_GetFontSize(tp_handle, i),
410
- font: font_name,
411
- weight: Raw.FPDFText_GetFontWeight(tp_handle, i),
412
- render_mode: rm,
413
- generated: Raw.FPDFText_IsGenerated(tp_handle, i) == 1,
414
- hyphen: Raw.FPDFText_IsHyphen(tp_handle, i) == 1,
412
+ font: font_name,
413
+ weight: Raw.FPDFText_GetFontWeight(tp_handle, i),
414
+ render_mode: rm,
415
+ generated: Raw.FPDFText_IsGenerated(tp_handle, i) == 1,
416
+ hyphen: Raw.FPDFText_IsHyphen(tp_handle, i) == 1,
415
417
  unicode_error: Raw.FPDFText_HasUnicodeMapError(tp_handle, i) == 1,
416
418
  advance: advance,
417
419
  text_obj_id: text_obj && !text_obj.null? ? text_obj.address : nil,
@@ -422,67 +424,137 @@ module Rpdfium
422
424
  result
423
425
  end
424
426
 
425
- # Applica la rotazione della pagina alle coordinate di un char.
427
+ # Minimal char extraction for the table/word pipeline. See `chars`
428
+ # `geometry:` for the rationale. Compared to compute_chars(lean: true)
429
+ # this skips, per char: FPDFText_GetCharOrigin (origin is never read by
430
+ # the pipeline) and the per-char angle/font/weight/render-mode reads,
431
+ # the page rotation is applied inline (no origin, no intermediate
432
+ # 6-tuple allocation), and the result hash carries only the fields the
433
+ # WordExtractor / Table / rebuild_word_separators path reads.
426
434
  #
427
- # Input: coord PDFium raw (bottom-up, pre-rotazione) di un bbox
428
- # `[x0, x1, y_top, y_bot]` (con y_top > y_bot perché bottom-up) e
429
- # di un origin point.
435
+ # `text_obj_ends_with_space` is intentionally KEPT: rebuild_word_separators
436
+ # uses it as the content-stream "token end" signal that distinguishes a
437
+ # word boundary from internal numeric kerning (e.g. "2.895,26"). Dropping
438
+ # it would change word splitting on PDFs that rely on that signal, so the
439
+ # GetTextObject lookup stays (its info tuple is cached per text object).
440
+ def compute_geometry_chars(loose:)
441
+ tp = text_page
442
+ n = tp.char_count
443
+ return [] if n.zero?
444
+
445
+ page_rotation = rotation
446
+ raw_w, raw_h = rotated_dimensions(page_rotation)
447
+
448
+ result = Array.new(n)
449
+
450
+ # FFI buffers reused across all iterations (see compute_chars).
451
+ l = FFI::MemoryPointer.new(:double)
452
+ r = FFI::MemoryPointer.new(:double)
453
+ b = FFI::MemoryPointer.new(:double)
454
+ t = FFI::MemoryPointer.new(:double)
455
+ rect = Raw::FS_RECTF.new
456
+ fs_buf = FFI::MemoryPointer.new(:float)
457
+ text_obj_text_buf = FFI::MemoryPointer.new(:uint8, TEXT_OBJ_INITIAL_BUF_BYTES)
458
+ text_obj_cache = {}
459
+ tp_handle = tp.handle
460
+
461
+ n.times do |i|
462
+ x0, x1, y_top, y_bot = read_char_bbox(tp, i, loose, l, r, b, t, rect)
463
+
464
+ text_obj = begin
465
+ Raw.FPDFText_GetTextObject(tp_handle, i)
466
+ rescue Rpdfium::LoadError
467
+ nil
468
+ end
469
+ _, _, _, ends_with_space =
470
+ fetch_text_obj_info(text_obj, tp, text_obj_cache,
471
+ fs_buf: fs_buf, text_buf: text_obj_text_buf)
472
+
473
+ # Inline page-rotation → top-down coords (mirror of
474
+ # apply_page_rotation_to_char, dropping the origin outputs).
475
+ td_x0, td_x1, td_top, td_bottom =
476
+ case page_rotation
477
+ when 90 then [y_bot, y_top, x0, x1]
478
+ when 180 then [raw_w - x1, raw_w - x0, y_bot, y_top]
479
+ when 270 then [raw_h - y_top, raw_h - y_bot, raw_w - x1, raw_w - x0]
480
+ else # 0, nil, or non-multiple-of-90 fallback
481
+ [x0, x1, raw_h - y_top, raw_h - y_bot]
482
+ end
483
+
484
+ result[i] = {
485
+ char: safe_codepoint(Raw.FPDFText_GetUnicode(tp_handle, i)),
486
+ x0: td_x0,
487
+ x1: td_x1,
488
+ top: td_top,
489
+ bottom: td_bottom,
490
+ generated: Raw.FPDFText_IsGenerated(tp_handle, i) == 1,
491
+ text_obj_ends_with_space: ends_with_space
492
+ }
493
+ end
494
+ result
495
+ end
496
+
497
+ # Applies the page rotation to a char's coordinates.
430
498
  #
431
- # Output: coord top-down nel sistema della pagina post-rotazione,
432
- # nella convenzione standard di rpdfium: `[x0, x1, top, bottom]`
433
- # con `top < bottom`. Coerente con pdfplumber.
499
+ # Input: raw PDFium coords (bottom-up, pre-rotation) of a bbox
500
+ # `[x0, x1, y_top, y_bot]` (with y_top > y_bot because bottom-up) and
501
+ # of an origin point.
434
502
  #
435
- # Convenzione PDFium: GetRotation = N significa che la pagina visualizzata
436
- # è ruotata di N*90° in senso orario rispetto al sistema raw del content
437
- # stream. PDFium restituisce le coord nel sistema raw; applichiamo la
438
- # rotazione per allineare al rendering.
503
+ # Output: top-down coords in the post-rotation page system, in the
504
+ # standard rpdfium convention: `[x0, x1, top, bottom]` with
505
+ # `top < bottom`. Consistent with pdfplumber.
439
506
  #
440
- # Caso 0°: identità + bottom-up→top-down.
441
- # Caso 90° CW: bbox larga in x diventa alta in y. La x_min (sinistra) raw
442
- # coincide con il top (alto) del sistema post-rotazione.
443
- # Caso 180°: ribalta entrambi gli assi.
444
- # Caso 270° CW: bbox larga in x diventa alta in y, ma invertita verticalmente.
507
+ # PDFium convention: GetRotation = N means the displayed page is
508
+ # rotated by N*90° clockwise relative to the raw content stream
509
+ # system. PDFium returns the coords in the raw system; we apply the
510
+ # rotation to align with the rendering.
511
+ #
512
+ # Case 0°: identity + bottom-up→top-down.
513
+ # Case 90° CW: a bbox wide in x becomes tall in y. The raw x_min (left)
514
+ # coincides with the top of the post-rotation system.
515
+ # Case 180°: flips both axes.
516
+ # Case 270° CW: a bbox wide in x becomes tall in y, but flipped vertically.
445
517
  def apply_page_rotation_to_char(rotation, raw_w, raw_h,
446
518
  x0, x1, y_top, y_bot,
447
519
  origin_x, origin_y)
448
520
  case rotation
449
521
  when 0, nil
450
- # Nessuna rotazione. Bottom-up → top-down standard.
522
+ # No rotation. Standard bottom-up → top-down.
451
523
  # page_h_post == raw_h.
452
524
  [x0, x1, raw_h - y_top, raw_h - y_bot,
453
525
  origin_x, raw_h - origin_y]
454
526
 
455
527
  when 90
456
- # 90° CW. Dimensioni post-rotation: w=raw_h, h=raw_w.
457
- # Trasformazione: x_post = y_raw, y_post = raw_w - x_raw (bottom-up).
528
+ # 90° CW. Post-rotation dimensions: w=raw_h, h=raw_w.
529
+ # Transform: x_post = y_raw, y_post = raw_w - x_raw (bottom-up).
458
530
  # In top-down: top = x_min_raw, bottom = x_max_raw.
459
- new_x0 = y_bot # piccolo y_raw → piccolo x_post
460
- new_x1 = y_top # grande y_raw → grande x_post
461
- new_top = x0 # piccolo x_raw → top piccolo (alto)
462
- new_bottom = x1 # grande x_raw → bottom grande (basso)
531
+ new_x0 = y_bot # small y_raw → small x_post
532
+ new_x1 = y_top # large y_raw → large x_post
533
+ new_top = x0 # small x_raw → small top (high)
534
+ new_bottom = x1 # large x_raw → large bottom (low)
463
535
  new_ox = origin_y
464
536
  new_oy = origin_x # top-down origin_y = x_raw
465
537
  [new_x0, new_x1, new_top, new_bottom, new_ox, new_oy]
466
538
 
467
539
  when 180
468
- # 180°. Dimensioni post-rotation: invariate (raw_w × raw_h).
469
- # Trasformazione: x_post = raw_w - x_raw, y_post = raw_h - y_raw.
540
+ # 180°. Post-rotation dimensions: unchanged (raw_w × raw_h).
541
+ # Transform: x_post = raw_w - x_raw, y_post = raw_h - y_raw.
470
542
  # In top-down: top = y_bot_raw, bottom = y_top_raw.
471
543
  new_x0 = raw_w - x1
472
544
  new_x1 = raw_w - x0
473
- new_top = y_bot # bottom raw → top td (alto)
474
- new_bottom = y_top # top raw → bottom td (basso)
545
+ new_top = y_bot # raw bottom → td top (high)
546
+ new_bottom = y_top # raw top → td bottom (low)
475
547
  new_ox = raw_w - origin_x
476
- new_oy = y_top.zero? ? raw_h - origin_y : raw_h - origin_y
477
- # nota: origin in top-down post-180 = y_origin_raw
548
+ # Origin in top-down post-180°: the y axis is already flipped by
549
+ # the rotation, so origin_y carries over unchanged.
478
550
  new_oy = origin_y
479
551
  [new_x0, new_x1, new_top, new_bottom, new_ox, new_oy]
480
552
 
481
553
  when 270
482
- # 270° CW (= 90° CCW). Dimensioni post-rotation: w=raw_h, h=raw_w.
483
- # Trasformazione: x_post = raw_h - y_raw, y_post = x_raw (bottom-up).
554
+ # 270° CW (= 90° CCW). Post-rotation dimensions: w=raw_h, h=raw_w.
555
+ # Transform: x_post = raw_h - y_raw, y_post = x_raw (bottom-up).
484
556
  # In top-down: top = raw_w - x_max_raw, bottom = raw_w - x_min_raw.
485
- new_x0 = raw_h - y_top # grande y → piccolo x_post
557
+ new_x0 = raw_h - y_top # large y → small x_post
486
558
  new_x1 = raw_h - y_bot
487
559
  new_top = raw_w - x1
488
560
  new_bottom = raw_w - x0
@@ -491,22 +563,22 @@ module Rpdfium
491
563
  [new_x0, new_x1, new_top, new_bottom, new_ox, new_oy]
492
564
 
493
565
  else
494
- # Rotazione non standard (non multipla di 90°): fallback al
495
- # comportamento pre-rotazione. Non dovrebbe mai succedere per
496
- # PDF ben formati.
566
+ # Non-standard rotation (not a multiple of 90°): fall back to
567
+ # the pre-rotation behavior. This should never happen for
568
+ # well-formed PDFs.
497
569
  [x0, x1, raw_h - y_top, raw_h - y_bot,
498
570
  origin_x, raw_h - origin_y]
499
571
  end
500
572
  end
501
573
 
502
- # Cache lookup per text object. Restituisce tupla:
574
+ # Cache lookup for a text object. Returns a tuple:
503
575
  # [render_mode, font_handle, font_size, ends_with_space]
504
576
  #
505
- # `ends_with_space` indica se il testo dell'intero text object termina
506
- # con uno spazio (segnale "fine token" dichiarato dal PDF). È una
507
- # proprietà dell'oggetto, non del singolo char, quindi può essere
508
- # calcolata una volta sola e cachata insieme agli altri campi evita
509
- # una chiamata FPDFTextObj_GetText per ogni char che condivide l'obj.
577
+ # `ends_with_space` indicates whether the text of the entire text object
578
+ # ends with a space (a "token end" signal declared by the PDF). It is a
579
+ # property of the object, not of the single char, so it can be computed
580
+ # once and cached together with the other fields this avoids one
581
+ # FPDFTextObj_GetText call for every char that shares the obj.
510
582
  def fetch_text_obj_info(text_obj, tp, cache, fs_buf:, text_buf:)
511
583
  return [nil, nil, nil, nil] if text_obj.nil? || text_obj.null?
512
584
 
@@ -522,18 +594,24 @@ module Rpdfium
522
594
  end
523
595
 
524
596
  obj_text = read_text_obj_text_fast(text_obj, tp, text_buf)
525
- ends_with_space = obj_text&.end_with?(" ")
597
+ ends_with_space = obj_text&.end_with?(' ')
526
598
 
527
599
  tuple = [rm, font_handle, font_size, ends_with_space]
528
600
  cache[addr] = tuple
529
601
  tuple
530
602
  end
531
603
 
532
- # Versione "fast" di read_text_obj_text_from: riusa il buffer passato
533
- # invece di allocarlo. Per il 99% dei text obj il buffer iniziale da
534
- # 256 byte basta; nel caso raro che PDFium richieda più spazio, alloca
535
- # un buffer più grande on-demand (questa è una path rara, OK
536
- # allocare).
604
+ # Reads the text of a PDF text object, reusing the caller-provided
605
+ # buffer instead of allocating one per call.
606
+ #
607
+ # C signature: `unsigned long FPDFTextObj_GetText(FPDF_PAGEOBJECT,
608
+ # FPDF_TEXTPAGE, FPDF_WCHAR* buffer, unsigned long length)` — length in
609
+ # BYTES, the return is the total number of bytes needed (including the
610
+ # null terminator), even if the buffer is too small.
611
+ #
612
+ # For 99% of text objs the initial 256-byte buffer is enough; in the
613
+ # rare case PDFium requires more space, a larger buffer is allocated on
614
+ # demand (rare path, OK to allocate).
537
615
  def read_text_obj_text_fast(text_obj, tp, buf)
538
616
  return nil if text_obj.nil? || text_obj.null?
539
617
 
@@ -542,7 +620,7 @@ module Rpdfium
542
620
  return nil if needed < 2
543
621
 
544
622
  if needed > TEXT_OBJ_INITIAL_BUF_BYTES
545
- # Path raro: text obj con > 128 char. Alloco buffer dedicato.
623
+ # Rare path: text obj with > 128 chars. Allocate a dedicated buffer.
546
624
  big_buf = FFI::MemoryPointer.new(:uint8, needed)
547
625
  needed = Raw.FPDFTextObj_GetText(text_obj, tp.handle, big_buf, needed)
548
626
  return nil if needed < 2
@@ -550,23 +628,20 @@ module Rpdfium
550
628
  payload_bytes = needed - 2
551
629
  return nil if payload_bytes <= 0
552
630
 
553
- return big_buf.read_bytes(payload_bytes)
554
- .force_encoding("UTF-16LE")
555
- .encode("UTF-8")
556
- .delete("\u0000")
631
+ return decode_utf16le(big_buf.read_bytes(payload_bytes))
557
632
  end
558
633
 
559
634
  payload_bytes = needed - 2
560
635
  return nil if payload_bytes <= 0
561
636
 
562
- buf.read_bytes(payload_bytes)
563
- .force_encoding("UTF-16LE")
564
- .encode("UTF-8")
565
- .delete("\u0000")
637
+ decode_utf16le(buf.read_bytes(payload_bytes))
566
638
  end
567
639
 
568
- # Versione "fast" di compute_glyph_advance: riusa gw_buf e matrix
569
- # invece di allocarli per char. Stesso comportamento funzionale.
640
+ # Computes the glyph advance in page coordinates for a specific char.
641
+ # Formula: glyph_width(font, codepoint, font_size) × |CTM.a|. Reuses the
642
+ # caller-provided gw_buf and matrix instead of allocating per char.
643
+ # Returns nil if the advance is not computable (font unavailable, or
644
+ # PDFium build without FPDFFont_GetGlyphWidth).
570
645
  def compute_glyph_advance_fast(font, codepoint, font_size, tp_handle,
571
646
  char_index, gw_buf, matrix)
572
647
  return nil if font.nil? || font_size.nil?
@@ -580,7 +655,7 @@ module Rpdfium
580
655
 
581
656
  glyph_w_font_units = gw_buf.read_float
582
657
 
583
- # CTM scale: riuso la matrix in-place.
658
+ # CTM scale: reuse the matrix in-place.
584
659
  scale = if Raw.FPDFText_GetMatrix(tp_handle, char_index, matrix) == 1
585
660
  matrix[:a].abs
586
661
  else
@@ -589,148 +664,101 @@ module Rpdfium
589
664
  glyph_w_font_units * scale
590
665
  end
591
666
 
592
- # Buffer size iniziale per FPDFTextObj_GetText: 256 byte = 128 char UTF-16.
593
- # Empiricamente sufficiente per ~99% dei text object reali (parole singole
594
- # o frasi brevi). Quando un text obj è più grande, ricadiamo nel probe-then-
595
- # fetch corretto.
667
+ # Initial buffer size for FPDFTextObj_GetText: 256 bytes = 128 UTF-16 chars.
668
+ # Empirically sufficient for ~99% of real text objects (single words or
669
+ # short phrases). When a text obj is larger, we fall back to the correct
670
+ # probe-then-fetch.
596
671
  TEXT_OBJ_INITIAL_BUF_BYTES = 256
597
672
 
598
- # Legge il testo di un text object PDF.
599
- #
600
- # Firma C: `unsigned long FPDFTextObj_GetText(FPDF_PAGEOBJECT, FPDF_TEXTPAGE,
601
- # FPDF_WCHAR* buffer, unsigned long length)` — length in BYTE, return è
602
- # il numero di byte totali necessari (incluso null terminator), anche se
603
- # il buffer è troppo piccolo. Pattern: prova con buffer stack-friendly,
604
- # se PDFium ne richiede di più rialloca.
605
- def read_text_obj_text_from(text_obj, tp, _char_index_unused = nil)
606
- return nil if text_obj.nil? || text_obj.null?
607
-
608
- # Prima tentativo: buffer fisso da 256 byte. Risolve il 99% dei casi.
609
- buf = FFI::MemoryPointer.new(:uint8, TEXT_OBJ_INITIAL_BUF_BYTES)
610
- needed = Raw.FPDFTextObj_GetText(text_obj, tp.handle, buf,
611
- TEXT_OBJ_INITIAL_BUF_BYTES)
612
- return nil if needed < 2
613
-
614
- # Se PDFium ne vuole più di quanto allocato, rialloca esatto.
615
- if needed > TEXT_OBJ_INITIAL_BUF_BYTES
616
- buf = FFI::MemoryPointer.new(:uint8, needed)
617
- needed = Raw.FPDFTextObj_GetText(text_obj, tp.handle, buf, needed)
618
- return nil if needed < 2
619
- end
620
-
621
- # Clamp difensivo: non leggo mai più di quanto allocato.
622
- buf_capacity = buf.size
623
- payload_bytes = [needed - 2, buf_capacity - 2].min
624
- return nil if payload_bytes <= 0
625
-
626
- buf.read_bytes(payload_bytes)
627
- .force_encoding("UTF-16LE")
628
- .encode("UTF-8")
629
- .delete("\u0000")
630
- end
631
-
632
- # Calcola l'advance del glifo in coordinate pagina, per un char
633
- # specifico identificato da (text_page, char_index).
634
- # Formula: glyph_width(font, codepoint, font_size) × |CTM.a|.
635
- # Ritorna nil se l'advance non è calcolabile (font non disponibile,
636
- # PDFium che non supporta l'API).
637
- def compute_glyph_advance(font, codepoint, font_size, tp, char_index)
638
- return nil if font.nil? || font_size.nil?
639
-
640
- gw_buf = FFI::MemoryPointer.new(:float)
641
- ok = begin
642
- Raw.FPDFFont_GetGlyphWidth(font, codepoint, font_size, gw_buf)
643
- rescue Rpdfium::LoadError
644
- return nil # FPDFFont_GetGlyphWidth non disponibile in build vecchi
645
- end
646
- return nil if ok == 0
647
-
648
- glyph_w_font_units = gw_buf.read_float
649
- scale = char_ctm_scale_x(tp, char_index) || 1.0
650
- glyph_w_font_units * scale
651
- end
652
-
653
- # Calcola la scala orizzontale del CTM per un char specifico.
654
- def char_ctm_scale_x(tp, char_index)
655
- mat = Raw::FS_MATRIX.new
656
- return nil if Raw.FPDFText_GetMatrix(tp.handle, char_index, mat) == 0
657
-
658
- mat[:a].abs
659
- end
660
-
661
673
  # ===== Form-aware extraction =====
662
674
  #
663
- # PDF di "moduli compilati" (F24, Comunicazione IVA, 770, ecc.) sono PDF
664
- # di output dove il modello prestampato e i valori inseriti coesistono
665
- # come testo grafico nessun AcroForm, nessun tag PDF/UA. Il pipeline
666
- # geometrico di estrazione tabelle vede il modulo intero e produce
667
- # rumore (etichette del template mescolate ai dati).
675
+ # "Filled form" PDFs (F24, Comunicazione IVA, 770, etc.) are output PDFs
676
+ # where the pre-printed template and the entered values coexist as
677
+ # graphical textno AcroForm, no PDF/UA tag. The geometric table
678
+ # extraction pipeline sees the whole form and produces noise (template
679
+ # labels mixed in with the data).
668
680
  #
669
- # La strategia robusta su questi PDF è separare i char per "ruolo"
670
- # usando font/altezza, che tipicamente differiscono tra il template
671
- # (font proporzionali, dimensioni varie) e i dati inseriti dal
672
- # gestionale (un singolo font, tipicamente Courier o Helvetica,
673
- # una sola size).
681
+ # The robust strategy on these PDFs is to separate the chars by "role"
682
+ # using font/height, which typically differ between the template
683
+ # (proportional fonts, various sizes) and the data entered by the
684
+ # business software (a single font, typically Courier or Helvetica,
685
+ # a single size).
674
686
  #
675
- # Esempio classico F24:
687
+ # Classic F24 example:
676
688
  # Template: Futura-Light, Futura-Bold, Futura-Heavy, Times-Bold
677
- # Dati: Courier 10.0
689
+ # Data: Courier 10.0
678
690
  #
679
- # page.font_inventory # → vede tutti i (font, height)
691
+ # page.font_inventory # → sees all the (font, height)
680
692
  # page.chars_where(font: /Courier/i)
681
- # # → solo i char dei dati inseriti
682
- # page.lines(font: /Courier/i) # → testo dei dati riga per riga
693
+ # # → only the chars of the entered data
694
+ # page.lines(font: /Courier/i) # → data text line by line
683
695
 
684
- # Distribuzione dei char per (font, altezza visiva, weight).
696
+ # Distribution of chars by (font, visual height, weight).
685
697
  #
686
- # Ritorna un Array di Hash ordinato per count decrescente:
698
+ # Returns an Array of Hash sorted by descending count:
687
699
  # [{ font:, height:, weight:, count:, sample: }, ...]
688
700
  #
689
- # `height` è l'altezza visiva del char in punti (bottom - top), più
690
- # affidabile di `fontsize` che PDFium normalizza a 1.0 quando la
691
- # dimensione reale è nella matrice CTM (caso comune sui moduli
692
- # generati con scaling).
701
+ # `height` is the visual height of the char in points (bottom - top),
702
+ # more reliable than `fontsize`, which PDFium normalizes to 1.0 when the
703
+ # real size is in the CTM matrix (a common case on forms generated with
704
+ # scaling).
693
705
  #
694
- # `sample` sono i primi 40 char di quel gruppo, per ispezione.
706
+ # `sample` is the first 40 chars of that group, in document order, for
707
+ # inspection.
695
708
  #
696
- # Usalo per scegliere il filtro `chars_where`: tipicamente il font
697
- # con più char è il template, e i font minoritari (1 solo size,
698
- # spesso monospace) sono i dati.
699
- def font_inventory
700
- groups = chars.reject { |c| c[:generated] }.group_by do |c|
701
- h = (c[:bottom] - c[:top]).round(1)
702
- [c[:font], h, c[:weight]]
703
- end
704
- groups.map do |(font, height, weight), cs|
705
- {
706
- font: font,
707
- height: height,
708
- weight: weight,
709
- count: cs.size,
710
- sample: cs.first(40).map { |c| c[:char] }.join
711
- }
709
+ # Heights are bucketed within `height_tolerance` (single-linkage, per
710
+ # font+weight) rather than rounded to a fixed precision. A round glyph
711
+ # whose loose box overshoots the cap line by a fraction of a point
712
+ # ("O", "S", "C"...) would otherwise land in a spurious one-glyph group
713
+ # (e.g. "O" at h=6.6 split off from the rest of the line at h=6.5,
714
+ # producing garbled samples like "CDICE FISCALE" with every "O"
715
+ # missing). Clustering keeps each logical size in a single group.
716
+ #
717
+ # Use it to choose the `chars_where` filter: typically the font with the
718
+ # most chars is the template, and the minority fonts (a single size,
719
+ # often monospace) are the data.
720
+ def font_inventory(height_tolerance: 0.5)
721
+ real = chars.reject { |c| c[:generated] }
722
+ # Tag with document position so the cluster (which gets reordered by
723
+ # height) can be put back in reading order for the sample.
724
+ indexed = real.each_with_index.to_a
725
+
726
+ by_font_weight = indexed.group_by { |(c, _i)| [c[:font], c[:weight]] }
727
+
728
+ by_font_weight.flat_map do |(font, weight), pairs|
729
+ height_of = ->(p) { p[0][:bottom] - p[0][:top] }
730
+ Util::Cluster.cluster_objects(pairs, height_of, tolerance: height_tolerance).map do |cluster|
731
+ mean_h = cluster.sum { |p| height_of.call(p) } / cluster.size.to_f
732
+ ordered = cluster.sort_by { |(_c, i)| i }
733
+ {
734
+ font: font,
735
+ height: mean_h.round(1),
736
+ weight: weight,
737
+ count: cluster.size,
738
+ sample: ordered.first(40).map { |(c, _i)| c[:char] }.join
739
+ }
740
+ end
712
741
  end.sort_by { |g| -g[:count] }
713
742
  end
714
743
 
715
- # Filtro char generico. Ritorna i char che matchano TUTTI i predicati
716
- # specificati (intersezione, non unione).
744
+ # Generic char filter. Returns the chars that match ALL the specified
745
+ # predicates (intersection, not union).
717
746
  #
718
- # Argomenti supportati:
719
- # font: String esatto, Array<String>, o Regexp
720
- # height: Float (singolo valore), Range, Array<Float>
721
- # weight: Integer o Range
722
- # bbox: [left, top, right, bottom] in coord top-down della pagina
723
- # where: block che riceve l'hash char, deve ritornare truthy
747
+ # Supported arguments:
748
+ # font: exact String, Array<String>, or Regexp
749
+ # height: Float (single value), Range, Array<Float>
750
+ # weight: Integer or Range
751
+ # bbox: [left, top, right, bottom] in the page's top-down coords
752
+ # where: block that receives the char hash, must return truthy
724
753
  #
725
- # Tutti i parametri sono opzionali; quelli passati vengono combinati
726
- # in AND.
754
+ # All parameters are optional; the ones passed are combined with AND.
727
755
  #
728
- # Tipicamente combinato con WordExtractor per estrarre testo "pulito":
756
+ # Typically combined with WordExtractor to extract "clean" text:
729
757
  #
730
758
  # data_chars = page.chars_where(font: /Courier/i)
731
759
  # words = Rpdfium::Util::WordExtractor.new.extract_words(data_chars)
732
760
  #
733
- # oppure usato come building block per pipeline custom.
761
+ # or used as a building block for custom pipelines.
734
762
  def chars_where(font: nil, height: nil, weight: nil, bbox: nil, where: nil, **char_opts)
735
763
  cs = chars(**char_opts)
736
764
 
@@ -749,29 +777,29 @@ module Rpdfium
749
777
  end
750
778
  end
751
779
 
752
- # Raggruppa i char filtrati in righe logiche e ritorna un Array di
753
- # stringhe (una per riga, top-to-bottom, char dentro la riga
754
- # left-to-right). Conveniente quando il PDF è un modulo compilato
755
- # e vuoi solo i valori inseriti come righe pulite.
780
+ # Groups the filtered chars into logical rows and returns an Array of
781
+ # strings (one per row, top-to-bottom, chars within the row
782
+ # left-to-right). Convenient when the PDF is a filled form and you
783
+ # want only the entered values as clean rows.
756
784
  #
757
- # Esempio F24:
785
+ # F24 example:
758
786
  #
759
787
  # page.lines(font: /Courier/i)
760
- # # => ["Soggetto: MANAGEMENT CONSULTING S.R.L. ( 02098120682 )",
761
- # # "0 2 0 9 8 1 2 0 6 8 2",
762
- # # "MANAGEMENT CONSULTING S.R.L.",
788
+ # # => ["Soggetto: Azienda S.R.L. ( 01234567890 )",
789
+ # # "0 1 2 3 4 5 6 7 8 9 0",
790
+ # # "Azienda S.R.L.",
763
791
  # # "1001 11 2021 499,81 0,00",
764
792
  # # "1712 12 2021 32,46 0,00",
765
793
  # # "1701 11 2021 0,00 295,89",
766
794
  # # "532,27 295,89 236,38",
767
795
  # # ...]
768
796
  #
769
- # I parametri di filtro sono gli stessi di `chars_where`. I parametri
770
- # `x_tolerance` e `y_tolerance` controllano il WordExtractor.
797
+ # The filter parameters are the same as `chars_where`. The
798
+ # `x_tolerance` and `y_tolerance` parameters control the WordExtractor.
771
799
  #
772
- # Il separatore inter-word è due spazi (per leggibilità su moduli con
773
- # campi spaziati); cambialo con `separator:`.
774
- def lines(x_tolerance: 3.0, y_tolerance: 3.0, separator: " ",
800
+ # The inter-word separator is two spaces (for readability on forms with
801
+ # spaced fields); change it with `separator:`.
802
+ def lines(x_tolerance: 3.0, y_tolerance: 3.0, separator: ' ',
775
803
  font: nil, height: nil, weight: nil, bbox: nil, where: nil,
776
804
  **char_opts)
777
805
  cs = chars_where(font: font, height: height, weight: weight,
@@ -779,44 +807,44 @@ module Rpdfium
779
807
  return [] if cs.empty?
780
808
 
781
809
  we = Util::WordExtractor.new(x_tolerance: x_tolerance,
782
- y_tolerance: y_tolerance)
810
+ y_tolerance: y_tolerance)
783
811
  words = we.extract_words(cs)
784
812
  return [] if words.empty?
785
813
 
786
- # Cluster per top (con tolleranza), poi ordina per x0 dentro la riga
814
+ # Cluster by top (with tolerance), then sort by x0 within the row
787
815
  rows = Util::Cluster.cluster_objects(words, :top, tolerance: y_tolerance)
788
816
  rows.map do |row_words|
789
817
  row_words.sort_by { |w| w[:x0] }.map { |w| w[:text] }.join(separator)
790
818
  end
791
819
  end
792
820
 
793
- # Associa label semantiche del template ai valori inseriti sulla pagina.
794
- # Per moduli compilati (F24, Comunicazione IVA, 770, ecc.) dove il
795
- # template e i dati sono entrambi testo statico ma in font diversi.
821
+ # Associates the template's semantic labels with the values entered on
822
+ # the page. For filled forms (F24, Comunicazione IVA, 770, etc.) where
823
+ # the template and the data are both static text but in different fonts.
796
824
  #
797
- # @param data_font [String, Regexp, Array] font del layer "dati" inseriti.
798
- # Tipicamente Courier (F24, 770) o Helvetica (Comunicazione IVA).
799
- # Vedi `Page#font_inventory` per identificarlo.
800
- # Associa label semantiche del template ai valori inseriti sulla pagina.
801
- # Primitiva per estrazione strutturata da moduli compilati dove
802
- # template e dati coesistono come testo grafico in font diversi.
825
+ # @param data_font [String, Regexp, Array] font of the entered "data"
826
+ # layer. Typically Courier (F24, 770) or Helvetica (Comunicazione IVA).
827
+ # See `Page#font_inventory` to identify it.
828
+ # Associates the template's semantic labels with the values entered on
829
+ # the page. A primitive for structured extraction from filled forms
830
+ # where template and data coexist as graphical text in different fonts.
803
831
  #
804
- # **Per casi avanzati** (tabelle ripetitive, merge di word multi-cella,
805
- # output strutturato) componi con `Util::WordMerger`,
806
- # `Util::ColumnInference`, e configura il `Util::LabelMatcher`
807
- # opportunamentevedi gli esempi nella docs.
832
+ # **For advanced cases** (repetitive tables, merging of multi-cell
833
+ # words, structured output) compose with `Util::WordMerger`,
834
+ # `Util::ColumnInference`, and configure the `Util::LabelMatcher`
835
+ # appropriatelysee the examples in the docs.
808
836
  #
809
- # @param data_font [String, Regexp, Array] font del layer "dati".
810
- # @param template_font [String, Regexp, Array, nil] font del layer
811
- # "template". Se nil, usa tutti i char che NON sono in `data_font`.
812
- # @param data_filter [Proc, nil] filtro opzionale sul testo dei valori.
813
- # @param matcher [LabelMatcher, nil] istanza preconfigurata. Se nil,
814
- # ne crea una con i default.
815
- # @param x_tolerance, y_tolerance [Float] tolleranze per WordExtractor.
816
- # @param char_opts [Hash] kwargs passati a `#chars` (es. `inject_spaces:
817
- # false` per moduli a caselline).
837
+ # @param data_font [String, Regexp, Array] font of the "data" layer.
838
+ # @param template_font [String, Regexp, Array, nil] font of the
839
+ # "template" layer. If nil, uses all chars that are NOT in `data_font`.
840
+ # @param data_filter [Proc, nil] optional filter on the value text.
841
+ # @param matcher [LabelMatcher, nil] preconfigured instance. If nil,
842
+ # creates one with the defaults.
843
+ # @param x_tolerance, y_tolerance [Float] tolerances for WordExtractor.
844
+ # @param char_opts [Hash] kwargs passed to `#chars` (e.g. `inject_spaces:
845
+ # false` for box-based forms).
818
846
  #
819
- # @return [Array<Hash>] uno per valore:
847
+ # @return [Array<Hash>] one per value:
820
848
  # { value:, labels: { col:, row: }, geometry: {...} }
821
849
  def label_value_pairs(data_font:, template_font: nil,
822
850
  data_filter: nil, matcher: nil,
@@ -848,14 +876,14 @@ module Rpdfium
848
876
  cs = chars(**char_opts)
849
877
  return [] if cs.empty?
850
878
 
851
- # Raggruppa in righe per y
879
+ # Group into rows by y
852
880
  rows = group_consecutive(cs.sort_by { |c| [c[:top], c[:x0]] }) do |a, b|
853
881
  (a[:top] - b[:top]).abs <= y_tolerance
854
882
  end
855
883
 
856
884
  rows.flat_map do |row|
857
885
  sorted = row.sort_by { |c| c[:x0] }
858
- # Spezza su gap > x_tolerance o spazio esplicito
886
+ # Split on gap > x_tolerance or explicit space
859
887
  word_groups = []
860
888
  buf = []
861
889
  sorted.each do |c|
@@ -875,44 +903,41 @@ module Rpdfium
875
903
  end
876
904
  end
877
905
 
878
- # ===== Linee vettoriali (path segments REALI) =====
906
+ # ===== Vector lines (REAL path segments) =====
879
907
 
880
- # Estrae tutti i segmenti di linea (LINETO) dei path objects.
881
- # Ritorna Array<Hash>:
882
- # :x0,:y0,:x1,:y1 estremi (top-down)
883
- # :stroke_width spessore tratto
884
- # :horizontal/:vertical derivati per comodità
908
+ # Extracts all the line segments (LINETO) of the path objects.
909
+ # Returns Array<Hash>:
910
+ # :x0,:y0,:x1,:y1 endpoints (top-down)
911
+ # :stroke_width stroke width
912
+ # :horizontal/:vertical derived for convenience
885
913
  #
886
- # Per le tabelle interessano principalmente i segmenti orizzontali e
887
- # verticali "puri". Beziers e segmenti obliqui vengono ignorati di default
888
- # (passa `include_curves: true` per averli come bbox dei loro punti).
914
+ # For tables, mainly the "pure" horizontal and vertical segments are of
915
+ # interest. Beziers and oblique segments are ignored by default
916
+ # (pass `include_curves: true` to get them as the bbox of their points).
889
917
  #
890
- # Discende ricorsivamente nei Form XObjects applicando la loro matrice
891
- # di trasformazione. Molti PDF (TeamSystem, Zucchetti, template Excel)
892
- # incapsulano l'intera pagina in un Form XObject — senza discesa, qui
893
- # vedremmo zero linee anche se visivamente la pagina è piena di
894
- # bordi/separatori. Comportamento allineato a pdfminer.six (e quindi a
918
+ # Descends recursively into Form XObjects applying their transformation
919
+ # matrix. Many PDFs (TeamSystem, Zucchetti, Excel templates) encapsulate
920
+ # the entire page in a Form XObject — without the descent, we would see
921
+ # zero lines here even though the page is visually full of
922
+ # borders/separators. Behavior aligned with pdfminer.six (and therefore
895
923
  # pdfplumber).
896
- # `include_curves` true: include i Bezier come segmenti (con flag :curve).
897
- # `include_dashed` true: include le linee tratteggiate (con flag :dashed).
898
- # Default: false. Le tratteggiate spesso sono "guide" non-visive nei
899
- # template di stampa e confondono la detection cellule tabella. Chi
900
- # le vuole esplicitamente (es. drawing extraction completo) passa true.
924
+ # `include_curves` true: includes Beziers as segments (with the :curve flag).
925
+ # `include_dashed` true: includes dashed lines (with the :dashed flag).
926
+ # Default: false. Dashed lines are often non-visual "guides" in print
927
+ # templates and confuse table cell detection. Those who want them
928
+ # explicitly (e.g. full drawing extraction) pass true.
901
929
  def line_segments(include_curves: false, include_dashed: false)
902
- # Cache per parametri: line_segments viene tipicamente chiamato 2 volte
903
- # per pagina (da horizontal_lines E da vertical_lines), e itera tutti
904
- # i path objects della pagina via FFI — costoso su PDF con grafica
905
- # ricca (es. CR Banca d'Italia: ~500-1000 path obj per pagina).
930
+ # Cache by parameters: line_segments is typically called twice per
931
+ # page (by horizontal_lines AND by vertical_lines), and iterates all
932
+ # the path objects of the page via FFI — expensive on PDFs with rich
933
+ # graphics (e.g. CR Banca d'Italia: ~500-1000 path objs per page).
906
934
  cache_key = [include_curves, include_dashed]
907
935
  @line_segments_cache ||= {}
908
936
  return @line_segments_cache[cache_key] if @line_segments_cache.key?(cache_key)
909
937
 
910
938
  out = []
911
939
  page_rotation = rotation
912
- raw_w, raw_h = case page_rotation
913
- when 90, 270 then [height, width]
914
- else [width, height]
915
- end
940
+ raw_w, raw_h = rotated_dimensions(page_rotation)
916
941
  ctx = { rotation: page_rotation, raw_w: raw_w, raw_h: raw_h }
917
942
  collect_line_segments(@state[:handle], identity_matrix, ctx,
918
943
  include_curves, out, page_object: false)
@@ -935,15 +960,15 @@ module Rpdfium
935
960
  end
936
961
  end
937
962
 
938
- # Matrice identità nello spazio PDF: [1, 0, 0, 1, 0, 0]
963
+ # Identity matrix in PDF space: [1, 0, 0, 1, 0, 0]
939
964
  # (a, b, c, d, e, f) → (x', y') = (a*x + c*y + e, b*x + d*y + f)
940
965
  def identity_matrix
941
966
  { a: 1.0, b: 0.0, c: 0.0, d: 1.0, e: 0.0, f: 0.0 }
942
967
  end
943
968
 
944
- # Compone due trasformazioni affini PDF: applica `child` PRIMA di `parent`
945
- # nello spazio PDF (notazione pdfminer.six "apply_matrix_norm").
946
- # Equivale a: result = parent * child (col-major).
969
+ # Composes two PDF affine transforms: applies `child` BEFORE `parent`
970
+ # in PDF space (pdfminer.six "apply_matrix_norm" notation).
971
+ # Equivalent to: result = parent * child (col-major).
947
972
  def compose_matrix(parent, child)
948
973
  {
949
974
  a: parent[:a] * child[:a] + parent[:c] * child[:b],
@@ -968,10 +993,10 @@ module Rpdfium
968
993
  e: mat[:e], f: mat[:f] }
969
994
  end
970
995
 
971
- # Itera oggetti di una page o di un Form XObject, applicando ricorsivamente
972
- # la matrice di trasformazione. `parent` = handle (FPDF_PAGE alla radice o
973
- # FPDF_PAGEOBJECT per i form xobjects). `page_object: true` se parent è un
974
- # form xobject.
996
+ # Iterates the objects of a page or of a Form XObject, recursively
997
+ # applying the transformation matrix. `parent` = handle (FPDF_PAGE at the
998
+ # root or FPDF_PAGEOBJECT for form xobjects). `page_object: true` if
999
+ # parent is a form xobject.
975
1000
  def collect_line_segments(parent, ctm, rotation_ctx, include_curves, out, page_object:)
976
1001
  n = if page_object
977
1002
  Raw.FPDFFormObj_CountObjects(parent)
@@ -992,7 +1017,7 @@ module Rpdfium
992
1017
  when Raw::PAGEOBJ_PATH
993
1018
  extract_path_segments(obj, ctm, rotation_ctx, include_curves, out)
994
1019
  when Raw::PAGEOBJ_FORM
995
- # Discendi nel form xobject componendo la sua matrice col CTM
1020
+ # Descend into the form xobject composing its matrix with the CTM
996
1021
  child_ctm = compose_matrix(ctm, read_object_matrix(obj))
997
1022
  collect_line_segments(obj, child_ctm, rotation_ctx, include_curves, out,
998
1023
  page_object: true)
@@ -1052,11 +1077,11 @@ module Rpdfium
1052
1077
  end
1053
1078
  end
1054
1079
 
1055
- # FPDFPageObj_GetIsActive: ritorna true se il page object è marcato
1056
- # attivo (visibile). Su PDF senza Optional Content, è always-true; su
1057
- # PDF con layer disabilitati, alcuni obj possono essere inactive.
1058
- # Fallback: se la binding non c'è o fallisce, consideriamo attivo
1059
- # (comportamento equivalente alla versione pre-0.3.6).
1080
+ # FPDFPageObj_GetIsActive: returns true if the page object is marked
1081
+ # active (visible). On PDFs without Optional Content it is always-true;
1082
+ # on PDFs with disabled layers, some objs may be inactive.
1083
+ # Fallback: if the binding is missing or fails, we consider it active
1084
+ # (behavior equivalent to the pre-0.3.6 version).
1060
1085
  def object_active?(obj)
1061
1086
  active_buf = FFI::MemoryPointer.new(:int)
1062
1087
  return true if Raw.FPDFPageObj_GetIsActive(obj, active_buf) == 0
@@ -1066,9 +1091,8 @@ module Rpdfium
1066
1091
  true
1067
1092
  end
1068
1093
 
1069
- # FPDFPageObj_GetDashCount: numero di elementi del dash array. 0 =
1070
- # linea continua, > 0 = linea tratteggiata (con N elementi
1071
- # alternati on/off).
1094
+ # FPDFPageObj_GetDashCount: number of elements in the dash array. 0 =
1095
+ # solid line, > 0 = dashed line (with N elements alternating on/off).
1072
1096
  def read_dash_count(obj)
1073
1097
  Raw.FPDFPageObj_GetDashCount(obj)
1074
1098
  rescue Rpdfium::LoadError
@@ -1077,7 +1101,7 @@ module Rpdfium
1077
1101
 
1078
1102
  public
1079
1103
 
1080
- # Linee orizzontali: dy ~ 0 entro tolleranza
1104
+ # Horizontal lines: dy ~ 0 within tolerance
1081
1105
  def horizontal_lines(tolerance: 0.5)
1082
1106
  line_segments.select { |s| (s[:y0] - s[:y1]).abs <= tolerance }
1083
1107
  .map { |s| { y: (s[:y0] + s[:y1]) / 2.0,
@@ -1086,7 +1110,7 @@ module Rpdfium
1086
1110
  stroke_width: s[:stroke_width] } }
1087
1111
  end
1088
1112
 
1089
- # Linee verticali: dx ~ 0 entro tolleranza
1113
+ # Vertical lines: dx ~ 0 within tolerance
1090
1114
  def vertical_lines(tolerance: 0.5)
1091
1115
  line_segments.select { |s| (s[:x0] - s[:x1]).abs <= tolerance }
1092
1116
  .map { |s| { x: (s[:x0] + s[:x1]) / 2.0,
@@ -1095,8 +1119,8 @@ module Rpdfium
1095
1119
  stroke_width: s[:stroke_width] } }
1096
1120
  end
1097
1121
 
1098
- # Compat con la prima versione: bbox dei path objects (utile per
1099
- # rectangles disegnati come bordi sottili).
1122
+ # Compat with the first version: bbox of the path objects (useful for
1123
+ # rectangles drawn as thin borders).
1100
1124
  def vector_rects
1101
1125
  n = Raw.FPDFPage_CountObjects(@state[:handle])
1102
1126
  h = height
@@ -1121,20 +1145,20 @@ module Rpdfium
1121
1145
 
1122
1146
  # ===== Marked Content (PDF tagged) =====
1123
1147
 
1124
- # Itera tutti i marked content del page (operatori BDC/BMC del content
1125
- # stream PDF) raggruppando i page object per il loro mcid (Marked
1126
- # Content ID). Utile per PDF "tagged" (PDF/UA, esport da Word/InDesign):
1127
- # un mcid ≥ 0 identifica un'unità semantica (paragrafo, span, figura),
1128
- # e tutti gli oggetti con lo stesso mcid appartengono allo stesso
1129
- # tag struttura.
1148
+ # Iterates all the marked content of the page (BDC/BMC operators of the
1149
+ # PDF content stream) grouping the page objects by their mcid (Marked
1150
+ # Content ID). Useful for "tagged" PDFs (PDF/UA, exports from
1151
+ # Word/InDesign): an mcid ≥ 0 identifies a semantic unit (paragraph,
1152
+ # span, figure), and all the objects with the same mcid belong to the
1153
+ # same structure tag.
1130
1154
  #
1131
- # Ritorna un Hash { mcid (Integer) => Array<page_object_handle> }.
1132
- # mcid -1 (i page object senza marked content) viene OMESSO.
1155
+ # Returns a Hash { mcid (Integer) => Array<page_object_handle> }.
1156
+ # mcid -1 (the page objects without marked content) is OMITTED.
1133
1157
  #
1134
- # Su PDF non tagged (es. la maggior parte dei PDF da gestionali
1135
- # italiani) l'Hash è vuoto. Su PDF tagged è la fonte di verità per
1136
- # raggruppare semanticamente char/parolepiù affidabile di qualsiasi
1137
- # euristica geometrica.
1158
+ # On non-tagged PDFs (e.g. most PDFs from Italian business software)
1159
+ # the Hash is empty. On tagged PDFs it is the source of truth for
1160
+ # semantically grouping chars/wordsmore reliable than any geometric
1161
+ # heuristic.
1138
1162
  def marked_content_regions
1139
1163
  out = Hash.new { |h, k| h[k] = [] }
1140
1164
  walk_page_objects do |obj, _ctm|
@@ -1144,9 +1168,9 @@ module Rpdfium
1144
1168
  out
1145
1169
  end
1146
1170
 
1147
- # Itera tutti i marks (BMC/BDC operators) con i loro nomi e parametri.
1148
- # Ritorna Array<Hash> con { obj_handle, mark_name, params }.
1149
- # Per PDF tagged, i mark_name comuni sono: "P" (paragraph),
1171
+ # Iterates all the marks (BMC/BDC operators) with their names and
1172
+ # parameters. Returns Array<Hash> with { obj_handle, mark_name, params }.
1173
+ # For tagged PDFs, the common mark_names are: "P" (paragraph),
1150
1174
  # "Span", "Artifact", "Figure", "TR" (table row), "TD" (table cell).
1151
1175
  def marked_content_inventory
1152
1176
  out = []
@@ -1168,15 +1192,15 @@ module Rpdfium
1168
1192
 
1169
1193
  # ===== Links (annotation links + hit-test posizionale) =====
1170
1194
 
1171
- # Hit-test: ritorna il link annotation che contiene il punto (x, y)
1172
- # in coordinate top-down della pagina. Restituisce un'istanza di
1173
- # Annotation o nil.
1195
+ # Hit-test: returns the link annotation that contains the point (x, y)
1196
+ # in the page's top-down coordinates. Returns an Annotation instance
1197
+ # or nil.
1174
1198
  #
1175
- # Più efficiente di iterare `links` quando si parte da una coordinata
1176
- # (es. mapping click sul rendering → URL del link). Pdfplumber non
1177
- # ha equivalente diretto.
1199
+ # More efficient than iterating `links` when starting from a coordinate
1200
+ # (e.g. mapping a click on the rendering → the link URL). pdfplumber has
1201
+ # no direct equivalent.
1178
1202
  def link_at(x, y)
1179
- # PDFium usa coord bottom-up; converto
1203
+ # PDFium uses bottom-up coords; convert
1180
1204
  pdf_y = height - y
1181
1205
  link_handle = Raw.FPDFLink_GetLinkAtPoint(@state[:handle],
1182
1206
  x.to_f, pdf_y.to_f)
@@ -1185,9 +1209,9 @@ module Rpdfium
1185
1209
  annot_handle = Raw.FPDFLink_GetAnnot(@state[:handle], link_handle)
1186
1210
  return nil if annot_handle.null?
1187
1211
 
1188
- # Annotation richiede un index nel page; non lo abbiamo direttamente
1189
- # qui. Iteriamo le annotation della pagina e troviamo quella col
1190
- # rect più vicino. Per la maggior parte dei PDF è O(piccolo).
1212
+ # Annotation requires an index in the page; we do not have it directly
1213
+ # here. We iterate the page's annotations and find the one with the
1214
+ # closest rect. For most PDFs this is O(small).
1191
1215
  annotations.find { |a| a.subtype == :link && annotation_contains?(a, x, y) }
1192
1216
  end
1193
1217
 
@@ -1206,19 +1230,19 @@ module Rpdfium
1206
1230
  out
1207
1231
  end
1208
1232
 
1209
- # ===== Annotazioni =====
1233
+ # ===== Annotations =====
1210
1234
 
1211
1235
  def annotations
1212
1236
  n = Raw.FPDFPage_GetAnnotCount(@state[:handle])
1213
1237
  Array.new(n) { |i| Annotation.new(self, i) }
1214
1238
  end
1215
1239
 
1216
- # Solo annotazioni link (cliccabili, esterne o interne)
1240
+ # Link annotations only (clickable, external or internal)
1217
1241
  def links
1218
1242
  annotations.select { |a| a.subtype == :link }
1219
1243
  end
1220
1244
 
1221
- # Solo widget di form
1245
+ # Form widgets only
1222
1246
  def form_fields
1223
1247
  return [] unless @document.has_forms?
1224
1248
 
@@ -1228,25 +1252,25 @@ module Rpdfium
1228
1252
 
1229
1253
  # ===== Struct Tree (PDF tagged) =====
1230
1254
 
1231
- # Struct tree della pagina (PDF/UA / Tagged PDF). Ritorna nil se la
1232
- # pagina non è tagged. Per PDF da Word/LibreOffice/InDesign export
1233
- # con accessibility tags attivati, espone la struttura logica
1234
- # (Document → P, H1, Table, TR, TH, TD, Figure, ecc.).
1255
+ # Struct tree of the page (PDF/UA / Tagged PDF). Returns nil if the
1256
+ # page is not tagged. For PDFs from Word/LibreOffice/InDesign exports
1257
+ # with accessibility tags enabled, it exposes the logical structure
1258
+ # (Document → P, H1, Table, TR, TH, TD, Figure, etc.).
1235
1259
  #
1236
- # Modalità d'uso:
1260
+ # Usage modes:
1237
1261
  #
1238
- # # Lifecycle automatico (RAII via finalizer):
1262
+ # # Automatic lifecycle (RAII via finalizer):
1239
1263
  # tree = page.struct_tree
1240
1264
  # tree&.walk { |el| puts el.type }
1241
1265
  #
1242
- # # Lifecycle deterministico (close al fine blocco):
1266
+ # # Deterministic lifecycle (close at end of block):
1243
1267
  # page.struct_tree do |tree|
1244
1268
  # tree.tables.each { |t| ... }
1245
1269
  # end
1246
1270
  #
1247
- # Su PDF non tagged ritorna nil. Su PDF "tagged ma vuoto" (es. CR
1248
- # Banca d'Italia, StructTreeRoot presente ma con element placeholder),
1249
- # ritorna un Tree con `Tree#empty? == true`.
1271
+ # On non-tagged PDFs it returns nil. On "tagged but empty" PDFs (e.g. CR
1272
+ # Banca d'Italia, StructTreeRoot present but with placeholder elements),
1273
+ # it returns a Tree with `Tree#empty? == true`.
1250
1274
  def struct_tree
1251
1275
  tree = Structure::Tree.for_page(self)
1252
1276
  if block_given?
@@ -1262,9 +1286,10 @@ module Rpdfium
1262
1286
 
1263
1287
  # ===== Rendering =====
1264
1288
 
1265
- # Render a bitmap. `output` può essere :rgba (default), :bgra, :gray.
1266
- # Ritorna [w, h, bytes] dove bytes è una stringa binaria.
1267
- # Se include_forms è true e il documento ha forms, sovrappone i widget.
1289
+ # Render to a bitmap. `output` can be :rgba (default), :bgra, :gray.
1290
+ # Returns [w, h, bytes] where bytes is a binary string.
1291
+ # If include_forms is true and the document has forms, it overlays the
1292
+ # widgets.
1268
1293
  def render(scale: 2.0, rotate: 0, output: :rgba,
1269
1294
  include_annotations: false, include_forms: false,
1270
1295
  background: 0xFFFFFFFF)
@@ -1276,7 +1301,7 @@ module Rpdfium
1276
1301
  format = output == :gray ? Raw::FPDFBitmap_Gray : Raw::FPDFBitmap_BGRA
1277
1302
 
1278
1303
  bitmap = Raw.FPDFBitmap_CreateEx(w, h, format, FFI::Pointer::NULL, 0)
1279
- raise Error, "Bitmap allocation failed" if bitmap.null?
1304
+ raise Error, 'Bitmap allocation failed' if bitmap.null?
1280
1305
 
1281
1306
  begin
1282
1307
  Raw.FPDFBitmap_FillRect(bitmap, 0, 0, w, h, background)
@@ -1288,8 +1313,8 @@ module Rpdfium
1288
1313
  end
1289
1314
  stride = Raw.FPDFBitmap_GetStride(bitmap)
1290
1315
  buf = Raw.FPDFBitmap_GetBuffer(bitmap)
1291
- # Lo stride può eccedere w*bpp per padding di allineamento.
1292
- # In BGRA è quasi sempre w*4, ma rispettiamolo per sicurezza.
1316
+ # The stride may exceed w*bpp due to alignment padding.
1317
+ # In BGRA it is almost always w*4, but we respect it for safety.
1293
1318
  bytes = buf.read_bytes(stride * h)
1294
1319
  [w, h, bytes, stride]
1295
1320
  ensure
@@ -1297,7 +1322,7 @@ module Rpdfium
1297
1322
  end
1298
1323
  end
1299
1324
 
1300
- # Rendering diretto a PNG file. Usa Rpdfium::IO::PNG (puro Ruby, zero dep).
1325
+ # Direct rendering to a PNG file. Uses Rpdfium::IO::PNG (pure Ruby, zero deps).
1301
1326
  def render_to_png(path, **opts)
1302
1327
  w, h, bytes, stride = render(output: :rgba, **opts)
1303
1328
  Rpdfium::IO::PNG.write(path, w, h, bytes, stride: stride)
@@ -1328,7 +1353,7 @@ module Rpdfium
1328
1353
 
1329
1354
  private
1330
1355
 
1331
- # Match helper per il parametro `font:` di chars_where/lines.
1356
+ # Match helper for the `font:` parameter of chars_where/lines.
1332
1357
  def font_matches?(actual_font, pattern)
1333
1358
  return false if actual_font.nil?
1334
1359
 
@@ -1340,9 +1365,9 @@ module Rpdfium
1340
1365
  end
1341
1366
  end
1342
1367
 
1343
- # Match helper per parametri numerici (`height:`, `weight:`).
1344
- # Accetta singolo valore, Range, o Array<Numeric>. Per singolo valore
1345
- # numeric usa tolleranza 0.05 (utile per height in punti).
1368
+ # Match helper for numeric parameters (`height:`, `weight:`).
1369
+ # Accepts a single value, Range, or Array<Numeric>. For a single
1370
+ # numeric value it uses a 0.05 tolerance (useful for height in points).
1346
1371
  def range_matches?(actual, spec)
1347
1372
  return false if actual.nil?
1348
1373
 
@@ -1354,13 +1379,13 @@ module Rpdfium
1354
1379
  end
1355
1380
  end
1356
1381
 
1357
- # Converte un box PDFium {left, bottom, right, top} in coord bottom-up
1358
- # alla tuple top-down [x0, top, x1, bottom] usata dal resto della
1359
- # libreria. Ritorna nil se il box è nil (box assente sul PDF).
1360
- # Itera tutti i page object della pagina ricorsivamente (discendendo
1361
- # nei Form XObjects), passando al block ogni (obj, ctm_corrente).
1362
- # Stessa logica di walk di collect_line_segments ma astrattautile
1363
- # per altre operazioni a livello di obj (marked content, etc).
1382
+ # Converts a PDFium box {left, bottom, right, top} in bottom-up coords
1383
+ # to the top-down tuple [x0, top, x1, bottom] used by the rest of the
1384
+ # library. Returns nil if the box is nil (box absent on the PDF).
1385
+ # Iterates all the page objects of the page recursively (descending
1386
+ # into Form XObjects), passing each (obj, current_ctm) to the block.
1387
+ # Same walk logic as collect_line_segments but abstracteduseful for
1388
+ # other obj-level operations (marked content, etc).
1364
1389
  def walk_page_objects(handle = @state[:handle], ctm = identity_matrix,
1365
1390
  is_form: false, &block)
1366
1391
  n = is_form ? Raw.FPDFFormObj_CountObjects(handle) : Raw.FPDFPage_CountObjects(handle)
@@ -1399,16 +1424,13 @@ module Rpdfium
1399
1424
  needed = out_len.read_ulong
1400
1425
  return nil if needed < 2
1401
1426
 
1402
- # Clamp: se needed eccede il buffer, leggo solo quanto allocato (e
1403
- # mi pace che la stringa sia troncata: il caso è patologico). Senza
1404
- # clamp → IndexError su mark name eccezionalmente lunghi.
1427
+ # Clamp: if needed exceeds the buffer, read only what was allocated
1428
+ # (and accept that the string is truncated: the case is pathological).
1429
+ # Without the clamp → IndexError on exceptionally long mark names.
1405
1430
  payload_bytes = [needed - 2, buf_bytes - 2].min
1406
1431
  return nil if payload_bytes <= 0
1407
1432
 
1408
- name_buf.read_bytes(payload_bytes)
1409
- .force_encoding("UTF-16LE")
1410
- .encode("UTF-8")
1411
- .delete("\u0000")
1433
+ decode_utf16le(name_buf.read_bytes(payload_bytes))
1412
1434
  end
1413
1435
 
1414
1436
  def read_mark_params(mark)
@@ -1418,7 +1440,7 @@ module Rpdfium
1418
1440
  key = read_mark_param_key(mark, pi)
1419
1441
  next if key.nil? || key.empty?
1420
1442
 
1421
- # Tipo del valore: 0=Null, 1=Int, 2=String, 3=Blob, 4=Dict (ignorato)
1443
+ # Value type: 0=Null, 1=Int, 2=String, 3=Blob, 4=Dict (ignored)
1422
1444
  type = Raw.FPDFPageObjMark_GetParamValueType(mark, key)
1423
1445
  params[key] = case type
1424
1446
  when 1 then read_mark_param_int(mark, key)
@@ -1442,10 +1464,7 @@ module Rpdfium
1442
1464
  payload_bytes = [needed - 2, buf_bytes - 2].min
1443
1465
  return nil if payload_bytes <= 0
1444
1466
 
1445
- key_buf.read_bytes(payload_bytes)
1446
- .force_encoding("UTF-16LE")
1447
- .encode("UTF-8")
1448
- .delete("\u0000")
1467
+ decode_utf16le(key_buf.read_bytes(payload_bytes))
1449
1468
  end
1450
1469
 
1451
1470
  def read_mark_param_int(mark, key)
@@ -1469,10 +1488,7 @@ module Rpdfium
1469
1488
  payload_bytes = [needed - 2, buf_bytes - 2].min
1470
1489
  return nil if payload_bytes <= 0
1471
1490
 
1472
- val_buf.read_bytes(payload_bytes)
1473
- .force_encoding("UTF-16LE")
1474
- .encode("UTF-8")
1475
- .delete("\u0000")
1491
+ decode_utf16le(val_buf.read_bytes(payload_bytes))
1476
1492
  end
1477
1493
 
1478
1494
  def annotation_contains?(annot, x, y)
@@ -1490,13 +1506,37 @@ module Rpdfium
1490
1506
  box[:right], page_h - box[:bottom]]
1491
1507
  end
1492
1508
 
1509
+ # Page dimensions after applying the PDF rotation: width and height are
1510
+ # swapped for 90°/270°. Shared by the char and line-segment pipelines.
1511
+ def rotated_dimensions(rot = rotation)
1512
+ case rot
1513
+ when 90, 270 then [height, width]
1514
+ else [width, height]
1515
+ end
1516
+ end
1517
+
1518
+ CODEPOINT_CHAR = 'U'.freeze
1519
+
1493
1520
  def safe_codepoint(cp)
1494
- return "" if cp.zero?
1495
- return "" if cp > 0x10FFFF || (0xD800..0xDFFF).cover?(cp)
1521
+ return '' if cp.zero?
1522
+ return '' if cp > 0x10FFFF || (0xD800..0xDFFF).cover?(cp)
1496
1523
 
1497
- [cp].pack("U")
1524
+ [cp].pack(CODEPOINT_CHAR)
1498
1525
  rescue RangeError, ArgumentError
1499
- ""
1526
+ ''
1527
+ end
1528
+
1529
+ # PDFium returns text as UTF-16LE byte buffers padded with NUL. This
1530
+ # is the single decode path used by every text getter (page text,
1531
+ # bounded text, text objects, marked-content names/params).
1532
+ # `replace: true` swaps invalid/undefined codepoints for U+FFFD
1533
+ # instead of raising — used for whole-page text where a single bad
1534
+ # glyph must not abort the extraction.
1535
+ def decode_utf16le(bytes, replace: false)
1536
+ opts = replace ? { invalid: :replace, undef: :replace } : {}
1537
+ bytes.force_encoding(Encoding::UTF_16LE.to_s)
1538
+ .encode(Encoding::UTF_8.to_s, **opts)
1539
+ .delete("\u0000")
1500
1540
  end
1501
1541
 
1502
1542
  def read_stroke_width(obj)
@@ -1506,10 +1546,10 @@ module Rpdfium
1506
1546
  buf.read_float
1507
1547
  end
1508
1548
 
1509
- # Costruisce un segmento dalla coppia di endpoint nello spazio raw
1510
- # PDFium (bottom-up, pre-rotazione). Applica la rotazione della pagina
1511
- # per restituire coord top-down nel sistema post-rotation, coerente
1512
- # con il sistema usato da `chars`.
1549
+ # Builds a segment from the pair of endpoints in the raw PDFium space
1550
+ # (bottom-up, pre-rotation). Applies the page rotation to return
1551
+ # top-down coords in the post-rotation system, consistent with the
1552
+ # system used by `chars`.
1513
1553
  def build_segment(x0, y0, x1, y1, rotation_ctx, stroke_width, dashed: false)
1514
1554
  r = rotation_ctx[:rotation]
1515
1555
  raw_w = rotation_ctx[:raw_w]
@@ -1526,24 +1566,19 @@ module Rpdfium
1526
1566
  }
1527
1567
  end
1528
1568
 
1529
- # Trasforma un singolo punto (x, y) dal sistema raw PDFium (bottom-up)
1530
- # al sistema top-down post-rotation della pagina.
1569
+ # Transforms a single point (x, y) from the raw PDFium system (bottom-up)
1570
+ # to the page's top-down post-rotation system.
1531
1571
  def apply_page_rotation_to_point(rotation, raw_w, raw_h, x, y)
1532
1572
  case rotation
1533
- when 0, nil
1534
- [x, raw_h - y] # bottom-up → top-down
1535
- when 90
1536
- [y, x] # 90° CW
1537
- when 180
1538
- [raw_w - x, y]
1539
- when 270
1540
- [raw_h - y, raw_w - x]
1541
- else
1542
- [x, raw_h - y]
1573
+ when 0, nil then [x, raw_h - y] # bottom-up → top-down
1574
+ when 90 then [y, x] # 90° CW
1575
+ when 180 then [raw_w - x, y]
1576
+ when 270 then [raw_h - y, raw_w - x]
1577
+ else [x, raw_h - y]
1543
1578
  end
1544
1579
  end
1545
1580
 
1546
- # Raggruppa elementi consecutivi se un blocco li considera equivalenti.
1581
+ # Groups consecutive elements if a block considers them equivalent.
1547
1582
  def group_consecutive(arr)
1548
1583
  groups = []
1549
1584
  current = []
@@ -1561,14 +1596,14 @@ module Rpdfium
1561
1596
 
1562
1597
  def word_from_chars(chars)
1563
1598
  {
1564
- text: chars.map { |c| c[:char] }.join,
1565
- x0: chars.first[:x0],
1566
- x1: chars.last[:x1],
1567
- top: chars.map { |c| c[:top] }.min,
1599
+ text: chars.map { |c| c[:char] }.join,
1600
+ x0: chars.first[:x0],
1601
+ x1: chars.last[:x1],
1602
+ top: chars.map { |c| c[:top] }.min,
1568
1603
  bottom: chars.map { |c| c[:bottom] }.max,
1569
1604
  fontsize: chars.first[:fontsize],
1570
- font: chars.first[:font],
1571
- chars: chars
1605
+ font: chars.first[:font],
1606
+ chars: chars
1572
1607
  }
1573
1608
  end
1574
1609
 
@@ -1583,11 +1618,11 @@ module Rpdfium
1583
1618
  end
1584
1619
  end
1585
1620
 
1586
- # Wrapper per FPDF_TEXTPAGE
1621
+ # Wrapper for FPDF_TEXTPAGE
1587
1622
  class TextPage
1588
1623
  def initialize(page)
1589
1624
  handle = Raw.FPDFText_LoadPage(page.handle)
1590
- raise PageError, "Could not load text page" if handle.null?
1625
+ raise PageError, 'Could not load text page' if handle.null?
1591
1626
 
1592
1627
  @state = { handle: handle, closed: false }
1593
1628
  ObjectSpace.define_finalizer(self, self.class.finalizer(@state))