RubyGems - rpdfium - Versions diffs - 0.4.1 → 0.4.3 - Mend

rpdfium 0.4.1 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +615 -1317
data/README.md +73 -78
data/lib/rpdfium/annotation/annotation.rb +10 -8
data/lib/rpdfium/document.rb +49 -22
data/lib/rpdfium/errors.rb +2 -2
data/lib/rpdfium/form/form.rb +9 -9
data/lib/rpdfium/image/embedded.rb +17 -16
data/lib/rpdfium/io/png.rb +9 -9
data/lib/rpdfium/page.rb +561 -526
data/lib/rpdfium/raw.rb +216 -203
data/lib/rpdfium/search/search.rb +5 -5
data/lib/rpdfium/structure/attachment.rb +6 -6
data/lib/rpdfium/structure/element.rb +74 -74
data/lib/rpdfium/structure/outline.rb +2 -2
data/lib/rpdfium/structure/tree.rb +56 -55
data/lib/rpdfium/table/cells.rb +36 -33
data/lib/rpdfium/table/debugger.rb +12 -12
data/lib/rpdfium/table/edges.rb +51 -49
data/lib/rpdfium/table/extractor.rb +35 -34
data/lib/rpdfium/table/table.rb +65 -62
data/lib/rpdfium/util/cluster.rb +35 -33
data/lib/rpdfium/util/column_inference.rb +34 -32
data/lib/rpdfium/util/label_matcher.rb +30 -30
data/lib/rpdfium/util/text_extraction.rb +15 -15
data/lib/rpdfium/util/word_extractor.rb +49 -48
data/lib/rpdfium/util/word_merger.rb +25 -24
data/lib/rpdfium/version.rb +1 -1
data/lib/rpdfium.rb +17 -15
metadata +1 -1

data/lib/rpdfium/search/search.rb CHANGED Viewed

@@ -1,10 +1,10 @@
 # frozen_string_literal: true
 module Rpdfium
-  # Ricerca testuale interna alla pagina, basata su FPDFText_Find*.
-  # Mantiene lo stato (cursor) e supporta forward/backward.
+  # Text search within the page, based on FPDFText_Find*.
+  # Keeps the state (cursor) and supports forward/backward.
   #
-  # Esempio:
+  # Example:
   #   page.search("totale").each_match { |m| p m[:bbox], m[:text] }
   class Search
     include Enumerable
@@ -43,8 +43,8 @@ module Rpdfium
       @state[:handle]
     end
-    # Itera tutte le occorrenze in avanti. Ritorna hash con :char_index, :length,
-    # :text, :rects (array di bbox top-down: una per riga di testo).
+    # Iterates over all forward occurrences. Returns a hash with :char_index, :length,
+    # :text, :rects (an array of top-down bboxes: one per text line).
     def each_match
       return enum_for(:each_match) unless block_given?

data/lib/rpdfium/structure/attachment.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 # frozen_string_literal: true
 module Rpdfium
-  # File embedded nel PDF (allegati). PDFium li espone via FPDFDoc_GetAttachment.
+  # Files embedded in the PDF (attachments). PDFium exposes them via FPDFDoc_GetAttachment.
   class Attachment
     attr_reader :document, :index, :handle
@@ -16,7 +16,7 @@ module Rpdfium
       Raw.read_utf16_string(:FPDFAttachment_GetName, @handle)
     end
-    # Ritorna i bytes del file allegato. Pattern probe-then-fetch.
+    # Returns the bytes of the attached file. Probe-then-fetch pattern.
     def bytes
       out_size = FFI::MemoryPointer.new(:ulong)
       Raw.FPDFAttachment_GetFile(@handle, FFI::Pointer::NULL, 0, out_size)
@@ -25,10 +25,10 @@ module Rpdfium
       buf = FFI::MemoryPointer.new(:uchar, n)
       Raw.FPDFAttachment_GetFile(@handle, buf, n, out_size)
-      # Leggo n byte (la dimensione del MIO buffer), non out_size.read_ulong:
-      # PDFium può aggiornare out_size con un valore diverso da n (es. dim
-      # totale necessaria) che leggerebbe oltre il buffer → IndexError.
-      # Se la write effettiva è < n, riempie il resto con NUL.
+      # Read n bytes (the size of OUR buffer), not out_size.read_ulong:
+      # PDFium may update out_size with a value different from n (e.g. the
+      # total size required), which would read past the buffer → IndexError.
+      # If the actual write is < n, the remainder is filled with NUL.
       buf.read_bytes(n)
     end

data/lib/rpdfium/structure/element.rb CHANGED Viewed

@@ -2,20 +2,20 @@
 module Rpdfium
   module Structure
-    # Element di un PDF tagged StructTree.
+    # Element of a tagged PDF StructTree.
     #
-    # Un Element rappresenta un nodo della struttura logica del documento:
-    # `Document`, `P` (paragrafo), `H1`..`H6` (headings), `Table`, `TR`,
-    # `TH`, `TD`, `Figure`, `Span`, `Lbl`, `LI`, `Caption`, ecc. Vedi
-    # PDF spec §14.8 per la tassonomia completa.
+    # An Element represents a node of the document's logical structure:
+    # `Document`, `P` (paragraph), `H1`..`H6` (headings), `Table`, `TR`,
+    # `TH`, `TD`, `Figure`, `Span`, `Lbl`, `LI`, `Caption`, etc. See
+    # PDF spec §14.8 for the complete taxonomy.
     #
-    # Gli element non hanno una vita autonoma: appartengono al Tree che li
-    # ha generati. Quando il Tree viene chiuso, gli element diventano
-    # invalidi. Non chiamare metodi su un element dopo `tree.close`.
+    # Elements have no independent lifetime: they belong to the Tree that
+    # produced them. When the Tree is closed, the elements become
+    # invalid. Do not call methods on an element after `tree.close`.
     #
-    # Tutti i metodi sono read-only: PDFium non espone API per modificare
-    # il StructTree (è una struttura "di sola lettura" anche nel suo C API
-    # pubblico).
+    # All methods are read-only: PDFium exposes no API to modify the
+    # StructTree (it is a "read-only" structure even in its public C
+    # API).
     class Element
       attr_reader :handle, :tree
@@ -24,75 +24,75 @@ module Rpdfium
         @handle = handle
       end
-      # Tipo strutturale dell'element (es. "P", "H1", "Table", "TR", "TD").
-      # Nil se PDFium non riesce a leggerlo (element placeholder).
+      # Structural type of the element (e.g. "P", "H1", "Table", "TR", "TD").
+      # Nil if PDFium cannot read it (placeholder element).
       def type
         read_utf16_string(:FPDF_StructElement_GetType)
       end
-      # Tipo dell'oggetto PDF sottostante: di solito "StructElem", ma può
-      # essere "MCR" (Marked Content Reference) o "OBJR" (Object Reference)
-      # per nodi specializzati. La maggior parte degli utenti usa `type`.
+      # Type of the underlying PDF object: usually "StructElem", but may
+      # be "MCR" (Marked Content Reference) or "OBJR" (Object Reference)
+      # for specialized nodes. Most users use `type`.
       def obj_type
         read_utf16_string(:FPDF_StructElement_GetObjType)
       end
-      # Title attribute (raro, usato in alcuni documenti per dare un nome
-      # parlante all'element, es. "Capitolo 1").
+      # Title attribute (rare, used in some documents to give the element
+      # a descriptive name, e.g. "Capitolo 1").
       def title
         read_utf16_string(:FPDF_StructElement_GetTitle)
       end
-      # ID univoco dell'element (se dichiarato nel /ID dictionary del
-      # StructTreeRoot). Permette riferimenti cross-element (es. Headers
-      # attribute di una cella TD che punta a un TH per id).
+      # Unique ID of the element (if declared in the /ID dictionary of
+      # the StructTreeRoot). Enables cross-element references (e.g. the
+      # Headers attribute of a TD cell pointing to a TH by id).
       def id
         read_utf16_string(:FPDF_StructElement_GetID)
       end
-      # Lingua dichiarata sull'element (es. "it-IT", "en-US"). Ereditata
-      # dal parent se non sovrascritta. Utile per pipeline language-aware.
+      # Language declared on the element (e.g. "it-IT", "en-US"). Inherited
+      # from the parent if not overridden. Useful for language-aware pipelines.
       def lang
         read_utf16_string(:FPDF_StructElement_GetLang)
       end
-      # ActualText: override del testo "logico" per l'element. Risolve
-      # legature (PDF mostra `ﬁ` ma actual_text dice "fi"), simboli math
-      # ("∫" → "integral"), abbreviazioni. Se presente, ha priorità sul
-      # testo grafico per accessibility e ricerca.
+      # ActualText: override of the "logical" text for the element. Resolves
+      # ligatures (the PDF shows `ﬁ` but actual_text says "fi"), math symbols
+      # ("∫" → "integral"), abbreviations. When present, it takes precedence
+      # over the graphical text for accessibility and search.
       def actual_text
         read_utf16_string(:FPDF_StructElement_GetActualText)
       end
-      # AltText: testo alternativo per Figure / Formula / immagini. PDF/UA
-      # richiede che ogni Figure abbia un alt_text non vuoto.
+      # AltText: alternative text for Figure / Formula / images. PDF/UA
+      # requires every Figure to have a non-empty alt_text.
       def alt_text
         read_utf16_string(:FPDF_StructElement_GetAltText)
       end
-      # Expansion text per abbreviazioni (es. element type "Span" con
-      # contenuto "Dr." e expansion "Doctor"). Usato per text-to-speech.
+      # Expansion text for abbreviations (e.g. an element of type "Span"
+      # with content "Dr." and expansion "Doctor"). Used for text-to-speech.
       def expansion
         read_utf16_string(:FPDF_StructElement_GetExpansion)
       end
-      # Marked Content IDs collegati a questo element. Un element ha tipicamente
-      # 1 MCID (es. una `<P>` ha tutto il testo del paragrafo dentro un BDC con
-      # mcid=N) oppure 0 (element strutturale puro: `<Document>`, `<Table>`,
-      # `<TR>` — i loro MCID stanno nei figli foglia).
+      # Marked Content IDs linked to this element. An element typically has
+      # 1 MCID (e.g. a `<P>` holds all the paragraph text inside a BDC with
+      # mcid=N) or 0 (a pure structural element: `<Document>`, `<Table>`,
+      # `<TR>` — their MCIDs reside in the leaf children).
       #
-      # Per collegare un MCID al testo della pagina: leggi i page object e
-      # raggruppa per `FPDFPageObj_GetMarkedContentID`. Vedi `Element#text`.
+      # To link an MCID to the page text: read the page objects and group
+      # by `FPDFPageObj_GetMarkedContentID`. See `Element#text`.
       def marked_content_ids
         first = Raw.FPDF_StructElement_GetMarkedContentID(@handle)
         count = Raw.FPDF_StructElement_GetMarkedContentIdCount(@handle)
-        # Casi: GetMarkedContentIdCount ritorna -1 quando non ci sono MCID
-        # diretti (element strutturale). GetMarkedContentID ritorna -1
-        # nello stesso caso.
+        # Cases: GetMarkedContentIdCount returns -1 when there are no direct
+        # MCIDs (structural element). GetMarkedContentID returns -1 in the
+        # same case.
         return [] if count <= 0 && first < 0
-        # Quando esiste un solo MCID, GetMarkedContentIdCount può ritornare
-        # 0 o -1 mentre GetMarkedContentID dà il valore. Coalescenza:
+        # When a single MCID exists, GetMarkedContentIdCount may return
+        # 0 or -1 while GetMarkedContentID provides the value. Coalesce:
         if count <= 0
           first >= 0 ? [first] : []
         else
@@ -103,8 +103,8 @@ module Rpdfium
         end
       end
-      # Figli diretti dell'element. Ordinati come dichiarati nel PDF
-      # (top-to-bottom, left-to-right per reading order).
+      # Direct children of the element. Ordered as declared in the PDF
+      # (top-to-bottom, left-to-right for reading order).
       def children
         n = Raw.FPDF_StructElement_CountChildren(@handle)
         return [] if n <= 0
@@ -115,7 +115,7 @@ module Rpdfium
         end
       end
-      # Parent. Nil per gli element root (figli diretti del StructTree).
+      # Parent. Nil for root elements (direct children of the StructTree).
       def parent
         h = Raw.FPDF_StructElement_GetParent(@handle)
         return nil if h.null?
@@ -123,9 +123,9 @@ module Rpdfium
         Element.new(@tree, h)
       end
-      # Walk depth-first dell'intero sub-tree a partire da questo element.
-      # Visita prima self, poi ricorsivamente i figli.
-      # Senza block ritorna un Enumerator.
+      # Depth-first walk of the entire sub-tree starting from this element.
+      # Visits self first, then recursively the children.
+      # Without a block returns an Enumerator.
       def walk(&block)
         return enum_for(:walk) unless block
@@ -133,26 +133,26 @@ module Rpdfium
         children.each { |c| c.walk(&block) }
       end
-      # Foglie del sub-tree (element senza figli). Sono i nodi che
-      # tipicamente hanno il MCID diretto.
+      # Leaves of the sub-tree (elements without children). These are the
+      # nodes that typically hold the direct MCID.
       def leaves
         return [self] if children.empty?
         children.flat_map(&:leaves)
       end
-      # Testo dell'element, ricostruito dalla pagina via MCID. Risoluzione:
-      # 1. Se `actual_text` è presente, lo usa (gestisce legature/abbreviazioni).
-      # 2. Altrimenti raccoglie tutti gli MCID del sub-tree (questo element
-      #    + ricorsivamente i figli) e concatena il testo dei page objects
-      #    con quei MCID, in document order.
+      # Text of the element, reconstructed from the page via MCID. Resolution:
+      # 1. If `actual_text` is present, use it (handles ligatures/abbreviations).
+      # 2. Otherwise collect all MCIDs of the sub-tree (this element
+      #    + recursively the children) and concatenate the text of the page
+      #    objects with those MCIDs, in document order.
       #
-      # Per element strutturali puri (`Table`, `TR`) il testo è la
-      # concatenazione di tutti i discendenti — utile come "summary".
+      # For pure structural elements (`Table`, `TR`) the text is the
+      # concatenation of all descendants — useful as a "summary".
       def text
         return actual_text if actual_text && !actual_text.empty?
-        # Raccoglie MCID di tutto il sub-tree depth-first
+        # Collect MCIDs of the entire sub-tree depth-first
         all_mcids = []
         walk { |el| all_mcids.concat(el.marked_content_ids) }
         return "" if all_mcids.empty?
@@ -161,11 +161,11 @@ module Rpdfium
         all_mcids.filter_map { |id| mcid_map[id] }.join
       end
-      # Attributi PDF strutturali. Ritorna un Hash { name => value } con
-      # tutti gli attributi dichiarati su questo element (RowSpan, ColSpan,
-      # Scope, Headers, BBox, ecc.). I valori sono Ruby-native: Integer,
-      # Float, String, true/false, o Array per attributi "Headers" che
-      # contengono liste di ID.
+      # Structural PDF attributes. Returns a Hash { name => value } with
+      # all attributes declared on this element (RowSpan, ColSpan,
+      # Scope, Headers, BBox, etc.). Values are Ruby-native: Integer,
+      # Float, String, true/false, or Array for "Headers" attributes that
+      # contain lists of IDs.
       def attributes
         result = {}
         attr_count = Raw.FPDF_StructElement_GetAttributeCount(@handle)
@@ -204,9 +204,9 @@ module Rpdfium
       private
-      # Helper UTF-16 string read con probe-then-fetch corretto. PDFium
-      # restituisce il numero di byte necessari (incluso null terminator),
-      # anche se il buffer è troppo piccolo.
+      # UTF-16 string read helper with proper probe-then-fetch. PDFium
+      # returns the number of bytes required (including the null
+      # terminator), even when the buffer is too small.
       def read_utf16_string(fn_name)
         needed = Raw.send(fn_name, @handle, FFI::Pointer::NULL, 0)
         return nil if needed < 2
@@ -215,7 +215,7 @@ module Rpdfium
         written = Raw.send(fn_name, @handle, buf, needed)
         return nil if written < 2
-        # Clamp: leggi al massimo il buffer allocato meno il null terminator.
+        # Clamp: read at most the allocated buffer minus the null terminator.
         payload = [written - 2, needed - 2].min
         return nil if payload <= 0
@@ -235,7 +235,7 @@ module Rpdfium
         n = len_buf.read_ulong
         return nil if n.zero?
-        # GetName ritorna ASCII (latin-1), non UTF-16
+        # GetName returns ASCII (latin-1), not UTF-16
         name_buf.read_bytes(n).force_encoding("UTF-8").delete("\u0000")
       end
@@ -244,7 +244,7 @@ module Rpdfium
         return nil if val_handle.null?
         type = Raw.FPDF_StructElement_Attr_GetType(val_handle)
-        # Type codes da fpdf_structtree.h:
+        # Type codes from fpdf_structtree.h:
         #   1 = Boolean, 2 = Number, 3 = String, 4 = Blob,
         #   5 = Name, 6 = Array, 7 = Dictionary
         case type
@@ -258,15 +258,15 @@ module Rpdfium
           read_attr_string_value(val_handle)
         when 4 # Blob (raw bytes)
           read_attr_blob_value(val_handle)
-        when 6 # Array → ricorsivamente raccolgo i figli
+        when 6 # Array → recursively collect the children
           n = Raw.FPDF_StructElement_Attr_CountChildren(val_handle)
           (0...n).filter_map do |i|
             child = Raw.FPDF_StructElement_Attr_GetChildAtIndex(val_handle, i)
             next nil if child.null?
-            # Per ogni child applico la stessa lettura via type. Ma non ho
-            # un "name" per accedere a Attr_GetValue su un child; il child
-            # È già una FPDF_STRUCTELEMENT_ATTR_VALUE. Leggi direttamente.
+            # For each child apply the same read via type. But there is no
+            # "name" to access Attr_GetValue on a child; the child is
+            # already an FPDF_STRUCTELEMENT_ATTR_VALUE. Read it directly.
             read_attr_value_handle(child)
           end
         else
@@ -294,7 +294,7 @@ module Rpdfium
       def read_attr_string_value(val_handle)
         len_buf = FFI::MemoryPointer.new(:ulong)
-        # Probe size
+        # Probe the size
         Raw.FPDF_StructElement_Attr_GetStringValue(val_handle,
                                                     FFI::Pointer::NULL, 0, len_buf)
         n = len_buf.read_ulong

data/lib/rpdfium/structure/outline.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 # frozen_string_literal: true
 module Rpdfium
-  # Albero di bookmark (outline) del documento. Costruito ricorsivamente.
+  # Document bookmark (outline) tree. Built recursively.
   class Outline
     attr_reader :title, :page_index, :children
@@ -32,7 +32,7 @@ module Rpdfium
       result
     end
-    # Iteratore flat preorder: utile per generare un sommario lineare.
+    # Flat preorder iterator: useful for generating a linear table of contents.
     def self.flatten(outline_tree, depth = 0, &block)
       outline_tree.each do |item|
         block.call(item, depth)

data/lib/rpdfium/structure/tree.rb CHANGED Viewed

@@ -2,39 +2,40 @@
 module Rpdfium
   module Structure
-    # StructTree di una pagina PDF tagged.
+    # StructTree of a tagged PDF page.
     #
-    # Per PDF tagged (PDF/UA, esport accessibility-friendly da
-    # Word/LibreOffice/InDesign), espone la struttura logica del documento:
-    # Document → P, H1, Table, TR, TH, TD, Figure, ecc.
+    # For tagged PDFs (PDF/UA, accessibility-friendly exports from
+    # Word/LibreOffice/InDesign), it exposes the logical structure of the
+    # document: Document → P, H1, Table, TR, TH, TD, Figure, etc.
     #
-    # Per PDF NON tagged, `Page#struct_tree` ritorna nil. Per PDF "tagged
-    # ma vuoti" (es. CR Banca d'Italia, StructTreeRoot presente ma con
-    # element placeholder senza type/MCID), `Tree#empty?` ritorna true.
+    # For NON-tagged PDFs, `Page#struct_tree` returns nil. For "tagged but
+    # empty" PDFs (e.g. CR Banca d'Italia, StructTreeRoot present but with
+    # placeholder elements without type/MCID), `Tree#empty?` returns true.
     #
-    # Lifecycle: il Tree mantiene un handle PDFium che è "owning" — chiamare
-    # `FPDF_StructTree_Close` lo dealloca. PDFium dealloca automaticamente
-    # lo struct tree alla chiusura del documento, quindi in pratica:
+    # Lifecycle: the Tree holds a PDFium handle that is "owning" — calling
+    # `FPDF_StructTree_Close` deallocates it. PDFium automatically
+    # deallocates the struct tree when the document is closed, so in
+    # practice:
     #
-    #   - se non chiudi mai il tree esplicitamente, PDFium lo libera con
-    #     `FPDF_CloseDocument` (zero perdita persistente, ma il tree resta
-    #     in memoria fino alla chiusura del doc — può essere ~MB)
-    #   - per controllo deterministico (rilascia subito), usa il blocco:
+    #   - if you never close the tree explicitly, PDFium frees it with
+    #     `FPDF_CloseDocument` (zero persistent leak, but the tree stays
+    #     in memory until the doc is closed — it may be ~MB)
+    #   - for deterministic control (release immediately), use the block:
     #
     #       page.struct_tree do |tree|
     #         tree.walk { |el| ... }
     #       end
-    #     all'uscita dal blocco il tree viene chiuso, anche su eccezione.
+    #     on exit from the block the tree is closed, even on exception.
     #
-    # Per scelta progettuale NON usiamo `ObjectSpace.define_finalizer`: se
-    # il GC chiamasse `FPDF_StructTree_Close` dopo che il documento è già
-    # stato chiuso, si avrebbe un use-after-free → segfault. La chiusura
-    # via Document è sempre sicura; la chiusura via Tree.close (esplicita
-    # o tramite blocco) richiede che il documento sia ancora vivo.
+    # As a design choice we do NOT use `ObjectSpace.define_finalizer`: if
+    # the GC were to call `FPDF_StructTree_Close` after the document had
+    # already been closed, this would cause a use-after-free → segfault.
+    # Closing via Document is always safe; closing via Tree.close (explicit
+    # or through a block) requires the document to still be alive.
     class Tree
       attr_reader :handle, :page
-      # Ritorna nil se la pagina non è tagged. Altrimenti un Tree.
+      # Returns nil if the page is not tagged. Otherwise a Tree.
       def self.for_page(page)
         h = Raw.FPDF_StructTree_GetForPage(page.handle)
         return nil if h.null?
@@ -48,23 +49,23 @@ module Rpdfium
         @closed = false
         @mcid_text_cache = nil
-        # NOTA: niente finalizer. FPDF_StructTree_Close è "owning": chiama
-        # ~CPDF_StructTree() che libera l'oggetto. Se il documento PDF
-        # viene chiuso prima del tree, il finalizer GC chiamerebbe Close
-        # su memoria già liberata → segfault. Lifetime sicuro:
-        #   - close esplicito via `tree.close` o via blocco
+        # NOTE: no finalizer. FPDF_StructTree_Close is "owning": it calls
+        # ~CPDF_StructTree() which frees the object. If the PDF document
+        # is closed before the tree, the GC finalizer would call Close on
+        # already-freed memory → segfault. Safe lifetime:
+        #   - explicit close via `tree.close` or via the block
         #     `page.struct_tree { |tree| ... }`
-        #   - se nessuno chiude esplicitamente, PDFium libera il tree
-        #     insieme al documento al `FPDF_CloseDocument` (no leak
-        #     persistent, solo riserva memoria fino a chiusura doc)
+        #   - if nobody closes it explicitly, PDFium frees the tree
+        #     together with the document at `FPDF_CloseDocument` (no
+        #     persistent leak, only memory held until the doc is closed)
       end
       def closed?
         @closed
       end
-      # Chiusura esplicita (idempotente). Dopo close, non chiamare metodi
-      # su questo Tree né sugli Element che ha generato.
+      # Explicit close (idempotent). After close, do not call methods on
+      # this Tree nor on the Elements it generated.
       def close
         return if @closed
@@ -73,15 +74,15 @@ module Rpdfium
         @mcid_text_cache = nil
       end
-      # Numero di element root (figli diretti del StructTreeRoot per
-      # questa pagina). Tipicamente 1 (`<Document>`), ma può essere
-      # arbitrariamente alto su PDF strani (es. cu.pdf: 717 placeholder).
+      # Number of root elements (direct children of the StructTreeRoot for
+      # this page). Typically 1 (`<Document>`), but it can be arbitrarily
+      # high on odd PDFs (e.g. cu.pdf: 717 placeholders).
       def root_count
         n = Raw.FPDF_StructTree_CountChildren(@handle)
         [n, 0].max
       end
-      # Element root (figli diretti del StructTreeRoot). Tipicamente 1
+      # Root elements (direct children of the StructTreeRoot). Typically 1
       # (`<Document>`).
       def roots
         (0...root_count).filter_map do |i|
@@ -90,42 +91,42 @@ module Rpdfium
         end
       end
-      # True se il tree è strutturalmente vuoto (nessun element con type
-      # leggibile dai root). Caso comune per PDF "fintamente tagged" come
-      # CR Banca d'Italia: il StructTreeRoot esiste ma gli element sono
-      # placeholder vuoti.
+      # True if the tree is structurally empty (no element with a readable
+      # type among the roots). A common case for "fake-tagged" PDFs such as
+      # CR Banca d'Italia: the StructTreeRoot exists but the elements are
+      # empty placeholders.
       def empty?
         return true if root_count.zero?
         roots.none? { |r| r.type || r.children.any? }
       end
-      # Walk depth-first di TUTTI gli element del tree. Equivalente a
-      # `roots.flat_map(&:walk)`. Senza block ritorna Enumerator.
+      # Depth-first walk of ALL the elements of the tree. Equivalent to
+      # `roots.flat_map(&:walk)`. Without a block it returns an Enumerator.
       def walk(&block)
         return enum_for(:walk) unless block
         roots.each { |r| r.walk(&block) }
       end
-      # Trova tutti gli element del tipo specificato (es. "Table", "P",
-      # "Figure"). Confronto case-sensitive (i tipi PDF sono "Table",
-      # "P", "H1", ecc.).
+      # Finds all the elements of the specified type (e.g. "Table", "P",
+      # "Figure"). Case-sensitive comparison (PDF types are "Table",
+      # "P", "H1", etc.).
       def find_all(type:)
         walk.select { |el| el.type == type }
       end
-      # Restituisce tutti gli element di tipo "Table". Conveniente per
-      # estrazione tabelle semantica.
+      # Returns all the elements of type "Table". Convenient for semantic
+      # table extraction.
       def tables
         find_all(type: "Table")
       end
-      # Page objects raggruppati per Marked Content ID, per consentire a
-      # Element#text di risolvere il testo dei suoi MCID. La mappa è
-      # costruita una sola volta per Tree e cached.
+      # Page objects grouped by Marked Content ID, to allow Element#text
+      # to resolve the text of its MCIDs. The map is built only once per
+      # Tree and cached.
       #
-      # Pubblico ma destinato a uso interno; non parte dell'API stabile.
+      # Public but intended for internal use; not part of the stable API.
       def mcid_text_map
         @mcid_text_cache ||= build_mcid_text_map
       end
@@ -137,9 +138,9 @@ module Rpdfium
       private
-      # Itera tutti i page objects (incl. Form XObject) e raggruppa il loro
-      # testo per MCID. Il pattern probe-then-fetch su FPDFTextObj_GetText
-      # è già rodato (vedi Page#read_text_obj_text_fast).
+      # Iterates all the page objects (incl. Form XObject) and groups their
+      # text by MCID. The probe-then-fetch pattern on FPDFTextObj_GetText
+      # is well-established (see Page#read_text_obj_text_fast).
       def build_mcid_text_map
         map = Hash.new { |h, k| h[k] = +"" }
         tp = @page.text_page
@@ -170,8 +171,8 @@ module Rpdfium
       end
       def read_text_obj_text(obj, tp, buf)
-        # Probe con buffer 1024 byte (sufficiente per il 99% dei marked
-        # content runs, che tipicamente sono parole singole o frasi brevi).
+        # Probe with a 1024-byte buffer (sufficient for 99% of marked
+        # content runs, which are typically single words or short phrases).
         needed = Raw.FPDFTextObj_GetText(obj, tp.handle, buf, 1024)
         return nil if needed < 2