RubyGems - rpdfium - Versions diffs - 0.4.1 → 0.4.3 - Mend

rpdfium 0.4.1 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +615 -1317
data/README.md +73 -78
data/lib/rpdfium/annotation/annotation.rb +10 -8
data/lib/rpdfium/document.rb +49 -22
data/lib/rpdfium/errors.rb +2 -2
data/lib/rpdfium/form/form.rb +9 -9
data/lib/rpdfium/image/embedded.rb +17 -16
data/lib/rpdfium/io/png.rb +9 -9
data/lib/rpdfium/page.rb +561 -526
data/lib/rpdfium/raw.rb +216 -203
data/lib/rpdfium/search/search.rb +5 -5
data/lib/rpdfium/structure/attachment.rb +6 -6
data/lib/rpdfium/structure/element.rb +74 -74
data/lib/rpdfium/structure/outline.rb +2 -2
data/lib/rpdfium/structure/tree.rb +56 -55
data/lib/rpdfium/table/cells.rb +36 -33
data/lib/rpdfium/table/debugger.rb +12 -12
data/lib/rpdfium/table/edges.rb +51 -49
data/lib/rpdfium/table/extractor.rb +35 -34
data/lib/rpdfium/table/table.rb +65 -62
data/lib/rpdfium/util/cluster.rb +35 -33
data/lib/rpdfium/util/column_inference.rb +34 -32
data/lib/rpdfium/util/label_matcher.rb +30 -30
data/lib/rpdfium/util/text_extraction.rb +15 -15
data/lib/rpdfium/util/word_extractor.rb +49 -48
data/lib/rpdfium/util/word_merger.rb +25 -24
data/lib/rpdfium/version.rb +1 -1
data/lib/rpdfium.rb +17 -15
metadata +1 -1

data/lib/rpdfium/util/word_extractor.rb CHANGED Viewed

@@ -2,28 +2,29 @@
 module Rpdfium
   module Util
-    # Estrae "words" da una lista di char, fedelmente a pdfplumber.WordExtractor.
+    # Extracts "words" from a list of chars, faithfully to
+    # pdfplumber.WordExtractor.
     #
-    # Algoritmo:
-    #   1. Ordina i char per (top, x0): righe top-to-bottom, char left-to-right
-    #      dentro ogni riga.
-    #   2. Cluster per top con `y_tolerance` → "righe logiche" di char.
-    #   3. Dentro ogni riga, cluster per gap orizzontale: due char sono nella
-    #      stessa word se `next.x0 - prev.x1 <= x_tolerance`. Anche un char
-    #      whitespace separa la word (a meno che `keep_blank_chars`).
-    #   4. Per ogni cluster di char emette una word: text concatenato, bbox.
+    # Algorithm:
+    #   1. Sort the chars by (top, x0): rows top-to-bottom, chars
+    #      left-to-right within each row.
+    #   2. Cluster by top with `y_tolerance` → "logical rows" of chars.
+    #   3. Within each row, cluster by horizontal gap: two chars belong to
+    #      the same word if `next.x0 - prev.x1 <= x_tolerance`. A whitespace
+    #      char also separates the word (unless `keep_blank_chars`).
+    #   4. For each cluster of chars, emit a word: concatenated text, bbox.
     #
-    # Differenze da pdfplumber (semplificazioni accettabili per il nostro uso):
-    #   - Non gestiamo `line_dir`/`char_dir` rotated (testo ruotato non
-    #     orizzontale ltr): non rilevante per i casi d'uso correnti.
-    #   - Non gestiamo `use_text_flow` (ordering basato sul content stream):
-    #     i nostri char arrivano già da PDFium nell'ordine geometrico via
-    #     `chars` (top, x0).
-    #   - Non gestiamo `expand_ligatures`: PDFium di solito espande i
-    #     codepoint correttamente già a livello char.
+    # Differences from pdfplumber (simplifications acceptable for our use):
+    #   - We do not handle rotated `line_dir`/`char_dir` (text rotated away
+    #     from horizontal ltr): not relevant for current use cases.
+    #   - We do not handle `use_text_flow` (ordering based on the content
+    #     stream): our chars already arrive from PDFium in geometric order
+    #     via `chars` (top, x0).
+    #   - We do not handle `expand_ligatures`: PDFium usually expands the
+    #     codepoints correctly already at the char level.
     #
-    # Queste differenze sono documentate; se mai necessarie si aggiungono
-    # come feature toggles senza cambiare il path di default.
+    # These differences are documented; if ever needed they can be added
+    # as feature toggles without changing the default path.
     class WordExtractor
       DEFAULT_X_TOLERANCE = 3.0
       DEFAULT_Y_TOLERANCE = 3.0
@@ -40,13 +41,13 @@ module Rpdfium
         @extra_attrs = extra_attrs || []
       end
-      # Restituisce un Array di Hash: { text:, x0:, x1:, top:, bottom:, chars: }.
-      # Se `extra_attrs` è non vuoto, ogni word splitta anche al cambio di
-      # questi attributi (es. fontname/size diversi → word diverse).
+      # Returns an Array of Hash: { text:, x0:, x1:, top:, bottom:, chars: }.
+      # If `extra_attrs` is non-empty, each word also splits when these
+      # attributes change (e.g. different fontname/size → different words).
       def extract_words(chars)
         return [] if chars.empty?
-        # Fast path: 1 solo char → 1 word triviale (se non whitespace).
+        # Fast path: a single char → 1 trivial word (if not whitespace).
         if chars.size == 1
           c = chars.first
           return [] if blank?(c) && !@keep_blank_chars
@@ -54,35 +55,35 @@ module Rpdfium
           return [build_word([c])]
         end
-        # 1. Ordina per (top, x0). Top-down, left-to-right.
+        # 1. Sort by (top, x0). Top-down, left-to-right.
         sorted = chars.sort_by { |c| [c[:top], c[:x0]] }
-        # 2. Cluster in righe per `top`.
-        # `presorted: true`: sorted è già ordinato per [top, x0], quindi
-        # implicitamente anche per top — cluster_objects salta il proprio
-        # sort interno.
+        # 2. Cluster into rows by `top`.
+        # `presorted: true`: sorted is already ordered by [top, x0], hence
+        # implicitly also by top — cluster_objects skips its own internal
+        # sort.
         rows = Cluster.cluster_objects(sorted, :top,
                                         tolerance: @y_tolerance,
                                         presorted: true)
         words = []
         rows.each do |row|
-          # Re-sort per x0 dentro ogni riga clusterizzata.
+          # Re-sort by x0 within each clustered row.
           #
-          # NOTA: in linea di principio l'input `sorted` è già ordinato per
-          # [top, x0], quindi i cluster di top dovrebbero essere già in
-          # ordine x0. MA il sort globale `[top, x0]` rispetta strettamente
-          # l'ordine per top — se due char della stessa riga visiva hanno
-          # top diversi entro tolerance (es. la "i" minuscola spesso ha
-          # top più alto di 0.008pt rispetto alle altre lettere a causa di
-          # come PDFium calcola la bbox), il sort globale li interfoglia.
-          # Il cluster_objects per :top non riordina internamente i char,
-          # quindi un char con top leggermente minore finisce DAVANTI a
-          # tutte le altre lettere della parola.
+          # NOTE: in principle the input `sorted` is already ordered by
+          # [top, x0], so the top clusters should already be in x0 order.
+          # BUT the global sort `[top, x0]` strictly respects the order by
+          # top — if two chars of the same visual row have different tops
+          # within tolerance (e.g. the lowercase "i" often has a top higher
+          # by 0.008pt than the other letters because of how PDFium computes
+          # the bbox), the global sort interleaves them. cluster_objects by
+          # :top does not internally reorder the chars, so a char with a
+          # slightly lower top ends up AHEAD of all the other letters of the
+          # word.
           #
-          # Esempio reale: "Categoria" dove "i" ha top=414.9789 e le altre
-          # 414.9869 → output `iCategora` invece di `Categoria`.
-          # Il fix è semplicemente ri-sortare per x0 dentro la riga.
+          # Real example: "Categoria" where "i" has top=414.9789 and the
+          # others 414.9869 → output `iCategora` instead of `Categoria`.
+          # The fix is simply to re-sort by x0 within the row.
           row_sorted = row.sort_by { |c| c[:x0] }
           word_chars = []
@@ -91,8 +92,8 @@ module Rpdfium
               words << build_word(word_chars) unless word_chars.empty?
               word_chars = []
             end
-            # Whitespace: per default lo usiamo come separatore (lo scartiamo).
-            # Con keep_blank_chars=true lo includiamo nella word corrente.
+            # Whitespace: by default we use it as a separator (we discard it).
+            # With keep_blank_chars=true we include it in the current word.
             if blank?(c) && !@keep_blank_chars
               words << build_word(word_chars) unless word_chars.empty?
               word_chars = []
@@ -111,15 +112,15 @@ module Rpdfium
       def char_begins_new_word?(prev, curr)
         return false if prev.nil?
-        # Gap orizzontale (PDF font hinting può dare overlap leggero, max 0)
+        # Horizontal gap (PDF font hinting may give a slight overlap, max 0)
         gap = curr[:x0] - prev[:x1]
         return true if gap > @x_tolerance
-        # Cambio di riga (può succedere se y_tolerance è grande ma due
-        # char sono comunque su righe diverse)
+        # Row change (can happen if y_tolerance is large but two chars are
+        # nonetheless on different rows)
         return true if (curr[:top] - prev[:top]).abs > @y_tolerance
-        # Cambio di un extra_attr richiesto
+        # Change of a required extra_attr
         @extra_attrs.any? { |attr| prev[attr] != curr[attr] }
       end

data/lib/rpdfium/util/word_merger.rb CHANGED Viewed

@@ -2,30 +2,30 @@
 module Rpdfium
   module Util
-    # Fonde word adiacenti sulla stessa riga in un'unica word con bbox
-    # aggregata e text concatenato.
+    # Merges adjacent words on the same row into a single word with an
+    # aggregated bbox and concatenated text.
     #
-    # Tre strategie disponibili come metodi separati:
+    # Three strategies are available as separate methods:
     #
-    # - `merge_by_proximity` — fonde tutte le word adiacenti che soddisfano
-    #   il criterio di vicinanza. Strategia base.
+    # - `merge_by_proximity` — merges all adjacent words that satisfy the
+    #   proximity criterion. Base strategy.
     #
-    # - `merge_by_label` — fonde solo word che condividono la stessa "label"
-    #   (chiave esterna calcolata dal chiamante). Utile per preservare la
-    #   semantica quando label diverse cadono sulla stessa riga (es. flag
-    #   in colonne adiacenti).
+    # - `merge_by_label` — merges only words that share the same "label"
+    #   (external key computed by the caller). Useful for preserving
+    #   semantics when different labels fall on the same row (e.g. flags
+    #   in adjacent columns).
     #
-    # - `merge_unlabeled` — fonde solo word "orfane" (label nil) lasciando
-    #   intatte quelle con label. Inverso di merge_by_label.
+    # - `merge_unlabeled` — merges only "orphan" words (label nil), leaving
+    #   labeled ones intact. Inverse of merge_by_label.
     #
-    # Tutte ritornano una nuova lista di word, con quelle fuse rappresentate
-    # come hash `{ text:, x0:, x1:, top:, bottom: }`.
+    # All return a new list of words, with merged ones represented as the
+    # hash `{ text:, x0:, x1:, top:, bottom: }`.
     #
-    # @example merge per proximity
+    # @example merge by proximity
     #   merger = Rpdfium::Util::WordMerger.new(x_gap: 20.0, y_tol: 3.0)
     #   merged = merger.merge_by_proximity(words)
     #
-    # @example merge per label, con label fornita dal chiamante
+    # @example merge by label, with the label provided by the caller
     #   labels_by_word = words.each_with_object({}) { |w, h| h[w] = compute_label(w) }
     #   merged = merger.merge_by_label(words, labels_by_word)
     class WordMerger
@@ -37,21 +37,22 @@ module Rpdfium
         @y_tol = y_tol
       end
-      # Fonde tutte le word adiacenti (stessa riga + gap orizzontale ≤ x_gap).
+      # Merges all adjacent words (same row + horizontal gap ≤ x_gap).
       def merge_by_proximity(words)
         merge_groups(words) { |a, b| true }
       end
-      # Fonde solo word con la stessa label.
-      # @param labels_by_word [Hash] mapping word → label (qualunque tipo).
-      #   Word con stessa label vengono fuse, word con label diverse no.
+      # Merges only words with the same label.
+      # @param labels_by_word [Hash] mapping word → label (any type).
+      #   Words with the same label are merged; words with different
+      #   labels are not.
       def merge_by_label(words, labels_by_word)
         merge_groups(words) do |a, b|
           labels_by_word[a] == labels_by_word[b]
         end
       end
-      # Fonde solo word con label nil (orfane).
+      # Merges only words with a nil label (orphans).
       def merge_unlabeled(words, labels_by_word)
         merge_groups(words) do |a, b|
           labels_by_word[a].nil? && labels_by_word[b].nil?
@@ -60,10 +61,10 @@ module Rpdfium
       private
-      # Algoritmo generico di merging: scorre i word ordinati per (top, x0)
-      # e li raggruppa quando soddisfano sia il criterio geometrico
-      # (stessa riga e gap orizzontale stretto) che il predicato `yield`
-      # fornito dal chiamante.
+      # Generic merging algorithm: iterates over the words sorted by
+      # (top, x0) and groups them when they satisfy both the geometric
+      # criterion (same row and narrow horizontal gap) and the `yield`
+      # predicate provided by the caller.
       def merge_groups(words)
         return [] if words.empty?

data/lib/rpdfium/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Rpdfium
-  VERSION = "0.4.1"
+  VERSION = "0.4.3"
 end

data/lib/rpdfium.rb CHANGED Viewed

@@ -3,9 +3,9 @@
 require_relative "rpdfium/version"
 require_relative "rpdfium/errors"
-# Carica la gemma companion rpdfium-binary se presente: deve avvenire PRIMA
-# di raw.rb, che chiama ffi_lib al momento del require e interroga
-# Rpdfium::Binary.library_path per trovare il path assoluto al .so/.dylib.
+# Loads the companion gem rpdfium-binary if present: this must happen BEFORE
+# raw.rb, which calls ffi_lib at require time and queries
+# Rpdfium::Binary.library_path to find the absolute path to the .so/.dylib.
 begin
   require "rpdfium/binary"
 rescue LoadError
@@ -54,22 +54,24 @@ module Rpdfium
     Document.open(input, password: password, &block)
   end
-  # Estrai tutto il testo di tutte le pagine, una stringa per pagina.
+  # Extract all the text of all pages, one string per page.
   def self.extract_text(input, password: nil)
-    open(input, password: password) { |doc| doc.map(&:text) }
+    open(input, password: password) do |doc|
+      doc.each_page_streaming.map(&:text)
+    end
   end
-  # Estrai tutte le tabelle di tutte le pagine.
-  # Ritorna Array<{ page: Integer, rows: Array<Array<String>> }>.
+  # Extract all the tables of all pages.
+  # Returns Array<{ page: Integer, rows: Array<Array<String>> }>.
   #
-  # `keep_blank_rows: false` (default) elimina le righe completamente vuote
-  # che la strategia `:text` di words_to_edges_h genera per costruzione (ogni
-  # riga visiva produce due edges, top + bottom, e tra coppie di edges
-  # adiacenti si formano "righe spurie" di altezza pari al gap interlinea).
-  # Con `keep_blank_rows: true` ottieni l'output grezzo di Table#extract.
+  # `keep_blank_rows: false` (default) removes the completely empty rows
+  # that the `:text` strategy of words_to_edges_h generates by construction (each
+  # visual row produces two edges, top + bottom, and between pairs of adjacent
+  # edges "spurious rows" form, with a height equal to the line gap).
+  # With `keep_blank_rows: true` you get the raw output of Table#extract.
   def self.extract_tables(input, password: nil, keep_blank_rows: false, **opts)
     open(input, password: password) do |doc|
-      doc.flat_map do |page|
+      doc.each_page_streaming.flat_map do |page|
         Table::Extractor.new(page, **opts).extract.map do |rows|
           rows = rows.reject { |r| r.all? { |c| c.nil? || c.empty? } } unless keep_blank_rows
           { page: page.index, rows: rows }
@@ -78,11 +80,11 @@ module Rpdfium
     end
   end
-  # Renderizza ogni pagina in un PNG dentro output_dir.
+  # Render each page to a PNG inside output_dir.
   def self.render_to_pngs(input, output_dir:, scale: 2.0, password: nil)
     Dir.mkdir(output_dir) unless Dir.exist?(output_dir)
     open(input, password: password) do |doc|
-      doc.map do |page|
+      doc.each_page_streaming.map do |page|
         path = File.join(output_dir, format("page_%04d.png", page.index + 1))
         page.render_to_png(path, scale: scale)
         path

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: rpdfium
 version: !ruby/object:Gem::Version
-  version: 0.4.1
+  version: 0.4.3
 platform: ruby
 authors:
 - Roberto Scinocca