rpdfium 0.4.1 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +601 -1317
- data/README.md +73 -78
- data/lib/rpdfium/annotation/annotation.rb +10 -8
- data/lib/rpdfium/document.rb +49 -22
- data/lib/rpdfium/errors.rb +2 -2
- data/lib/rpdfium/form/form.rb +9 -9
- data/lib/rpdfium/image/embedded.rb +17 -16
- data/lib/rpdfium/io/png.rb +9 -9
- data/lib/rpdfium/page.rb +562 -527
- data/lib/rpdfium/raw.rb +216 -203
- data/lib/rpdfium/search/search.rb +5 -5
- data/lib/rpdfium/structure/attachment.rb +6 -6
- data/lib/rpdfium/structure/element.rb +74 -74
- data/lib/rpdfium/structure/outline.rb +2 -2
- data/lib/rpdfium/structure/tree.rb +56 -55
- data/lib/rpdfium/table/cells.rb +36 -33
- data/lib/rpdfium/table/debugger.rb +12 -12
- data/lib/rpdfium/table/edges.rb +51 -49
- data/lib/rpdfium/table/extractor.rb +35 -34
- data/lib/rpdfium/table/table.rb +65 -62
- data/lib/rpdfium/util/cluster.rb +35 -33
- data/lib/rpdfium/util/column_inference.rb +34 -32
- data/lib/rpdfium/util/label_matcher.rb +30 -30
- data/lib/rpdfium/util/text_extraction.rb +15 -15
- data/lib/rpdfium/util/word_extractor.rb +49 -48
- data/lib/rpdfium/util/word_merger.rb +25 -24
- data/lib/rpdfium/version.rb +1 -1
- data/lib/rpdfium.rb +17 -15
- metadata +1 -1
data/lib/rpdfium/raw.rb
CHANGED
|
@@ -4,27 +4,27 @@ require "ffi"
|
|
|
4
4
|
require "rbconfig"
|
|
5
5
|
|
|
6
6
|
module Rpdfium
|
|
7
|
-
# Layer 1:
|
|
8
|
-
#
|
|
9
|
-
#
|
|
10
|
-
# in
|
|
7
|
+
# Layer 1: raw FFI bindings to the PDFium C API.
|
|
8
|
+
# 1:1 mapping with the original names. Use the wrapper classes for
|
|
9
|
+
# application code. PDFium "Experimental" APIs are marked in the comments:
|
|
10
|
+
# in theory they could change, in practice they have been stable for years.
|
|
11
11
|
module Raw
|
|
12
12
|
extend FFI::Library
|
|
13
13
|
|
|
14
|
-
#
|
|
14
|
+
# Builds the list of candidates that `ffi_lib` will try in order.
|
|
15
15
|
#
|
|
16
|
-
#
|
|
17
|
-
# (.dylib
|
|
18
|
-
#
|
|
19
|
-
# `libpdfium.so`
|
|
20
|
-
#
|
|
21
|
-
# OS
|
|
16
|
+
# WARNING: FFI auto-appends the platform's "natural" extension
|
|
17
|
+
# (.dylib on macOS, .so on Linux, .dll on Windows) when the supplied path
|
|
18
|
+
# does not already end with a known extension. Therefore, if we pass
|
|
19
|
+
# `libpdfium.so` on macOS, FFI looks for `libpdfium.so.dylib` — absurd but
|
|
20
|
+
# documented. To avoid this, we filter the system_library_names by
|
|
21
|
+
# host OS.
|
|
22
22
|
#
|
|
23
|
-
#
|
|
24
|
-
#
|
|
25
|
-
#
|
|
26
|
-
#
|
|
27
|
-
# (
|
|
23
|
+
# Additionally: ENV["PDFIUM_LIBRARY_PATH"] and Rpdfium::Binary.library_path
|
|
24
|
+
# are ABSOLUTE/EXPLICIT paths: if they are not found, we do NOT fall back
|
|
25
|
+
# to system names. We immediately return an array of a single path: in
|
|
26
|
+
# that case ffi_lib either succeeds right away, or raises a clear LoadError
|
|
27
|
+
# (which is what the user wants — they provided an explicit path).
|
|
28
28
|
def self.candidate_paths
|
|
29
29
|
explicit = ENV["PDFIUM_LIBRARY_PATH"]
|
|
30
30
|
return [explicit] if explicit && !explicit.empty?
|
|
@@ -37,10 +37,10 @@ module Rpdfium
|
|
|
37
37
|
system_library_names
|
|
38
38
|
end
|
|
39
39
|
|
|
40
|
-
#
|
|
41
|
-
# `libpdfium` (
|
|
42
|
-
#
|
|
43
|
-
#
|
|
40
|
+
# "System" names filtered by host OS. We keep `pdfium` /
|
|
41
|
+
# `libpdfium` (without extension) first: FFI auto-appends the right ext.
|
|
42
|
+
# Names with an extension are included ONLY if they match the host OS, so
|
|
43
|
+
# we avoid the double-extension bug.
|
|
44
44
|
def self.system_library_names
|
|
45
45
|
base = %w[pdfium libpdfium]
|
|
46
46
|
host = host_os
|
|
@@ -69,21 +69,21 @@ module Rpdfium
|
|
|
69
69
|
|
|
70
70
|
begin
|
|
71
71
|
ffi_lib(*candidate_paths)
|
|
72
|
-
ffi_convention :default # cdecl
|
|
72
|
+
ffi_convention :default # cdecl everywhere, even on Win64 (bblanchon build)
|
|
73
73
|
@native_loaded = true
|
|
74
74
|
rescue ::LoadError, ::RuntimeError => e
|
|
75
|
-
#
|
|
76
|
-
#
|
|
77
|
-
#
|
|
78
|
-
#
|
|
75
|
+
# We fall back to "stub" mode: the attach_function calls generate stubs
|
|
76
|
+
# that raise Rpdfium::LoadError on first invocation. This allows the gem
|
|
77
|
+
# to be loaded in order to use the pure-Ruby modules (Edges, Cells, PNG)
|
|
78
|
+
# without having PDFium installed.
|
|
79
79
|
@load_error = e
|
|
80
|
-
ffi_lib_flags :now # no-op
|
|
80
|
+
ffi_lib_flags :now # no-op without ffi_lib, but documents intent
|
|
81
81
|
end
|
|
82
82
|
|
|
83
|
-
#
|
|
84
|
-
#
|
|
85
|
-
#
|
|
86
|
-
#
|
|
83
|
+
# Tolerant attach_function wrapper: if the binding fails (library
|
|
84
|
+
# not loaded, symbol not present in this version of PDFium),
|
|
85
|
+
# it still generates a method that raises a clear error at the call site,
|
|
86
|
+
# instead of blowing up the `require`.
|
|
87
87
|
def self.attach_function(name, *args)
|
|
88
88
|
super
|
|
89
89
|
rescue FFI::NotFoundError, RuntimeError => e
|
|
@@ -94,8 +94,8 @@ module Rpdfium
|
|
|
94
94
|
end
|
|
95
95
|
|
|
96
96
|
unless @native_loaded
|
|
97
|
-
# Override
|
|
98
|
-
#
|
|
97
|
+
# Override of attach_function when the library failed to load:
|
|
98
|
+
# do not call super (which would blow up), generate the stub directly.
|
|
99
99
|
def self.attach_function(name, *_args)
|
|
100
100
|
err = @load_error
|
|
101
101
|
define_singleton_method(name) do |*_a|
|
|
@@ -110,7 +110,7 @@ module Rpdfium
|
|
|
110
110
|
end
|
|
111
111
|
|
|
112
112
|
# =========================================================================
|
|
113
|
-
#
|
|
113
|
+
# Opaque types
|
|
114
114
|
# =========================================================================
|
|
115
115
|
typedef :pointer, :FPDF_DOCUMENT
|
|
116
116
|
typedef :pointer, :FPDF_PAGE
|
|
@@ -135,7 +135,7 @@ module Rpdfium
|
|
|
135
135
|
typedef :ushort, :FPDF_WCHAR
|
|
136
136
|
|
|
137
137
|
# =========================================================================
|
|
138
|
-
#
|
|
138
|
+
# C structures
|
|
139
139
|
# =========================================================================
|
|
140
140
|
class FS_RECTF < FFI::Struct
|
|
141
141
|
layout :left, :float,
|
|
@@ -145,7 +145,7 @@ module Rpdfium
|
|
|
145
145
|
end
|
|
146
146
|
|
|
147
147
|
class FS_MATRIX < FFI::Struct
|
|
148
|
-
# PDF matrix: [a b 0; c d 0; e f 1] (row-major in PDF; FFI
|
|
148
|
+
# PDF matrix: [a b 0; c d 0; e f 1] (row-major in PDF; FFI follows field order)
|
|
149
149
|
layout :a, :float, :b, :float,
|
|
150
150
|
:c, :float, :d, :float,
|
|
151
151
|
:e, :float, :f, :float
|
|
@@ -177,7 +177,7 @@ module Rpdfium
|
|
|
177
177
|
end
|
|
178
178
|
|
|
179
179
|
# =========================================================================
|
|
180
|
-
#
|
|
180
|
+
# Constants
|
|
181
181
|
# =========================================================================
|
|
182
182
|
# Bitmap formats
|
|
183
183
|
FPDFBitmap_Unknown = 0
|
|
@@ -191,7 +191,7 @@ module Rpdfium
|
|
|
191
191
|
FPDF_LCD_TEXT = 0x02
|
|
192
192
|
FPDF_NO_NATIVETEXT = 0x04
|
|
193
193
|
FPDF_GRAYSCALE = 0x08
|
|
194
|
-
FPDF_REVERSE_BYTE_ORDER = 0x10 # → RGBA
|
|
194
|
+
FPDF_REVERSE_BYTE_ORDER = 0x10 # → RGBA instead of BGRA
|
|
195
195
|
FPDF_NO_GDIPLUS = 0x40
|
|
196
196
|
FPDF_PRINTING = 0x800
|
|
197
197
|
FPDF_RENDER_NO_SMOOTHTEXT = 0x1000
|
|
@@ -254,7 +254,7 @@ module Rpdfium
|
|
|
254
254
|
FPDF_ANNOT_WIDGET => "Widget", FPDF_ANNOT_REDACT => "Redact"
|
|
255
255
|
}.freeze
|
|
256
256
|
|
|
257
|
-
# Form field types (
|
|
257
|
+
# Form field types (for widget annotations)
|
|
258
258
|
FPDF_FORMFIELD_UNKNOWN = 0
|
|
259
259
|
FPDF_FORMFIELD_PUSHBUTTON = 1
|
|
260
260
|
FPDF_FORMFIELD_CHECKBOX = 2
|
|
@@ -335,18 +335,18 @@ module Rpdfium
|
|
|
335
335
|
attach_function :FPDFText_GetFontWeight, %i[FPDF_TEXTPAGE int], :int
|
|
336
336
|
attach_function :FPDFText_GetFontInfo,
|
|
337
337
|
%i[FPDF_TEXTPAGE int pointer ulong pointer], :ulong
|
|
338
|
-
# NOTE: FPDFText_GetTextRenderMode(text_page, char_index)
|
|
339
|
-
#
|
|
338
|
+
# NOTE: FPDFText_GetTextRenderMode(text_page, char_index) was REMOVED
|
|
339
|
+
# from PDFium in chromium/6611 (July 2024). The replacement is two steps:
|
|
340
340
|
# 1. FPDFText_GetTextObject(text_page, char_index) → FPDF_PAGEOBJECT
|
|
341
341
|
# 2. FPDFTextObj_GetTextRenderMode(page_object) → int
|
|
342
|
-
#
|
|
343
|
-
#
|
|
342
|
+
# High-level wrapper: see Page#chars (the :render_mode field).
|
|
343
|
+
# Reference: pypdfium2 issue #335, pdfium-render issue #151.
|
|
344
344
|
attach_function :FPDFText_GetTextObject,
|
|
345
345
|
%i[FPDF_TEXTPAGE int], :FPDF_PAGEOBJECT
|
|
346
346
|
attach_function :FPDFText_GetCharBox,
|
|
347
347
|
%i[FPDF_TEXTPAGE int pointer pointer pointer pointer],
|
|
348
348
|
:FPDF_BOOL
|
|
349
|
-
# "Loose" char box: bbox
|
|
349
|
+
# "Loose" char box: bbox proportional to the font size, more stable for layout
|
|
350
350
|
attach_function :FPDFText_GetLooseCharBox,
|
|
351
351
|
%i[FPDF_TEXTPAGE int pointer], :FPDF_BOOL
|
|
352
352
|
attach_function :FPDFText_GetMatrix,
|
|
@@ -398,7 +398,7 @@ module Rpdfium
|
|
|
398
398
|
attach_function :FPDF_RenderPageBitmap,
|
|
399
399
|
%i[FPDF_BITMAP FPDF_PAGE int int int int int int],
|
|
400
400
|
:void
|
|
401
|
-
# Rendering
|
|
401
|
+
# Rendering with a 2x3 matrix + clipping (for arbitrary scaling/rotation)
|
|
402
402
|
attach_function :FPDF_RenderPageBitmapWithMatrix,
|
|
403
403
|
%i[FPDF_BITMAP FPDF_PAGE pointer pointer int],
|
|
404
404
|
:void
|
|
@@ -426,22 +426,22 @@ module Rpdfium
|
|
|
426
426
|
attach_function :FPDFPageObj_GetLineJoin, %i[FPDF_PAGEOBJECT], :int
|
|
427
427
|
|
|
428
428
|
# =========================================================================
|
|
429
|
-
# Form XObjects:
|
|
430
|
-
#
|
|
431
|
-
# (TeamSystem, Zucchetti, ...)
|
|
432
|
-
#
|
|
433
|
-
#
|
|
429
|
+
# Form XObjects: containers that encapsulate graphics (lines, rects, text)
|
|
430
|
+
# as a reusable "graphics subroutine". In PDFs generated by management
|
|
431
|
+
# software (TeamSystem, Zucchetti, ...) and by many Word/Excel templates,
|
|
432
|
+
# the ENTIRE page is a single Form XObject. Without descending into it, no
|
|
433
|
+
# lines/rects/chars are visible. Cf. PDF Spec 1.7 §8.10.
|
|
434
434
|
#
|
|
435
|
-
#
|
|
436
|
-
#
|
|
437
|
-
#
|
|
435
|
+
# After FPDFFormObj_GetObject(form, i) one obtains a child FPDF_PAGEOBJECT
|
|
436
|
+
# whose coordinates are in the form's system. The transformation to the
|
|
437
|
+
# page system is obtained from FPDFPageObj_GetMatrix(form_obj, &matrix).
|
|
438
438
|
# =========================================================================
|
|
439
439
|
attach_function :FPDFFormObj_CountObjects, %i[FPDF_PAGEOBJECT], :int
|
|
440
440
|
attach_function :FPDFFormObj_GetObject,
|
|
441
441
|
%i[FPDF_PAGEOBJECT ulong], :FPDF_PAGEOBJECT
|
|
442
442
|
|
|
443
443
|
# =========================================================================
|
|
444
|
-
# Path segments —
|
|
444
|
+
# Path segments — fundamental for table line detection
|
|
445
445
|
# =========================================================================
|
|
446
446
|
attach_function :FPDFPath_CountSegments, %i[FPDF_PAGEOBJECT], :int
|
|
447
447
|
attach_function :FPDFPath_GetPathSegment,
|
|
@@ -474,33 +474,33 @@ module Rpdfium
|
|
|
474
474
|
%i[FPDF_PAGEOBJECT int pointer ulong], :ulong
|
|
475
475
|
|
|
476
476
|
# =========================================================================
|
|
477
|
-
# Text page-objects (font name
|
|
477
|
+
# Text page-objects (font name of a text object, glyphs)
|
|
478
478
|
# =========================================================================
|
|
479
479
|
attach_function :FPDFTextObj_GetFontSize,
|
|
480
480
|
%i[FPDF_PAGEOBJECT pointer], :FPDF_BOOL
|
|
481
481
|
attach_function :FPDFTextObj_GetText,
|
|
482
482
|
%i[FPDF_PAGEOBJECT FPDF_TEXTPAGE pointer ulong], :ulong
|
|
483
483
|
attach_function :FPDFTextObj_GetFont, %i[FPDF_PAGEOBJECT], :FPDF_FONT
|
|
484
|
-
# FPDFTextObj_GetTextRenderMode
|
|
485
|
-
# FPDFText_GetTextRenderMode (
|
|
486
|
-
#
|
|
484
|
+
# FPDFTextObj_GetTextRenderMode is the replacement for the former
|
|
485
|
+
# FPDFText_GetTextRenderMode (removed upstream in chromium/6611).
|
|
486
|
+
# It takes a text PAGEOBJECT, not (textpage, char_index).
|
|
487
487
|
attach_function :FPDFTextObj_GetTextRenderMode, %i[FPDF_PAGEOBJECT], :int
|
|
488
|
-
# NOTE: FPDFFont_GetFontName
|
|
489
|
-
#
|
|
490
|
-
# - FPDFFont_GetBaseFontName → BaseFont entry
|
|
491
|
-
#
|
|
488
|
+
# NOTE: FPDFFont_GetFontName is marked as legacy in recent PDFium.
|
|
489
|
+
# The new model provides two distinct APIs:
|
|
490
|
+
# - FPDFFont_GetBaseFontName → BaseFont entry of the PDF dict (may
|
|
491
|
+
# include subset prefixes such as
|
|
492
492
|
# "ABCDEF+Helvetica")
|
|
493
|
-
# - FPDFFont_GetFamilyName →
|
|
494
|
-
#
|
|
495
|
-
# `c_ulong`.
|
|
496
|
-
# in
|
|
497
|
-
#
|
|
493
|
+
# - FPDFFont_GetFamilyName → "clean" family name (e.g. "Helvetica")
|
|
494
|
+
# These APIs use `c_size_t` for length/return type instead of
|
|
495
|
+
# `c_ulong`. On PDFium builds <= chromium/6533 they are not present:
|
|
496
|
+
# in that case the `attach_function` stub (in raw.rb) ensures that the
|
|
497
|
+
# call fails with a clear LoadError at the call site, not at require.
|
|
498
498
|
attach_function :FPDFFont_GetBaseFontName,
|
|
499
499
|
%i[FPDF_FONT pointer size_t], :size_t
|
|
500
500
|
attach_function :FPDFFont_GetFamilyName,
|
|
501
501
|
%i[FPDF_FONT pointer size_t], :size_t
|
|
502
|
-
#
|
|
503
|
-
#
|
|
502
|
+
# Kept for compatibility with older PDFium builds. On newer builds
|
|
503
|
+
# it may not be present: same stub mechanism.
|
|
504
504
|
attach_function :FPDFFont_GetFontName,
|
|
505
505
|
%i[FPDF_FONT pointer ulong], :ulong
|
|
506
506
|
attach_function :FPDFFont_GetFlags, %i[FPDF_FONT pointer], :FPDF_BOOL
|
|
@@ -509,30 +509,30 @@ module Rpdfium
|
|
|
509
509
|
attach_function :FPDFFont_GetItalicAngle,
|
|
510
510
|
%i[FPDF_FONT pointer], :FPDF_BOOL
|
|
511
511
|
|
|
512
|
-
#
|
|
513
|
-
#
|
|
514
|
-
# font_size
|
|
515
|
-
# baseline detection
|
|
512
|
+
# Font ascent/descent metrics in font-program units.
|
|
513
|
+
# To obtain the value in page coordinates, multiply by the text object's
|
|
514
|
+
# font_size and then by the CTM scale. Useful for
|
|
515
|
+
# baseline detection and line leading.
|
|
516
516
|
attach_function :FPDFFont_GetAscent, %i[FPDF_FONT int pointer], :FPDF_BOOL
|
|
517
517
|
attach_function :FPDFFont_GetDescent, %i[FPDF_FONT int pointer], :FPDF_BOOL
|
|
518
518
|
|
|
519
|
-
#
|
|
520
|
-
#
|
|
521
|
-
#
|
|
522
|
-
# (
|
|
523
|
-
#
|
|
524
|
-
#
|
|
519
|
+
# Nominal width of a glyph in the font program ("advance width").
|
|
520
|
+
# It is the width the PDF declares for that glyph before the kerning
|
|
521
|
+
# applied by the `TJ` operators. In combination with FPDFText_GetMatrix
|
|
522
|
+
# (for the CTM scale), it allows the real advance in page coordinates to
|
|
523
|
+
# be computed. Conceptually equivalent to the advance that pdfminer.six
|
|
524
|
+
# reads directly from the font program.
|
|
525
525
|
#
|
|
526
|
-
#
|
|
527
|
-
#
|
|
528
|
-
#
|
|
529
|
-
# (
|
|
526
|
+
# WARNING: the returned value is in "font_size-scaled" units,
|
|
527
|
+
# with font_size passed as a parameter. For most PDFs
|
|
528
|
+
# generated by management software, the font_size is 1.0 and the CTM
|
|
529
|
+
# scales (typically 5×–10× for the final rendering).
|
|
530
530
|
attach_function :FPDFFont_GetGlyphWidth,
|
|
531
531
|
%i[FPDF_FONT uint float pointer], :FPDF_BOOL
|
|
532
532
|
|
|
533
|
-
#
|
|
534
|
-
# In
|
|
535
|
-
#
|
|
533
|
+
# NOTE: FPDFText_GetMatrix is already attached above (text page section).
|
|
534
|
+
# In combination with FPDFFont_GetGlyphWidth, it allows the glyph advance
|
|
535
|
+
# in page coordinates to be computed as
|
|
536
536
|
# `glyph_width × |FPDFText_GetMatrix.a|`.
|
|
537
537
|
|
|
538
538
|
# =========================================================================
|
|
@@ -563,16 +563,16 @@ module Rpdfium
|
|
|
563
563
|
# =========================================================================
|
|
564
564
|
# Forms
|
|
565
565
|
# =========================================================================
|
|
566
|
-
# FPDF_FORMFILLINFO
|
|
567
|
-
#
|
|
568
|
-
#
|
|
569
|
-
# in
|
|
566
|
+
# FPDF_FORMFILLINFO is a rich struct (~70 fields in the latest builds).
|
|
567
|
+
# For EXTRACTION alone it is enough to pass a minimal version with version=2
|
|
568
|
+
# and all callbacks null — PDFium tolerates NULL on those not called
|
|
569
|
+
# in read-only mode (no JavaScript, no XFA).
|
|
570
570
|
class FPDF_FORMFILLINFO < FFI::Struct
|
|
571
|
-
#
|
|
572
|
-
# `version` —
|
|
573
|
-
#
|
|
574
|
-
#
|
|
575
|
-
#
|
|
571
|
+
# Keep aligned with the public header fpdf_formfill.h. The critical field
|
|
572
|
+
# is `version` — if it is wrong, init fails silently. For read-only use
|
|
573
|
+
# version=2 + all others zero/NULL is enough. We allocate a very
|
|
574
|
+
# generous buffer (256 pointers) to be robust against future extensions
|
|
575
|
+
# of the header.
|
|
576
576
|
layout :version, :int,
|
|
577
577
|
:_callbacks, [:pointer, 256]
|
|
578
578
|
end
|
|
@@ -625,23 +625,23 @@ module Rpdfium
|
|
|
625
625
|
%i[FPDF_ATTACHMENT pointer ulong pointer], :FPDF_BOOL
|
|
626
626
|
|
|
627
627
|
# =========================================================================
|
|
628
|
-
# Structure tree (
|
|
628
|
+
# Structure tree (for tagged PDF → robust semantic extraction)
|
|
629
629
|
# =========================================================================
|
|
630
630
|
#
|
|
631
|
-
#
|
|
632
|
-
# `StructTreeRoot`
|
|
633
|
-
# → P, H1, Table, TR, TH, TD, Figure...)
|
|
634
|
-
#
|
|
635
|
-
# `MarkedContentID`:
|
|
636
|
-
#
|
|
631
|
+
# For "tagged" PDFs (PDF/UA, exports from Word/LibreOffice/InDesign), the
|
|
632
|
+
# `StructTreeRoot` exposes a logical structure of the document (Document
|
|
633
|
+
# → P, H1, Table, TR, TH, TD, Figure...) independent of the graphical
|
|
634
|
+
# layout. Each element can be linked to the page text via
|
|
635
|
+
# `MarkedContentID`: page objects with the same MCID belong
|
|
636
|
+
# semantically to that element.
|
|
637
637
|
#
|
|
638
|
-
#
|
|
639
|
-
# FPDF_StructTree_GetForPage
|
|
638
|
+
# On NON-tagged PDFs (most Italian management-software output):
|
|
639
|
+
# FPDF_StructTree_GetForPage returns NULL.
|
|
640
640
|
#
|
|
641
|
-
#
|
|
642
|
-
# StructTreeRoot
|
|
643
|
-
#
|
|
644
|
-
# output
|
|
641
|
+
# On "tagged but empty" PDFs (e.g. a Banca d'Italia CR, where the
|
|
642
|
+
# StructTreeRoot exists with 700+ entries but all elements are
|
|
643
|
+
# placeholders without type/MCID): the tree is present but the walk
|
|
644
|
+
# produces empty output. See `Rpdfium::Structure::Tree#empty?`.
|
|
645
645
|
typedef :pointer, :FPDF_STRUCTELEMENT_ATTR
|
|
646
646
|
typedef :pointer, :FPDF_STRUCTELEMENT_ATTR_VALUE
|
|
647
647
|
|
|
@@ -653,7 +653,7 @@ module Rpdfium
|
|
|
653
653
|
attach_function :FPDF_StructTree_GetChildAtIndex,
|
|
654
654
|
%i[FPDF_STRUCTTREE int], :FPDF_STRUCTELEMENT
|
|
655
655
|
|
|
656
|
-
#
|
|
656
|
+
# Tree navigation
|
|
657
657
|
attach_function :FPDF_StructElement_CountChildren,
|
|
658
658
|
%i[FPDF_STRUCTELEMENT], :int
|
|
659
659
|
attach_function :FPDF_StructElement_GetChildAtIndex,
|
|
@@ -661,7 +661,7 @@ module Rpdfium
|
|
|
661
661
|
attach_function :FPDF_StructElement_GetParent,
|
|
662
662
|
%i[FPDF_STRUCTELEMENT], :FPDF_STRUCTELEMENT
|
|
663
663
|
|
|
664
|
-
#
|
|
664
|
+
# Element identification
|
|
665
665
|
attach_function :FPDF_StructElement_GetType,
|
|
666
666
|
%i[FPDF_STRUCTELEMENT pointer ulong], :ulong
|
|
667
667
|
attach_function :FPDF_StructElement_GetObjType,
|
|
@@ -673,7 +673,7 @@ module Rpdfium
|
|
|
673
673
|
attach_function :FPDF_StructElement_GetLang,
|
|
674
674
|
%i[FPDF_STRUCTELEMENT pointer ulong], :ulong
|
|
675
675
|
|
|
676
|
-
#
|
|
676
|
+
# "Logical" text overrides (accessibility, ligature resolution)
|
|
677
677
|
attach_function :FPDF_StructElement_GetActualText,
|
|
678
678
|
%i[FPDF_STRUCTELEMENT pointer ulong], :ulong
|
|
679
679
|
attach_function :FPDF_StructElement_GetAltText,
|
|
@@ -681,10 +681,10 @@ module Rpdfium
|
|
|
681
681
|
attach_function :FPDF_StructElement_GetExpansion,
|
|
682
682
|
%i[FPDF_STRUCTELEMENT pointer ulong], :ulong
|
|
683
683
|
|
|
684
|
-
# Marked content IDs (
|
|
685
|
-
# GetMarkedContentID
|
|
686
|
-
# GetMarkedContentIdCount + IdAtIndex
|
|
687
|
-
# GetChildMarkedContentID: MCID
|
|
684
|
+
# Marked content IDs (link elements → page objects with the same MCID)
|
|
685
|
+
# GetMarkedContentID returns the first MCID (for back-compat).
|
|
686
|
+
# GetMarkedContentIdCount + IdAtIndex for elements with multiple MCIDs.
|
|
687
|
+
# GetChildMarkedContentID: MCID of the child if it is a direct MCR.
|
|
688
688
|
attach_function :FPDF_StructElement_GetMarkedContentID,
|
|
689
689
|
%i[FPDF_STRUCTELEMENT], :int
|
|
690
690
|
attach_function :FPDF_StructElement_GetMarkedContentIdCount,
|
|
@@ -694,9 +694,9 @@ module Rpdfium
|
|
|
694
694
|
attach_function :FPDF_StructElement_GetChildMarkedContentID,
|
|
695
695
|
%i[FPDF_STRUCTELEMENT int], :int
|
|
696
696
|
|
|
697
|
-
#
|
|
698
|
-
#
|
|
699
|
-
#
|
|
697
|
+
# Structural PDF attributes (RowSpan, ColSpan, Scope, Headers, etc.)
|
|
698
|
+
# They live in a sub-API: each element has 0+ attribute objects, each
|
|
699
|
+
# with 0+ key/value pairs.
|
|
700
700
|
attach_function :FPDF_StructElement_GetAttributeCount,
|
|
701
701
|
%i[FPDF_STRUCTELEMENT], :int
|
|
702
702
|
attach_function :FPDF_StructElement_GetAttributeAtIndex,
|
|
@@ -704,7 +704,7 @@ module Rpdfium
|
|
|
704
704
|
attach_function :FPDF_StructElement_GetStringAttribute,
|
|
705
705
|
%i[FPDF_STRUCTELEMENT string pointer ulong], :ulong
|
|
706
706
|
|
|
707
|
-
# Attribute getters:
|
|
707
|
+
# Attribute getters: key/value enumeration
|
|
708
708
|
attach_function :FPDF_StructElement_Attr_GetCount,
|
|
709
709
|
%i[FPDF_STRUCTELEMENT_ATTR], :int
|
|
710
710
|
attach_function :FPDF_StructElement_Attr_GetName,
|
|
@@ -725,7 +725,7 @@ module Rpdfium
|
|
|
725
725
|
attach_function :FPDF_StructElement_Attr_GetBlobValue,
|
|
726
726
|
%i[FPDF_STRUCTELEMENT_ATTR_VALUE pointer ulong pointer],
|
|
727
727
|
:FPDF_BOOL
|
|
728
|
-
# Attribute
|
|
728
|
+
# Attribute whose value is another array (e.g. Headers, an array of IDs)
|
|
729
729
|
attach_function :FPDF_StructElement_Attr_CountChildren,
|
|
730
730
|
%i[FPDF_STRUCTELEMENT_ATTR_VALUE], :int
|
|
731
731
|
attach_function :FPDF_StructElement_Attr_GetChildAtIndex,
|
|
@@ -735,17 +735,17 @@ module Rpdfium
|
|
|
735
735
|
# =========================================================================
|
|
736
736
|
# Page box geometry — media/crop/bleed/trim/art box
|
|
737
737
|
# =========================================================================
|
|
738
|
-
#
|
|
739
|
-
# - media:
|
|
740
|
-
# - crop:
|
|
741
|
-
# - bleed: area
|
|
742
|
-
# - trim:
|
|
743
|
-
# - art: area
|
|
738
|
+
# Each PDF page has up to 5 rectangular boxes, in bottom-up coordinates:
|
|
739
|
+
# - media: the complete physical area of the page (always present)
|
|
740
|
+
# - crop: the visible sub-area (default = media if not specified)
|
|
741
|
+
# - bleed: usable area for printing with bleed margins (rare)
|
|
742
|
+
# - trim: final cut area (rare, for pre-press)
|
|
743
|
+
# - art: area of significant content (rare)
|
|
744
744
|
#
|
|
745
|
-
# In pdfplumber
|
|
746
|
-
#
|
|
747
|
-
#
|
|
748
|
-
#
|
|
745
|
+
# In pdfplumber these are exposed as `page.mediabox`, `page.cropbox`, etc.
|
|
746
|
+
# Without access to the cropbox, a PDF extraction library cannot know
|
|
747
|
+
# which is the "visible" area of the page vs the "physical" one.
|
|
748
|
+
# They all return FPDF_BOOL: 0 if the box is not defined.
|
|
749
749
|
attach_function :FPDFPage_GetMediaBox,
|
|
750
750
|
%i[FPDF_PAGE pointer pointer pointer pointer], :FPDF_BOOL
|
|
751
751
|
attach_function :FPDFPage_GetCropBox,
|
|
@@ -758,25 +758,25 @@ module Rpdfium
|
|
|
758
758
|
%i[FPDF_PAGE pointer pointer pointer pointer], :FPDF_BOOL
|
|
759
759
|
|
|
760
760
|
# =========================================================================
|
|
761
|
-
# Page object:
|
|
761
|
+
# Page object: state, rotated bounds, dash pattern, marked content
|
|
762
762
|
# =========================================================================
|
|
763
|
-
# `FPDFPageObj_GetIsActive`:
|
|
764
|
-
# (
|
|
765
|
-
#
|
|
766
|
-
#
|
|
763
|
+
# `FPDFPageObj_GetIsActive`: some page objects may be "inactive"
|
|
764
|
+
# (e.g. hidden by Optional Content / disabled layers). Without
|
|
765
|
+
# this check, extraction would include non-visible content.
|
|
766
|
+
# Returns 0/1 in *out_active.
|
|
767
767
|
attach_function :FPDFPageObj_GetIsActive,
|
|
768
768
|
%i[FPDF_PAGEOBJECT pointer], :FPDF_BOOL
|
|
769
769
|
|
|
770
|
-
# `FPDFPageObj_GetRotatedBounds`: bbox
|
|
771
|
-
#
|
|
772
|
-
# Bounding Box),
|
|
773
|
-
#
|
|
770
|
+
# `FPDFPageObj_GetRotatedBounds`: bbox as 4 points (FS_QUADPOINTSF) for
|
|
771
|
+
# rotated objects. The standard GetBounds returns the AABB (Axis-Aligned
|
|
772
|
+
# Bounding Box), useless for objects at 45°/90°. For vertical or
|
|
773
|
+
# rotated text, this is the "true" bbox.
|
|
774
774
|
attach_function :FPDFPageObj_GetRotatedBounds,
|
|
775
775
|
%i[FPDF_PAGEOBJECT pointer], :FPDF_BOOL
|
|
776
776
|
|
|
777
|
-
# Dash pattern:
|
|
778
|
-
#
|
|
779
|
-
#
|
|
777
|
+
# Dash pattern: useful in `line_segments` to filter out dashed
|
|
778
|
+
# guide lines (often used as "non-printing" hints in templates).
|
|
779
|
+
# Dashed lines can confuse table cell detection.
|
|
780
780
|
attach_function :FPDFPageObj_GetDashCount,
|
|
781
781
|
%i[FPDF_PAGEOBJECT], :int
|
|
782
782
|
attach_function :FPDFPageObj_GetDashArray,
|
|
@@ -784,12 +784,12 @@ module Rpdfium
|
|
|
784
784
|
attach_function :FPDFPageObj_GetDashPhase,
|
|
785
785
|
%i[FPDF_PAGEOBJECT pointer], :FPDF_BOOL
|
|
786
786
|
|
|
787
|
-
# Marked content (Tagged PDF) —
|
|
788
|
-
# In
|
|
789
|
-
# `/Span BMC ... EMC`
|
|
790
|
-
#
|
|
791
|
-
#
|
|
792
|
-
#
|
|
787
|
+
# Marked content (Tagged PDF) — BMC/BDC operators of the content stream.
|
|
788
|
+
# In structured PDFs (PDF/UA, Word→PDF, InDesign export), the operators
|
|
789
|
+
# `/Span BMC ... EMC` or `/Span <</MCID 12>> BDC ... EMC` group
|
|
790
|
+
# chars semantically. For PDFs generated by Italian management software
|
|
791
|
+
# these tags are NOT present; for "tagged" PDFs they are the most reliable
|
|
792
|
+
# way to group tokens.
|
|
793
793
|
attach_function :FPDFPageObj_CountMarks,
|
|
794
794
|
%i[FPDF_PAGEOBJECT], :int
|
|
795
795
|
attach_function :FPDFPageObj_GetMark,
|
|
@@ -814,23 +814,23 @@ module Rpdfium
|
|
|
814
814
|
# =========================================================================
|
|
815
815
|
# Catalog / Document metadata
|
|
816
816
|
# =========================================================================
|
|
817
|
-
# FPDFCatalog_GetLanguage:
|
|
818
|
-
#
|
|
819
|
-
#
|
|
817
|
+
# FPDFCatalog_GetLanguage: language declared by the document (e.g. "it-IT").
|
|
818
|
+
# Useful for extraction pipelines that want to switch language-specific
|
|
819
|
+
# rules (e.g. word tokenizer, hyphen lookup).
|
|
820
820
|
attach_function :FPDFCatalog_GetLanguage,
|
|
821
821
|
%i[FPDF_DOCUMENT pointer ulong], :ulong
|
|
822
822
|
|
|
823
|
-
# FPDFDoc_GetPageMode:
|
|
824
|
-
# PageMode.FullScreen). Numeric.
|
|
823
|
+
# FPDFDoc_GetPageMode: PDF open state (e.g. PageMode.UseOutlines,
|
|
824
|
+
# PageMode.FullScreen). Numeric. Useful for PDF editor/viewer building.
|
|
825
825
|
attach_function :FPDFDoc_GetPageMode, %i[FPDF_DOCUMENT], :int
|
|
826
826
|
|
|
827
827
|
# =========================================================================
|
|
828
|
-
# Links (annotation
|
|
828
|
+
# Links (Link annotation and LinkAtPoint for coordinate-based lookup)
|
|
829
829
|
# =========================================================================
|
|
830
|
-
# `FPDFLink_GetLinkAtPoint`:
|
|
831
|
-
#
|
|
832
|
-
# in
|
|
833
|
-
# `page.hyperlinks`.
|
|
830
|
+
# `FPDFLink_GetLinkAtPoint`: given (x, y) in page coordinates, returns
|
|
831
|
+
# the link annotation that contains it. The core of "click handling"
|
|
832
|
+
# in viewers / OCR-style "extract links". Pdfplumber exposes something
|
|
833
|
+
# similar via `page.hyperlinks`.
|
|
834
834
|
attach_function :FPDFLink_GetLinkAtPoint,
|
|
835
835
|
%i[FPDF_PAGE double double], :FPDF_LINK
|
|
836
836
|
attach_function :FPDFLink_GetLinkZOrderAtPoint,
|
|
@@ -839,37 +839,37 @@ module Rpdfium
|
|
|
839
839
|
%i[FPDF_PAGE FPDF_LINK], :FPDF_ANNOTATION
|
|
840
840
|
attach_function :FPDFLink_GetAnnotRect,
|
|
841
841
|
%i[FPDF_LINK pointer], :FPDF_BOOL
|
|
842
|
-
# FPDFLink_GetTextRange: range
|
|
843
|
-
#
|
|
842
|
+
# FPDFLink_GetTextRange: range of char_index in the text page corresponding
|
|
843
|
+
# to the link. Allows mapping hyperlink → page text.
|
|
844
844
|
attach_function :FPDFLink_GetTextRange,
|
|
845
845
|
%i[FPDF_LINK pointer pointer], :FPDF_BOOL
|
|
846
|
-
# Rect
|
|
847
|
-
#
|
|
846
|
+
# Rect and QuadPoints: link geometry (rectangle or quadrilateral for
|
|
847
|
+
# links that span multiple lines).
|
|
848
848
|
attach_function :FPDFLink_GetRect,
|
|
849
849
|
%i[FPDF_LINK int pointer], :FPDF_BOOL
|
|
850
850
|
attach_function :FPDFLink_GetQuadPoints,
|
|
851
851
|
%i[FPDF_LINK int pointer], :FPDF_BOOL
|
|
852
852
|
|
|
853
853
|
# =========================================================================
|
|
854
|
-
# Action / Destination (
|
|
854
|
+
# Action / Destination (outline + link extensions)
|
|
855
855
|
# =========================================================================
|
|
856
|
-
# FPDFAction_GetDest:
|
|
857
|
-
# FPDFAction_GetFilePath:
|
|
858
|
-
#
|
|
856
|
+
# FPDFAction_GetDest: for "GoTo"-type actions, returns the FPDF_DEST.
|
|
857
|
+
# FPDFAction_GetFilePath: for "Launch" or "RemoteGoTo" actions, the path of
|
|
858
|
+
# the target external file.
|
|
859
859
|
attach_function :FPDFAction_GetDest,
|
|
860
860
|
%i[FPDF_DOCUMENT FPDF_ACTION], :FPDF_DEST
|
|
861
861
|
attach_function :FPDFAction_GetFilePath,
|
|
862
862
|
%i[FPDF_ACTION pointer ulong], :ulong
|
|
863
|
-
# FPDFBookmark_GetAction: action
|
|
864
|
-
# GetDest
|
|
863
|
+
# FPDFBookmark_GetAction: action associated with a bookmark (alternative to
|
|
864
|
+
# GetDest if the bookmark is an action instead of a destination).
|
|
865
865
|
attach_function :FPDFBookmark_GetAction,
|
|
866
866
|
%i[FPDF_BOOKMARK], :FPDF_ACTION
|
|
867
|
-
# FPDFBookmark_GetCount:
|
|
868
|
-
#
|
|
867
|
+
# FPDFBookmark_GetCount: number of sub-bookmarks (positive = expanded,
|
|
868
|
+
# negative = collapsed, 0 = leaf).
|
|
869
869
|
attach_function :FPDFBookmark_GetCount,
|
|
870
870
|
%i[FPDF_BOOKMARK], :int
|
|
871
|
-
# FPDFDest_GetView:
|
|
872
|
-
# FPDFDest_GetLocationInPage: x/y/zoom
|
|
871
|
+
# FPDFDest_GetView: view type (Fit, FitH, XYZ, etc.) + parameters.
|
|
872
|
+
# FPDFDest_GetLocationInPage: x/y/zoom extracted from the dest.
|
|
873
873
|
attach_function :FPDFDest_GetView,
|
|
874
874
|
%i[FPDF_DEST pointer pointer], :ulong
|
|
875
875
|
attach_function :FPDFDest_GetLocationInPage,
|
|
@@ -879,17 +879,17 @@ module Rpdfium
|
|
|
879
879
|
# =========================================================================
|
|
880
880
|
# Font extras: GetFontData, GetAscent, GetDescent
|
|
881
881
|
# =========================================================================
|
|
882
|
-
#
|
|
883
|
-
#
|
|
884
|
-
#
|
|
885
|
-
#
|
|
886
|
-
# GetFontData
|
|
882
|
+
# Already attached above: FPDFFont_GetGlyphWidth.
|
|
883
|
+
# We add: FontData (raw font program bytes — useful for inspection,
|
|
884
|
+
# embedding debugging, font substitution) and GetGlyphPath (vector path of
|
|
885
|
+
# a glyph, an alternative to GlyphWidth for exotic fonts).
|
|
886
|
+
# GetFontData follows the bool convention: it returns `out_buflen` if buf is NULL.
|
|
887
887
|
attach_function :FPDFFont_GetFontData,
|
|
888
888
|
%i[FPDF_FONT pointer size_t pointer], :FPDF_BOOL
|
|
889
889
|
attach_function :FPDFFont_GetGlyphPath,
|
|
890
890
|
%i[FPDF_FONT uint float], :FPDF_GLYPHPATH
|
|
891
|
-
# FPDF_GLYPHPATH: handle a
|
|
892
|
-
#
|
|
891
|
+
# FPDF_GLYPHPATH: handle to a path. Added as a typedef.
|
|
892
|
+
# Its GlyphPath_* APIs are niche, but we expose them for symmetry.
|
|
893
893
|
attach_function :FPDFGlyphPath_CountGlyphSegments,
|
|
894
894
|
%i[FPDF_GLYPHPATH], :int
|
|
895
895
|
attach_function :FPDFGlyphPath_GetGlyphPathSegment,
|
|
@@ -898,14 +898,14 @@ module Rpdfium
|
|
|
898
898
|
# =========================================================================
|
|
899
899
|
# Text page: char index at position
|
|
900
900
|
# =========================================================================
|
|
901
|
-
# FPDFText_GetCharIndexAtPos:
|
|
902
|
-
#
|
|
903
|
-
# "hit test" in
|
|
901
|
+
# FPDFText_GetCharIndexAtPos: given a point (x, y) in page coordinates,
|
|
902
|
+
# returns the index of the nearest char (within tolerance). Useful for
|
|
903
|
+
# "hit test" in viewers and for mapping coord → text index during search.
|
|
904
904
|
attach_function :FPDFText_GetCharIndexAtPos,
|
|
905
905
|
%i[FPDF_TEXTPAGE double double double double], :int
|
|
906
906
|
# FPDFText_GetTextIndexFromCharIndex / GetCharIndexFromTextIndex:
|
|
907
|
-
#
|
|
908
|
-
#
|
|
907
|
+
# map the "char" index (per glyph) to the "text" index (per logical
|
|
908
|
+
# codepoint). The two indices differ due to ligatures/substitutions.
|
|
909
909
|
attach_function :FPDFText_GetTextIndexFromCharIndex,
|
|
910
910
|
%i[FPDF_TEXTPAGE int], :int
|
|
911
911
|
attach_function :FPDFText_GetCharIndexFromTextIndex,
|
|
@@ -914,27 +914,27 @@ module Rpdfium
|
|
|
914
914
|
# =========================================================================
|
|
915
915
|
# Annotation extras: GetFlags, GetColor, GetBorder, AP, attachment points
|
|
916
916
|
# =========================================================================
|
|
917
|
-
# FPDFAnnot_GetFlags: bitmask
|
|
918
|
-
#
|
|
919
|
-
#
|
|
917
|
+
# FPDFAnnot_GetFlags: bitmask of Flags (Hidden, Print, NoZoom, etc.).
|
|
918
|
+
# Without this, we cannot distinguish a visible annotation from one
|
|
919
|
+
# with the Hidden flag.
|
|
920
920
|
attach_function :FPDFAnnot_GetFlags, %i[FPDF_ANNOTATION], :int
|
|
921
|
-
#
|
|
921
|
+
# Color: stroke (BORDER_COLOR) and fill (INTERIOR_COLOR).
|
|
922
922
|
attach_function :FPDFAnnot_GetColor,
|
|
923
923
|
%i[FPDF_ANNOTATION int pointer pointer pointer pointer],
|
|
924
924
|
:FPDF_BOOL
|
|
925
|
-
# Border:
|
|
925
|
+
# Border: thickness, horizontal/vertical radius, dash array count.
|
|
926
926
|
attach_function :FPDFAnnot_GetBorder,
|
|
927
927
|
%i[FPDF_ANNOTATION pointer pointer pointer], :FPDF_BOOL
|
|
928
|
-
# AP (Appearance Stream):
|
|
929
|
-
#
|
|
928
|
+
# AP (Appearance Stream): rendered form of the annotation in various
|
|
929
|
+
# modes (Normal/Rollover/Down).
|
|
930
930
|
attach_function :FPDFAnnot_GetAP,
|
|
931
931
|
%i[FPDF_ANNOTATION int pointer ulong], :ulong
|
|
932
|
-
# FileAttachment:
|
|
933
|
-
#
|
|
932
|
+
# FileAttachment: for annotations of subtype FileAttachment, obtains
|
|
933
|
+
# the FPDF_ATTACHMENT.
|
|
934
934
|
attach_function :FPDFAnnot_GetFileAttachment,
|
|
935
935
|
%i[FPDF_ANNOTATION], :FPDF_ATTACHMENT
|
|
936
|
-
# AttachmentPoints:
|
|
937
|
-
#
|
|
936
|
+
# AttachmentPoints: for highlight/markup spanning multiple lines,
|
|
937
|
+
# the 4 points of each quadrilateral.
|
|
938
938
|
attach_function :FPDFAnnot_CountAttachmentPoints,
|
|
939
939
|
%i[FPDF_ANNOTATION], :size_t
|
|
940
940
|
attach_function :FPDFAnnot_GetAttachmentPoints,
|
|
@@ -943,11 +943,11 @@ module Rpdfium
|
|
|
943
943
|
# =========================================================================
|
|
944
944
|
# Attachment extras
|
|
945
945
|
# =========================================================================
|
|
946
|
-
# FPDFAttachment_GetSubtype: MIME-like subtype
|
|
946
|
+
# FPDFAttachment_GetSubtype: MIME-like subtype of the attached file.
|
|
947
947
|
attach_function :FPDFAttachment_GetSubtype,
|
|
948
948
|
%i[FPDF_ATTACHMENT pointer ulong], :ulong
|
|
949
|
-
# FPDFAttachment_GetStringValue/HasKey:
|
|
950
|
-
#
|
|
949
|
+
# FPDFAttachment_GetStringValue/HasKey: to read the custom metadata
|
|
950
|
+
# of the file attachment (Description, CreationDate, etc.).
|
|
951
951
|
attach_function :FPDFAttachment_HasKey,
|
|
952
952
|
%i[FPDF_ATTACHMENT string], :FPDF_BOOL
|
|
953
953
|
attach_function :FPDFAttachment_GetValueType,
|
|
@@ -956,15 +956,15 @@ module Rpdfium
|
|
|
956
956
|
%i[FPDF_ATTACHMENT string pointer ulong], :ulong
|
|
957
957
|
|
|
958
958
|
# =========================================================================
|
|
959
|
-
# Helper:
|
|
959
|
+
# Helper: reading UTF-16LE strings that PDFium returns as bytes
|
|
960
960
|
# =========================================================================
|
|
961
|
-
#
|
|
962
|
-
# `unsigned long` (
|
|
963
|
-
#
|
|
961
|
+
# PDFium convention: most Get*Text/Get*Name calls return
|
|
962
|
+
# `unsigned long` (number of BYTES, terminator included). It is called
|
|
963
|
+
# first with a NULL/0 buffer to obtain the size, then with an allocated buffer.
|
|
964
964
|
def self.read_utf16_string(method_name, *args)
|
|
965
965
|
args_probe = args + [FFI::Pointer::NULL, 0]
|
|
966
966
|
n_bytes = send(method_name, *args_probe)
|
|
967
|
-
return "" if n_bytes <= 2 #
|
|
967
|
+
return "" if n_bytes <= 2 # only the null terminator or an error
|
|
968
968
|
|
|
969
969
|
buf = FFI::MemoryPointer.new(:uchar, n_bytes)
|
|
970
970
|
args_real = args + [buf, n_bytes]
|
|
@@ -972,7 +972,20 @@ module Rpdfium
|
|
|
972
972
|
utf16_bytes_to_utf8(buf.read_bytes(n_bytes))
|
|
973
973
|
end
|
|
974
974
|
|
|
975
|
-
#
|
|
975
|
+
# Same two-call convention, but for the few APIs that return 7-bit
|
|
976
|
+
# ASCII bytes instead of UTF-16LE (e.g. FPDFAction_GetURIPath).
|
|
977
|
+
def self.read_ascii_string(method_name, *args)
|
|
978
|
+
args_probe = args + [FFI::Pointer::NULL, 0]
|
|
979
|
+
n_bytes = send(method_name, *args_probe)
|
|
980
|
+
return "" if n_bytes <= 1 # only the null terminator or an error
|
|
981
|
+
|
|
982
|
+
buf = FFI::MemoryPointer.new(:uchar, n_bytes)
|
|
983
|
+
args_real = args + [buf, n_bytes]
|
|
984
|
+
send(method_name, *args_real)
|
|
985
|
+
buf.read_bytes(n_bytes).delete("\x00").force_encoding("UTF-8")
|
|
986
|
+
end
|
|
987
|
+
|
|
988
|
+
# PDFium returns little-endian UTF-16LE with a null terminator.
|
|
976
989
|
def self.utf16_bytes_to_utf8(bytes)
|
|
977
990
|
bytes.force_encoding("UTF-16LE")
|
|
978
991
|
.encode("UTF-8", invalid: :replace, undef: :replace)
|