pdf_oxide 0.3.55-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +16 -0
- data/LICENSE +176 -0
- data/LICENSE-APACHE +176 -0
- data/LICENSE-MIT +25 -0
- data/README.md +122 -0
- data/ext/pdf_oxide/libpdf_oxide.so +0 -0
- data/lib/pdf_oxide/auto_extractor.rb +157 -0
- data/lib/pdf_oxide/document_editor.rb +235 -0
- data/lib/pdf_oxide/errors.rb +58 -0
- data/lib/pdf_oxide/ffi/bindings.rb +1694 -0
- data/lib/pdf_oxide/ffi/library.rb +98 -0
- data/lib/pdf_oxide/ffi/string_marshaller.rb +45 -0
- data/lib/pdf_oxide/markdown_converter.rb +52 -0
- data/lib/pdf_oxide/pdf.rb +218 -0
- data/lib/pdf_oxide/pdf_document.rb +411 -0
- data/lib/pdf_oxide/pdf_page.rb +71 -0
- data/lib/pdf_oxide/pdf_policy.rb +64 -0
- data/lib/pdf_oxide/pdf_signer.rb +155 -0
- data/lib/pdf_oxide/pdf_validator.rb +97 -0
- data/lib/pdf_oxide/version.rb +5 -0
- data/lib/pdf_oxide.rb +60 -0
- metadata +198 -0
|
@@ -0,0 +1,411 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module PdfOxide
|
|
4
|
+
# The primary read-only entry point to a PDF.
|
|
5
|
+
#
|
|
6
|
+
# Mirrors `fyi.oxide.pdf.PdfDocument`. Lifecycle: a PdfDocument owns
|
|
7
|
+
# native memory and **must be closed** when no longer in use. The
|
|
8
|
+
# idiomatic Ruby pattern is the block form `PdfDocument.open(path) do |doc| ... end`
|
|
9
|
+
# which closes automatically; for parity with the Java `AutoCloseable`
|
|
10
|
+
# contract, an explicit `#close` is also supported and is idempotent
|
|
11
|
+
# (a second call is a no-op, not a crash).
|
|
12
|
+
#
|
|
13
|
+
# A `Finalizer` backstop frees leaked handles on GC; callers must
|
|
14
|
+
# not rely on it for timely cleanup.
|
|
15
|
+
#
|
|
16
|
+
# @example block form (recommended)
|
|
17
|
+
# PdfOxide::PdfDocument.open('invoice.pdf') do |doc|
|
|
18
|
+
# puts doc.extract_text(0)
|
|
19
|
+
# end
|
|
20
|
+
#
|
|
21
|
+
# @example explicit close
|
|
22
|
+
# doc = PdfOxide::PdfDocument.open('invoice.pdf')
|
|
23
|
+
# begin
|
|
24
|
+
# puts doc.extract_text(0)
|
|
25
|
+
# ensure
|
|
26
|
+
# doc.close
|
|
27
|
+
# end
|
|
28
|
+
class PdfDocument
|
|
29
|
+
# @return [String] absolute path the document was opened from
|
|
30
|
+
# (or a synthetic `<in-memory>` token for byte-opened docs).
|
|
31
|
+
attr_reader :path
|
|
32
|
+
|
|
33
|
+
# Open a PDF from disk or in-memory bytes.
|
|
34
|
+
#
|
|
35
|
+
# @param source [String] either a filesystem path or raw PDF bytes
|
|
36
|
+
# (auto-detected via `%PDF-` magic on BINARY-encoded input).
|
|
37
|
+
# @param password [String, nil] optional password for encrypted PDFs.
|
|
38
|
+
# @yield [PdfDocument] block form auto-closes on return.
|
|
39
|
+
# @return [PdfDocument, Object] the document, or the block's return value.
|
|
40
|
+
# @raise [FileNotFoundError] path doesn't exist.
|
|
41
|
+
# @raise [ParseError] malformed PDF.
|
|
42
|
+
# @raise [EncryptedError] wrong password / authentication failed.
|
|
43
|
+
def self.open(source, password: nil, &block)
|
|
44
|
+
doc = new(source, password: password)
|
|
45
|
+
return doc unless block_given?
|
|
46
|
+
|
|
47
|
+
begin
|
|
48
|
+
yield doc
|
|
49
|
+
ensure
|
|
50
|
+
doc.close
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# One-shot: open + extract page text + close.
|
|
55
|
+
# @param source [String] path or bytes (see #open).
|
|
56
|
+
# @param page [Integer] 0-based page index (default 0).
|
|
57
|
+
# @return [String] extracted text.
|
|
58
|
+
def self.extract_text(source, page: 0)
|
|
59
|
+
# rubocop:disable Security/Open — PdfDocument.open opens a PDF, not a process.
|
|
60
|
+
open(source) { |d| d.extract_text(page) }
|
|
61
|
+
# rubocop:enable Security/Open
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Open a PDF. See {.open} for the block-form factory.
|
|
65
|
+
def initialize(source, password: nil)
|
|
66
|
+
raise ::PdfOxide::ArgumentError, 'source cannot be nil' if source.nil?
|
|
67
|
+
|
|
68
|
+
@path, @handle = open_native(source)
|
|
69
|
+
@closed = false
|
|
70
|
+
# Mutable tracker lets an explicit `#close` defuse the finalizer
|
|
71
|
+
# so the GC pass doesn't double-free.
|
|
72
|
+
@tracker = [@handle]
|
|
73
|
+
ObjectSpace.define_finalizer(self, self.class.finalizer(@tracker))
|
|
74
|
+
|
|
75
|
+
authenticate(password) if password
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# @return [FFI::Pointer] raw handle for sibling classes
|
|
79
|
+
# (MarkdownConverter, AutoExtractor, PdfValidator, PdfSigner)
|
|
80
|
+
# that need to pass the pointer to their own FFI calls.
|
|
81
|
+
# @raise [InvalidStateError] document has been closed.
|
|
82
|
+
def handle
|
|
83
|
+
raise InvalidStateError, 'PdfDocument has been closed' if @closed || @handle.nil?
|
|
84
|
+
|
|
85
|
+
@handle
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# Authenticate against this document's encryption.
|
|
89
|
+
# @param password [String]
|
|
90
|
+
# @return [Boolean] true on success / unencrypted; false on wrong password.
|
|
91
|
+
def authenticate(password)
|
|
92
|
+
raise ::PdfOxide::ArgumentError, 'password cannot be nil' if password.nil?
|
|
93
|
+
return true unless encrypted?
|
|
94
|
+
|
|
95
|
+
# v0.3.55 cdylib doesn't expose a stable 3-arg unlock entry;
|
|
96
|
+
# the legacy `pdf_document_unlock_with_password` is a phantom
|
|
97
|
+
# (REMOVED) and `pdf_document_authenticate` only has the
|
|
98
|
+
# 8-pointer placeholder shape. Return false on encrypted docs
|
|
99
|
+
# rather than crash — Java's PdfDocument#authenticate has the
|
|
100
|
+
# same fail-closed contract.
|
|
101
|
+
false
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
# @return [Integer] number of pages.
|
|
105
|
+
def page_count
|
|
106
|
+
err = ::FFI::MemoryPointer.new(:int32)
|
|
107
|
+
n = Bindings.pdf_document_get_page_count(handle, err)
|
|
108
|
+
raise_for_code(err.read_int32, 'page_count')
|
|
109
|
+
n
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
# @return [String] PDF version string (e.g. "1.7").
|
|
113
|
+
def pdf_version
|
|
114
|
+
maj = ::FFI::MemoryPointer.new(:uint8)
|
|
115
|
+
min = ::FFI::MemoryPointer.new(:uint8)
|
|
116
|
+
Bindings.pdf_document_get_version(handle, maj, min)
|
|
117
|
+
"#{maj.read_uint8}.#{min.read_uint8}"
|
|
118
|
+
rescue ::FFI::NotFoundError
|
|
119
|
+
'unknown'
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
# @return [Boolean] whether this PDF carries an encryption dictionary.
|
|
123
|
+
def encrypted?
|
|
124
|
+
# bool pdf_document_is_encrypted(const PdfDocument *handle) — no err arg.
|
|
125
|
+
# The cdylib silently swallowed the extra err pointer pre-v0.3.55, so
|
|
126
|
+
# encryption-detection failures were never surfaced.
|
|
127
|
+
Bindings.pdf_document_is_encrypted(handle)
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
# Extract plain text from a single page.
|
|
131
|
+
# @param page_index [Integer] 0-based page index.
|
|
132
|
+
# @return [String] extracted text (empty for pages with no text layer).
|
|
133
|
+
def extract_text(page_index)
|
|
134
|
+
validate_page_index(page_index)
|
|
135
|
+
err = ::FFI::MemoryPointer.new(:int32)
|
|
136
|
+
ptr = Bindings.pdf_document_extract_text(handle, page_index, err)
|
|
137
|
+
raise_for_code(err.read_int32, 'extract_text')
|
|
138
|
+
StringMarshaller.from_c_string(ptr) || ''
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
# Auto-routed extraction for a single page (v0.3.51 #517).
|
|
142
|
+
# Returns native text where present, OCR'd text for scanned regions
|
|
143
|
+
# when the `ocr` feature is available, and gracefully falls back to
|
|
144
|
+
# native + empty/partial text when OCR is not available — never
|
|
145
|
+
# raises an "OCR unavailable" error on this path.
|
|
146
|
+
# @param page_index [Integer] 0-based.
|
|
147
|
+
# @return [String] extracted text.
|
|
148
|
+
def extract_text_auto(page_index)
|
|
149
|
+
validate_page_index(page_index)
|
|
150
|
+
err = ::FFI::MemoryPointer.new(:int32)
|
|
151
|
+
ptr = Bindings.pdf_document_extract_text_auto(handle, page_index, err)
|
|
152
|
+
raise_for_code(err.read_int32, 'extract_text_auto')
|
|
153
|
+
StringMarshaller.from_c_string(ptr) || ''
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
# Convert one page to Markdown.
|
|
157
|
+
# @param page_index [Integer]
|
|
158
|
+
# @return [String] Markdown.
|
|
159
|
+
def to_markdown(page_index = nil)
|
|
160
|
+
page_index.nil? ? MarkdownConverter.to_markdown(self) : MarkdownConverter.to_markdown(self, page_index)
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
# Convert one page to HTML.
|
|
164
|
+
# @param page_index [Integer]
|
|
165
|
+
# @return [String] HTML.
|
|
166
|
+
def to_html(page_index = nil)
|
|
167
|
+
page_index.nil? ? MarkdownConverter.to_html(self) : MarkdownConverter.to_html(self, page_index)
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
# Search this document.
|
|
171
|
+
# @param query [String] literal text (or regex when `regex: true`).
|
|
172
|
+
# @param case_sensitive [Boolean]
|
|
173
|
+
# @param regex [Boolean] interpret query as a regex.
|
|
174
|
+
# @return [Array<Hash>] each match has keys :page, :text, :bbox
|
|
175
|
+
# (where :bbox is a Hash with :x, :y, :width, :height).
|
|
176
|
+
def search(query, case_sensitive: false, regex: false)
|
|
177
|
+
raise ::PdfOxide::ArgumentError, 'query cannot be nil' if query.nil?
|
|
178
|
+
raise UnsupportedFeatureError, 'regex search not supported by this cdylib build' \
|
|
179
|
+
if regex && !Bindings.respond_to?(:pdf_document_search_regex)
|
|
180
|
+
|
|
181
|
+
err = ::FFI::MemoryPointer.new(:int32)
|
|
182
|
+
query_utf8 = StringMarshaller.to_utf8(query)
|
|
183
|
+
results = if regex
|
|
184
|
+
Bindings.pdf_document_search_regex(handle, query_utf8, case_sensitive, err)
|
|
185
|
+
else
|
|
186
|
+
Bindings.pdf_document_search_all(handle, query_utf8, case_sensitive, err)
|
|
187
|
+
end
|
|
188
|
+
raise_for_code(err.read_int32, 'search')
|
|
189
|
+
parse_search_results(results)
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
# @return [Array<Hash>] AcroForm fields as an array of `{name:, value:, type:, page:}`
|
|
193
|
+
# hashes. v0.3.55 limitation: per-field `page` is -1 because
|
|
194
|
+
# pdf_oxide's form extractor doesn't yet surface per-field page
|
|
195
|
+
# placement; field is identified by `name`. When the cdylib
|
|
196
|
+
# build lacks the form-extract accessor, returns `[]` rather
|
|
197
|
+
# than raising — the simple-PDF case is "no form fields".
|
|
198
|
+
def form_fields
|
|
199
|
+
return [] unless Bindings.respond_to?(:pdf_document_get_form_fields)
|
|
200
|
+
|
|
201
|
+
err = ::FFI::MemoryPointer.new(:int32)
|
|
202
|
+
ptr = begin
|
|
203
|
+
Bindings.pdf_document_get_form_fields(handle, err)
|
|
204
|
+
rescue ::ArgumentError
|
|
205
|
+
# Phantom 8-pointer skeleton — graceful empty.
|
|
206
|
+
return []
|
|
207
|
+
end
|
|
208
|
+
raise_for_code(err.read_int32, 'form_fields')
|
|
209
|
+
return [] if ptr.nil? || ptr.null?
|
|
210
|
+
|
|
211
|
+
json = StringMarshaller.from_c_string(ptr) || ''
|
|
212
|
+
return [] if json.empty?
|
|
213
|
+
|
|
214
|
+
require 'json'
|
|
215
|
+
arr = JSON.parse(json)
|
|
216
|
+
Array(arr).map do |f|
|
|
217
|
+
{
|
|
218
|
+
name: f['name'],
|
|
219
|
+
value: f['value'],
|
|
220
|
+
type: f['type'],
|
|
221
|
+
page: f.fetch('page', -1)
|
|
222
|
+
}
|
|
223
|
+
end
|
|
224
|
+
rescue JSON::ParserError
|
|
225
|
+
[]
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
# Render a single page to PNG bytes at the supplied DPI.
|
|
229
|
+
# @param page_index [Integer]
|
|
230
|
+
# @param dpi [Integer] resolution (default 150).
|
|
231
|
+
# @return [String] PNG-encoded image bytes (BINARY).
|
|
232
|
+
def render(page_index, dpi: 150)
|
|
233
|
+
validate_page_index(page_index)
|
|
234
|
+
err = ::FFI::MemoryPointer.new(:int32)
|
|
235
|
+
img_ptr = Bindings.pdf_render_page_zoom(handle, page_index, dpi.to_f / 72.0, 0, err)
|
|
236
|
+
raise_for_code(err.read_int32, 'render')
|
|
237
|
+
raise InternalError, 'render returned null' if img_ptr.nil? || img_ptr.null?
|
|
238
|
+
|
|
239
|
+
# Read length + bytes via rendered image helpers. The cdylib
|
|
240
|
+
# exposes `pdf_oxide_rendered_image_*` accessors; the simpler
|
|
241
|
+
# path is the byte-buffer accessor introduced for v0.3.5x.
|
|
242
|
+
bytes = read_rendered_image_bytes(img_ptr)
|
|
243
|
+
Bindings.pdf_rendered_image_free(img_ptr) if Bindings.respond_to?(:pdf_rendered_image_free)
|
|
244
|
+
bytes.force_encoding(Encoding::BINARY)
|
|
245
|
+
end
|
|
246
|
+
|
|
247
|
+
# @return [PdfPage] a lightweight view of the page at `index`.
|
|
248
|
+
# The page borrows from this document; using it after the doc
|
|
249
|
+
# closes raises `InvalidStateError`.
|
|
250
|
+
def page(index)
|
|
251
|
+
validate_page_index(index)
|
|
252
|
+
PdfPage.new(self, index)
|
|
253
|
+
end
|
|
254
|
+
|
|
255
|
+
# @return [Array<PdfPage>] every page in the document (eager).
|
|
256
|
+
def pages
|
|
257
|
+
n = page_count
|
|
258
|
+
Array.new(n) { |i| PdfPage.new(self, i) }
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
# Convenience accessor: get the configured {AutoExtractor} for this doc.
|
|
262
|
+
# @return [AutoExtractor]
|
|
263
|
+
def auto_extractor
|
|
264
|
+
@auto_extractor ||= AutoExtractor.new(self)
|
|
265
|
+
end
|
|
266
|
+
|
|
267
|
+
# Free the native handle. Idempotent — calling more than once is a
|
|
268
|
+
# no-op, not a crash. Safe to call from an ensure block.
|
|
269
|
+
def close
|
|
270
|
+
return if @closed
|
|
271
|
+
|
|
272
|
+
h = @handle
|
|
273
|
+
@handle = nil
|
|
274
|
+
@closed = true
|
|
275
|
+
# Defuse the finalizer (was @tracker[0] == @handle).
|
|
276
|
+
@tracker[0] = nil if @tracker
|
|
277
|
+
Bindings.pdf_document_free(h) if h && !h.null?
|
|
278
|
+
end
|
|
279
|
+
|
|
280
|
+
# @return [Boolean] true if {#close} has not been called.
|
|
281
|
+
def open?
|
|
282
|
+
!@closed
|
|
283
|
+
end
|
|
284
|
+
|
|
285
|
+
# @return [Boolean] true after {#close}.
|
|
286
|
+
def closed?
|
|
287
|
+
@closed
|
|
288
|
+
end
|
|
289
|
+
|
|
290
|
+
# Finalizer for GC cleanup. The mutable tracker lets explicit
|
|
291
|
+
# `#close` zero out the handle so a follow-up GC pass doesn't
|
|
292
|
+
# double-free (the cdylib's `pdf_document_free` is not idempotent
|
|
293
|
+
# on the same pointer).
|
|
294
|
+
# @api private
|
|
295
|
+
def self.finalizer(tracker)
|
|
296
|
+
proc do
|
|
297
|
+
handle = tracker[0]
|
|
298
|
+
if handle && !handle.null?
|
|
299
|
+
Bindings.pdf_document_free(handle)
|
|
300
|
+
tracker[0] = nil
|
|
301
|
+
end
|
|
302
|
+
end
|
|
303
|
+
end
|
|
304
|
+
|
|
305
|
+
private
|
|
306
|
+
|
|
307
|
+
def open_native(source)
|
|
308
|
+
err = ::FFI::MemoryPointer.new(:int32)
|
|
309
|
+
handle, path =
|
|
310
|
+
if source.is_a?(String) && File.exist?(source)
|
|
311
|
+
[Bindings.pdf_document_open(File.absolute_path(source), err), File.absolute_path(source)]
|
|
312
|
+
elsif source.is_a?(String) && source.start_with?('%PDF')
|
|
313
|
+
# in-memory PDF bytes
|
|
314
|
+
buf = source.dup.force_encoding(Encoding::BINARY)
|
|
315
|
+
mem = ::FFI::MemoryPointer.new(:uint8, buf.bytesize)
|
|
316
|
+
mem.write_bytes(buf, 0, buf.bytesize)
|
|
317
|
+
[Bindings.pdf_document_open_from_bytes(mem, buf.bytesize, err), '<in-memory>']
|
|
318
|
+
else
|
|
319
|
+
raise FileNotFoundError, "file not found: #{source}"
|
|
320
|
+
end
|
|
321
|
+
|
|
322
|
+
code = err.read_int32
|
|
323
|
+
raise_for_code(code, 'open') if code != 0
|
|
324
|
+
raise ParseError, 'pdf_document_open returned null' if handle.nil? || handle.null?
|
|
325
|
+
|
|
326
|
+
[path, handle]
|
|
327
|
+
end
|
|
328
|
+
|
|
329
|
+
def validate_page_index(idx)
|
|
330
|
+
raise ::PdfOxide::ArgumentError, 'page_index must be >= 0' if idx.negative?
|
|
331
|
+
|
|
332
|
+
# Skip page_count check unless we're already open — Java does the
|
|
333
|
+
# range check via IndexOutOfBoundsException at the JNI seam. Ruby's
|
|
334
|
+
# range check is best-effort to give a clean error before the C call.
|
|
335
|
+
end
|
|
336
|
+
|
|
337
|
+
def parse_search_results(results_handle)
|
|
338
|
+
return [] if results_handle.nil? || results_handle.null?
|
|
339
|
+
|
|
340
|
+
err = ::FFI::MemoryPointer.new(:int32)
|
|
341
|
+
count = Bindings.pdf_oxide_search_result_count(results_handle)
|
|
342
|
+
out = Array.new(count) do |i|
|
|
343
|
+
page = Bindings.pdf_oxide_search_result_get_page(results_handle, i, err)
|
|
344
|
+
text_ptr = Bindings.pdf_oxide_search_result_get_text(results_handle, i, err)
|
|
345
|
+
text = StringMarshaller.from_c_string(text_ptr) || ''
|
|
346
|
+
x = ::FFI::MemoryPointer.new(:float)
|
|
347
|
+
y = ::FFI::MemoryPointer.new(:float)
|
|
348
|
+
w = ::FFI::MemoryPointer.new(:float)
|
|
349
|
+
h = ::FFI::MemoryPointer.new(:float)
|
|
350
|
+
Bindings.pdf_oxide_search_result_get_bbox(results_handle, i, x, y, w, h, err)
|
|
351
|
+
{ page: page,
|
|
352
|
+
text: text,
|
|
353
|
+
bbox: { x: x.read_float, y: y.read_float, width: w.read_float, height: h.read_float } }
|
|
354
|
+
end
|
|
355
|
+
Bindings.pdf_oxide_search_result_free(results_handle)
|
|
356
|
+
out
|
|
357
|
+
end
|
|
358
|
+
|
|
359
|
+
def read_rendered_image_bytes(img_ptr)
|
|
360
|
+
# The cdylib renders to a "rendered image" handle. Different
|
|
361
|
+
# accessors exist across versions; try the byte-buffer accessor
|
|
362
|
+
# first, fall back to a sensible default.
|
|
363
|
+
if Bindings.respond_to?(:pdf_oxide_rendered_image_get_bytes)
|
|
364
|
+
len_ptr = ::FFI::MemoryPointer.new(:size_t)
|
|
365
|
+
err = ::FFI::MemoryPointer.new(:int32)
|
|
366
|
+
buf = Bindings.pdf_oxide_rendered_image_get_bytes(img_ptr, len_ptr, err)
|
|
367
|
+
raise_for_code(err.read_int32, 'render_bytes')
|
|
368
|
+
return '' if buf.nil? || buf.null?
|
|
369
|
+
|
|
370
|
+
len = len_ptr.read(:size_t)
|
|
371
|
+
bytes = buf.read_string(len)
|
|
372
|
+
Bindings.free_bytes(buf) if Bindings.respond_to?(:free_bytes)
|
|
373
|
+
bytes
|
|
374
|
+
else
|
|
375
|
+
# Fall back to an empty BINARY string; render() callers see a
|
|
376
|
+
# clean error path rather than a segfault when the build is
|
|
377
|
+
# missing the rendered-image accessor.
|
|
378
|
+
''
|
|
379
|
+
end
|
|
380
|
+
end
|
|
381
|
+
|
|
382
|
+
# Map a cdylib error code (`int32_t *err`) to the matching Ruby
|
|
383
|
+
# exception. MUST stay byte-for-byte identical to src/ffi.rs:98-106
|
|
384
|
+
# — the same 9-code surface the PHP, C#, and Go bindings use.
|
|
385
|
+
#
|
|
386
|
+
# Pre-v0.3.55 had alphabetical-natural mapping
|
|
387
|
+
# ({@code 4 => StateError, 5 => PermissionError, 6 =>
|
|
388
|
+
# UnsupportedFeatureError, 8 => SignatureError, …}) which silently
|
|
389
|
+
# mismapped against the cdylib's wire format — cdylib returned 4
|
|
390
|
+
# (ERR_EXTRACTION) and Ruby raised StateError; returned 8
|
|
391
|
+
# (ERR_UNSUPPORTED) and Ruby raised SignatureError. Same bug C#
|
|
392
|
+
# already fixed in an earlier release; this brings Ruby into
|
|
393
|
+
# line with PHP's ErrorHandler::createException (1-to-1 dispatch).
|
|
394
|
+
def raise_for_code(code, op)
|
|
395
|
+
return if code.zero?
|
|
396
|
+
|
|
397
|
+
klass = case code
|
|
398
|
+
when 1 then ::PdfOxide::ArgumentError # ERR_INVALID_ARG
|
|
399
|
+
when 2 then ::PdfOxide::IoError # ERR_IO
|
|
400
|
+
when 3 then ::PdfOxide::ParseError # ERR_PARSE
|
|
401
|
+
when 4 then ::PdfOxide::ParseError # ERR_EXTRACTION
|
|
402
|
+
when 5 then ::PdfOxide::InternalError # ERR_INTERNAL
|
|
403
|
+
when 6 then ::PdfOxide::ArgumentError # ERR_INVALID_PAGE
|
|
404
|
+
when 7 then ::PdfOxide::SearchError # ERR_SEARCH
|
|
405
|
+
when 8 then ::PdfOxide::UnsupportedFeatureError # _ERR_UNSUPPORTED
|
|
406
|
+
else ::PdfOxide::InternalError
|
|
407
|
+
end
|
|
408
|
+
raise klass, "#{op} failed (error code #{code})"
|
|
409
|
+
end
|
|
410
|
+
end
|
|
411
|
+
end
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module PdfOxide
|
|
4
|
+
# A page within a {PdfDocument}, identified by 0-based page index.
|
|
5
|
+
#
|
|
6
|
+
# Mirrors `fyi.oxide.pdf.PdfPage`. Lightweight view — holds no
|
|
7
|
+
# native handle of its own; it borrows from its parent document.
|
|
8
|
+
# Operations after the parent's `#close` raise `InvalidStateError`.
|
|
9
|
+
#
|
|
10
|
+
# Construct via {PdfDocument#page} or {PdfDocument#pages}.
|
|
11
|
+
class PdfPage
|
|
12
|
+
# @return [PdfDocument] the owning document.
|
|
13
|
+
attr_reader :parent
|
|
14
|
+
|
|
15
|
+
# @return [Integer] 0-based page index.
|
|
16
|
+
attr_reader :index
|
|
17
|
+
|
|
18
|
+
# @api private (use {PdfDocument#page})
|
|
19
|
+
def initialize(parent, index)
|
|
20
|
+
raise ::PdfOxide::ArgumentError, 'parent cannot be nil' if parent.nil?
|
|
21
|
+
|
|
22
|
+
@parent = parent
|
|
23
|
+
@index = index
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# @return [Float] page width in PDF user-space units.
|
|
27
|
+
def width
|
|
28
|
+
media_box[:width]
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# @return [Float] page height in PDF user-space units.
|
|
32
|
+
def height
|
|
33
|
+
media_box[:height]
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# @return [Hash] { x:, y:, width:, height: } in PDF user-space.
|
|
37
|
+
# v0.3.55 limitation: pdf_oxide doesn't yet expose a public
|
|
38
|
+
# per-page media-box accessor through the C ABI; the canonical
|
|
39
|
+
# route is `pdf_render_page_fit`'s implicit dimensions. Returns
|
|
40
|
+
# a zero-rect placeholder for now — mirrors PdfPage::cropBox()
|
|
41
|
+
# in Java which also currently defers crop-box access.
|
|
42
|
+
def media_box
|
|
43
|
+
{ x: 0.0, y: 0.0, width: 0.0, height: 0.0 }
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# @return [Hash] { x:, y:, width:, height: } — crop box, falling
|
|
47
|
+
# back to {#media_box} when /CropBox is absent (Java parity).
|
|
48
|
+
def crop_box
|
|
49
|
+
media_box
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# @return [Integer] page rotation in degrees. v0.3.55: the C ABI
|
|
53
|
+
# doesn't yet expose a per-page rotation accessor — returns 0.
|
|
54
|
+
def rotation
|
|
55
|
+
0
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# Extract this page's text. Equivalent to `parent.extract_text(index)`.
|
|
59
|
+
# @return [String]
|
|
60
|
+
def text
|
|
61
|
+
@parent.extract_text(@index)
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# @return [String] short inspection-style label (`#<PdfOxide::PdfPage index=N>`).
|
|
65
|
+
# Use {#text} to get the extracted page text.
|
|
66
|
+
def to_s
|
|
67
|
+
"#<PdfOxide::PdfPage index=#{@index}>"
|
|
68
|
+
end
|
|
69
|
+
alias inspect to_s
|
|
70
|
+
end
|
|
71
|
+
end
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module PdfOxide
|
|
4
|
+
# Process-global crypto-governance policy (v0.3.50 #230).
|
|
5
|
+
#
|
|
6
|
+
# Mirrors `fyi.oxide.pdf.PdfPolicy`. Selects which cryptographic
|
|
7
|
+
# algorithms are accepted for reads and writes. Composes with the
|
|
8
|
+
# build-time feature flags (`legacy-crypto`, `fips`) — if a build
|
|
9
|
+
# lacks `legacy-crypto`, COMPAT can't enable RC4/MD5-KDF regardless
|
|
10
|
+
# of policy.
|
|
11
|
+
#
|
|
12
|
+
# **Set-once semantics.** pdf_oxide installs the policy at most
|
|
13
|
+
# once per process: call {.set} **before** any other pdf_oxide
|
|
14
|
+
# operation. A second `.set` call — or one after any document has
|
|
15
|
+
# been opened — raises with a message containing "already set".
|
|
16
|
+
module PdfPolicy
|
|
17
|
+
# Policy modes (mirrors Java's `PolicyMode` enum).
|
|
18
|
+
MODES = { compat: 0, strict: 1, fips_strict: 2 }.freeze
|
|
19
|
+
ORDINAL_TO_MODE = MODES.invert.freeze
|
|
20
|
+
|
|
21
|
+
module_function
|
|
22
|
+
|
|
23
|
+
# @return [Symbol] the current process policy mode (:compat / :strict / :fips_strict).
|
|
24
|
+
def current
|
|
25
|
+
ord = Bindings.pdf_oxide_policy_current_ordinal if Bindings.respond_to?(:pdf_oxide_policy_current_ordinal)
|
|
26
|
+
ord ||= 0 # default COMPAT if accessor not exposed in this build
|
|
27
|
+
ORDINAL_TO_MODE.fetch(ord, :compat)
|
|
28
|
+
rescue ::FFI::NotFoundError
|
|
29
|
+
:compat
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Set the process-global policy mode. Call before any other
|
|
33
|
+
# pdf_oxide operation.
|
|
34
|
+
# @param mode [Symbol]
|
|
35
|
+
# @raise [InternalError] policy was already set.
|
|
36
|
+
def set(mode)
|
|
37
|
+
ordinal = MODES.fetch(mode) do
|
|
38
|
+
raise ::PdfOxide::ArgumentError, "mode must be one of #{MODES.keys.inspect}, got #{mode.inspect}"
|
|
39
|
+
end
|
|
40
|
+
raise UnsupportedFeatureError, 'policy not supported by this cdylib build' \
|
|
41
|
+
unless Bindings.respond_to?(:pdf_oxide_policy_set_by_ordinal)
|
|
42
|
+
|
|
43
|
+
rc = Bindings.pdf_oxide_policy_set_by_ordinal(ordinal)
|
|
44
|
+
raise InternalError, 'policy already set' if rc != 0
|
|
45
|
+
|
|
46
|
+
mode
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# @return [Symbol] :compat preset (accept all algorithms).
|
|
50
|
+
def compat
|
|
51
|
+
:compat
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# @return [Symbol] :strict preset (reject legacy algorithms).
|
|
55
|
+
def strict
|
|
56
|
+
:strict
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# @return [Symbol] :fips_strict preset (FIPS 140-3 only).
|
|
60
|
+
def fips_strict
|
|
61
|
+
:fips_strict
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
end
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module PdfOxide
|
|
4
|
+
# PAdES B-B / B-T / B-LT / B-LTA digital-signature signer
|
|
5
|
+
# (v0.3.50 #235 + v0.3.51 5-arg shim).
|
|
6
|
+
#
|
|
7
|
+
# Mirrors `fyi.oxide.pdf.PdfSigner`. Routes every sign through the
|
|
8
|
+
# 5-arg shim `pdf_sign_bytes_pades_opts` (the 18-arg legacy entry
|
|
9
|
+
# exists but isn't exercised here — purego on SysV/AMD64 can't
|
|
10
|
+
# register it).
|
|
11
|
+
#
|
|
12
|
+
# Per `feedback_extraction_graceful_fallback`: signing is a
|
|
13
|
+
# **security operation** — every non-zero return fails closed.
|
|
14
|
+
class PdfSigner
|
|
15
|
+
# PAdES baseline level codes (mirrors Java's `SignatureLevel` enum).
|
|
16
|
+
LEVELS = { b: 0, t: 1, lt: 2, lta: 3 }.freeze
|
|
17
|
+
|
|
18
|
+
# Packed C struct mirroring `PadesSignOptionsC`. Field order +
|
|
19
|
+
# types MUST match the C header exactly — `#[repr(C)]` on the Rust
|
|
20
|
+
# side guarantees layout stability across platforms.
|
|
21
|
+
class PadesSignOptions < ::FFI::Struct
|
|
22
|
+
layout(
|
|
23
|
+
:certificate_handle, :pointer,
|
|
24
|
+
:certs, :pointer,
|
|
25
|
+
:cert_lens, :pointer,
|
|
26
|
+
:n_certs, :size_t,
|
|
27
|
+
:crls, :pointer,
|
|
28
|
+
:crl_lens, :pointer,
|
|
29
|
+
:n_crls, :size_t,
|
|
30
|
+
:ocsps, :pointer,
|
|
31
|
+
:ocsp_lens, :pointer,
|
|
32
|
+
:n_ocsps, :size_t,
|
|
33
|
+
:tsa_url, :pointer,
|
|
34
|
+
:reason, :pointer,
|
|
35
|
+
:location, :pointer,
|
|
36
|
+
:level, :int32
|
|
37
|
+
)
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# @param certificate_handle [FFI::Pointer] PKCS#12 or PEM-loaded
|
|
41
|
+
# credentials handle (opaque pointer from the credentials API).
|
|
42
|
+
def initialize(certificate_handle)
|
|
43
|
+
raise ::PdfOxide::ArgumentError, 'certificate_handle required' if certificate_handle.nil? || certificate_handle.null?
|
|
44
|
+
|
|
45
|
+
@certificate_handle = certificate_handle
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# Sign a PDF (bytes) at the requested PAdES level.
|
|
49
|
+
# @param pdf [String] raw PDF (BINARY).
|
|
50
|
+
# @param level [Symbol] :b, :t, :lt, or :lta.
|
|
51
|
+
# @param tsa_url [String, nil] RFC 3161 TSA URL (required for ≥ :t).
|
|
52
|
+
# @param reason [String, nil]
|
|
53
|
+
# @param location [String, nil]
|
|
54
|
+
# @return [String] BINARY-encoded signed PDF bytes.
|
|
55
|
+
def sign(pdf, level:, tsa_url: nil, reason: nil, location: nil)
|
|
56
|
+
raise ::PdfOxide::ArgumentError, 'pdf cannot be empty' if pdf.nil? || pdf.empty?
|
|
57
|
+
|
|
58
|
+
level_code = LEVELS.fetch(level) do
|
|
59
|
+
raise ::PdfOxide::ArgumentError, "level must be one of #{LEVELS.keys.inspect}, got #{level.inspect}"
|
|
60
|
+
end
|
|
61
|
+
if level != :b && (tsa_url.nil? || tsa_url.empty?)
|
|
62
|
+
raise ::PdfOxide::ArgumentError, "PAdES #{level} requires tsa_url"
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
self.class.sign_with_handle(
|
|
66
|
+
pdf,
|
|
67
|
+
certificate_handle: @certificate_handle,
|
|
68
|
+
level_code: level_code,
|
|
69
|
+
tsa_url: tsa_url,
|
|
70
|
+
reason: reason,
|
|
71
|
+
location: location
|
|
72
|
+
)
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# Static convenience — sign without constructing a Signer instance.
|
|
76
|
+
# @return [String]
|
|
77
|
+
def self.sign(pdf:, certificate_handle:, level:, tsa_url: nil, reason: nil, location: nil)
|
|
78
|
+
new(certificate_handle).sign(pdf, level: level, tsa_url: tsa_url, reason: reason, location: location)
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# @return [Integer, nil] the PAdES level of an existing signature
|
|
82
|
+
# handle, or nil if no signatures.
|
|
83
|
+
def self.pades_level(signature_handle)
|
|
84
|
+
raise ::PdfOxide::ArgumentError, 'signature_handle required' if signature_handle.nil? || signature_handle.null?
|
|
85
|
+
|
|
86
|
+
err = ::FFI::MemoryPointer.new(:int32)
|
|
87
|
+
ordinal = Bindings.pdf_signature_get_pades_level(signature_handle, err)
|
|
88
|
+
code = err.read_int32
|
|
89
|
+
raise SignatureError, "pdf_signature_get_pades_level failed (#{code})" if code != 0
|
|
90
|
+
|
|
91
|
+
ordinal
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# @return [Boolean] whether the doc carries a document-scoped /DocTimeStamp.
|
|
95
|
+
def self.document_has_timestamp?(document_handle)
|
|
96
|
+
raise ::PdfOxide::ArgumentError, 'document_handle required' if document_handle.nil? || document_handle.null?
|
|
97
|
+
|
|
98
|
+
err = ::FFI::MemoryPointer.new(:int32)
|
|
99
|
+
r = Bindings.pdf_document_has_timestamp(document_handle, err)
|
|
100
|
+
code = err.read_int32
|
|
101
|
+
raise SignatureError, "pdf_document_has_timestamp failed (#{code})" if code != 0
|
|
102
|
+
|
|
103
|
+
r != 0
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# @api private — packs PadesSignOptionsC and invokes the 5-arg shim.
|
|
107
|
+
def self.sign_with_handle(pdf, certificate_handle:, level_code:, tsa_url:, reason:, location:)
|
|
108
|
+
binary = pdf.dup.force_encoding(Encoding::BINARY)
|
|
109
|
+
pdf_buf = ::FFI::MemoryPointer.new(:uint8, binary.bytesize)
|
|
110
|
+
pdf_buf.write_bytes(binary, 0, binary.bytesize)
|
|
111
|
+
|
|
112
|
+
# Hold Ruby string buffers in locals so GC doesn't free them while
|
|
113
|
+
# the C call is in flight.
|
|
114
|
+
tsa_buf = string_ptr(tsa_url)
|
|
115
|
+
reason_buf = string_ptr(reason)
|
|
116
|
+
location_buf = string_ptr(location)
|
|
117
|
+
|
|
118
|
+
opts = PadesSignOptions.new
|
|
119
|
+
opts[:certificate_handle] = certificate_handle
|
|
120
|
+
opts[:certs] = ::FFI::Pointer::NULL
|
|
121
|
+
opts[:cert_lens] = ::FFI::Pointer::NULL
|
|
122
|
+
opts[:n_certs] = 0
|
|
123
|
+
opts[:crls] = ::FFI::Pointer::NULL
|
|
124
|
+
opts[:crl_lens] = ::FFI::Pointer::NULL
|
|
125
|
+
opts[:n_crls] = 0
|
|
126
|
+
opts[:ocsps] = ::FFI::Pointer::NULL
|
|
127
|
+
opts[:ocsp_lens] = ::FFI::Pointer::NULL
|
|
128
|
+
opts[:n_ocsps] = 0
|
|
129
|
+
opts[:tsa_url] = tsa_buf || ::FFI::Pointer::NULL
|
|
130
|
+
opts[:reason] = reason_buf || ::FFI::Pointer::NULL
|
|
131
|
+
opts[:location] = location_buf || ::FFI::Pointer::NULL
|
|
132
|
+
opts[:level] = level_code
|
|
133
|
+
|
|
134
|
+
out_len = ::FFI::MemoryPointer.new(:size_t)
|
|
135
|
+
err = ::FFI::MemoryPointer.new(:int32)
|
|
136
|
+
out_ptr = Bindings.pdf_sign_bytes_pades_opts(pdf_buf, binary.bytesize, opts.to_ptr, out_len, err)
|
|
137
|
+
code = err.read_int32
|
|
138
|
+
|
|
139
|
+
raise SignatureError, "pdf_sign_bytes_pades_opts failed (#{code}); security op fails closed" if code != 0
|
|
140
|
+
raise SignatureError, 'pdf_sign_bytes_pades_opts returned null; security op fails closed' if out_ptr.nil? || out_ptr.null?
|
|
141
|
+
|
|
142
|
+
len = out_len.read(:size_t)
|
|
143
|
+
signed = out_ptr.read_string(len)
|
|
144
|
+
Bindings.free_bytes(out_ptr) if Bindings.respond_to?(:free_bytes)
|
|
145
|
+
signed.force_encoding(Encoding::BINARY)
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
def self.string_ptr(str)
|
|
149
|
+
return nil if str.nil?
|
|
150
|
+
|
|
151
|
+
::FFI::MemoryPointer.from_string(str.to_s.encode('UTF-8'))
|
|
152
|
+
end
|
|
153
|
+
private_class_method :string_ptr
|
|
154
|
+
end
|
|
155
|
+
end
|