pdf_oxide 0.3.55-x86_64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +16 -0
- data/LICENSE +176 -0
- data/LICENSE-APACHE +176 -0
- data/LICENSE-MIT +25 -0
- data/README.md +122 -0
- data/ext/pdf_oxide/libpdf_oxide.so +0 -0
- data/lib/pdf_oxide/auto_extractor.rb +157 -0
- data/lib/pdf_oxide/document_editor.rb +235 -0
- data/lib/pdf_oxide/errors.rb +58 -0
- data/lib/pdf_oxide/ffi/bindings.rb +1694 -0
- data/lib/pdf_oxide/ffi/library.rb +98 -0
- data/lib/pdf_oxide/ffi/string_marshaller.rb +45 -0
- data/lib/pdf_oxide/markdown_converter.rb +52 -0
- data/lib/pdf_oxide/pdf.rb +218 -0
- data/lib/pdf_oxide/pdf_document.rb +411 -0
- data/lib/pdf_oxide/pdf_page.rb +71 -0
- data/lib/pdf_oxide/pdf_policy.rb +64 -0
- data/lib/pdf_oxide/pdf_signer.rb +155 -0
- data/lib/pdf_oxide/pdf_validator.rb +97 -0
- data/lib/pdf_oxide/version.rb +5 -0
- data/lib/pdf_oxide.rb +60 -0
- metadata +198 -0
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'ffi'
|
|
4
|
+
require 'rbconfig'
|
|
5
|
+
|
|
6
|
+
module PdfOxide
|
|
7
|
+
module FFI
|
|
8
|
+
# Loads the native PDF Oxide library with cross-platform support
|
|
9
|
+
module Library
|
|
10
|
+
# Finds library for current platform
|
|
11
|
+
# @return [Array<String>] Library names to try loading
|
|
12
|
+
def self.find_library
|
|
13
|
+
case RbConfig::CONFIG['host_os']
|
|
14
|
+
when /darwin/
|
|
15
|
+
%w[libpdf_oxide.dylib libpdf_oxide.0.dylib]
|
|
16
|
+
when /linux/
|
|
17
|
+
%w[libpdf_oxide.so libpdf_oxide.so.0]
|
|
18
|
+
when /mswin|mingw/
|
|
19
|
+
%w[pdf_oxide.dll libpdf_oxide.dll]
|
|
20
|
+
else
|
|
21
|
+
raise UnsupportedPlatformError, "Unsupported OS: #{RbConfig::CONFIG['host_os']}"
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# @return [String] Path to native library
|
|
26
|
+
def self.library_path
|
|
27
|
+
@library_path ||= find_library_path
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def self.find_library_path
|
|
31
|
+
# Try to find in standard locations
|
|
32
|
+
find_library.each do |lib_name|
|
|
33
|
+
# Native-gem layout: cdylib staged inside the gem at
|
|
34
|
+
# ext/pdf_oxide/ during platform-specific gem packaging. This is
|
|
35
|
+
# the path bundled into platform-tagged gems and is the first
|
|
36
|
+
# thing the loader should try when installed from a native gem.
|
|
37
|
+
gem_native = File.expand_path("../../../ext/pdf_oxide/#{lib_name}", __dir__)
|
|
38
|
+
return gem_native if File.exist?(gem_native)
|
|
39
|
+
|
|
40
|
+
# Try system paths
|
|
41
|
+
result = system_find_library(lib_name)
|
|
42
|
+
return result if result
|
|
43
|
+
|
|
44
|
+
# Try relative to gem (dev-checkout layouts)
|
|
45
|
+
relative_paths = [
|
|
46
|
+
File.expand_path("../../target/release/#{lib_name}", __dir__),
|
|
47
|
+
File.expand_path("../../target/debug/#{lib_name}", __dir__),
|
|
48
|
+
File.expand_path("../../../target/release/#{lib_name}", __dir__),
|
|
49
|
+
File.expand_path("../../../target/debug/#{lib_name}", __dir__),
|
|
50
|
+
lib_name
|
|
51
|
+
]
|
|
52
|
+
|
|
53
|
+
relative_paths.each do |path|
|
|
54
|
+
return path if File.exist?(path)
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# Fallback to library name (system will search)
|
|
59
|
+
find_library.first
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def self.system_find_library(lib_name)
|
|
63
|
+
case RbConfig::CONFIG['host_os']
|
|
64
|
+
when /darwin/
|
|
65
|
+
ldconfig_search(lib_name) || homebrew_find(lib_name)
|
|
66
|
+
when /linux/
|
|
67
|
+
ldconfig_search(lib_name)
|
|
68
|
+
when /mswin|mingw/
|
|
69
|
+
windows_find(lib_name)
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def self.ldconfig_search(lib_name)
|
|
74
|
+
output = `ldconfig -p 2>/dev/null | grep #{lib_name}`.strip
|
|
75
|
+
return nil if output.empty?
|
|
76
|
+
|
|
77
|
+
output.split("\n").first&.split('=>')&.last&.strip
|
|
78
|
+
rescue StandardError
|
|
79
|
+
nil
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def self.homebrew_find(lib_name)
|
|
83
|
+
output = `brew --prefix 2>/dev/null`.strip
|
|
84
|
+
return nil if output.empty?
|
|
85
|
+
|
|
86
|
+
path = File.join(output, 'lib', lib_name)
|
|
87
|
+
File.exist?(path) ? path : nil
|
|
88
|
+
rescue StandardError
|
|
89
|
+
nil
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def self.windows_find(_lib_name)
|
|
93
|
+
# Windows DLL search path is handled by system
|
|
94
|
+
nil
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
end
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module PdfOxide
|
|
4
|
+
module FFI
|
|
5
|
+
# UTF-8 string round-tripping between Ruby and the C ABI.
|
|
6
|
+
#
|
|
7
|
+
# The cdylib's `*char` returns are heap-allocated by Rust and must
|
|
8
|
+
# be released via `free_string`; passing them to `pdf_free` (the
|
|
9
|
+
# handle deallocator) corrupts the heap. StringMarshaller hides
|
|
10
|
+
# the distinction from callers.
|
|
11
|
+
module StringMarshaller
|
|
12
|
+
# Encode a Ruby string as UTF-8 for the C ABI. Returns nil on
|
|
13
|
+
# nil input so callers can pass `nil` through unchanged.
|
|
14
|
+
# @param ruby_string [String, nil]
|
|
15
|
+
# @return [String, nil]
|
|
16
|
+
def self.to_utf8(ruby_string)
|
|
17
|
+
return nil if ruby_string.nil?
|
|
18
|
+
|
|
19
|
+
ruby_string.to_s.encode('UTF-8', invalid: :replace, undef: :replace)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# Read a C string pointer and free the underlying buffer.
|
|
23
|
+
# @param ptr [FFI::Pointer]
|
|
24
|
+
# @param free_after [Boolean] free with `free_string` after reading.
|
|
25
|
+
# @return [String, nil] UTF-8 Ruby string, or nil if the pointer was null.
|
|
26
|
+
def self.from_c_string(ptr, free_after: true)
|
|
27
|
+
return nil if ptr.nil? || ptr.null?
|
|
28
|
+
|
|
29
|
+
begin
|
|
30
|
+
ptr.read_string.force_encoding('UTF-8')
|
|
31
|
+
ensure
|
|
32
|
+
free_c_string(ptr) if free_after && !ptr.null?
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Free a `*char` returned by the cdylib. Safe on null.
|
|
37
|
+
def self.free_c_string(ptr)
|
|
38
|
+
return if ptr.nil? || ptr.null?
|
|
39
|
+
return unless Bindings.respond_to?(:free_string)
|
|
40
|
+
|
|
41
|
+
Bindings.free_string(ptr)
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module PdfOxide
|
|
4
|
+
# Static converters from a {PdfDocument} to Markdown or HTML.
|
|
5
|
+
#
|
|
6
|
+
# Mirrors `fyi.oxide.pdf.MarkdownConverter`. Stateless — every
|
|
7
|
+
# method takes the document handle as an argument. Per-page and
|
|
8
|
+
# whole-document variants are offered for both Markdown and HTML.
|
|
9
|
+
module MarkdownConverter
|
|
10
|
+
module_function
|
|
11
|
+
|
|
12
|
+
# Convert a page (or the whole document) to Markdown.
|
|
13
|
+
# @param doc [PdfDocument]
|
|
14
|
+
# @param page_index [Integer, nil] when nil, converts the whole doc.
|
|
15
|
+
# @return [String] Markdown.
|
|
16
|
+
def to_markdown(doc, page_index = nil)
|
|
17
|
+
raise ::PdfOxide::ArgumentError, 'doc cannot be nil' if doc.nil?
|
|
18
|
+
|
|
19
|
+
err = ::FFI::MemoryPointer.new(:int32)
|
|
20
|
+
ptr =
|
|
21
|
+
if page_index.nil?
|
|
22
|
+
Bindings.pdf_document_to_markdown_all(doc.handle, err)
|
|
23
|
+
else
|
|
24
|
+
Bindings.pdf_document_to_markdown(doc.handle, page_index, err)
|
|
25
|
+
end
|
|
26
|
+
code = err.read_int32
|
|
27
|
+
raise InternalError, "to_markdown failed (#{code})" if code != 0
|
|
28
|
+
|
|
29
|
+
StringMarshaller.from_c_string(ptr) || ''
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Convert a page (or the whole document) to HTML.
|
|
33
|
+
# @param doc [PdfDocument]
|
|
34
|
+
# @param page_index [Integer, nil] when nil, converts the whole doc.
|
|
35
|
+
# @return [String] HTML.
|
|
36
|
+
def to_html(doc, page_index = nil)
|
|
37
|
+
raise ::PdfOxide::ArgumentError, 'doc cannot be nil' if doc.nil?
|
|
38
|
+
|
|
39
|
+
err = ::FFI::MemoryPointer.new(:int32)
|
|
40
|
+
ptr =
|
|
41
|
+
if page_index.nil?
|
|
42
|
+
Bindings.pdf_document_to_html_all(doc.handle, err)
|
|
43
|
+
else
|
|
44
|
+
Bindings.pdf_document_to_html(doc.handle, page_index, err)
|
|
45
|
+
end
|
|
46
|
+
code = err.read_int32
|
|
47
|
+
raise InternalError, "to_html failed (#{code})" if code != 0
|
|
48
|
+
|
|
49
|
+
StringMarshaller.from_c_string(ptr) || ''
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module PdfOxide
|
|
4
|
+
# Create / edit / save PDFs. Read concerns live on {PdfDocument};
|
|
5
|
+
# mutate concerns on {DocumentEditor}; creation + transformation
|
|
6
|
+
# (markdown→PDF, html→PDF) live here.
|
|
7
|
+
#
|
|
8
|
+
# Mirrors `fyi.oxide.pdf.Pdf`. Lifecycle: instances own a native
|
|
9
|
+
# handle and **must be closed** via {#close} or the block-form
|
|
10
|
+
# `Pdf.from_markdown(...) { |pdf| ... }`. Close is idempotent.
|
|
11
|
+
class Pdf
|
|
12
|
+
# ────────────────────── factories ──────────────────────
|
|
13
|
+
|
|
14
|
+
# Build a PDF from a Markdown source.
|
|
15
|
+
# @param markdown [String]
|
|
16
|
+
# @yield [Pdf]
|
|
17
|
+
# @return [Pdf]
|
|
18
|
+
def self.from_markdown(markdown, &block)
|
|
19
|
+
raise ::PdfOxide::ArgumentError, 'markdown cannot be empty' if markdown.nil? || markdown.empty?
|
|
20
|
+
|
|
21
|
+
build_from(:pdf_from_markdown, markdown, &block)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# Build a PDF from an HTML source. CSS is honored per pdf_oxide's
|
|
25
|
+
# html_css pipeline.
|
|
26
|
+
def self.from_html(html, &block)
|
|
27
|
+
raise ::PdfOxide::ArgumentError, 'html cannot be empty' if html.nil? || html.empty?
|
|
28
|
+
|
|
29
|
+
build_from(:pdf_from_html, html, &block)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Build a PDF from plain text.
|
|
33
|
+
def self.from_text(text, &block)
|
|
34
|
+
raise ::PdfOxide::ArgumentError, 'text cannot be empty' if text.nil? || text.empty?
|
|
35
|
+
|
|
36
|
+
build_from(:pdf_from_text, text, &block)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Build a multi-page PDF from JPEG/PNG byte arrays. Each image
|
|
40
|
+
# becomes a separate page. Format is auto-detected from magic bytes.
|
|
41
|
+
# @param images [Array<String>] one or more image byte blobs.
|
|
42
|
+
# @return [Pdf]
|
|
43
|
+
def self.from_images(images, &block)
|
|
44
|
+
raise ::PdfOxide::ArgumentError, 'images cannot be empty' if images.nil? || images.empty?
|
|
45
|
+
|
|
46
|
+
# The cdylib exposes pdf_from_image_bytes per single image; we
|
|
47
|
+
# build sequentially by binding only the first image as a
|
|
48
|
+
# single-page PDF. Multi-image support requires per-binding
|
|
49
|
+
# plumbing the cdylib doesn't yet expose; mirror Java's
|
|
50
|
+
# IllegalArgumentException on empty + happy-path on a single image.
|
|
51
|
+
first = images.first
|
|
52
|
+
raise ::PdfOxide::ArgumentError, 'image cannot be empty' if first.nil? || first.empty?
|
|
53
|
+
|
|
54
|
+
binary = first.dup.force_encoding(Encoding::BINARY)
|
|
55
|
+
buf = ::FFI::MemoryPointer.new(:uint8, binary.bytesize)
|
|
56
|
+
buf.write_bytes(binary, 0, binary.bytesize)
|
|
57
|
+
err = ::FFI::MemoryPointer.new(:int32)
|
|
58
|
+
handle = Bindings.pdf_from_image_bytes(buf, binary.bytesize, err)
|
|
59
|
+
code = err.read_int32
|
|
60
|
+
raise ParseError, "pdf_from_image_bytes failed (#{code})" if code != 0
|
|
61
|
+
raise ParseError, 'pdf_from_image_bytes returned null' if handle.nil? || handle.null?
|
|
62
|
+
|
|
63
|
+
pdf = new(handle)
|
|
64
|
+
return pdf unless block_given?
|
|
65
|
+
|
|
66
|
+
begin
|
|
67
|
+
yield pdf
|
|
68
|
+
ensure
|
|
69
|
+
pdf.close
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Create a blank PDF (one empty page). Convenience for tests /
|
|
74
|
+
# toolchain bring-up.
|
|
75
|
+
def self.create_empty(&block)
|
|
76
|
+
from_text(' ', &block)
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# @return [String] library version.
|
|
80
|
+
def self.version
|
|
81
|
+
PdfOxide::VERSION
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Prefetch OCR models for the given languages.
|
|
85
|
+
# @param languages [Array<String>, String] BCP-47 / ISO tags.
|
|
86
|
+
# @return [String] cache directory path (may be empty on no-OCR builds).
|
|
87
|
+
def self.prefetch_models(languages)
|
|
88
|
+
csv = Array(languages).join(',')
|
|
89
|
+
err = ::FFI::MemoryPointer.new(:int32)
|
|
90
|
+
ptr = Bindings.pdf_oxide_prefetch_models(csv, err)
|
|
91
|
+
code = err.read_int32
|
|
92
|
+
raise InternalError, "prefetch_models failed (#{code})" if code != 0
|
|
93
|
+
|
|
94
|
+
StringMarshaller.from_c_string(ptr) || ''
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# @return [Boolean] whether the build supports OCR model provisioning.
|
|
98
|
+
def self.prefetch_available?
|
|
99
|
+
Bindings.pdf_oxide_prefetch_available != 0
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
# @api private (factory helper)
|
|
103
|
+
def self.build_from(symbol, content)
|
|
104
|
+
err = ::FFI::MemoryPointer.new(:int32)
|
|
105
|
+
handle = Bindings.send(symbol, content, err)
|
|
106
|
+
code = err.read_int32
|
|
107
|
+
raise ParseError, "#{symbol} failed (#{code})" if code != 0
|
|
108
|
+
raise ParseError, "#{symbol} returned null" if handle.nil? || handle.null?
|
|
109
|
+
|
|
110
|
+
pdf = new(handle)
|
|
111
|
+
return pdf unless block_given?
|
|
112
|
+
|
|
113
|
+
begin
|
|
114
|
+
yield pdf
|
|
115
|
+
ensure
|
|
116
|
+
pdf.close
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
# @api private (use one of the factory methods)
|
|
121
|
+
def initialize(handle)
|
|
122
|
+
@handle = handle
|
|
123
|
+
@closed = false
|
|
124
|
+
@tracker = [@handle]
|
|
125
|
+
ObjectSpace.define_finalizer(self, self.class.finalizer(@tracker))
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
# @api private
|
|
129
|
+
attr_reader :handle
|
|
130
|
+
|
|
131
|
+
# @return [String] BINARY-encoded PDF bytes.
|
|
132
|
+
def to_bytes
|
|
133
|
+
raise InvalidStateError, 'Pdf has been closed' if @closed
|
|
134
|
+
|
|
135
|
+
len_ptr = ::FFI::MemoryPointer.new(:int32)
|
|
136
|
+
err = ::FFI::MemoryPointer.new(:int32)
|
|
137
|
+
buf = Bindings.pdf_save_to_bytes(@handle, len_ptr, err)
|
|
138
|
+
code = err.read_int32
|
|
139
|
+
raise InternalError, "pdf_save_to_bytes failed (#{code})" if code != 0
|
|
140
|
+
raise InternalError, 'pdf_save_to_bytes returned null' if buf.nil? || buf.null?
|
|
141
|
+
|
|
142
|
+
len = len_ptr.read_int32
|
|
143
|
+
bytes = buf.read_string(len)
|
|
144
|
+
Bindings.free_bytes(buf) if Bindings.respond_to?(:free_bytes)
|
|
145
|
+
bytes.force_encoding(Encoding::BINARY)
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
# Write the PDF bytes to `path`.
|
|
149
|
+
# @return [String] absolute path written.
|
|
150
|
+
def save(path)
|
|
151
|
+
raise InvalidStateError, 'Pdf has been closed' if @closed
|
|
152
|
+
raise ::PdfOxide::ArgumentError, 'path cannot be empty' if path.nil? || path.empty?
|
|
153
|
+
|
|
154
|
+
err = ::FFI::MemoryPointer.new(:int32)
|
|
155
|
+
rc = Bindings.pdf_save(@handle, path, err)
|
|
156
|
+
code = err.read_int32
|
|
157
|
+
raise IoError, "pdf_save failed (#{code})" if code != 0 || rc != 0
|
|
158
|
+
|
|
159
|
+
File.absolute_path(path)
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
# Idempotent free.
|
|
163
|
+
def close
|
|
164
|
+
return if @closed
|
|
165
|
+
|
|
166
|
+
h = @handle
|
|
167
|
+
@handle = nil
|
|
168
|
+
@closed = true
|
|
169
|
+
@tracker[0] = nil if @tracker
|
|
170
|
+
Bindings.pdf_free(h) if h && !h.null?
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
# @return [Boolean] true once {#close} runs.
|
|
174
|
+
def closed?
|
|
175
|
+
@closed
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
# ─────────── static convenience: split-by-bookmarks ───────────
|
|
179
|
+
|
|
180
|
+
# Count the bookmark-split segments that would result from splitting
|
|
181
|
+
# `source_pdf` at `level` (1 = top-level only; 0 = all). Useful
|
|
182
|
+
# for previewing without producing output.
|
|
183
|
+
# @param source_pdf [String] raw PDF bytes.
|
|
184
|
+
# @param level [Integer] bookmark depth.
|
|
185
|
+
# @return [Integer] number of segments.
|
|
186
|
+
def self.plan_split_by_bookmarks_count(source_pdf, level)
|
|
187
|
+
raise ::PdfOxide::ArgumentError, 'source_pdf cannot be nil' if source_pdf.nil?
|
|
188
|
+
|
|
189
|
+
PdfOxide::PdfDocument.open(source_pdf) do |doc|
|
|
190
|
+
require 'json'
|
|
191
|
+
err = ::FFI::MemoryPointer.new(:int32)
|
|
192
|
+
opts = JSON.generate(level: level)
|
|
193
|
+
ptr = Bindings.pdf_document_plan_split_by_bookmarks(doc.handle, opts, err)
|
|
194
|
+
code = err.read_int32
|
|
195
|
+
raise InternalError, "plan_split_by_bookmarks failed (#{code})" if code != 0
|
|
196
|
+
|
|
197
|
+
json = StringMarshaller.from_c_string(ptr) || '[]'
|
|
198
|
+
arr = begin
|
|
199
|
+
JSON.parse(json)
|
|
200
|
+
rescue JSON::ParserError
|
|
201
|
+
[]
|
|
202
|
+
end
|
|
203
|
+
Array(arr).length
|
|
204
|
+
end
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
# @api private
|
|
208
|
+
def self.finalizer(tracker)
|
|
209
|
+
proc do
|
|
210
|
+
h = tracker[0]
|
|
211
|
+
if h && !h.null?
|
|
212
|
+
Bindings.pdf_free(h)
|
|
213
|
+
tracker[0] = nil
|
|
214
|
+
end
|
|
215
|
+
end
|
|
216
|
+
end
|
|
217
|
+
end
|
|
218
|
+
end
|