pdf_oxide 0.3.55-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,157 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+
5
+ module PdfOxide
6
+ # v0.3.51 #519 — auto-extraction with typed reasons.
7
+ #
8
+ # Mirrors `fyi.oxide.pdf.AutoExtractor`. Given a {PdfDocument},
9
+ # returns recoverable text (native or OCR), per-page or
10
+ # whole-document, with a typed reason naming any degraded outcome.
11
+ # When OCR is needed but unavailable, returns the native text layer
12
+ # with `:ocr_requested_but_unavailable` instead of raising —
13
+ # extraction is **not** a security operation (per
14
+ # `feedback_extraction_graceful_fallback`).
15
+ #
16
+ # @example
17
+ # doc = PdfOxide::PdfDocument.open('sample.pdf')
18
+ # ax = PdfOxide::AutoExtractor.new(doc)
19
+ # result = ax.extract_page(0)
20
+ # puts result[:text]
21
+ # warn "degraded: #{result[:reason]}" unless ax.ok?(result[:reason])
22
+ class AutoExtractor
23
+ # Typed reasons mirror the Rust serde-emitted snake_case tokens
24
+ # at the FFI JSON boundary. Renaming would break cross-binding
25
+ # parity with PHP / Python / Java.
26
+ REASONS = %i[
27
+ ok
28
+ native_text_high_confidence
29
+ no_text_layer_present
30
+ text_layer_below_threshold
31
+ glyph_mapping_missing
32
+ encrypted_no_extract_permission
33
+ image_table_reconstructed
34
+ image_table_no_structure
35
+ chart_not_transcribed
36
+ ocr_requested_but_unavailable
37
+ ocr_low_confidence_fallback
38
+ empty
39
+ ].freeze
40
+
41
+ # Per-page kinds from the auto-classifier (Rust's `PageKind` enum).
42
+ PAGE_KINDS = %i[text_layer scanned image_text mixed empty].freeze
43
+
44
+ # @return [PdfDocument]
45
+ attr_reader :document
46
+
47
+ def initialize(document)
48
+ raise ::PdfOxide::ArgumentError, 'document cannot be nil' if document.nil?
49
+ raise ::PdfOxide::StateError, 'document has been closed' if document.respond_to?(:closed?) && document.closed?
50
+
51
+ @document = document
52
+ end
53
+
54
+ # Cheap per-page classifier — no OCR, no rasterisation.
55
+ # @return [Hash] { reason:, kind:, confidence:, classification: }
56
+ def classify_page(page_index)
57
+ json = call_json('classify_page') do |err|
58
+ Bindings.pdf_document_classify_page(@document.handle, page_index, err)
59
+ end
60
+ build_classification(json)
61
+ end
62
+
63
+ # Whole-document classifier.
64
+ # @return [Hash] decoded JSON envelope.
65
+ def classify_document
66
+ call_json('classify_document') do |err|
67
+ Bindings.pdf_document_classify_document(@document.handle, err)
68
+ end
69
+ end
70
+
71
+ # Extract a page's text via the v0.3.51 auto-router (text-vs-OCR
72
+ # decision with graceful native fallback). Surfaces a typed
73
+ # reason describing the quality.
74
+ # @return [Hash] { text:, reason:, kind:, confidence:, classification: }
75
+ def extract_text(page_index)
76
+ text = call_text('extract_text_auto') do |err|
77
+ Bindings.pdf_document_extract_text_auto(@document.handle, page_index, err)
78
+ end
79
+ cls = begin
80
+ classify_page(page_index)
81
+ rescue StandardError
82
+ { reason: :ok, kind: :mixed, confidence: 0.0 }
83
+ end
84
+ # Graceful fallback: if classifier wants OCR and the build can't
85
+ # supply it, surface OCR_REQUESTED_BUT_UNAVAILABLE regardless of
86
+ # native-side state.
87
+ cls[:reason] = :ocr_requested_but_unavailable if cls[:kind] == :scanned && !self.class.prefetch_available?
88
+ cls.merge(text: text)
89
+ end
90
+
91
+ # Rich per-page extraction — returns the full PageExtraction
92
+ # JSON envelope (text + per-region bbox + reason + confidence)
93
+ # merged into a Hash.
94
+ # @param page_index [Integer]
95
+ # @param options [Hash, nil] auto-extract options serialised to JSON.
96
+ def extract_page(page_index, options: nil)
97
+ options_json = options.nil? ? nil : JSON.generate(options)
98
+ json = call_json('extract_page_auto') do |err|
99
+ Bindings.pdf_document_extract_page_auto(@document.handle, page_index, options_json, err)
100
+ end
101
+ cls = build_classification(json)
102
+ cls.merge(text: json['text'] || '', classification: json)
103
+ end
104
+
105
+ # @return [Boolean] true when the reason represents a clean extract.
106
+ def ok?(reason)
107
+ %i[ok native_text_high_confidence].include?(reason)
108
+ end
109
+
110
+ # @return [Boolean] true when the OCR-unavailable graceful-fallback
111
+ # path engaged.
112
+ def ocr_fallback?(reason)
113
+ %i[ocr_requested_but_unavailable ocr_low_confidence_fallback].include?(reason)
114
+ end
115
+
116
+ # @return [Boolean] whether the build supports OCR provisioning
117
+ # (i.e. the `ocr` feature is compiled in).
118
+ def self.prefetch_available?
119
+ Bindings.pdf_oxide_prefetch_available != 0
120
+ end
121
+
122
+ private
123
+
124
+ def call_json(operation, &block)
125
+ raw = call_text(operation, &block)
126
+ return {} if raw.nil? || raw.empty?
127
+
128
+ JSON.parse(raw)
129
+ rescue JSON::ParserError
130
+ {}
131
+ end
132
+
133
+ def call_text(operation)
134
+ err = ::FFI::MemoryPointer.new(:int32)
135
+ ptr = yield(err)
136
+ code = err.read_int32
137
+ raise InternalError, "#{operation} failed (#{code})" if code != 0
138
+ return '' if ptr.nil? || ptr.null?
139
+
140
+ StringMarshaller.from_c_string(ptr) || ''
141
+ end
142
+
143
+ def build_classification(json)
144
+ json = {} unless json.is_a?(Hash)
145
+ reason = (json['reason'] || 'ok').to_sym
146
+ reason = :ok unless REASONS.include?(reason)
147
+ kind = (json['kind'] || 'mixed').to_sym
148
+ kind = :mixed unless PAGE_KINDS.include?(kind)
149
+ {
150
+ reason: reason,
151
+ kind: kind,
152
+ confidence: (json['confidence'] || 0.0).to_f,
153
+ classification: json
154
+ }
155
+ end
156
+ end
157
+ end
@@ -0,0 +1,235 @@
1
+ # frozen_string_literal: true
2
+
3
+ module PdfOxide
4
+ # Write-side counterpart to {PdfDocument}: form-fill, destructive
5
+ # redaction (v0.3.50 #231), metadata scrubbing, and incremental save.
6
+ #
7
+ # Mirrors `fyi.oxide.pdf.DocumentEditor`. Lifecycle: holds a native
8
+ # `DocumentEditor*` handle; **must** be closed via {#close} or a
9
+ # block-form factory. Close is idempotent.
10
+ #
11
+ # Per `feedback_extraction_graceful_fallback`: destructive redaction
12
+ # is a **security operation** — every non-zero return code raises
13
+ # rather than silently degrading.
14
+ #
15
+ # @example destructive redaction (block-form auto-closes).
16
+ # PdfOxide::DocumentEditor.open('source.pdf') do |ed|
17
+ # ed.add_redaction(page: 0, rect: [100, 200, 300, 250])
18
+ # ed.apply_redactions!
19
+ # ed.save_to('redacted.pdf')
20
+ # end
21
+ class DocumentEditor
22
+ # Open an editor session over a PDF on disk (or in-memory bytes).
23
+ # @param source [String] file path or raw PDF bytes.
24
+ # @yield [DocumentEditor]
25
+ # @return [DocumentEditor, Object]
26
+ def self.open(source, &block)
27
+ ed = new(source)
28
+ return ed unless block_given?
29
+
30
+ begin
31
+ yield ed
32
+ ensure
33
+ ed.close
34
+ end
35
+ end
36
+
37
+ def initialize(source)
38
+ raise ::PdfOxide::ArgumentError, 'source cannot be nil' if source.nil?
39
+
40
+ err = ::FFI::MemoryPointer.new(:int32)
41
+ @handle =
42
+ if source.is_a?(String) && File.exist?(source)
43
+ Bindings.document_editor_open(File.absolute_path(source), err)
44
+ elsif source.is_a?(String) && !source.empty?
45
+ binary = source.dup.force_encoding(Encoding::BINARY)
46
+ buf = ::FFI::MemoryPointer.new(:uint8, binary.bytesize)
47
+ buf.write_bytes(binary, 0, binary.bytesize)
48
+ Bindings.document_editor_open_from_bytes(buf, binary.bytesize, err)
49
+ else
50
+ raise FileNotFoundError, "file not found: #{source}"
51
+ end
52
+
53
+ code = err.read_int32
54
+ raise IoError, "document_editor_open failed (#{code})" if code != 0
55
+ raise IoError, 'document_editor_open returned null' if @handle.nil? || @handle.null?
56
+
57
+ @closed = false
58
+ @applied = false
59
+ @tracker = [@handle]
60
+ ObjectSpace.define_finalizer(self, self.class.finalizer(@tracker))
61
+ end
62
+
63
+ # @api private
64
+ attr_reader :handle
65
+
66
+ # ─────────────── destructive redaction (#231) ───────────────
67
+
68
+ # Queue a redaction rectangle for the given page. The redaction
69
+ # is not applied until {#apply_redactions!} runs.
70
+ # @param page [Integer] 0-based page index.
71
+ # @param rect [Array<Numeric>] `[x1, y1, x2, y2]` in PDF user-space.
72
+ # @param color [Array<Numeric>] `[r, g, b]` overlay color (0.0–1.0).
73
+ # @return [self] (fluent chaining).
74
+ def add_redaction(page:, rect:, color: [0.0, 0.0, 0.0])
75
+ check_open!
76
+ raise ::PdfOxide::ArgumentError, 'rect must have 4 numeric values' unless rect.respond_to?(:length) && rect.length == 4
77
+
78
+ x1, y1, x2, y2 = rect.map(&:to_f)
79
+ r, g, b = color.map(&:to_f)
80
+ err = ::FFI::MemoryPointer.new(:int32)
81
+ rc = Bindings.pdf_redaction_add(@handle, Integer(page), x1, y1, x2, y2, r, g, b, err)
82
+ fail_closed!(rc, err.read_int32, 'pdf_redaction_add')
83
+ self
84
+ end
85
+
86
+ # Total redactions queued for the page.
87
+ # @param page [Integer]
88
+ # @return [Integer]
89
+ def redaction_count(page)
90
+ check_open!
91
+ err = ::FFI::MemoryPointer.new(:int32)
92
+ n = Bindings.pdf_redaction_count(@handle, Integer(page), err)
93
+ fail_closed!(0, err.read_int32, 'pdf_redaction_count')
94
+ n
95
+ end
96
+
97
+ # Apply all queued redactions destructively.
98
+ # @param scrub_metadata [Boolean] also strip /Info, XMP, JS.
99
+ # @param fill_color [Array<Numeric>] overlay `[r, g, b]`.
100
+ # @return [self]
101
+ def apply_redactions!(scrub_metadata: false, fill_color: [0.0, 0.0, 0.0])
102
+ check_open!
103
+ r, g, b = fill_color.map(&:to_f)
104
+ err = ::FFI::MemoryPointer.new(:int32)
105
+ rc = Bindings.pdf_redaction_apply(@handle, scrub_metadata, r, g, b, err)
106
+ fail_closed!(rc, err.read_int32, 'pdf_redaction_apply')
107
+
108
+ if scrub_metadata
109
+ err2 = ::FFI::MemoryPointer.new(:int32)
110
+ rc2 = Bindings.pdf_redaction_scrub_metadata(@handle, err2)
111
+ fail_closed!(rc2, err2.read_int32, 'pdf_redaction_scrub_metadata')
112
+ end
113
+ @applied = true
114
+ self
115
+ end
116
+
117
+ # Metadata scrubbing without redaction regions.
118
+ # @return [self]
119
+ def scrub_metadata
120
+ check_open!
121
+ err = ::FFI::MemoryPointer.new(:int32)
122
+ rc = Bindings.pdf_redaction_scrub_metadata(@handle, err)
123
+ fail_closed!(rc, err.read_int32, 'pdf_redaction_scrub_metadata')
124
+ @applied = true
125
+ self
126
+ end
127
+
128
+ # ─────────────── form-fill ───────────────
129
+
130
+ # Set an AcroForm text field.
131
+ # @param name [String] dot-separated full field name.
132
+ # @param value [String, Boolean] new value (Boolean = checkbox/radio).
133
+ # @return [self]
134
+ def set_form_field(name, value)
135
+ check_open!
136
+ raise ::PdfOxide::ArgumentError, 'name cannot be nil' if name.nil?
137
+
138
+ err = ::FFI::MemoryPointer.new(:int32)
139
+ ok = if [true, false].include?(value)
140
+ Bindings.pdf_form_field_set_value_by_name_boolean(@handle, name, value, err)
141
+ else
142
+ Bindings.pdf_form_field_set_value_by_name_string(@handle, name, value.to_s, err)
143
+ end
144
+ code = err.read_int32
145
+ raise InternalError, "set_form_field failed (#{code})" if code != 0
146
+ raise InternalError, 'set_form_field rejected by cdylib (field missing?)' unless ok
147
+
148
+ self
149
+ rescue ::FFI::NotFoundError
150
+ # phantom in this cdylib build — leave the field-write a no-op
151
+ # and surface a clear error rather than crashing.
152
+ raise UnsupportedFeatureError, 'form-fill not supported by this cdylib build'
153
+ end
154
+
155
+ # ─────────────── save ───────────────
156
+
157
+ # Save the edited PDF to the given path.
158
+ # @return [String] absolute path written.
159
+ def save_to(path)
160
+ check_open!
161
+ raise ::PdfOxide::ArgumentError, 'path cannot be empty' if path.nil? || path.empty?
162
+
163
+ check_applied! if @needs_apply
164
+ err = ::FFI::MemoryPointer.new(:int32)
165
+ rc = Bindings.document_editor_save(@handle, File.absolute_path(path), err)
166
+ fail_closed!(rc, err.read_int32, 'document_editor_save')
167
+ File.absolute_path(path)
168
+ end
169
+
170
+ # @return [String] BINARY-encoded PDF bytes.
171
+ def to_bytes
172
+ check_open!
173
+ len_ptr = ::FFI::MemoryPointer.new(:size_t)
174
+ err = ::FFI::MemoryPointer.new(:int32)
175
+ buf = Bindings.document_editor_save_to_bytes(@handle, len_ptr, err)
176
+ fail_closed!(0, err.read_int32, 'document_editor_save_to_bytes')
177
+ raise InternalError, 'document_editor_save_to_bytes returned null' if buf.nil? || buf.null?
178
+
179
+ len = len_ptr.read(:size_t)
180
+ bytes = buf.read_string(len)
181
+ Bindings.free_bytes(buf) if Bindings.respond_to?(:free_bytes)
182
+ bytes.force_encoding(Encoding::BINARY)
183
+ end
184
+
185
+ # ─────────────── lifecycle ───────────────
186
+
187
+ # Idempotent close.
188
+ def close
189
+ return if @closed
190
+
191
+ h = @handle
192
+ @handle = nil
193
+ @closed = true
194
+ @tracker[0] = nil if @tracker
195
+ Bindings.document_editor_free(h) if h && !h.null?
196
+ end
197
+
198
+ def closed?
199
+ @closed
200
+ end
201
+
202
+ # @api private
203
+ def self.finalizer(tracker)
204
+ proc do
205
+ h = tracker[0]
206
+ if h && !h.null?
207
+ Bindings.document_editor_free(h)
208
+ tracker[0] = nil
209
+ end
210
+ end
211
+ end
212
+
213
+ private
214
+
215
+ def check_open!
216
+ raise InvalidStateError, 'DocumentEditor has been closed' if @closed || @handle.nil?
217
+ end
218
+
219
+ def check_applied!
220
+ return if @applied
221
+
222
+ raise StateError, 'no redactions applied; call apply_redactions! before save'
223
+ end
224
+
225
+ # Security-op fail-closed contract: any non-zero rc OR error_code raises.
226
+ def fail_closed!(rc, error_code, operation)
227
+ if error_code != 0
228
+ raise RedactionError, "#{operation} failed (error code #{error_code}); security op fails closed"
229
+ end
230
+ return unless rc.is_a?(Integer) && rc.negative?
231
+
232
+ raise RedactionError, "#{operation} returned #{rc}; security op fails closed"
233
+ end
234
+ end
235
+ end
@@ -0,0 +1,58 @@
1
+ # frozen_string_literal: true
2
+
3
+ module PdfOxide
4
+ # Base error class for all PdfOxide exceptions. Mirrors the Java
5
+ # exception hierarchy at fyi.oxide.pdf.exception.* — every native
6
+ # error maps to one of the subclasses below.
7
+ class Error < StandardError; end
8
+
9
+ # Raised when the host platform isn't supported by the bundled cdylib.
10
+ class UnsupportedPlatformError < Error; end
11
+
12
+ # Raised when a user-supplied argument fails validation BEFORE the
13
+ # native call (nil check, range check, etc.). Wrapper around
14
+ # ::ArgumentError so it composes with Ruby's standard library.
15
+ class ArgumentError < Error; end
16
+
17
+ # Filesystem / I/O failures (file-not-found, EACCES, EIO, …).
18
+ class IoError < Error; end
19
+
20
+ # `IoError` specialisation for missing files.
21
+ class FileNotFoundError < IoError; end
22
+
23
+ # PDF parse / structure errors (malformed header, corrupt xref, …).
24
+ class ParseError < Error; end
25
+
26
+ # Resource / state errors — closed handle, wrong operation order.
27
+ class StateError < Error; end
28
+
29
+ # Operation called on an already-closed document/editor/Pdf.
30
+ class InvalidStateError < StateError; end
31
+
32
+ # Encryption / wrong-password failures.
33
+ class EncryptedError < Error; end
34
+
35
+ # Permission denied (encrypted PDF lacking extract / sign perm).
36
+ class PermissionError < Error; end
37
+
38
+ # Feature requested but not compiled into this cdylib build
39
+ # (e.g. signatures without the `signatures` Cargo feature).
40
+ class UnsupportedFeatureError < Error; end
41
+
42
+ # Digital-signature failure (PAdES B/T/LT signing / verifying).
43
+ class SignatureError < Error; end
44
+
45
+ # Destructive-redaction failure (#231). Security op: fails closed.
46
+ class RedactionError < Error; end
47
+
48
+ # PDF/A · PDF/X · PDF/UA compliance failure.
49
+ class ComplianceError < Error; end
50
+
51
+ # Native text-search operation failed (cdylib error code 7 /
52
+ # `ERR_SEARCH`). Mirrors C#'s `PdfOxide.Exceptions.SearchException`
53
+ # and Java's `PdfException(SEARCH)`.
54
+ class SearchError < Error; end
55
+
56
+ # Generic native-side failure that didn't map to a specific subclass.
57
+ class InternalError < Error; end
58
+ end