corp_pdf 1.0.5 → 1.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +2 -1
- data/README.md +80 -161
- data/lib/corp_pdf/object_resolver.rb +92 -2
- data/lib/corp_pdf/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 41334cc30a2fbd56ab358ed95ec0a89b7867b059818220abd0663c05315d4caf
|
|
4
|
+
data.tar.gz: 73514589fb5c6ad4c6d66f079de8d854b54ea67254d425091a1cc9be402193b6
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 2b35da0a14dd018b85451e4018622ef235a6b078571ada9fc4b6f949288d14672b26b519e0991ef1605aaec914d324881fe312b33b80ef32dd9e58f6bb949fd4
|
|
7
|
+
data.tar.gz: ae2362ecfa8cc1fa91501293ba2caa9cc5540c237c1db9671e126901ef91c7b4f405cfc86792fe02f6b5b5d4c63b303ba8969df8dd0d765cb6adcb58497a60fe
|
data/.gitignore
CHANGED
data/README.md
CHANGED
|
@@ -11,6 +11,16 @@ A minimal pure Ruby library for parsing and editing PDF AcroForm fields.
|
|
|
11
11
|
- ✅ **Minimal PDF Engine** - Basic PDF parser/writer for AcroForm manipulation
|
|
12
12
|
- ✅ **Ruby 3.1+** - Modern Ruby support
|
|
13
13
|
|
|
14
|
+
## Documentation
|
|
15
|
+
|
|
16
|
+
📚 **Detailed documentation** is available in the [`docs/`](./docs/) directory:
|
|
17
|
+
|
|
18
|
+
- **[PDF Structure](./docs/pdf_structure.md)** - Understanding PDF file structure and text-based syntax
|
|
19
|
+
- **[DictScan Explained](./docs/dict_scan_explained.md)** - How text traversal parses PDF dictionaries
|
|
20
|
+
- **[Object Streams](./docs/object_streams.md)** - Parsing compressed PDF object streams
|
|
21
|
+
- **[Clearing Fields](./docs/clear_fields.md)** - Complete guide to removing unwanted fields
|
|
22
|
+
- **[Documentation Index](./docs/README.md)** - Overview of all available documentation
|
|
23
|
+
|
|
14
24
|
## Installation
|
|
15
25
|
|
|
16
26
|
Add this line to your application's Gemfile:
|
|
@@ -54,7 +64,7 @@ fields.each do |field|
|
|
|
54
64
|
puts "#{field.name} (#{type_info}) = #{field.value}"
|
|
55
65
|
end
|
|
56
66
|
|
|
57
|
-
# Add a new field
|
|
67
|
+
# Add a new field
|
|
58
68
|
new_field = doc.add_field("NameField",
|
|
59
69
|
value: "John Doe",
|
|
60
70
|
x: 100,
|
|
@@ -62,17 +72,7 @@ new_field = doc.add_field("NameField",
|
|
|
62
72
|
width: 200,
|
|
63
73
|
height: 20,
|
|
64
74
|
page: 1,
|
|
65
|
-
type: :text
|
|
66
|
-
)
|
|
67
|
-
|
|
68
|
-
# Or using the PDF type string directly
|
|
69
|
-
button_field = doc.add_field("CheckBox",
|
|
70
|
-
type: "/Btn", # Or use :button symbol
|
|
71
|
-
x: 100,
|
|
72
|
-
y: 600,
|
|
73
|
-
width: 20,
|
|
74
|
-
height: 20,
|
|
75
|
-
page: 1
|
|
75
|
+
type: :text
|
|
76
76
|
)
|
|
77
77
|
|
|
78
78
|
# Update a field value
|
|
@@ -87,9 +87,6 @@ doc.remove_field("FieldToRemove")
|
|
|
87
87
|
# Write the modified PDF to a file
|
|
88
88
|
doc.write("output.pdf")
|
|
89
89
|
|
|
90
|
-
# Or write with flattening (removes incremental updates)
|
|
91
|
-
doc.write("output.pdf", flatten: true)
|
|
92
|
-
|
|
93
90
|
# Or get PDF bytes as a String (returns String, not StringIO)
|
|
94
91
|
pdf_bytes = doc.write
|
|
95
92
|
File.binwrite("output.pdf", pdf_bytes)
|
|
@@ -99,46 +96,42 @@ File.binwrite("output.pdf", pdf_bytes)
|
|
|
99
96
|
|
|
100
97
|
#### Working with Field Objects
|
|
101
98
|
|
|
99
|
+
Each field returned by `#list_fields` is a `Field` object with properties and methods:
|
|
100
|
+
|
|
102
101
|
```ruby
|
|
103
102
|
doc = CorpPdf::Document.new("form.pdf")
|
|
104
103
|
fields = doc.list_fields
|
|
104
|
+
field = fields.first
|
|
105
105
|
|
|
106
106
|
# Access field properties
|
|
107
|
-
field
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
#
|
|
119
|
-
|
|
120
|
-
# Update
|
|
121
|
-
field.
|
|
122
|
-
|
|
123
|
-
#
|
|
124
|
-
field.
|
|
125
|
-
|
|
126
|
-
#
|
|
127
|
-
field.
|
|
128
|
-
|
|
129
|
-
#
|
|
130
|
-
field.
|
|
131
|
-
field.button_field? # true for button/checkbox fields
|
|
132
|
-
field.choice_field? # true for choice/dropdown fields
|
|
133
|
-
field.signature_field? # true for signature fields
|
|
134
|
-
|
|
135
|
-
# Check if field has a value
|
|
136
|
-
field.has_value?
|
|
137
|
-
|
|
138
|
-
# Check if field has position information
|
|
139
|
-
field.has_position?
|
|
107
|
+
field.name # Field name (String)
|
|
108
|
+
field.value # Field value (String or nil)
|
|
109
|
+
field.type # Field type (String, e.g., "/Tx", "/Btn", "/Ch", "/Sig")
|
|
110
|
+
field.type_key # Symbol key (e.g., :text) or nil if not mapped
|
|
111
|
+
field.x # X coordinate (Float or nil)
|
|
112
|
+
field.y # Y coordinate (Float or nil)
|
|
113
|
+
field.width # Field width (Float or nil)
|
|
114
|
+
field.height # Field height (Float or nil)
|
|
115
|
+
field.page # Page number (Integer or nil)
|
|
116
|
+
field.ref # Object reference array [object_number, generation]
|
|
117
|
+
|
|
118
|
+
# Field methods
|
|
119
|
+
field.update("New Value") # Update field value
|
|
120
|
+
field.update("New Value", new_name: "NewName") # Update and rename
|
|
121
|
+
field.remove # Remove the field
|
|
122
|
+
field.text_field? # Check if text field
|
|
123
|
+
field.button_field? # Check if button/checkbox field
|
|
124
|
+
field.choice_field? # Check if choice/dropdown field
|
|
125
|
+
field.signature_field? # Check if signature field
|
|
126
|
+
field.has_value? # Check if field has a value
|
|
127
|
+
field.has_position? # Check if field has position info
|
|
128
|
+
field.object_number # Get object number
|
|
129
|
+
field.generation # Get generation number
|
|
130
|
+
field.valid_ref? # Check if field has valid reference
|
|
140
131
|
```
|
|
141
132
|
|
|
133
|
+
**Note**: When reading fields from a PDF, if the type is missing or empty, it defaults to `"/Tx"` (text field).
|
|
134
|
+
|
|
142
135
|
#### Signature Fields with Image Appearances
|
|
143
136
|
|
|
144
137
|
Signature fields can be enhanced with image appearances (signature images). When you update a signature field with image data (base64-encoded JPEG or PNG), CorpPdf will automatically add the image as the field's appearance.
|
|
@@ -255,26 +248,33 @@ doc.write("form_with_multiple_groups.pdf")
|
|
|
255
248
|
|
|
256
249
|
#### Flattening PDFs
|
|
257
250
|
|
|
251
|
+
Flattening removes incremental updates from a PDF, creating a clean single-version document:
|
|
252
|
+
|
|
258
253
|
```ruby
|
|
259
|
-
# Flatten a PDF to remove incremental updates
|
|
260
254
|
doc = CorpPdf::Document.new("form.pdf")
|
|
261
|
-
doc.flatten! # Modifies the document in-place
|
|
262
255
|
|
|
263
|
-
#
|
|
264
|
-
|
|
256
|
+
# Flatten in-place (modifies the document)
|
|
257
|
+
doc.flatten!
|
|
258
|
+
|
|
259
|
+
# Get flattened bytes without modifying the document
|
|
260
|
+
flattened_bytes = doc.flatten
|
|
261
|
+
|
|
262
|
+
# Write with flattening option
|
|
263
|
+
doc.write("output.pdf", flatten: true)
|
|
265
264
|
|
|
266
|
-
#
|
|
267
|
-
|
|
265
|
+
# Class method: flatten from file
|
|
266
|
+
CorpPdf::Document.flatten_pdf("input.pdf", "output.pdf")
|
|
267
|
+
flattened_doc = CorpPdf::Document.flatten_pdf("input.pdf")
|
|
268
268
|
```
|
|
269
269
|
|
|
270
270
|
#### Clearing Fields
|
|
271
271
|
|
|
272
|
-
The `clear` and `clear!` methods
|
|
272
|
+
The `clear` and `clear!` methods completely remove unwanted fields by rewriting the entire PDF (more efficient than multiple `remove_field` calls):
|
|
273
273
|
|
|
274
274
|
```ruby
|
|
275
275
|
doc = CorpPdf::Document.new("form.pdf")
|
|
276
276
|
|
|
277
|
-
# Remove
|
|
277
|
+
# Remove fields matching a pattern (in-place)
|
|
278
278
|
doc.clear!(remove_pattern: /^text-/)
|
|
279
279
|
|
|
280
280
|
# Keep only specific fields
|
|
@@ -283,14 +283,17 @@ doc.clear!(keep_fields: ["Name", "Email"])
|
|
|
283
283
|
# Remove specific fields
|
|
284
284
|
doc.clear!(remove_fields: ["OldField1", "OldField2"])
|
|
285
285
|
|
|
286
|
-
# Use a block to
|
|
287
|
-
doc.clear! { |
|
|
286
|
+
# Use a block to filter fields (return true to keep)
|
|
287
|
+
doc.clear! { |field| !field.name.start_with?("temp_") }
|
|
288
|
+
|
|
289
|
+
# Get cleared bytes without modifying document
|
|
290
|
+
cleared_bytes = doc.clear(remove_pattern: /.*/)
|
|
288
291
|
|
|
289
292
|
# Write the cleared PDF
|
|
290
293
|
doc.write("cleared.pdf", flatten: true)
|
|
291
294
|
```
|
|
292
295
|
|
|
293
|
-
**Note:** Unlike `remove_field`, which uses incremental updates, `clear` completely rewrites the PDF
|
|
296
|
+
**Note:** Unlike `remove_field`, which uses incremental updates, `clear` completely rewrites the PDF. See [Clearing Fields Documentation](docs/clear_fields.md) for detailed information.
|
|
294
297
|
|
|
295
298
|
### API Reference
|
|
296
299
|
|
|
@@ -339,43 +342,25 @@ second_page.add_field("Email", x: 100, y: 650, width: 200, height: 20)
|
|
|
339
342
|
- `page.to_h` - Convert to hash for backward compatibility
|
|
340
343
|
|
|
341
344
|
#### `#add_field(name, options)`
|
|
342
|
-
Adds a new form field to the document.
|
|
345
|
+
Adds a new form field to the document. Returns a `Field` object if successful.
|
|
346
|
+
|
|
347
|
+
**Options:**
|
|
343
348
|
- `value`: Default value for the field (String)
|
|
344
|
-
- `x`:
|
|
345
|
-
- `
|
|
346
|
-
- `
|
|
347
|
-
- `
|
|
348
|
-
- `page`: Page number to add the field to (Integer, default: 1)
|
|
349
|
-
- `type`: Field type (Symbol or String, default: `"/Tx"`). Options:
|
|
349
|
+
- `x`, `y`: Field position coordinates (Integer, defaults: 100, 500)
|
|
350
|
+
- `width`, `height`: Field dimensions (Integer, defaults: 100, 20)
|
|
351
|
+
- `page`: Page number (Integer, default: 1)
|
|
352
|
+
- `type`: Field type (Symbol or String, default: `"/Tx"`)
|
|
350
353
|
- Symbol keys: `:text`, `:button`, `:choice`, `:signature`, `:radio`
|
|
351
354
|
- PDF type strings: `"/Tx"`, `"/Btn"`, `"/Ch"`, `"/Sig"`
|
|
352
|
-
- `group_id`: Required for radio buttons.
|
|
353
|
-
- `selected`: Optional for radio buttons.
|
|
355
|
+
- `group_id`: Required for radio buttons. Groups related radio buttons together.
|
|
356
|
+
- `selected`: Optional for radio buttons. Set to `true` to select by default.
|
|
354
357
|
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
```ruby
|
|
358
|
-
# Using symbol keys (recommended)
|
|
359
|
-
field = doc.add_field("NewField", value: "Value", x: 100, y: 500, width: 200, height: 20, page: 1, type: :text)
|
|
360
|
-
|
|
361
|
-
# Using PDF type strings
|
|
362
|
-
field = doc.add_field("ButtonField", type: "/Btn", x: 100, y: 500, width: 20, height: 20, page: 1)
|
|
363
|
-
|
|
364
|
-
# Radio button example
|
|
365
|
-
field = doc.add_field("Option1", type: :radio, group_id: "my_group", value: "option1", x: 100, y: 500, width: 20, height: 20, page: 1, selected: true)
|
|
366
|
-
```
|
|
358
|
+
See [Radio Buttons](#radio-buttons) section for radio button examples.
|
|
367
359
|
|
|
368
360
|
#### `#update_field(name, new_value, new_name: nil)`
|
|
369
|
-
Updates a field's value and optionally renames it.
|
|
361
|
+
Updates a field's value and optionally renames it. Returns `true` if successful, `false` if field not found.
|
|
370
362
|
|
|
371
|
-
|
|
372
|
-
doc.update_field("FieldName", "New Value")
|
|
373
|
-
doc.update_field("OldName", "New Value", new_name: "NewName")
|
|
374
|
-
|
|
375
|
-
# For signature fields with images:
|
|
376
|
-
doc.update_field("SignatureField", base64_image_data) # Base64-encoded JPEG or PNG
|
|
377
|
-
doc.update_field("SignatureField", "data:image/png;base64,...") # Data URI format
|
|
378
|
-
```
|
|
363
|
+
For signature fields, if `new_value` is base64-encoded JPEG/PNG or a data URI, it automatically adds the image as the field's appearance. See [Signature Fields](#signature-fields-with-image-appearances) section for examples.
|
|
379
364
|
|
|
380
365
|
#### `#remove_field(name_or_field)`
|
|
381
366
|
Removes a form field by name (String) or Field object. Returns `true` if successful, `false` if field not found.
|
|
@@ -388,88 +373,22 @@ doc.remove_field(field_object)
|
|
|
388
373
|
#### `#write(path_out = nil, flatten: false)`
|
|
389
374
|
Writes the modified PDF. If `path_out` is provided, writes to that file path and returns `true`. If no path is provided, returns the PDF bytes as a String. The `flatten` option removes incremental updates from the PDF.
|
|
390
375
|
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
doc.write("output.pdf", flatten: true) # Write flattened PDF to file
|
|
394
|
-
pdf_bytes = doc.write # Get PDF bytes as String
|
|
395
|
-
```
|
|
396
|
-
|
|
397
|
-
#### `#flatten`
|
|
398
|
-
Returns flattened PDF bytes (removes incremental updates) without modifying the document.
|
|
399
|
-
|
|
400
|
-
```ruby
|
|
401
|
-
flattened_bytes = doc.flatten
|
|
402
|
-
```
|
|
403
|
-
|
|
404
|
-
#### `#flatten!`
|
|
405
|
-
Flattens the PDF in-place (modifies the current document instance).
|
|
406
|
-
|
|
407
|
-
```ruby
|
|
408
|
-
doc.flatten!
|
|
409
|
-
```
|
|
376
|
+
#### `#flatten` and `#flatten!`
|
|
377
|
+
Flattening methods. `#flatten` returns flattened PDF bytes without modifying the document. `#flatten!` flattens the PDF in-place.
|
|
410
378
|
|
|
411
379
|
#### `CorpPdf::Document.flatten_pdf(input_path, output_path = nil)`
|
|
412
380
|
Class method to flatten a PDF. If `output_path` is provided, writes to that path and returns the path. Otherwise returns a new `Document` instance with the flattened content.
|
|
413
381
|
|
|
414
|
-
```ruby
|
|
415
|
-
CorpPdf::Document.flatten_pdf("input.pdf", "output.pdf")
|
|
416
|
-
flattened_doc = CorpPdf::Document.flatten_pdf("input.pdf")
|
|
417
|
-
```
|
|
418
|
-
|
|
419
382
|
#### `#clear(options = {})` and `#clear!(options = {})`
|
|
420
|
-
Removes unwanted fields by rewriting the entire PDF. `clear` returns cleared PDF bytes without modifying the document, while `clear!` modifies the document in-place.
|
|
383
|
+
Removes unwanted fields by rewriting the entire PDF. `clear` returns cleared PDF bytes without modifying the document, while `clear!` modifies the document in-place.
|
|
421
384
|
|
|
385
|
+
**Options:**
|
|
422
386
|
- `keep_fields`: Array of field names to keep (all others removed)
|
|
423
387
|
- `remove_fields`: Array of field names to remove
|
|
424
388
|
- `remove_pattern`: Regex pattern - fields matching this are removed
|
|
425
|
-
- Block: Given field
|
|
426
|
-
|
|
427
|
-
```ruby
|
|
428
|
-
# Remove all fields
|
|
429
|
-
cleared = doc.clear(remove_pattern: /.*/)
|
|
430
|
-
|
|
431
|
-
# Remove fields matching pattern (in-place)
|
|
432
|
-
doc.clear!(remove_pattern: /^text-/)
|
|
433
|
-
|
|
434
|
-
# Keep only specific fields
|
|
435
|
-
doc.clear!(keep_fields: ["Name", "Email"])
|
|
436
|
-
|
|
437
|
-
# Use block to filter fields (return true to remove)
|
|
438
|
-
doc.clear! { |field| field.name.match?(/^[a-f0-9-]{30,}/) }
|
|
439
|
-
```
|
|
389
|
+
- Block: Given field object, return `true` to keep, `false` to remove
|
|
440
390
|
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
### Field Object
|
|
444
|
-
|
|
445
|
-
Each field returned by `#list_fields` is a `Field` object with the following attributes and methods:
|
|
446
|
-
|
|
447
|
-
#### Attributes
|
|
448
|
-
- `name`: Field name (String)
|
|
449
|
-
- `value`: Field value (String or nil)
|
|
450
|
-
- `type`: Field type (String, e.g., "/Tx", "/Btn", "/Ch", "/Sig"). Defaults to "/Tx" if missing from PDF.
|
|
451
|
-
- `ref`: Object reference array `[object_number, generation]`
|
|
452
|
-
- `x`: X coordinate (Float or nil)
|
|
453
|
-
- `y`: Y coordinate (Float or nil)
|
|
454
|
-
- `width`: Field width (Float or nil)
|
|
455
|
-
- `height`: Field height (Float or nil)
|
|
456
|
-
- `page`: Page number (Integer or nil)
|
|
457
|
-
|
|
458
|
-
#### Methods
|
|
459
|
-
- `#update(new_value, new_name: nil)`: Update the field's value and optionally rename it
|
|
460
|
-
- `#remove`: Remove the field from the document
|
|
461
|
-
- `#type_key`: Returns the symbol key for the type (e.g., `:text` for `"/Tx"`) or `nil` if not mapped
|
|
462
|
-
- `#text_field?`: Returns true if field is a text field
|
|
463
|
-
- `#button_field?`: Returns true if field is a button/checkbox field
|
|
464
|
-
- `#choice_field?`: Returns true if field is a choice/dropdown field
|
|
465
|
-
- `#signature_field?`: Returns true if field is a signature field
|
|
466
|
-
- `#has_value?`: Returns true if field has a non-empty value
|
|
467
|
-
- `#has_position?`: Returns true if field has position information
|
|
468
|
-
- `#object_number`: Returns the object number (first element of ref)
|
|
469
|
-
- `#generation`: Returns the generation number (second element of ref)
|
|
470
|
-
- `#valid_ref?`: Returns true if field has a valid reference (not a placeholder)
|
|
471
|
-
|
|
472
|
-
**Note**: When reading fields from a PDF, if the type is missing or empty, it defaults to `"/Tx"` (text field). The `type_key` method allows you to get the symbol representation (e.g., `:text`) from the type string.
|
|
391
|
+
See [Clearing Fields](#clearing-fields) section for examples.
|
|
473
392
|
|
|
474
393
|
## Example
|
|
475
394
|
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require 'set'
|
|
4
|
+
|
|
3
5
|
module CorpPdf
|
|
4
6
|
# Parses xref (tables and streams) and exposes object bodies uniformly,
|
|
5
7
|
# including objects embedded in /ObjStm. Also gives you the trailer and /Root.
|
|
@@ -14,10 +16,98 @@ module CorpPdf
|
|
|
14
16
|
end
|
|
15
17
|
|
|
16
18
|
def root_ref
|
|
19
|
+
# First try the current trailer_dict
|
|
17
20
|
tr = trailer_dict
|
|
18
|
-
|
|
21
|
+
if tr && tr =~ %r{/Root\s+(\d+)\s+(\d+)\s+R}
|
|
22
|
+
return [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# If not found, search through all trailers by following /Prev chain
|
|
26
|
+
start = find_startxref(@bytes) or return nil
|
|
27
|
+
root_ref_from_trailer_chain(start)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def root_ref_from_trailer_chain(offset)
|
|
31
|
+
visited = Set.new
|
|
32
|
+
|
|
33
|
+
loop do
|
|
34
|
+
return nil if visited.include?(offset)
|
|
35
|
+
visited.add(offset)
|
|
36
|
+
|
|
37
|
+
# Get trailer at this offset
|
|
38
|
+
tr = get_trailer_at_offset(offset)
|
|
39
|
+
return nil unless tr
|
|
40
|
+
|
|
41
|
+
# Check if this trailer has /Root
|
|
42
|
+
if tr =~ %r{/Root\s+(\d+)\s+(\d+)\s+R}
|
|
43
|
+
return [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Follow /Prev pointer if present
|
|
47
|
+
prev_tok = DictScan.value_token_after("/Prev", tr)
|
|
48
|
+
break unless prev_tok && (prev_ofs = prev_tok.to_i).positive?
|
|
49
|
+
offset = prev_ofs
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
nil
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def get_trailer_at_offset(offset)
|
|
56
|
+
if @bytes[offset, 4] == "xref"
|
|
57
|
+
# Extract trailer from classic xref without modifying state
|
|
58
|
+
extract_trailer_from_classic_xref(offset)
|
|
59
|
+
else
|
|
60
|
+
# Xref stream case - extract dictionary without modifying state
|
|
61
|
+
extract_trailer_from_xref_stream(offset)
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def extract_trailer_from_classic_xref(start)
|
|
66
|
+
pos = @bytes.rindex("xref", start) or return nil
|
|
67
|
+
i = pos + 4
|
|
68
|
+
|
|
69
|
+
# Skip xref entries
|
|
70
|
+
loop do
|
|
71
|
+
m = /\s*(\d+)\s+(\d+)/m.match(@bytes, i) or break
|
|
72
|
+
first = m[1].to_i
|
|
73
|
+
count = m[2].to_i
|
|
74
|
+
i = m.end(0)
|
|
75
|
+
|
|
76
|
+
count.times do |_k|
|
|
77
|
+
# Skip whitespace/newlines before the 20-byte record
|
|
78
|
+
i += 1 while (ch = @bytes.getbyte(i)) && [0x0A, 0x0D, 0x20].include?(ch)
|
|
79
|
+
i += 20
|
|
80
|
+
# consume line ending(s)
|
|
81
|
+
i += 1 while (ch = @bytes.getbyte(i)) && [0x0A, 0x0D].include?(ch)
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
break if @bytes[i, 7] == "trailer"
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
tpos = @bytes.index("trailer", i)
|
|
88
|
+
return nil unless tpos
|
|
89
|
+
|
|
90
|
+
dpos = @bytes.index("<<", tpos)
|
|
91
|
+
return nil unless dpos
|
|
19
92
|
|
|
20
|
-
|
|
93
|
+
dend = balanced_from(@bytes, dpos)
|
|
94
|
+
@bytes[dpos...dend]
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def extract_trailer_from_xref_stream(header_ofs)
|
|
98
|
+
# Expect "<num> <gen> obj" at header_ofs
|
|
99
|
+
m = /\A(\d+)\s+(\d+)\s+obj\b/m.match(@bytes[header_ofs, 50])
|
|
100
|
+
unless m
|
|
101
|
+
# Sometimes header_ofs might land on whitespace; search forward a bit
|
|
102
|
+
win = @bytes[header_ofs, 256]
|
|
103
|
+
m2 = /(\d+)\s+(\d+)\s+obj\b/m.match(win) or return nil
|
|
104
|
+
header_ofs += m2.begin(0)
|
|
105
|
+
m = m2
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
dpos = @bytes.index("<<", header_ofs + m[0].length) or return nil
|
|
109
|
+
dend = balanced_from(@bytes, dpos)
|
|
110
|
+
@bytes[dpos...dend]
|
|
21
111
|
end
|
|
22
112
|
|
|
23
113
|
def trailer_dict
|
data/lib/corp_pdf/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: corp_pdf
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.0.
|
|
4
|
+
version: 1.0.6
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Michael Wynkoop
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2026-01-15 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: chunky_png
|