corp_pdf 1.0.5 → 1.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: fe571644d98559aebc1201878125a75eb96d1cda4a01f59d7c62a2783de9006c
4
- data.tar.gz: ead3b549dbffce573981c5ecce1b4a08401a6eb23d407ed50c086388c551bd36
3
+ metadata.gz: 41334cc30a2fbd56ab358ed95ec0a89b7867b059818220abd0663c05315d4caf
4
+ data.tar.gz: 73514589fb5c6ad4c6d66f079de8d854b54ea67254d425091a1cc9be402193b6
5
5
  SHA512:
6
- metadata.gz: a37bc26b4125c885fb121d170212ab75d4f85ce2f0366c53feb93a233128cfdd1ec1d01091ac77ba027bd6ed2daa0d960060a8e5d5a8353ff0d5422cdda5def3
7
- data.tar.gz: d8cbba8496866a6a2e2326ec69a8206a2100d4693dc0d5d233cec930767da4d23bcc56e61ab51fc162ec2bc56be5033e398ae06f5c2e24910f182c476f183091
6
+ metadata.gz: 2b35da0a14dd018b85451e4018622ef235a6b078571ada9fc4b6f949288d14672b26b519e0991ef1605aaec914d324881fe312b33b80ef32dd9e58f6bb949fd4
7
+ data.tar.gz: ae2362ecfa8cc1fa91501293ba2caa9cc5540c237c1db9671e126901ef91c7b4f405cfc86792fe02f6b5b5d4c63b303ba8969df8dd0d765cb6adcb58497a60fe
data/.gitignore CHANGED
@@ -10,4 +10,5 @@ research/
10
10
  pdf_test_script.rb
11
11
  .cursor/
12
12
 
13
- .DS_Store
13
+ .DS_Store
14
+ .env*
data/README.md CHANGED
@@ -11,6 +11,16 @@ A minimal pure Ruby library for parsing and editing PDF AcroForm fields.
11
11
  - ✅ **Minimal PDF Engine** - Basic PDF parser/writer for AcroForm manipulation
12
12
  - ✅ **Ruby 3.1+** - Modern Ruby support
13
13
 
14
+ ## Documentation
15
+
16
+ 📚 **Detailed documentation** is available in the [`docs/`](./docs/) directory:
17
+
18
+ - **[PDF Structure](./docs/pdf_structure.md)** - Understanding PDF file structure and text-based syntax
19
+ - **[DictScan Explained](./docs/dict_scan_explained.md)** - How text traversal parses PDF dictionaries
20
+ - **[Object Streams](./docs/object_streams.md)** - Parsing compressed PDF object streams
21
+ - **[Clearing Fields](./docs/clear_fields.md)** - Complete guide to removing unwanted fields
22
+ - **[Documentation Index](./docs/README.md)** - Overview of all available documentation
23
+
14
24
  ## Installation
15
25
 
16
26
  Add this line to your application's Gemfile:
@@ -54,7 +64,7 @@ fields.each do |field|
54
64
  puts "#{field.name} (#{type_info}) = #{field.value}"
55
65
  end
56
66
 
57
- # Add a new field (using symbol key for type)
67
+ # Add a new field
58
68
  new_field = doc.add_field("NameField",
59
69
  value: "John Doe",
60
70
  x: 100,
@@ -62,17 +72,7 @@ new_field = doc.add_field("NameField",
62
72
  width: 200,
63
73
  height: 20,
64
74
  page: 1,
65
- type: :text # Optional: :text, :button, :choice, :signature (or "/Tx", "/Btn", etc.)
66
- )
67
-
68
- # Or using the PDF type string directly
69
- button_field = doc.add_field("CheckBox",
70
- type: "/Btn", # Or use :button symbol
71
- x: 100,
72
- y: 600,
73
- width: 20,
74
- height: 20,
75
- page: 1
75
+ type: :text
76
76
  )
77
77
 
78
78
  # Update a field value
@@ -87,9 +87,6 @@ doc.remove_field("FieldToRemove")
87
87
  # Write the modified PDF to a file
88
88
  doc.write("output.pdf")
89
89
 
90
- # Or write with flattening (removes incremental updates)
91
- doc.write("output.pdf", flatten: true)
92
-
93
90
  # Or get PDF bytes as a String (returns String, not StringIO)
94
91
  pdf_bytes = doc.write
95
92
  File.binwrite("output.pdf", pdf_bytes)
@@ -99,46 +96,42 @@ File.binwrite("output.pdf", pdf_bytes)
99
96
 
100
97
  #### Working with Field Objects
101
98
 
99
+ Each field returned by `#list_fields` is a `Field` object with properties and methods:
100
+
102
101
  ```ruby
103
102
  doc = CorpPdf::Document.new("form.pdf")
104
103
  fields = doc.list_fields
104
+ field = fields.first
105
105
 
106
106
  # Access field properties
107
- field = fields.first
108
- puts field.name # Field name
109
- puts field.value # Field value
110
- puts field.type # Field type (e.g., "/Tx")
111
- puts field.type_key # Symbol key (e.g., :text) or nil if not mapped
112
- puts field.x # X position
113
- puts field.y # Y position
114
- puts field.width # Width
115
- puts field.height # Height
116
- puts field.page # Page number
117
-
118
- # Fields default to "/Tx" if type is missing from PDF
119
-
120
- # Update a field directly
121
- field.update("New Value")
122
-
123
- # Update and rename a field
124
- field.update("New Value", new_name: "NewName")
125
-
126
- # Remove a field directly
127
- field.remove
128
-
129
- # Check field type
130
- field.text_field? # true for text fields
131
- field.button_field? # true for button/checkbox fields
132
- field.choice_field? # true for choice/dropdown fields
133
- field.signature_field? # true for signature fields
134
-
135
- # Check if field has a value
136
- field.has_value?
137
-
138
- # Check if field has position information
139
- field.has_position?
107
+ field.name # Field name (String)
108
+ field.value # Field value (String or nil)
109
+ field.type # Field type (String, e.g., "/Tx", "/Btn", "/Ch", "/Sig")
110
+ field.type_key # Symbol key (e.g., :text) or nil if not mapped
111
+ field.x # X coordinate (Float or nil)
112
+ field.y # Y coordinate (Float or nil)
113
+ field.width # Field width (Float or nil)
114
+ field.height # Field height (Float or nil)
115
+ field.page # Page number (Integer or nil)
116
+ field.ref # Object reference array [object_number, generation]
117
+
118
+ # Field methods
119
+ field.update("New Value") # Update field value
120
+ field.update("New Value", new_name: "NewName") # Update and rename
121
+ field.remove # Remove the field
122
+ field.text_field? # Check if text field
123
+ field.button_field? # Check if button/checkbox field
124
+ field.choice_field? # Check if choice/dropdown field
125
+ field.signature_field? # Check if signature field
126
+ field.has_value? # Check if field has a value
127
+ field.has_position? # Check if field has position info
128
+ field.object_number # Get object number
129
+ field.generation # Get generation number
130
+ field.valid_ref? # Check if field has valid reference
140
131
  ```
141
132
 
133
+ **Note**: When reading fields from a PDF, if the type is missing or empty, it defaults to `"/Tx"` (text field).
134
+
142
135
  #### Signature Fields with Image Appearances
143
136
 
144
137
  Signature fields can be enhanced with image appearances (signature images). When you update a signature field with image data (base64-encoded JPEG or PNG), CorpPdf will automatically add the image as the field's appearance.
@@ -255,26 +248,33 @@ doc.write("form_with_multiple_groups.pdf")
255
248
 
256
249
  #### Flattening PDFs
257
250
 
251
+ Flattening removes incremental updates from a PDF, creating a clean single-version document:
252
+
258
253
  ```ruby
259
- # Flatten a PDF to remove incremental updates
260
254
  doc = CorpPdf::Document.new("form.pdf")
261
- doc.flatten! # Modifies the document in-place
262
255
 
263
- # Or create a new flattened document
264
- flattened_doc = CorpPdf::Document.flatten_pdf("input.pdf", "output.pdf")
256
+ # Flatten in-place (modifies the document)
257
+ doc.flatten!
258
+
259
+ # Get flattened bytes without modifying the document
260
+ flattened_bytes = doc.flatten
261
+
262
+ # Write with flattening option
263
+ doc.write("output.pdf", flatten: true)
265
264
 
266
- # Or get flattened bytes
267
- flattened_bytes = CorpPdf::Document.flatten_pdf("input.pdf")
265
+ # Class method: flatten from file
266
+ CorpPdf::Document.flatten_pdf("input.pdf", "output.pdf")
267
+ flattened_doc = CorpPdf::Document.flatten_pdf("input.pdf")
268
268
  ```
269
269
 
270
270
  #### Clearing Fields
271
271
 
272
- The `clear` and `clear!` methods allow you to completely remove unwanted fields by rewriting the entire PDF:
272
+ The `clear` and `clear!` methods completely remove unwanted fields by rewriting the entire PDF (more efficient than multiple `remove_field` calls):
273
273
 
274
274
  ```ruby
275
275
  doc = CorpPdf::Document.new("form.pdf")
276
276
 
277
- # Remove all fields matching a pattern
277
+ # Remove fields matching a pattern (in-place)
278
278
  doc.clear!(remove_pattern: /^text-/)
279
279
 
280
280
  # Keep only specific fields
@@ -283,14 +283,17 @@ doc.clear!(keep_fields: ["Name", "Email"])
283
283
  # Remove specific fields
284
284
  doc.clear!(remove_fields: ["OldField1", "OldField2"])
285
285
 
286
- # Use a block to determine which fields to keep
287
- doc.clear! { |name| !name.start_with?("temp_") }
286
+ # Use a block to filter fields (return true to keep)
287
+ doc.clear! { |field| !field.name.start_with?("temp_") }
288
+
289
+ # Get cleared bytes without modifying document
290
+ cleared_bytes = doc.clear(remove_pattern: /.*/)
288
291
 
289
292
  # Write the cleared PDF
290
293
  doc.write("cleared.pdf", flatten: true)
291
294
  ```
292
295
 
293
- **Note:** Unlike `remove_field`, which uses incremental updates, `clear` completely rewrites the PDF to exclude unwanted fields. This is more efficient when removing many fields and ensures complete removal. See [Clearing Fields Documentation](docs/cleaning_fields.md) for detailed information.
296
+ **Note:** Unlike `remove_field`, which uses incremental updates, `clear` completely rewrites the PDF. See [Clearing Fields Documentation](docs/clear_fields.md) for detailed information.
294
297
 
295
298
  ### API Reference
296
299
 
@@ -339,43 +342,25 @@ second_page.add_field("Email", x: 100, y: 650, width: 200, height: 20)
339
342
  - `page.to_h` - Convert to hash for backward compatibility
340
343
 
341
344
  #### `#add_field(name, options)`
342
- Adds a new form field to the document. Options include:
345
+ Adds a new form field to the document. Returns a `Field` object if successful.
346
+
347
+ **Options:**
343
348
  - `value`: Default value for the field (String)
344
- - `x`: X coordinate (Integer, default: 100)
345
- - `y`: Y coordinate (Integer, default: 500)
346
- - `width`: Field width (Integer, default: 100)
347
- - `height`: Field height (Integer, default: 20)
348
- - `page`: Page number to add the field to (Integer, default: 1)
349
- - `type`: Field type (Symbol or String, default: `"/Tx"`). Options:
349
+ - `x`, `y`: Field position coordinates (Integer, defaults: 100, 500)
350
+ - `width`, `height`: Field dimensions (Integer, defaults: 100, 20)
351
+ - `page`: Page number (Integer, default: 1)
352
+ - `type`: Field type (Symbol or String, default: `"/Tx"`)
350
353
  - Symbol keys: `:text`, `:button`, `:choice`, `:signature`, `:radio`
351
354
  - PDF type strings: `"/Tx"`, `"/Btn"`, `"/Ch"`, `"/Sig"`
352
- - `group_id`: Required for radio buttons. String or identifier to group radio buttons together. All radio buttons in the same group must share the same `group_id`.
353
- - `selected`: Optional for radio buttons. Boolean (`true` or `false`, or string `"true"`). If set to `true`, this radio button will be selected by default.
355
+ - `group_id`: Required for radio buttons. Groups related radio buttons together.
356
+ - `selected`: Optional for radio buttons. Set to `true` to select by default.
354
357
 
355
- Returns a `Field` object if successful.
356
-
357
- ```ruby
358
- # Using symbol keys (recommended)
359
- field = doc.add_field("NewField", value: "Value", x: 100, y: 500, width: 200, height: 20, page: 1, type: :text)
360
-
361
- # Using PDF type strings
362
- field = doc.add_field("ButtonField", type: "/Btn", x: 100, y: 500, width: 20, height: 20, page: 1)
363
-
364
- # Radio button example
365
- field = doc.add_field("Option1", type: :radio, group_id: "my_group", value: "option1", x: 100, y: 500, width: 20, height: 20, page: 1, selected: true)
366
- ```
358
+ See [Radio Buttons](#radio-buttons) section for radio button examples.
367
359
 
368
360
  #### `#update_field(name, new_value, new_name: nil)`
369
- Updates a field's value and optionally renames it. For signature fields, if `new_value` looks like image data (base64-encoded JPEG/PNG or a data URI), it will automatically add the image as the field's appearance. Returns `true` if successful, `false` if field not found.
361
+ Updates a field's value and optionally renames it. Returns `true` if successful, `false` if field not found.
370
362
 
371
- ```ruby
372
- doc.update_field("FieldName", "New Value")
373
- doc.update_field("OldName", "New Value", new_name: "NewName")
374
-
375
- # For signature fields with images:
376
- doc.update_field("SignatureField", base64_image_data) # Base64-encoded JPEG or PNG
377
- doc.update_field("SignatureField", "data:image/png;base64,...") # Data URI format
378
- ```
363
+ For signature fields, if `new_value` is base64-encoded JPEG/PNG or a data URI, it automatically adds the image as the field's appearance. See [Signature Fields](#signature-fields-with-image-appearances) section for examples.
379
364
 
380
365
  #### `#remove_field(name_or_field)`
381
366
  Removes a form field by name (String) or Field object. Returns `true` if successful, `false` if field not found.
@@ -388,88 +373,22 @@ doc.remove_field(field_object)
388
373
  #### `#write(path_out = nil, flatten: false)`
389
374
  Writes the modified PDF. If `path_out` is provided, writes to that file path and returns `true`. If no path is provided, returns the PDF bytes as a String. The `flatten` option removes incremental updates from the PDF.
390
375
 
391
- ```ruby
392
- doc.write("output.pdf") # Write to file
393
- doc.write("output.pdf", flatten: true) # Write flattened PDF to file
394
- pdf_bytes = doc.write # Get PDF bytes as String
395
- ```
396
-
397
- #### `#flatten`
398
- Returns flattened PDF bytes (removes incremental updates) without modifying the document.
399
-
400
- ```ruby
401
- flattened_bytes = doc.flatten
402
- ```
403
-
404
- #### `#flatten!`
405
- Flattens the PDF in-place (modifies the current document instance).
406
-
407
- ```ruby
408
- doc.flatten!
409
- ```
376
+ #### `#flatten` and `#flatten!`
377
+ Flattening methods. `#flatten` returns flattened PDF bytes without modifying the document. `#flatten!` flattens the PDF in-place.
410
378
 
411
379
  #### `CorpPdf::Document.flatten_pdf(input_path, output_path = nil)`
412
380
  Class method to flatten a PDF. If `output_path` is provided, writes to that path and returns the path. Otherwise returns a new `Document` instance with the flattened content.
413
381
 
414
- ```ruby
415
- CorpPdf::Document.flatten_pdf("input.pdf", "output.pdf")
416
- flattened_doc = CorpPdf::Document.flatten_pdf("input.pdf")
417
- ```
418
-
419
382
  #### `#clear(options = {})` and `#clear!(options = {})`
420
- Removes unwanted fields by rewriting the entire PDF. `clear` returns cleared PDF bytes without modifying the document, while `clear!` modifies the document in-place. Options include:
383
+ Removes unwanted fields by rewriting the entire PDF. `clear` returns cleared PDF bytes without modifying the document, while `clear!` modifies the document in-place.
421
384
 
385
+ **Options:**
422
386
  - `keep_fields`: Array of field names to keep (all others removed)
423
387
  - `remove_fields`: Array of field names to remove
424
388
  - `remove_pattern`: Regex pattern - fields matching this are removed
425
- - Block: Given field name, return `true` to keep, `false` to remove
426
-
427
- ```ruby
428
- # Remove all fields
429
- cleared = doc.clear(remove_pattern: /.*/)
430
-
431
- # Remove fields matching pattern (in-place)
432
- doc.clear!(remove_pattern: /^text-/)
433
-
434
- # Keep only specific fields
435
- doc.clear!(keep_fields: ["Name", "Email"])
436
-
437
- # Use block to filter fields (return true to remove)
438
- doc.clear! { |field| field.name.match?(/^[a-f0-9-]{30,}/) }
439
- ```
389
+ - Block: Given field object, return `true` to keep, `false` to remove
440
390
 
441
- **Note:** This completely rewrites the PDF (like `flatten`), so it's more efficient than using `remove_field` multiple times. See [Clearing Fields Documentation](docs/cleaning_fields.md) for detailed information.
442
-
443
- ### Field Object
444
-
445
- Each field returned by `#list_fields` is a `Field` object with the following attributes and methods:
446
-
447
- #### Attributes
448
- - `name`: Field name (String)
449
- - `value`: Field value (String or nil)
450
- - `type`: Field type (String, e.g., "/Tx", "/Btn", "/Ch", "/Sig"). Defaults to "/Tx" if missing from PDF.
451
- - `ref`: Object reference array `[object_number, generation]`
452
- - `x`: X coordinate (Float or nil)
453
- - `y`: Y coordinate (Float or nil)
454
- - `width`: Field width (Float or nil)
455
- - `height`: Field height (Float or nil)
456
- - `page`: Page number (Integer or nil)
457
-
458
- #### Methods
459
- - `#update(new_value, new_name: nil)`: Update the field's value and optionally rename it
460
- - `#remove`: Remove the field from the document
461
- - `#type_key`: Returns the symbol key for the type (e.g., `:text` for `"/Tx"`) or `nil` if not mapped
462
- - `#text_field?`: Returns true if field is a text field
463
- - `#button_field?`: Returns true if field is a button/checkbox field
464
- - `#choice_field?`: Returns true if field is a choice/dropdown field
465
- - `#signature_field?`: Returns true if field is a signature field
466
- - `#has_value?`: Returns true if field has a non-empty value
467
- - `#has_position?`: Returns true if field has position information
468
- - `#object_number`: Returns the object number (first element of ref)
469
- - `#generation`: Returns the generation number (second element of ref)
470
- - `#valid_ref?`: Returns true if field has a valid reference (not a placeholder)
471
-
472
- **Note**: When reading fields from a PDF, if the type is missing or empty, it defaults to `"/Tx"` (text field). The `type_key` method allows you to get the symbol representation (e.g., `:text`) from the type string.
391
+ See [Clearing Fields](#clearing-fields) section for examples.
473
392
 
474
393
  ## Example
475
394
 
@@ -1,5 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require 'set'
4
+
3
5
  module CorpPdf
4
6
  # Parses xref (tables and streams) and exposes object bodies uniformly,
5
7
  # including objects embedded in /ObjStm. Also gives you the trailer and /Root.
@@ -14,10 +16,98 @@ module CorpPdf
14
16
  end
15
17
 
16
18
  def root_ref
19
+ # First try the current trailer_dict
17
20
  tr = trailer_dict
18
- return nil unless tr =~ %r{/Root\s+(\d+)\s+(\d+)\s+R}
21
+ if tr && tr =~ %r{/Root\s+(\d+)\s+(\d+)\s+R}
22
+ return [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
23
+ end
24
+
25
+ # If not found, search through all trailers by following /Prev chain
26
+ start = find_startxref(@bytes) or return nil
27
+ root_ref_from_trailer_chain(start)
28
+ end
29
+
30
+ def root_ref_from_trailer_chain(offset)
31
+ visited = Set.new
32
+
33
+ loop do
34
+ return nil if visited.include?(offset)
35
+ visited.add(offset)
36
+
37
+ # Get trailer at this offset
38
+ tr = get_trailer_at_offset(offset)
39
+ return nil unless tr
40
+
41
+ # Check if this trailer has /Root
42
+ if tr =~ %r{/Root\s+(\d+)\s+(\d+)\s+R}
43
+ return [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
44
+ end
45
+
46
+ # Follow /Prev pointer if present
47
+ prev_tok = DictScan.value_token_after("/Prev", tr)
48
+ break unless prev_tok && (prev_ofs = prev_tok.to_i).positive?
49
+ offset = prev_ofs
50
+ end
51
+
52
+ nil
53
+ end
54
+
55
+ def get_trailer_at_offset(offset)
56
+ if @bytes[offset, 4] == "xref"
57
+ # Extract trailer from classic xref without modifying state
58
+ extract_trailer_from_classic_xref(offset)
59
+ else
60
+ # Xref stream case - extract dictionary without modifying state
61
+ extract_trailer_from_xref_stream(offset)
62
+ end
63
+ end
64
+
65
+ def extract_trailer_from_classic_xref(start)
66
+ pos = @bytes.rindex("xref", start) or return nil
67
+ i = pos + 4
68
+
69
+ # Skip xref entries
70
+ loop do
71
+ m = /\s*(\d+)\s+(\d+)/m.match(@bytes, i) or break
72
+ first = m[1].to_i
73
+ count = m[2].to_i
74
+ i = m.end(0)
75
+
76
+ count.times do |_k|
77
+ # Skip whitespace/newlines before the 20-byte record
78
+ i += 1 while (ch = @bytes.getbyte(i)) && [0x0A, 0x0D, 0x20].include?(ch)
79
+ i += 20
80
+ # consume line ending(s)
81
+ i += 1 while (ch = @bytes.getbyte(i)) && [0x0A, 0x0D].include?(ch)
82
+ end
83
+
84
+ break if @bytes[i, 7] == "trailer"
85
+ end
86
+
87
+ tpos = @bytes.index("trailer", i)
88
+ return nil unless tpos
89
+
90
+ dpos = @bytes.index("<<", tpos)
91
+ return nil unless dpos
19
92
 
20
- [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
93
+ dend = balanced_from(@bytes, dpos)
94
+ @bytes[dpos...dend]
95
+ end
96
+
97
+ def extract_trailer_from_xref_stream(header_ofs)
98
+ # Expect "<num> <gen> obj" at header_ofs
99
+ m = /\A(\d+)\s+(\d+)\s+obj\b/m.match(@bytes[header_ofs, 50])
100
+ unless m
101
+ # Sometimes header_ofs might land on whitespace; search forward a bit
102
+ win = @bytes[header_ofs, 256]
103
+ m2 = /(\d+)\s+(\d+)\s+obj\b/m.match(win) or return nil
104
+ header_ofs += m2.begin(0)
105
+ m = m2
106
+ end
107
+
108
+ dpos = @bytes.index("<<", header_ofs + m[0].length) or return nil
109
+ dend = balanced_from(@bytes, dpos)
110
+ @bytes[dpos...dend]
21
111
  end
22
112
 
23
113
  def trailer_dict
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module CorpPdf
4
- VERSION = "1.0.5"
4
+ VERSION = "1.0.6"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: corp_pdf
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.5
4
+ version: 1.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michael Wynkoop
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-11-13 00:00:00.000000000 Z
11
+ date: 2026-01-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: chunky_png