acro_that 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/Gemfile.lock +1 -1
- data/README.md +49 -0
- data/docs/README.md +12 -0
- data/docs/clear_fields.md +202 -0
- data/issues/README.md +38 -0
- data/issues/refactoring-opportunities.md +269 -0
- data/lib/acro_that/actions/add_field.rb +2 -55
- data/lib/acro_that/actions/add_signature_appearance.rb +3 -3
- data/lib/acro_that/actions/base.rb +4 -0
- data/lib/acro_that/actions/remove_field.rb +1 -5
- data/lib/acro_that/dict_scan.rb +7 -0
- data/lib/acro_that/document.rb +480 -45
- data/lib/acro_that/version.rb +1 -1
- data/lib/acro_that.rb +1 -0
- data/publish +183 -0
- metadata +5 -1
data/lib/acro_that/document.rb
CHANGED
|
@@ -71,62 +71,185 @@ module AcroThat
|
|
|
71
71
|
self
|
|
72
72
|
end
|
|
73
73
|
|
|
74
|
-
# Return an array of
|
|
75
|
-
def
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
74
|
+
# Return an array of page information (page number, width, height, ref, metadata)
|
|
75
|
+
def list_pages
|
|
76
|
+
pages = []
|
|
77
|
+
page_objects = find_all_pages
|
|
78
|
+
|
|
79
|
+
# Second pass: extract information from each page
|
|
80
|
+
page_objects.each_with_index do |ref, index|
|
|
81
|
+
body = @resolver.object_body(ref)
|
|
82
|
+
next unless body
|
|
83
|
+
|
|
84
|
+
# Extract MediaBox, CropBox, or ArtBox for dimensions
|
|
85
|
+
width = nil
|
|
86
|
+
height = nil
|
|
87
|
+
media_box = nil
|
|
88
|
+
crop_box = nil
|
|
89
|
+
art_box = nil
|
|
90
|
+
bleed_box = nil
|
|
91
|
+
trim_box = nil
|
|
92
|
+
|
|
93
|
+
# Try MediaBox first (most common)
|
|
94
|
+
if body =~ %r{/MediaBox\s*\[(.*?)\]}
|
|
95
|
+
box_values = ::Regexp.last_match(1).scan(/[-+]?\d*\.?\d+/).map(&:to_f)
|
|
96
|
+
if box_values.length == 4
|
|
97
|
+
llx, lly, urx, ury = box_values
|
|
98
|
+
width = urx - llx
|
|
99
|
+
height = ury - lly
|
|
100
|
+
media_box = { llx: llx, lly: lly, urx: urx, ury: ury }
|
|
101
|
+
end
|
|
102
|
+
end
|
|
79
103
|
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
104
|
+
# Try CropBox
|
|
105
|
+
if body =~ %r{/CropBox\s*\[(.*?)\]}
|
|
106
|
+
box_values = ::Regexp.last_match(1).scan(/[-+]?\d*\.?\d+/).map(&:to_f)
|
|
107
|
+
if box_values.length == 4
|
|
108
|
+
llx, lly, urx, ury = box_values
|
|
109
|
+
crop_box = { llx: llx, lly: lly, urx: urx, ury: ury }
|
|
110
|
+
end
|
|
111
|
+
end
|
|
83
112
|
|
|
84
|
-
#
|
|
85
|
-
|
|
86
|
-
|
|
113
|
+
# Try ArtBox
|
|
114
|
+
if body =~ %r{/ArtBox\s*\[(.*?)\]}
|
|
115
|
+
box_values = ::Regexp.last_match(1).scan(/[-+]?\d*\.?\d+/).map(&:to_f)
|
|
116
|
+
if box_values.length == 4
|
|
117
|
+
llx, lly, urx, ury = box_values
|
|
118
|
+
art_box = { llx: llx, lly: lly, urx: urx, ury: ury }
|
|
119
|
+
end
|
|
120
|
+
end
|
|
87
121
|
|
|
88
|
-
#
|
|
89
|
-
|
|
90
|
-
|
|
122
|
+
# Try BleedBox
|
|
123
|
+
if body =~ %r{/BleedBox\s*\[(.*?)\]}
|
|
124
|
+
box_values = ::Regexp.last_match(1).scan(/[-+]?\d*\.?\d+/).map(&:to_f)
|
|
125
|
+
if box_values.length == 4
|
|
126
|
+
llx, lly, urx, ury = box_values
|
|
127
|
+
bleed_box = { llx: llx, lly: lly, urx: urx, ury: ury }
|
|
128
|
+
end
|
|
129
|
+
end
|
|
91
130
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
131
|
+
# Try TrimBox
|
|
132
|
+
if body =~ %r{/TrimBox\s*\[(.*?)\]}
|
|
133
|
+
box_values = ::Regexp.last_match(1).scan(/[-+]?\d*\.?\d+/).map(&:to_f)
|
|
134
|
+
if box_values.length == 4
|
|
135
|
+
llx, lly, urx, ury = box_values
|
|
136
|
+
trim_box = { llx: llx, lly: lly, urx: urx, ury: ury }
|
|
137
|
+
end
|
|
138
|
+
end
|
|
95
139
|
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
140
|
+
# Extract rotation
|
|
141
|
+
rotate = nil
|
|
142
|
+
if body =~ %r{/Rotate\s+(\d+)}
|
|
143
|
+
rotate = Integer(::Regexp.last_match(1))
|
|
100
144
|
end
|
|
101
145
|
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
}
|
|
146
|
+
# Extract Resources reference
|
|
147
|
+
resources_ref = nil
|
|
148
|
+
if body =~ %r{/Resources\s+(\d+)\s+(\d+)\s+R}
|
|
149
|
+
resources_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
|
|
150
|
+
end
|
|
105
151
|
|
|
152
|
+
# Extract Parent reference
|
|
153
|
+
parent_ref = nil
|
|
106
154
|
if body =~ %r{/Parent\s+(\d+)\s+(\d+)\s+R}
|
|
107
155
|
parent_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
|
|
108
|
-
|
|
109
|
-
field_widgets[parent_ref] ||= []
|
|
110
|
-
field_widgets[parent_ref] << widget_info
|
|
111
156
|
end
|
|
112
157
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
158
|
+
# Extract Contents reference(s)
|
|
159
|
+
contents_refs = []
|
|
160
|
+
if body =~ %r{/Contents\s+(\d+)\s+(\d+)\s+R}
|
|
161
|
+
contents_refs << [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
|
|
162
|
+
elsif body =~ %r{/Contents\s*\[(.*?)\]}
|
|
163
|
+
contents_array = ::Regexp.last_match(1)
|
|
164
|
+
contents_array.scan(/(\d+)\s+(\d+)\s+R/) do |num_str, gen_str|
|
|
165
|
+
contents_refs << [num_str.to_i, gen_str.to_i]
|
|
121
166
|
end
|
|
122
167
|
end
|
|
168
|
+
|
|
169
|
+
# Build metadata hash
|
|
170
|
+
metadata = {
|
|
171
|
+
rotate: rotate,
|
|
172
|
+
media_box: media_box,
|
|
173
|
+
crop_box: crop_box,
|
|
174
|
+
art_box: art_box,
|
|
175
|
+
bleed_box: bleed_box,
|
|
176
|
+
trim_box: trim_box,
|
|
177
|
+
resources_ref: resources_ref,
|
|
178
|
+
parent_ref: parent_ref,
|
|
179
|
+
contents_refs: contents_refs
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
pages << {
|
|
183
|
+
page: index + 1, # Page number starting at 1
|
|
184
|
+
width: width,
|
|
185
|
+
height: height,
|
|
186
|
+
ref: ref,
|
|
187
|
+
metadata: metadata
|
|
188
|
+
}
|
|
123
189
|
end
|
|
124
190
|
|
|
125
|
-
|
|
191
|
+
pages
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
# Return an array of Field(name, value, type, ref)
|
|
195
|
+
def list_fields
|
|
196
|
+
fields = []
|
|
197
|
+
field_widgets = {}
|
|
198
|
+
widgets_by_name = {}
|
|
199
|
+
|
|
200
|
+
# First pass: collect widget information
|
|
126
201
|
@resolver.each_object do |ref, body|
|
|
127
|
-
next unless body
|
|
202
|
+
next unless body
|
|
203
|
+
|
|
204
|
+
is_widget = DictScan.is_widget?(body)
|
|
205
|
+
|
|
206
|
+
# Collect widget information if this is a widget
|
|
207
|
+
if is_widget
|
|
208
|
+
# Extract position from widget
|
|
209
|
+
rect_tok = DictScan.value_token_after("/Rect", body)
|
|
210
|
+
if rect_tok && rect_tok.start_with?("[")
|
|
211
|
+
# Parse [x y x+width y+height] format
|
|
212
|
+
rect_values = rect_tok.scan(/[-+]?\d*\.?\d+/).map(&:to_f)
|
|
213
|
+
if rect_values.length == 4
|
|
214
|
+
x, y, x2, y2 = rect_values
|
|
215
|
+
width = x2 - x
|
|
216
|
+
height = y2 - y
|
|
128
217
|
|
|
129
|
-
|
|
218
|
+
page_num = nil
|
|
219
|
+
if body =~ %r{/P\s+(\d+)\s+(\d+)\s+R}
|
|
220
|
+
page_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
|
|
221
|
+
page_num = find_page_number_for_ref(page_ref)
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
widget_info = {
|
|
225
|
+
x: x, y: y, width: width, height: height, page: page_num
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
if body =~ %r{/Parent\s+(\d+)\s+(\d+)\s+R}
|
|
229
|
+
parent_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
|
|
230
|
+
|
|
231
|
+
field_widgets[parent_ref] ||= []
|
|
232
|
+
field_widgets[parent_ref] << widget_info
|
|
233
|
+
end
|
|
234
|
+
|
|
235
|
+
if body.include?("/T")
|
|
236
|
+
t_tok = DictScan.value_token_after("/T", body)
|
|
237
|
+
if t_tok
|
|
238
|
+
widget_name = DictScan.decode_pdf_string(t_tok)
|
|
239
|
+
if widget_name && !widget_name.empty?
|
|
240
|
+
widgets_by_name[widget_name] ||= []
|
|
241
|
+
widgets_by_name[widget_name] << widget_info
|
|
242
|
+
end
|
|
243
|
+
end
|
|
244
|
+
end
|
|
245
|
+
end
|
|
246
|
+
end
|
|
247
|
+
end
|
|
248
|
+
|
|
249
|
+
# Second pass: collect all fields (both field objects and widget annotations with /T)
|
|
250
|
+
next unless body.include?("/T")
|
|
251
|
+
|
|
252
|
+
is_widget_field = is_widget
|
|
130
253
|
hint = body.include?("/FT") || is_widget_field || body.include?("/Kids") || body.include?("/Parent")
|
|
131
254
|
next unless hint
|
|
132
255
|
|
|
@@ -143,8 +266,7 @@ module AcroThat
|
|
|
143
266
|
type = ft_tok
|
|
144
267
|
|
|
145
268
|
position = {}
|
|
146
|
-
|
|
147
|
-
if is_widget_annot
|
|
269
|
+
if is_widget
|
|
148
270
|
rect_tok = DictScan.value_token_after("/Rect", body)
|
|
149
271
|
if rect_tok && rect_tok.start_with?("[")
|
|
150
272
|
rect_values = rect_tok.scan(/[-+]?\d*\.?\d+/).map(&:to_f)
|
|
@@ -270,8 +392,260 @@ module AcroThat
|
|
|
270
392
|
field.remove
|
|
271
393
|
end
|
|
272
394
|
|
|
395
|
+
# Clean up the PDF by removing unwanted fields.
|
|
396
|
+
# Options:
|
|
397
|
+
# - keep_fields: Array of field names to keep (all others removed)
|
|
398
|
+
# - remove_fields: Array of field names to remove
|
|
399
|
+
# - remove_pattern: Regex pattern - fields matching this are removed
|
|
400
|
+
# - block: Given field name, return true to keep, false to remove
|
|
401
|
+
# This rewrites the entire PDF (like flatten) but excludes the unwanted fields.
|
|
402
|
+
def clear(keep_fields: nil, remove_fields: nil, remove_pattern: nil)
|
|
403
|
+
root_ref = @resolver.root_ref
|
|
404
|
+
raise "Cannot clear: no /Root found" unless root_ref
|
|
405
|
+
|
|
406
|
+
# Build a set of fields to remove
|
|
407
|
+
fields_to_remove = Set.new
|
|
408
|
+
|
|
409
|
+
# Get all current fields
|
|
410
|
+
all_fields = list_fields
|
|
411
|
+
|
|
412
|
+
if block_given?
|
|
413
|
+
# Use block to determine which fields to keep
|
|
414
|
+
all_fields.each do |field|
|
|
415
|
+
fields_to_remove.add(field.name) unless yield(field.name)
|
|
416
|
+
end
|
|
417
|
+
elsif keep_fields
|
|
418
|
+
# Keep only specified fields
|
|
419
|
+
keep_set = Set.new(keep_fields.map(&:to_s))
|
|
420
|
+
all_fields.each do |field|
|
|
421
|
+
fields_to_remove.add(field.name) unless keep_set.include?(field.name)
|
|
422
|
+
end
|
|
423
|
+
elsif remove_fields
|
|
424
|
+
# Remove specified fields
|
|
425
|
+
remove_set = Set.new(remove_fields.map(&:to_s))
|
|
426
|
+
all_fields.each do |field|
|
|
427
|
+
fields_to_remove.add(field.name) if remove_set.include?(field.name)
|
|
428
|
+
end
|
|
429
|
+
elsif remove_pattern
|
|
430
|
+
# Remove fields matching pattern
|
|
431
|
+
all_fields.each do |field|
|
|
432
|
+
fields_to_remove.add(field.name) if field.name =~ remove_pattern
|
|
433
|
+
end
|
|
434
|
+
else
|
|
435
|
+
# No criteria specified, return original
|
|
436
|
+
return @raw
|
|
437
|
+
end
|
|
438
|
+
|
|
439
|
+
# Build sets of refs to exclude
|
|
440
|
+
field_refs_to_remove = Set.new
|
|
441
|
+
widget_refs_to_remove = Set.new
|
|
442
|
+
|
|
443
|
+
all_fields.each do |field|
|
|
444
|
+
next unless fields_to_remove.include?(field.name)
|
|
445
|
+
|
|
446
|
+
field_refs_to_remove.add(field.ref) if field.valid_ref?
|
|
447
|
+
|
|
448
|
+
# Find all widget annotations for this field
|
|
449
|
+
@resolver.each_object do |widget_ref, body|
|
|
450
|
+
next unless body && DictScan.is_widget?(body)
|
|
451
|
+
next if widget_ref == field.ref
|
|
452
|
+
|
|
453
|
+
# Match by /Parent reference
|
|
454
|
+
if body =~ %r{/Parent\s+(\d+)\s+(\d+)\s+R}
|
|
455
|
+
widget_parent_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
|
|
456
|
+
if widget_parent_ref == field.ref
|
|
457
|
+
widget_refs_to_remove.add(widget_ref)
|
|
458
|
+
next
|
|
459
|
+
end
|
|
460
|
+
end
|
|
461
|
+
|
|
462
|
+
# Also match by field name (/T)
|
|
463
|
+
next unless body.include?("/T")
|
|
464
|
+
|
|
465
|
+
t_tok = DictScan.value_token_after("/T", body)
|
|
466
|
+
next unless t_tok
|
|
467
|
+
|
|
468
|
+
widget_name = DictScan.decode_pdf_string(t_tok)
|
|
469
|
+
if widget_name && widget_name == field.name
|
|
470
|
+
widget_refs_to_remove.add(widget_ref)
|
|
471
|
+
end
|
|
472
|
+
end
|
|
473
|
+
end
|
|
474
|
+
|
|
475
|
+
# Collect objects to write (excluding removed fields and widgets)
|
|
476
|
+
objects = []
|
|
477
|
+
@resolver.each_object do |ref, body|
|
|
478
|
+
next if field_refs_to_remove.include?(ref)
|
|
479
|
+
next if widget_refs_to_remove.include?(ref)
|
|
480
|
+
next unless body
|
|
481
|
+
|
|
482
|
+
objects << { ref: ref, body: body }
|
|
483
|
+
end
|
|
484
|
+
|
|
485
|
+
# Process AcroForm to remove field references from /Fields array
|
|
486
|
+
af_ref = acroform_ref
|
|
487
|
+
if af_ref
|
|
488
|
+
# Find the AcroForm object in our objects list
|
|
489
|
+
af_obj = objects.find { |o| o[:ref] == af_ref }
|
|
490
|
+
if af_obj
|
|
491
|
+
af_body = af_obj[:body]
|
|
492
|
+
fields_array_ref = DictScan.value_token_after("/Fields", af_body)
|
|
493
|
+
|
|
494
|
+
if fields_array_ref && fields_array_ref =~ /\A(\d+)\s+(\d+)\s+R/
|
|
495
|
+
# /Fields points to separate array object
|
|
496
|
+
arr_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
|
|
497
|
+
arr_obj = objects.find { |o| o[:ref] == arr_ref }
|
|
498
|
+
if arr_obj
|
|
499
|
+
arr_body = arr_obj[:body]
|
|
500
|
+
field_refs_to_remove.each do |field_ref|
|
|
501
|
+
arr_body = DictScan.remove_ref_from_array(arr_body, field_ref)
|
|
502
|
+
end
|
|
503
|
+
# Clean up empty array
|
|
504
|
+
arr_body = arr_body.strip.gsub(/\[\s+\]/, "[]")
|
|
505
|
+
arr_obj[:body] = arr_body
|
|
506
|
+
end
|
|
507
|
+
elsif af_body.include?("/Fields")
|
|
508
|
+
# Inline /Fields array
|
|
509
|
+
field_refs_to_remove.each do |field_ref|
|
|
510
|
+
af_body = DictScan.remove_ref_from_inline_array(af_body, "/Fields", field_ref)
|
|
511
|
+
end
|
|
512
|
+
af_obj[:body] = af_body
|
|
513
|
+
end
|
|
514
|
+
end
|
|
515
|
+
end
|
|
516
|
+
|
|
517
|
+
# Process page objects to remove widget references from /Annots arrays
|
|
518
|
+
# Also remove any orphaned widget references (widgets that reference non-existent fields)
|
|
519
|
+
objects_in_file = Set.new(objects.map { |o| o[:ref] })
|
|
520
|
+
field_refs_in_file = Set.new
|
|
521
|
+
objects.each do |obj|
|
|
522
|
+
body = obj[:body]
|
|
523
|
+
# Check if this is a field object
|
|
524
|
+
if body&.include?("/FT") && body.include?("/T")
|
|
525
|
+
field_refs_in_file.add(obj[:ref])
|
|
526
|
+
end
|
|
527
|
+
|
|
528
|
+
body = obj[:body]
|
|
529
|
+
next unless DictScan.is_page?(body)
|
|
530
|
+
|
|
531
|
+
# Handle inline /Annots array
|
|
532
|
+
if body =~ %r{/Annots\s*\[(.*?)\]}
|
|
533
|
+
annots_array_str = ::Regexp.last_match(1)
|
|
534
|
+
|
|
535
|
+
# Remove widgets that match removed fields
|
|
536
|
+
widget_refs_to_remove.each do |widget_ref|
|
|
537
|
+
annots_array_str = annots_array_str.gsub(/\b#{widget_ref[0]}\s+#{widget_ref[1]}\s+R\b/, "").strip
|
|
538
|
+
annots_array_str = annots_array_str.gsub(/\s+/, " ")
|
|
539
|
+
end
|
|
540
|
+
|
|
541
|
+
# Also remove orphaned widget references (widgets not in objects_in_file or pointing to non-existent fields)
|
|
542
|
+
annots_refs = annots_array_str.scan(/(\d+)\s+(\d+)\s+R/).map { |n, g| [Integer(n), Integer(g)] }
|
|
543
|
+
annots_refs.each do |annot_ref|
|
|
544
|
+
# Check if this annotation is a widget that should be removed
|
|
545
|
+
if objects_in_file.include?(annot_ref)
|
|
546
|
+
# Widget exists - check if it's an orphaned widget (references non-existent field)
|
|
547
|
+
widget_obj = objects.find { |o| o[:ref] == annot_ref }
|
|
548
|
+
if widget_obj && DictScan.is_widget?(widget_obj[:body])
|
|
549
|
+
widget_body = widget_obj[:body]
|
|
550
|
+
# Check if widget references a parent field that doesn't exist
|
|
551
|
+
if widget_body =~ %r{/Parent\s+(\d+)\s+(\d+)\s+R}
|
|
552
|
+
parent_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
|
|
553
|
+
unless field_refs_in_file.include?(parent_ref)
|
|
554
|
+
# Parent field doesn't exist - orphaned widget, remove it
|
|
555
|
+
annots_array_str = annots_array_str.gsub(/\b#{annot_ref[0]}\s+#{annot_ref[1]}\s+R\b/, "").strip
|
|
556
|
+
annots_array_str = annots_array_str.gsub(/\s+/, " ")
|
|
557
|
+
end
|
|
558
|
+
end
|
|
559
|
+
end
|
|
560
|
+
else
|
|
561
|
+
# Widget object doesn't exist - remove it
|
|
562
|
+
annots_array_str = annots_array_str.gsub(/\b#{annot_ref[0]}\s+#{annot_ref[1]}\s+R\b/, "").strip
|
|
563
|
+
annots_array_str = annots_array_str.gsub(/\s+/, " ")
|
|
564
|
+
end
|
|
565
|
+
end
|
|
566
|
+
|
|
567
|
+
new_annots = if annots_array_str.empty? || annots_array_str.strip.empty?
|
|
568
|
+
"[]"
|
|
569
|
+
else
|
|
570
|
+
"[#{annots_array_str}]"
|
|
571
|
+
end
|
|
572
|
+
|
|
573
|
+
new_body = body.sub(%r{/Annots\s*\[.*?\]}, "/Annots #{new_annots}")
|
|
574
|
+
obj[:body] = new_body
|
|
575
|
+
# Handle indirect /Annots array reference
|
|
576
|
+
elsif body =~ %r{/Annots\s+(\d+)\s+(\d+)\s+R}
|
|
577
|
+
annots_array_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
|
|
578
|
+
annots_obj = objects.find { |o| o[:ref] == annots_array_ref }
|
|
579
|
+
if annots_obj
|
|
580
|
+
annots_body = annots_obj[:body]
|
|
581
|
+
|
|
582
|
+
# Remove widgets that match removed fields
|
|
583
|
+
widget_refs_to_remove.each do |widget_ref|
|
|
584
|
+
annots_body = DictScan.remove_ref_from_array(annots_body, widget_ref)
|
|
585
|
+
end
|
|
586
|
+
|
|
587
|
+
# Also remove orphaned widget references
|
|
588
|
+
annots_refs = annots_body.scan(/(\d+)\s+(\d+)\s+R/).map { |n, g| [Integer(n), Integer(g)] }
|
|
589
|
+
annots_refs.each do |annot_ref|
|
|
590
|
+
if objects_in_file.include?(annot_ref)
|
|
591
|
+
widget_obj = objects.find { |o| o[:ref] == annot_ref }
|
|
592
|
+
if widget_obj && DictScan.is_widget?(widget_obj[:body])
|
|
593
|
+
widget_body = widget_obj[:body]
|
|
594
|
+
if widget_body =~ %r{/Parent\s+(\d+)\s+(\d+)\s+R}
|
|
595
|
+
parent_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
|
|
596
|
+
unless field_refs_in_file.include?(parent_ref)
|
|
597
|
+
annots_body = DictScan.remove_ref_from_array(annots_body, annot_ref)
|
|
598
|
+
end
|
|
599
|
+
end
|
|
600
|
+
end
|
|
601
|
+
else
|
|
602
|
+
annots_body = DictScan.remove_ref_from_array(annots_body, annot_ref)
|
|
603
|
+
end
|
|
604
|
+
end
|
|
605
|
+
|
|
606
|
+
annots_obj[:body] = annots_body
|
|
607
|
+
end
|
|
608
|
+
end
|
|
609
|
+
end
|
|
610
|
+
|
|
611
|
+
# Sort objects by object number
|
|
612
|
+
objects.sort_by! { |obj| obj[:ref][0] }
|
|
613
|
+
|
|
614
|
+
# Write the cleaned PDF
|
|
615
|
+
writer = PDFWriter.new
|
|
616
|
+
writer.write_header
|
|
617
|
+
|
|
618
|
+
objects.each do |obj|
|
|
619
|
+
writer.write_object(obj[:ref], obj[:body])
|
|
620
|
+
end
|
|
621
|
+
|
|
622
|
+
writer.write_xref
|
|
623
|
+
|
|
624
|
+
trailer_dict = @resolver.trailer_dict
|
|
625
|
+
info_ref = nil
|
|
626
|
+
if trailer_dict =~ %r{/Info\s+(\d+)\s+(\d+)\s+R}
|
|
627
|
+
info_ref = [::Regexp.last_match(1).to_i, ::Regexp.last_match(2).to_i]
|
|
628
|
+
end
|
|
629
|
+
|
|
630
|
+
# Write trailer
|
|
631
|
+
max_obj_num = objects.map { |obj| obj[:ref][0] }.max || 0
|
|
632
|
+
writer.write_trailer(max_obj_num + 1, root_ref, info_ref)
|
|
633
|
+
|
|
634
|
+
writer.output
|
|
635
|
+
end
|
|
636
|
+
|
|
637
|
+
# Clean up in-place (mutates current instance)
|
|
638
|
+
def clear!(...)
|
|
639
|
+
cleaned_content = clear(...)
|
|
640
|
+
@raw = cleaned_content
|
|
641
|
+
@resolver = AcroThat::ObjectResolver.new(cleaned_content)
|
|
642
|
+
@patches = []
|
|
643
|
+
|
|
644
|
+
self
|
|
645
|
+
end
|
|
646
|
+
|
|
273
647
|
# Write out with an incremental update
|
|
274
|
-
def write(path_out = nil, flatten:
|
|
648
|
+
def write(path_out = nil, flatten: true)
|
|
275
649
|
deduped_patches = @patches.reverse.uniq { |p| p[:ref] }.reverse
|
|
276
650
|
writer = AcroThat::IncrementalWriter.new(@raw, deduped_patches)
|
|
277
651
|
@raw = writer.render
|
|
@@ -290,14 +664,75 @@ module AcroThat
|
|
|
290
664
|
|
|
291
665
|
private
|
|
292
666
|
|
|
293
|
-
def
|
|
667
|
+
def collect_pages_from_tree(pages_ref, page_objects)
|
|
668
|
+
pages_body = @resolver.object_body(pages_ref)
|
|
669
|
+
return unless pages_body
|
|
670
|
+
|
|
671
|
+
# Extract /Kids array from Pages object
|
|
672
|
+
if pages_body =~ %r{/Kids\s*\[(.*?)\]}m
|
|
673
|
+
kids_array = ::Regexp.last_match(1)
|
|
674
|
+
# Extract all object references from Kids array in order
|
|
675
|
+
kids_array.scan(/(\d+)\s+(\d+)\s+R/) do |num_str, gen_str|
|
|
676
|
+
kid_ref = [num_str.to_i, gen_str.to_i]
|
|
677
|
+
kid_body = @resolver.object_body(kid_ref)
|
|
678
|
+
|
|
679
|
+
# Check if this kid is a page (not /Type/Pages)
|
|
680
|
+
if kid_body && DictScan.is_page?(kid_body)
|
|
681
|
+
page_objects << kid_ref unless page_objects.include?(kid_ref)
|
|
682
|
+
elsif kid_body && kid_body.include?("/Type /Pages")
|
|
683
|
+
# Recursively find pages in this Pages node
|
|
684
|
+
collect_pages_from_tree(kid_ref, page_objects)
|
|
685
|
+
end
|
|
686
|
+
end
|
|
687
|
+
end
|
|
688
|
+
end
|
|
689
|
+
|
|
690
|
+
# Find all page objects in document order
|
|
691
|
+
# Returns an array of page references [obj_num, gen_num]
|
|
692
|
+
def find_all_pages
|
|
294
693
|
page_objects = []
|
|
295
|
-
@resolver.each_object do |ref, body|
|
|
296
|
-
next unless body&.include?("/Type /Page")
|
|
297
694
|
|
|
298
|
-
|
|
695
|
+
# First, try to get pages in document order via page tree
|
|
696
|
+
root_ref = @resolver.root_ref
|
|
697
|
+
if root_ref
|
|
698
|
+
catalog_body = @resolver.object_body(root_ref)
|
|
699
|
+
if catalog_body && catalog_body =~ %r{/Pages\s+(\d+)\s+(\d+)\s+R}
|
|
700
|
+
pages_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
|
|
701
|
+
collect_pages_from_tree(pages_ref, page_objects)
|
|
702
|
+
end
|
|
299
703
|
end
|
|
300
704
|
|
|
705
|
+
# Fallback: collect all page objects if page tree didn't work
|
|
706
|
+
if page_objects.empty?
|
|
707
|
+
@resolver.each_object do |ref, body|
|
|
708
|
+
next unless body
|
|
709
|
+
|
|
710
|
+
next unless DictScan.is_page?(body)
|
|
711
|
+
|
|
712
|
+
page_objects << ref unless page_objects.include?(ref)
|
|
713
|
+
end
|
|
714
|
+
|
|
715
|
+
# Sort by object number as fallback
|
|
716
|
+
page_objects.sort_by! { |ref| ref[0] }
|
|
717
|
+
end
|
|
718
|
+
|
|
719
|
+
page_objects
|
|
720
|
+
end
|
|
721
|
+
|
|
722
|
+
# Find a page by its page number (1-indexed)
|
|
723
|
+
# Returns [obj_num, gen_num] or nil if not found
|
|
724
|
+
def find_page_by_number(page_num)
|
|
725
|
+
page_objects = find_all_pages
|
|
726
|
+
|
|
727
|
+
return nil if page_objects.empty?
|
|
728
|
+
return page_objects[page_num - 1] if page_num.positive? && page_num <= page_objects.length
|
|
729
|
+
|
|
730
|
+
page_objects[0] # Default to first page if page_num is out of range
|
|
731
|
+
end
|
|
732
|
+
|
|
733
|
+
def find_page_number_for_ref(page_ref)
|
|
734
|
+
page_objects = find_all_pages
|
|
735
|
+
|
|
301
736
|
return nil if page_objects.empty?
|
|
302
737
|
|
|
303
738
|
page_index = page_objects.index(page_ref)
|
data/lib/acro_that/version.rb
CHANGED