corp_pdf 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,782 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CorpPdf
4
+ class Document
5
+ attr_reader :path
6
+
7
+ # Flatten a PDF to remove incremental updates
8
+ def self.flatten_pdf(input_path, output_path = nil)
9
+ output = new(input_path).flatten
10
+
11
+ if output_path
12
+ File.binwrite(output_path, output)
13
+ return output_path
14
+ else
15
+ return new(StringIO.new(output))
16
+ end
17
+ end
18
+
19
+ def initialize(path_or_io)
20
+ @path = path_or_io.is_a?(String) ? path_or_io : nil
21
+ raw_bytes = case path_or_io
22
+ when String then File.binread(path_or_io)
23
+ else path_or_io.binmode
24
+ path_or_io.read
25
+ end
26
+
27
+ # Extract PDF content if wrapped in multipart form data
28
+ @raw = extract_pdf_from_form_data(raw_bytes).freeze
29
+ @resolver = CorpPdf::ObjectResolver.new(@raw)
30
+ @patches = []
31
+ # Track radio button groups: group_id -> parent_field_ref
32
+ @radio_groups = {}
33
+ end
34
+
35
+ # Flatten this document to remove incremental updates
36
+ def flatten
37
+ root_ref = @resolver.root_ref
38
+ raise "Cannot flatten: no /Root found" unless root_ref
39
+
40
+ # First pass: collect only references (lightweight) and find max_obj_num
41
+ # This avoids loading all object bodies into memory at once
42
+ refs = []
43
+ max_obj_num = 0
44
+ @resolver.each_object do |ref, body|
45
+ if body
46
+ refs << ref
47
+ max_obj_num = [max_obj_num, ref[0]].max
48
+ end
49
+ end
50
+
51
+ # Sort references by object number
52
+ refs.sort_by! { |ref| ref[0] }
53
+
54
+ # Second pass: write objects in sorted order, retrieving bodies on demand
55
+ writer = PDFWriter.new
56
+ writer.write_header
57
+
58
+ refs.each do |ref|
59
+ body = @resolver.object_body(ref)
60
+ writer.write_object(ref, body) if body
61
+ end
62
+
63
+ writer.write_xref
64
+
65
+ trailer_dict = @resolver.trailer_dict
66
+ info_ref = nil
67
+ if trailer_dict =~ %r{/Info\s+(\d+)\s+(\d+)\s+R}
68
+ info_ref = [::Regexp.last_match(1).to_i, ::Regexp.last_match(2).to_i]
69
+ end
70
+
71
+ # Write trailer
72
+ writer.write_trailer(max_obj_num + 1, root_ref, info_ref)
73
+
74
+ writer.output
75
+ end
76
+
77
+ # Flatten this document in-place (mutates current instance)
78
+ def flatten!
79
+ flattened_content = flatten.freeze
80
+ @raw = flattened_content
81
+ @resolver.clear_cache
82
+ @resolver = CorpPdf::ObjectResolver.new(flattened_content)
83
+ @patches = []
84
+
85
+ self
86
+ end
87
+
88
+ # Return an array of page information (page number, width, height, ref, metadata)
89
+ def list_pages
90
+ pages = []
91
+ page_objects = find_all_pages
92
+
93
+ # Second pass: extract information from each page
94
+ page_objects.each_with_index do |ref, index|
95
+ body = @resolver.object_body(ref)
96
+ next unless body
97
+
98
+ # Extract MediaBox, CropBox, or ArtBox for dimensions
99
+ width = nil
100
+ height = nil
101
+
102
+ # Try MediaBox first (most common) - also extract width/height
103
+ media_box = DictScan.parse_box(body, "MediaBox")
104
+ if media_box
105
+ width = media_box[:urx] - media_box[:llx]
106
+ height = media_box[:ury] - media_box[:lly]
107
+ end
108
+
109
+ # Parse other box types
110
+ crop_box = DictScan.parse_box(body, "CropBox")
111
+ art_box = DictScan.parse_box(body, "ArtBox")
112
+ bleed_box = DictScan.parse_box(body, "BleedBox")
113
+ trim_box = DictScan.parse_box(body, "TrimBox")
114
+
115
+ # Extract rotation
116
+ rotate = nil
117
+ if body =~ %r{/Rotate\s+(\d+)}
118
+ rotate = Integer(::Regexp.last_match(1))
119
+ end
120
+
121
+ # Extract Resources reference
122
+ resources_ref = nil
123
+ if body =~ %r{/Resources\s+(\d+)\s+(\d+)\s+R}
124
+ resources_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
125
+ end
126
+
127
+ # Extract Parent reference
128
+ parent_ref = nil
129
+ if body =~ %r{/Parent\s+(\d+)\s+(\d+)\s+R}
130
+ parent_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
131
+ end
132
+
133
+ # Extract Contents reference(s)
134
+ contents_refs = []
135
+ if body =~ %r{/Contents\s+(\d+)\s+(\d+)\s+R}
136
+ contents_refs << [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
137
+ elsif body =~ %r{/Contents\s*\[(.*?)\]}
138
+ contents_array = ::Regexp.last_match(1)
139
+ contents_array.scan(/(\d+)\s+(\d+)\s+R/) do |num_str, gen_str|
140
+ contents_refs << [num_str.to_i, gen_str.to_i]
141
+ end
142
+ end
143
+
144
+ # Build metadata hash
145
+ metadata = {
146
+ rotate: rotate,
147
+ media_box: media_box,
148
+ crop_box: crop_box,
149
+ art_box: art_box,
150
+ bleed_box: bleed_box,
151
+ trim_box: trim_box,
152
+ resources_ref: resources_ref,
153
+ parent_ref: parent_ref,
154
+ contents_refs: contents_refs
155
+ }
156
+
157
+ pages << Page.new(
158
+ index + 1, # Page number starting at 1
159
+ width,
160
+ height,
161
+ ref,
162
+ metadata,
163
+ self # Pass document reference
164
+ )
165
+ end
166
+
167
+ pages
168
+ end
169
+
170
+ # Return an array of Field(name, value, type, ref)
171
+ def list_fields
172
+ fields = []
173
+ field_widgets = {}
174
+ widgets_by_name = {}
175
+
176
+ # First pass: collect widget information
177
+ @resolver.each_object do |ref, body|
178
+ next unless body
179
+
180
+ is_widget = DictScan.is_widget?(body)
181
+
182
+ # Collect widget information if this is a widget
183
+ if is_widget
184
+ # Extract position from widget
185
+ rect_tok = DictScan.value_token_after("/Rect", body)
186
+ if rect_tok && rect_tok.start_with?("[")
187
+ # Parse [x y x+width y+height] format
188
+ rect_values = rect_tok.scan(/[-+]?\d*\.?\d+/).map(&:to_f)
189
+ if rect_values.length == 4
190
+ x, y, x2, y2 = rect_values
191
+ width = x2 - x
192
+ height = y2 - y
193
+
194
+ page_num = nil
195
+ if body =~ %r{/P\s+(\d+)\s+(\d+)\s+R}
196
+ page_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
197
+ page_num = find_page_number_for_ref(page_ref)
198
+ end
199
+
200
+ widget_info = {
201
+ x: x, y: y, width: width, height: height, page: page_num
202
+ }
203
+
204
+ if body =~ %r{/Parent\s+(\d+)\s+(\d+)\s+R}
205
+ parent_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
206
+
207
+ field_widgets[parent_ref] ||= []
208
+ field_widgets[parent_ref] << widget_info
209
+ end
210
+
211
+ if body.include?("/T")
212
+ t_tok = DictScan.value_token_after("/T", body)
213
+ if t_tok
214
+ widget_name = DictScan.decode_pdf_string(t_tok)
215
+ if widget_name && !widget_name.empty?
216
+ widgets_by_name[widget_name] ||= []
217
+ widgets_by_name[widget_name] << widget_info
218
+ end
219
+ end
220
+ end
221
+ end
222
+ end
223
+ end
224
+
225
+ # Second pass: collect all fields (both field objects and widget annotations with /T)
226
+ next unless body.include?("/T")
227
+
228
+ is_widget_field = is_widget
229
+ hint = body.include?("/FT") || is_widget_field || body.include?("/Kids") || body.include?("/Parent")
230
+ next unless hint
231
+
232
+ t_tok = DictScan.value_token_after("/T", body)
233
+ next unless t_tok
234
+
235
+ name = DictScan.decode_pdf_string(t_tok)
236
+ next if name.nil? || name.empty? # Skip fields with empty names (deleted fields)
237
+
238
+ v_tok = body.include?("/V") ? DictScan.value_token_after("/V", body) : nil
239
+ value = v_tok && v_tok != "<<" ? DictScan.decode_pdf_string(v_tok) : nil
240
+
241
+ ft_tok = body.include?("/FT") ? DictScan.value_token_after("/FT", body) : nil
242
+ type = ft_tok
243
+
244
+ # Normalize button field values: "Yes" -> "/Yes" to match PDF name conventions
245
+ if type == "/Btn" && value == "Yes"
246
+ value = "/Yes"
247
+ end
248
+
249
+ position = {}
250
+ if is_widget
251
+ rect_tok = DictScan.value_token_after("/Rect", body)
252
+ if rect_tok && rect_tok.start_with?("[")
253
+ rect_values = rect_tok.scan(/[-+]?\d*\.?\d+/).map(&:to_f)
254
+ if rect_values.length == 4
255
+ x, y, x2, y2 = rect_values
256
+ position = { x: x, y: y, width: x2 - x, height: y2 - y }
257
+
258
+ if body =~ %r{/P\s+(\d+)\s+(\d+)\s+R}
259
+ page_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
260
+ position[:page] = find_page_number_for_ref(page_ref)
261
+ end
262
+ end
263
+ end
264
+ elsif field_widgets[ref]
265
+ widget_info = field_widgets[ref].first
266
+ position = {
267
+ x: widget_info[:x],
268
+ y: widget_info[:y],
269
+ width: widget_info[:width],
270
+ height: widget_info[:height],
271
+ page: widget_info[:page]
272
+ }
273
+ elsif widgets_by_name[name]
274
+ widget_info = widgets_by_name[name].first
275
+ position = {
276
+ x: widget_info[:x],
277
+ y: widget_info[:y],
278
+ width: widget_info[:width],
279
+ height: widget_info[:height],
280
+ page: widget_info[:page]
281
+ }
282
+ end
283
+
284
+ fields << Field.new(name, value, type, ref, self, position)
285
+ end
286
+
287
+ if fields.empty?
288
+ stripped = DictScan.strip_stream_bodies(@raw)
289
+ DictScan.each_dictionary(stripped) do |dict_src|
290
+ next unless dict_src.include?("/T")
291
+
292
+ is_widget_field_fallback = DictScan.is_widget?(dict_src)
293
+ hint = dict_src.include?("/FT") || is_widget_field_fallback || dict_src.include?("/Kids") || dict_src.include?("/Parent")
294
+ next unless hint
295
+
296
+ t_tok = DictScan.value_token_after("/T", dict_src)
297
+ next unless t_tok
298
+
299
+ name = DictScan.decode_pdf_string(t_tok)
300
+ next if name.nil? || name.empty? # Skip fields with empty names (deleted fields)
301
+
302
+ v_tok = dict_src.include?("/V") ? DictScan.value_token_after("/V", dict_src) : nil
303
+ value = v_tok && v_tok != "<<" ? DictScan.decode_pdf_string(v_tok) : nil
304
+ ft_tok = dict_src.include?("/FT") ? DictScan.value_token_after("/FT", dict_src) : nil
305
+ fields << Field.new(name, value, ft_tok, [-1, 0], self)
306
+ end
307
+ end
308
+
309
+ fields.group_by(&:name).values.map { |arr| arr.min_by { |f| f.ref[0] } }
310
+ end
311
+
312
+ # Add a new field to the AcroForm /Fields array
313
+ def add_field(name, options = {})
314
+ action = Actions::AddField.new(self, name, options)
315
+ result = action.call
316
+
317
+ if result
318
+ position = {
319
+ x: options[:x] || 100,
320
+ y: options[:y] || 500,
321
+ width: options[:width] || 100,
322
+ height: options[:height] || 20,
323
+ page: options[:page] || 1
324
+ }
325
+
326
+ field_obj_num = action.field_obj_num
327
+ field_type = action.field_type
328
+ field_value = action.field_value
329
+
330
+ Field.new(name, field_value, field_type, [field_obj_num, 0], self, position)
331
+ end
332
+ end
333
+
334
+ # Update field by name, setting /V and optionally /AS on widgets
335
+ def update_field(name, new_value, new_name: nil)
336
+ # First try to find in list_fields (already written fields)
337
+ field = list_fields.find { |f| f.name == name }
338
+
339
+ # If not found, check if field was just added (in patches) and create a Field object for it
340
+ unless field
341
+ patches = @patches
342
+ field_patch = patches.find do |p|
343
+ next unless p[:body]
344
+ next unless p[:body].include?("/T")
345
+
346
+ t_tok = DictScan.value_token_after("/T", p[:body])
347
+ next unless t_tok
348
+
349
+ field_name = DictScan.decode_pdf_string(t_tok)
350
+ field_name == name
351
+ end
352
+
353
+ if field_patch && field_patch[:body].include?("/FT")
354
+ ft_tok = DictScan.value_token_after("/FT", field_patch[:body])
355
+ if ft_tok
356
+ # Create a temporary Field object for newly added field
357
+ position = {}
358
+ field = Field.new(name, nil, ft_tok, field_patch[:ref], self, position)
359
+ end
360
+ end
361
+ end
362
+
363
+ return false unless field
364
+
365
+ field.update(new_value, new_name: new_name)
366
+ end
367
+
368
+ # Remove field by name from the AcroForm /Fields array
369
+ def remove_field(fld)
370
+ field = fld.is_a?(Field) ? fld : list_fields.find { |f| f.name == fld }
371
+ return false unless field
372
+
373
+ field.remove
374
+ end
375
+
376
+ # Clean up the PDF by removing unwanted fields.
377
+ # Options:
378
+ # - keep_fields: Array of field names to keep (all others removed)
379
+ # - remove_fields: Array of field names to remove
380
+ # - remove_pattern: Regex pattern - fields matching this are removed
381
+ # - block: Given field name, return true to keep, false to remove
382
+ # This rewrites the entire PDF (like flatten) but excludes the unwanted fields.
383
+ def clear(keep_fields: nil, remove_fields: nil, remove_pattern: nil)
384
+ root_ref = @resolver.root_ref
385
+ raise "Cannot clear: no /Root found" unless root_ref
386
+
387
+ # Build a set of fields to remove
388
+ fields_to_remove = Set.new
389
+
390
+ # Get all current fields
391
+ all_fields = list_fields
392
+
393
+ if block_given?
394
+ # Use block to determine which fields to remove
395
+ # Block receives field object (can check field.name, field.value, etc.)
396
+ # Return true to remove the field, false to keep it
397
+ all_fields.each do |field|
398
+ fields_to_remove.add(field.name) if yield(field)
399
+ end
400
+ elsif keep_fields
401
+ # Keep only specified fields
402
+ keep_set = Set.new(keep_fields.map(&:to_s))
403
+ all_fields.each do |field|
404
+ fields_to_remove.add(field.name) unless keep_set.include?(field.name)
405
+ end
406
+ elsif remove_fields
407
+ # Remove specified fields
408
+ remove_set = Set.new(remove_fields.map(&:to_s))
409
+ all_fields.each do |field|
410
+ fields_to_remove.add(field.name) if remove_set.include?(field.name)
411
+ end
412
+ elsif remove_pattern
413
+ # Remove fields matching pattern
414
+ all_fields.each do |field|
415
+ fields_to_remove.add(field.name) if field.name =~ remove_pattern
416
+ end
417
+ else
418
+ # No criteria specified, return original
419
+ return @raw
420
+ end
421
+
422
+ # Build sets of refs to exclude
423
+ field_refs_to_remove = Set.new
424
+ widget_refs_to_remove = Set.new
425
+
426
+ all_fields.each do |field|
427
+ next unless fields_to_remove.include?(field.name)
428
+
429
+ field_refs_to_remove.add(field.ref) if field.valid_ref?
430
+
431
+ # Find all widget annotations for this field
432
+ @resolver.each_object do |widget_ref, body|
433
+ next unless body && DictScan.is_widget?(body)
434
+ next if widget_ref == field.ref
435
+
436
+ # Match by /Parent reference
437
+ if body =~ %r{/Parent\s+(\d+)\s+(\d+)\s+R}
438
+ widget_parent_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
439
+ if widget_parent_ref == field.ref
440
+ widget_refs_to_remove.add(widget_ref)
441
+ next
442
+ end
443
+ end
444
+
445
+ # Also match by field name (/T)
446
+ next unless body.include?("/T")
447
+
448
+ t_tok = DictScan.value_token_after("/T", body)
449
+ next unless t_tok
450
+
451
+ widget_name = DictScan.decode_pdf_string(t_tok)
452
+ if widget_name && widget_name == field.name
453
+ widget_refs_to_remove.add(widget_ref)
454
+ end
455
+ end
456
+ end
457
+
458
+ # Collect refs to write (excluding removed fields and widgets)
459
+ # Store refs only initially to avoid loading all bodies into memory at once
460
+ refs_to_keep = []
461
+ @resolver.each_object do |ref, body|
462
+ next if field_refs_to_remove.include?(ref)
463
+ next if widget_refs_to_remove.include?(ref)
464
+ next unless body
465
+
466
+ refs_to_keep << ref
467
+ end
468
+
469
+ # Build objects hash - load bodies only for objects we need to modify
470
+ # For unmodified objects, we'll load bodies on demand during writing
471
+ objects = []
472
+ refs_to_keep.each do |ref|
473
+ body = @resolver.object_body(ref)
474
+ objects << { ref: ref, body: body } if body
475
+ end
476
+
477
+ # Process AcroForm to remove field references from /Fields array
478
+ af_ref = acroform_ref
479
+ if af_ref && refs_to_keep.include?(af_ref)
480
+ # Find the AcroForm object in our objects list
481
+ af_obj = objects.find { |o| o[:ref] == af_ref }
482
+ if af_obj
483
+ af_body = af_obj[:body]
484
+ fields_array_ref = DictScan.value_token_after("/Fields", af_body)
485
+
486
+ if fields_array_ref && fields_array_ref =~ /\A(\d+)\s+(\d+)\s+R/
487
+ # /Fields points to separate array object
488
+ arr_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
489
+ arr_obj = objects.find { |o| o[:ref] == arr_ref }
490
+ if arr_obj
491
+ arr_body = arr_obj[:body]
492
+ field_refs_to_remove.each do |field_ref|
493
+ arr_body = DictScan.remove_ref_from_array(arr_body, field_ref)
494
+ end
495
+ # Clean up empty array
496
+ arr_body = arr_body.strip.gsub(/\[\s+\]/, "[]")
497
+ arr_obj[:body] = arr_body
498
+ end
499
+ elsif af_body.include?("/Fields")
500
+ # Inline /Fields array
501
+ field_refs_to_remove.each do |field_ref|
502
+ af_body = DictScan.remove_ref_from_inline_array(af_body, "/Fields", field_ref)
503
+ end
504
+ af_obj[:body] = af_body
505
+ end
506
+ end
507
+ end
508
+
509
+ # Process page objects to remove widget references from /Annots arrays
510
+ # Also remove any orphaned widget references (widgets that reference non-existent fields)
511
+ objects_in_file = Set.new(objects.map { |o| o[:ref] })
512
+ field_refs_in_file = Set.new
513
+ objects.each do |obj|
514
+ body = obj[:body]
515
+ # Check if this is a field object
516
+ if body&.include?("/FT") && body.include?("/T")
517
+ field_refs_in_file.add(obj[:ref])
518
+ end
519
+
520
+ body = obj[:body]
521
+ next unless DictScan.is_page?(body)
522
+
523
+ # Handle inline /Annots array
524
+ if body =~ %r{/Annots\s*\[(.*?)\]}
525
+ annots_array_str = ::Regexp.last_match(1)
526
+
527
+ # Remove widgets that match removed fields
528
+ widget_refs_to_remove.each do |widget_ref|
529
+ annots_array_str = annots_array_str.gsub(/\b#{widget_ref[0]}\s+#{widget_ref[1]}\s+R\b/, "").strip
530
+ annots_array_str = annots_array_str.gsub(/\s+/, " ")
531
+ end
532
+
533
+ # Also remove orphaned widget references (widgets not in objects_in_file or pointing to non-existent fields)
534
+ annots_refs = annots_array_str.scan(/(\d+)\s+(\d+)\s+R/).map { |n, g| [Integer(n), Integer(g)] }
535
+ annots_refs.each do |annot_ref|
536
+ # Check if this annotation is a widget that should be removed
537
+ if objects_in_file.include?(annot_ref)
538
+ # Widget exists - check if it's an orphaned widget (references non-existent field)
539
+ widget_obj = objects.find { |o| o[:ref] == annot_ref }
540
+ if widget_obj && DictScan.is_widget?(widget_obj[:body])
541
+ widget_body = widget_obj[:body]
542
+ # Check if widget references a parent field that doesn't exist
543
+ if widget_body =~ %r{/Parent\s+(\d+)\s+(\d+)\s+R}
544
+ parent_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
545
+ unless field_refs_in_file.include?(parent_ref)
546
+ # Parent field doesn't exist - orphaned widget, remove it
547
+ annots_array_str = annots_array_str.gsub(/\b#{annot_ref[0]}\s+#{annot_ref[1]}\s+R\b/, "").strip
548
+ annots_array_str = annots_array_str.gsub(/\s+/, " ")
549
+ end
550
+ end
551
+ end
552
+ else
553
+ # Widget object doesn't exist - remove it
554
+ annots_array_str = annots_array_str.gsub(/\b#{annot_ref[0]}\s+#{annot_ref[1]}\s+R\b/, "").strip
555
+ annots_array_str = annots_array_str.gsub(/\s+/, " ")
556
+ end
557
+ end
558
+
559
+ new_annots = if annots_array_str.empty? || annots_array_str.strip.empty?
560
+ "[]"
561
+ else
562
+ "[#{annots_array_str}]"
563
+ end
564
+
565
+ new_body = body.sub(%r{/Annots\s*\[.*?\]}, "/Annots #{new_annots}")
566
+ obj[:body] = new_body
567
+ # Handle indirect /Annots array reference
568
+ elsif body =~ %r{/Annots\s+(\d+)\s+(\d+)\s+R}
569
+ annots_array_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
570
+ annots_obj = objects.find { |o| o[:ref] == annots_array_ref }
571
+ if annots_obj
572
+ annots_body = annots_obj[:body]
573
+
574
+ # Remove widgets that match removed fields
575
+ widget_refs_to_remove.each do |widget_ref|
576
+ annots_body = DictScan.remove_ref_from_array(annots_body, widget_ref)
577
+ end
578
+
579
+ # Also remove orphaned widget references
580
+ annots_refs = annots_body.scan(/(\d+)\s+(\d+)\s+R/).map { |n, g| [Integer(n), Integer(g)] }
581
+ annots_refs.each do |annot_ref|
582
+ if objects_in_file.include?(annot_ref)
583
+ widget_obj = objects.find { |o| o[:ref] == annot_ref }
584
+ if widget_obj && DictScan.is_widget?(widget_obj[:body])
585
+ widget_body = widget_obj[:body]
586
+ if widget_body =~ %r{/Parent\s+(\d+)\s+(\d+)\s+R}
587
+ parent_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
588
+ unless field_refs_in_file.include?(parent_ref)
589
+ annots_body = DictScan.remove_ref_from_array(annots_body, annot_ref)
590
+ end
591
+ end
592
+ end
593
+ else
594
+ annots_body = DictScan.remove_ref_from_array(annots_body, annot_ref)
595
+ end
596
+ end
597
+
598
+ annots_obj[:body] = annots_body
599
+ end
600
+ end
601
+ end
602
+
603
+ # Sort objects by object number
604
+ objects.sort_by! { |obj| obj[:ref][0] }
605
+
606
+ # Write the cleaned PDF
607
+ writer = PDFWriter.new
608
+ writer.write_header
609
+
610
+ objects.each do |obj|
611
+ writer.write_object(obj[:ref], obj[:body])
612
+ end
613
+
614
+ writer.write_xref
615
+
616
+ trailer_dict = @resolver.trailer_dict
617
+ info_ref = nil
618
+ if trailer_dict =~ %r{/Info\s+(\d+)\s+(\d+)\s+R}
619
+ info_ref = [::Regexp.last_match(1).to_i, ::Regexp.last_match(2).to_i]
620
+ end
621
+
622
+ # Write trailer
623
+ max_obj_num = objects.map { |obj| obj[:ref][0] }.max || 0
624
+ writer.write_trailer(max_obj_num + 1, root_ref, info_ref)
625
+
626
+ writer.output
627
+ end
628
+
629
+ # Clean up in-place (mutates current instance)
630
+ def clear!(...)
631
+ cleaned_content = clear(...).freeze
632
+ @raw = cleaned_content
633
+ @resolver.clear_cache
634
+ @resolver = CorpPdf::ObjectResolver.new(cleaned_content)
635
+ @patches = []
636
+
637
+ self
638
+ end
639
+
640
+ # Write out with an incremental update
641
+ def write(path_out = nil, flatten: true)
642
+ deduped_patches = @patches.reverse.uniq { |p| p[:ref] }.reverse
643
+ writer = CorpPdf::IncrementalWriter.new(@raw, deduped_patches)
644
+ @raw = writer.render.freeze
645
+ @patches = []
646
+ @resolver.clear_cache
647
+ @resolver = CorpPdf::ObjectResolver.new(@raw)
648
+
649
+ flatten! if flatten
650
+
651
+ if path_out
652
+ File.binwrite(path_out, @raw)
653
+ return true
654
+ else
655
+ return @raw
656
+ end
657
+ end
658
+
659
+ private
660
+
661
+ # Extract PDF content from multipart form data if present
662
+ # Some PDFs are uploaded as multipart form data with boundary markers
663
+ def extract_pdf_from_form_data(bytes)
664
+ # Check if this looks like multipart form data
665
+ if bytes =~ /\A------\w+/
666
+ # Find the PDF header
667
+ pdf_start = bytes.index("%PDF")
668
+ return bytes unless pdf_start
669
+
670
+ # Extract PDF content from start to EOF
671
+ pdf_end = bytes.rindex("%%EOF")
672
+ return bytes unless pdf_end
673
+
674
+ # Extract just the PDF portion
675
+ pdf_content = bytes[pdf_start..(pdf_end + 4)]
676
+ return pdf_content
677
+ end
678
+
679
+ # Not form data, return as-is
680
+ bytes
681
+ end
682
+
683
+ def collect_pages_from_tree(pages_ref, page_objects)
684
+ pages_body = @resolver.object_body(pages_ref)
685
+ return unless pages_body
686
+
687
+ # Extract /Kids array from Pages object
688
+ if pages_body =~ %r{/Kids\s*\[(.*?)\]}m
689
+ kids_array = ::Regexp.last_match(1)
690
+ # Extract all object references from Kids array in order
691
+ kids_array.scan(/(\d+)\s+(\d+)\s+R/) do |num_str, gen_str|
692
+ kid_ref = [num_str.to_i, gen_str.to_i]
693
+ kid_body = @resolver.object_body(kid_ref)
694
+
695
+ # Check if this kid is a page (not /Type/Pages)
696
+ if kid_body && DictScan.is_page?(kid_body)
697
+ page_objects << kid_ref unless page_objects.include?(kid_ref)
698
+ elsif kid_body && kid_body.include?("/Type /Pages")
699
+ # Recursively find pages in this Pages node
700
+ collect_pages_from_tree(kid_ref, page_objects)
701
+ end
702
+ end
703
+ end
704
+ end
705
+
706
+ # Find all page objects in document order
707
+ # Returns an array of page references [obj_num, gen_num]
708
+ def find_all_pages
709
+ page_objects = []
710
+
711
+ # First, try to get pages in document order via page tree
712
+ root_ref = @resolver.root_ref
713
+ if root_ref
714
+ catalog_body = @resolver.object_body(root_ref)
715
+ if catalog_body && catalog_body =~ %r{/Pages\s+(\d+)\s+(\d+)\s+R}
716
+ pages_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
717
+ collect_pages_from_tree(pages_ref, page_objects)
718
+ end
719
+ end
720
+
721
+ # Fallback: collect all page objects if page tree didn't work
722
+ if page_objects.empty?
723
+ @resolver.each_object do |ref, body|
724
+ next unless body
725
+
726
+ next unless DictScan.is_page?(body)
727
+
728
+ page_objects << ref unless page_objects.include?(ref)
729
+ end
730
+
731
+ # Sort by object number as fallback
732
+ page_objects.sort_by! { |ref| ref[0] }
733
+ end
734
+
735
+ page_objects
736
+ end
737
+
738
+ # Find a page by its page number (1-indexed)
739
+ # Returns [obj_num, gen_num] or nil if not found
740
+ def find_page_by_number(page_num)
741
+ page_objects = find_all_pages
742
+
743
+ return nil if page_objects.empty?
744
+ return page_objects[page_num - 1] if page_num.positive? && page_num <= page_objects.length
745
+
746
+ page_objects[0] # Default to first page if page_num is out of range
747
+ end
748
+
749
+ def find_page_number_for_ref(page_ref)
750
+ page_objects = find_all_pages
751
+
752
+ return nil if page_objects.empty?
753
+
754
+ page_index = page_objects.index(page_ref)
755
+ return nil unless page_index
756
+
757
+ page_index + 1
758
+ end
759
+
760
+ def next_fresh_object_number
761
+ max_obj_num = 0
762
+ @resolver.each_object do |ref, _|
763
+ max_obj_num = [max_obj_num, ref[0]].max
764
+ end
765
+ @patches.each do |p|
766
+ max_obj_num = [max_obj_num, p[:ref][0]].max
767
+ end
768
+ max_obj_num + 1
769
+ end
770
+
771
+ def acroform_ref
772
+ root_ref = @resolver.root_ref
773
+ return nil unless root_ref
774
+
775
+ cat_body = @resolver.object_body(root_ref)
776
+
777
+ return nil unless cat_body =~ %r{/AcroForm\s+(\d+)\s+(\d+)\s+R}
778
+
779
+ [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
780
+ end
781
+ end
782
+ end