corp_pdf 1.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +13 -0
- data/.rubocop.yml +78 -0
- data/CHANGELOG.md +122 -0
- data/Gemfile +5 -0
- data/Gemfile.lock +90 -0
- data/README.md +518 -0
- data/Rakefile +18 -0
- data/corp_pdf.gemspec +35 -0
- data/docs/README.md +111 -0
- data/docs/clear_fields.md +202 -0
- data/docs/dict_scan_explained.md +341 -0
- data/docs/object_streams.md +311 -0
- data/docs/pdf_structure.md +251 -0
- data/issues/README.md +59 -0
- data/issues/memory-benchmark-results.md +551 -0
- data/issues/memory-improvements.md +388 -0
- data/issues/memory-optimization-summary.md +204 -0
- data/issues/refactoring-opportunities.md +259 -0
- data/lib/corp_pdf/actions/add_field.rb +73 -0
- data/lib/corp_pdf/actions/base.rb +48 -0
- data/lib/corp_pdf/actions/remove_field.rb +154 -0
- data/lib/corp_pdf/actions/update_field.rb +663 -0
- data/lib/corp_pdf/dict_scan.rb +523 -0
- data/lib/corp_pdf/document.rb +782 -0
- data/lib/corp_pdf/field.rb +145 -0
- data/lib/corp_pdf/fields/base.rb +384 -0
- data/lib/corp_pdf/fields/checkbox.rb +164 -0
- data/lib/corp_pdf/fields/radio.rb +220 -0
- data/lib/corp_pdf/fields/signature.rb +393 -0
- data/lib/corp_pdf/fields/text.rb +31 -0
- data/lib/corp_pdf/incremental_writer.rb +245 -0
- data/lib/corp_pdf/object_resolver.rb +381 -0
- data/lib/corp_pdf/objstm.rb +75 -0
- data/lib/corp_pdf/page.rb +90 -0
- data/lib/corp_pdf/pdf_writer.rb +133 -0
- data/lib/corp_pdf/version.rb +5 -0
- data/lib/corp_pdf.rb +35 -0
- data/publish +183 -0
- metadata +169 -0
|
@@ -0,0 +1,782 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module CorpPdf
|
|
4
|
+
class Document
|
|
5
|
+
attr_reader :path
|
|
6
|
+
|
|
7
|
+
# Flatten a PDF to remove incremental updates
|
|
8
|
+
def self.flatten_pdf(input_path, output_path = nil)
|
|
9
|
+
output = new(input_path).flatten
|
|
10
|
+
|
|
11
|
+
if output_path
|
|
12
|
+
File.binwrite(output_path, output)
|
|
13
|
+
return output_path
|
|
14
|
+
else
|
|
15
|
+
return new(StringIO.new(output))
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def initialize(path_or_io)
|
|
20
|
+
@path = path_or_io.is_a?(String) ? path_or_io : nil
|
|
21
|
+
raw_bytes = case path_or_io
|
|
22
|
+
when String then File.binread(path_or_io)
|
|
23
|
+
else path_or_io.binmode
|
|
24
|
+
path_or_io.read
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Extract PDF content if wrapped in multipart form data
|
|
28
|
+
@raw = extract_pdf_from_form_data(raw_bytes).freeze
|
|
29
|
+
@resolver = CorpPdf::ObjectResolver.new(@raw)
|
|
30
|
+
@patches = []
|
|
31
|
+
# Track radio button groups: group_id -> parent_field_ref
|
|
32
|
+
@radio_groups = {}
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Flatten this document to remove incremental updates
|
|
36
|
+
def flatten
|
|
37
|
+
root_ref = @resolver.root_ref
|
|
38
|
+
raise "Cannot flatten: no /Root found" unless root_ref
|
|
39
|
+
|
|
40
|
+
# First pass: collect only references (lightweight) and find max_obj_num
|
|
41
|
+
# This avoids loading all object bodies into memory at once
|
|
42
|
+
refs = []
|
|
43
|
+
max_obj_num = 0
|
|
44
|
+
@resolver.each_object do |ref, body|
|
|
45
|
+
if body
|
|
46
|
+
refs << ref
|
|
47
|
+
max_obj_num = [max_obj_num, ref[0]].max
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# Sort references by object number
|
|
52
|
+
refs.sort_by! { |ref| ref[0] }
|
|
53
|
+
|
|
54
|
+
# Second pass: write objects in sorted order, retrieving bodies on demand
|
|
55
|
+
writer = PDFWriter.new
|
|
56
|
+
writer.write_header
|
|
57
|
+
|
|
58
|
+
refs.each do |ref|
|
|
59
|
+
body = @resolver.object_body(ref)
|
|
60
|
+
writer.write_object(ref, body) if body
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
writer.write_xref
|
|
64
|
+
|
|
65
|
+
trailer_dict = @resolver.trailer_dict
|
|
66
|
+
info_ref = nil
|
|
67
|
+
if trailer_dict =~ %r{/Info\s+(\d+)\s+(\d+)\s+R}
|
|
68
|
+
info_ref = [::Regexp.last_match(1).to_i, ::Regexp.last_match(2).to_i]
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Write trailer
|
|
72
|
+
writer.write_trailer(max_obj_num + 1, root_ref, info_ref)
|
|
73
|
+
|
|
74
|
+
writer.output
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# Flatten this document in-place (mutates current instance)
|
|
78
|
+
def flatten!
|
|
79
|
+
flattened_content = flatten.freeze
|
|
80
|
+
@raw = flattened_content
|
|
81
|
+
@resolver.clear_cache
|
|
82
|
+
@resolver = CorpPdf::ObjectResolver.new(flattened_content)
|
|
83
|
+
@patches = []
|
|
84
|
+
|
|
85
|
+
self
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# Return an array of page information (page number, width, height, ref, metadata)
|
|
89
|
+
def list_pages
|
|
90
|
+
pages = []
|
|
91
|
+
page_objects = find_all_pages
|
|
92
|
+
|
|
93
|
+
# Second pass: extract information from each page
|
|
94
|
+
page_objects.each_with_index do |ref, index|
|
|
95
|
+
body = @resolver.object_body(ref)
|
|
96
|
+
next unless body
|
|
97
|
+
|
|
98
|
+
# Extract MediaBox, CropBox, or ArtBox for dimensions
|
|
99
|
+
width = nil
|
|
100
|
+
height = nil
|
|
101
|
+
|
|
102
|
+
# Try MediaBox first (most common) - also extract width/height
|
|
103
|
+
media_box = DictScan.parse_box(body, "MediaBox")
|
|
104
|
+
if media_box
|
|
105
|
+
width = media_box[:urx] - media_box[:llx]
|
|
106
|
+
height = media_box[:ury] - media_box[:lly]
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
# Parse other box types
|
|
110
|
+
crop_box = DictScan.parse_box(body, "CropBox")
|
|
111
|
+
art_box = DictScan.parse_box(body, "ArtBox")
|
|
112
|
+
bleed_box = DictScan.parse_box(body, "BleedBox")
|
|
113
|
+
trim_box = DictScan.parse_box(body, "TrimBox")
|
|
114
|
+
|
|
115
|
+
# Extract rotation
|
|
116
|
+
rotate = nil
|
|
117
|
+
if body =~ %r{/Rotate\s+(\d+)}
|
|
118
|
+
rotate = Integer(::Regexp.last_match(1))
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
# Extract Resources reference
|
|
122
|
+
resources_ref = nil
|
|
123
|
+
if body =~ %r{/Resources\s+(\d+)\s+(\d+)\s+R}
|
|
124
|
+
resources_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
# Extract Parent reference
|
|
128
|
+
parent_ref = nil
|
|
129
|
+
if body =~ %r{/Parent\s+(\d+)\s+(\d+)\s+R}
|
|
130
|
+
parent_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
# Extract Contents reference(s)
|
|
134
|
+
contents_refs = []
|
|
135
|
+
if body =~ %r{/Contents\s+(\d+)\s+(\d+)\s+R}
|
|
136
|
+
contents_refs << [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
|
|
137
|
+
elsif body =~ %r{/Contents\s*\[(.*?)\]}
|
|
138
|
+
contents_array = ::Regexp.last_match(1)
|
|
139
|
+
contents_array.scan(/(\d+)\s+(\d+)\s+R/) do |num_str, gen_str|
|
|
140
|
+
contents_refs << [num_str.to_i, gen_str.to_i]
|
|
141
|
+
end
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
# Build metadata hash
|
|
145
|
+
metadata = {
|
|
146
|
+
rotate: rotate,
|
|
147
|
+
media_box: media_box,
|
|
148
|
+
crop_box: crop_box,
|
|
149
|
+
art_box: art_box,
|
|
150
|
+
bleed_box: bleed_box,
|
|
151
|
+
trim_box: trim_box,
|
|
152
|
+
resources_ref: resources_ref,
|
|
153
|
+
parent_ref: parent_ref,
|
|
154
|
+
contents_refs: contents_refs
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
pages << Page.new(
|
|
158
|
+
index + 1, # Page number starting at 1
|
|
159
|
+
width,
|
|
160
|
+
height,
|
|
161
|
+
ref,
|
|
162
|
+
metadata,
|
|
163
|
+
self # Pass document reference
|
|
164
|
+
)
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
pages
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
# Return an array of Field(name, value, type, ref)
|
|
171
|
+
def list_fields
|
|
172
|
+
fields = []
|
|
173
|
+
field_widgets = {}
|
|
174
|
+
widgets_by_name = {}
|
|
175
|
+
|
|
176
|
+
# First pass: collect widget information
|
|
177
|
+
@resolver.each_object do |ref, body|
|
|
178
|
+
next unless body
|
|
179
|
+
|
|
180
|
+
is_widget = DictScan.is_widget?(body)
|
|
181
|
+
|
|
182
|
+
# Collect widget information if this is a widget
|
|
183
|
+
if is_widget
|
|
184
|
+
# Extract position from widget
|
|
185
|
+
rect_tok = DictScan.value_token_after("/Rect", body)
|
|
186
|
+
if rect_tok && rect_tok.start_with?("[")
|
|
187
|
+
# Parse [x y x+width y+height] format
|
|
188
|
+
rect_values = rect_tok.scan(/[-+]?\d*\.?\d+/).map(&:to_f)
|
|
189
|
+
if rect_values.length == 4
|
|
190
|
+
x, y, x2, y2 = rect_values
|
|
191
|
+
width = x2 - x
|
|
192
|
+
height = y2 - y
|
|
193
|
+
|
|
194
|
+
page_num = nil
|
|
195
|
+
if body =~ %r{/P\s+(\d+)\s+(\d+)\s+R}
|
|
196
|
+
page_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
|
|
197
|
+
page_num = find_page_number_for_ref(page_ref)
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
widget_info = {
|
|
201
|
+
x: x, y: y, width: width, height: height, page: page_num
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
if body =~ %r{/Parent\s+(\d+)\s+(\d+)\s+R}
|
|
205
|
+
parent_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
|
|
206
|
+
|
|
207
|
+
field_widgets[parent_ref] ||= []
|
|
208
|
+
field_widgets[parent_ref] << widget_info
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
if body.include?("/T")
|
|
212
|
+
t_tok = DictScan.value_token_after("/T", body)
|
|
213
|
+
if t_tok
|
|
214
|
+
widget_name = DictScan.decode_pdf_string(t_tok)
|
|
215
|
+
if widget_name && !widget_name.empty?
|
|
216
|
+
widgets_by_name[widget_name] ||= []
|
|
217
|
+
widgets_by_name[widget_name] << widget_info
|
|
218
|
+
end
|
|
219
|
+
end
|
|
220
|
+
end
|
|
221
|
+
end
|
|
222
|
+
end
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
# Second pass: collect all fields (both field objects and widget annotations with /T)
|
|
226
|
+
next unless body.include?("/T")
|
|
227
|
+
|
|
228
|
+
is_widget_field = is_widget
|
|
229
|
+
hint = body.include?("/FT") || is_widget_field || body.include?("/Kids") || body.include?("/Parent")
|
|
230
|
+
next unless hint
|
|
231
|
+
|
|
232
|
+
t_tok = DictScan.value_token_after("/T", body)
|
|
233
|
+
next unless t_tok
|
|
234
|
+
|
|
235
|
+
name = DictScan.decode_pdf_string(t_tok)
|
|
236
|
+
next if name.nil? || name.empty? # Skip fields with empty names (deleted fields)
|
|
237
|
+
|
|
238
|
+
v_tok = body.include?("/V") ? DictScan.value_token_after("/V", body) : nil
|
|
239
|
+
value = v_tok && v_tok != "<<" ? DictScan.decode_pdf_string(v_tok) : nil
|
|
240
|
+
|
|
241
|
+
ft_tok = body.include?("/FT") ? DictScan.value_token_after("/FT", body) : nil
|
|
242
|
+
type = ft_tok
|
|
243
|
+
|
|
244
|
+
# Normalize button field values: "Yes" -> "/Yes" to match PDF name conventions
|
|
245
|
+
if type == "/Btn" && value == "Yes"
|
|
246
|
+
value = "/Yes"
|
|
247
|
+
end
|
|
248
|
+
|
|
249
|
+
position = {}
|
|
250
|
+
if is_widget
|
|
251
|
+
rect_tok = DictScan.value_token_after("/Rect", body)
|
|
252
|
+
if rect_tok && rect_tok.start_with?("[")
|
|
253
|
+
rect_values = rect_tok.scan(/[-+]?\d*\.?\d+/).map(&:to_f)
|
|
254
|
+
if rect_values.length == 4
|
|
255
|
+
x, y, x2, y2 = rect_values
|
|
256
|
+
position = { x: x, y: y, width: x2 - x, height: y2 - y }
|
|
257
|
+
|
|
258
|
+
if body =~ %r{/P\s+(\d+)\s+(\d+)\s+R}
|
|
259
|
+
page_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
|
|
260
|
+
position[:page] = find_page_number_for_ref(page_ref)
|
|
261
|
+
end
|
|
262
|
+
end
|
|
263
|
+
end
|
|
264
|
+
elsif field_widgets[ref]
|
|
265
|
+
widget_info = field_widgets[ref].first
|
|
266
|
+
position = {
|
|
267
|
+
x: widget_info[:x],
|
|
268
|
+
y: widget_info[:y],
|
|
269
|
+
width: widget_info[:width],
|
|
270
|
+
height: widget_info[:height],
|
|
271
|
+
page: widget_info[:page]
|
|
272
|
+
}
|
|
273
|
+
elsif widgets_by_name[name]
|
|
274
|
+
widget_info = widgets_by_name[name].first
|
|
275
|
+
position = {
|
|
276
|
+
x: widget_info[:x],
|
|
277
|
+
y: widget_info[:y],
|
|
278
|
+
width: widget_info[:width],
|
|
279
|
+
height: widget_info[:height],
|
|
280
|
+
page: widget_info[:page]
|
|
281
|
+
}
|
|
282
|
+
end
|
|
283
|
+
|
|
284
|
+
fields << Field.new(name, value, type, ref, self, position)
|
|
285
|
+
end
|
|
286
|
+
|
|
287
|
+
if fields.empty?
|
|
288
|
+
stripped = DictScan.strip_stream_bodies(@raw)
|
|
289
|
+
DictScan.each_dictionary(stripped) do |dict_src|
|
|
290
|
+
next unless dict_src.include?("/T")
|
|
291
|
+
|
|
292
|
+
is_widget_field_fallback = DictScan.is_widget?(dict_src)
|
|
293
|
+
hint = dict_src.include?("/FT") || is_widget_field_fallback || dict_src.include?("/Kids") || dict_src.include?("/Parent")
|
|
294
|
+
next unless hint
|
|
295
|
+
|
|
296
|
+
t_tok = DictScan.value_token_after("/T", dict_src)
|
|
297
|
+
next unless t_tok
|
|
298
|
+
|
|
299
|
+
name = DictScan.decode_pdf_string(t_tok)
|
|
300
|
+
next if name.nil? || name.empty? # Skip fields with empty names (deleted fields)
|
|
301
|
+
|
|
302
|
+
v_tok = dict_src.include?("/V") ? DictScan.value_token_after("/V", dict_src) : nil
|
|
303
|
+
value = v_tok && v_tok != "<<" ? DictScan.decode_pdf_string(v_tok) : nil
|
|
304
|
+
ft_tok = dict_src.include?("/FT") ? DictScan.value_token_after("/FT", dict_src) : nil
|
|
305
|
+
fields << Field.new(name, value, ft_tok, [-1, 0], self)
|
|
306
|
+
end
|
|
307
|
+
end
|
|
308
|
+
|
|
309
|
+
fields.group_by(&:name).values.map { |arr| arr.min_by { |f| f.ref[0] } }
|
|
310
|
+
end
|
|
311
|
+
|
|
312
|
+
# Add a new field to the AcroForm /Fields array
|
|
313
|
+
def add_field(name, options = {})
|
|
314
|
+
action = Actions::AddField.new(self, name, options)
|
|
315
|
+
result = action.call
|
|
316
|
+
|
|
317
|
+
if result
|
|
318
|
+
position = {
|
|
319
|
+
x: options[:x] || 100,
|
|
320
|
+
y: options[:y] || 500,
|
|
321
|
+
width: options[:width] || 100,
|
|
322
|
+
height: options[:height] || 20,
|
|
323
|
+
page: options[:page] || 1
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
field_obj_num = action.field_obj_num
|
|
327
|
+
field_type = action.field_type
|
|
328
|
+
field_value = action.field_value
|
|
329
|
+
|
|
330
|
+
Field.new(name, field_value, field_type, [field_obj_num, 0], self, position)
|
|
331
|
+
end
|
|
332
|
+
end
|
|
333
|
+
|
|
334
|
+
# Update field by name, setting /V and optionally /AS on widgets
|
|
335
|
+
def update_field(name, new_value, new_name: nil)
|
|
336
|
+
# First try to find in list_fields (already written fields)
|
|
337
|
+
field = list_fields.find { |f| f.name == name }
|
|
338
|
+
|
|
339
|
+
# If not found, check if field was just added (in patches) and create a Field object for it
|
|
340
|
+
unless field
|
|
341
|
+
patches = @patches
|
|
342
|
+
field_patch = patches.find do |p|
|
|
343
|
+
next unless p[:body]
|
|
344
|
+
next unless p[:body].include?("/T")
|
|
345
|
+
|
|
346
|
+
t_tok = DictScan.value_token_after("/T", p[:body])
|
|
347
|
+
next unless t_tok
|
|
348
|
+
|
|
349
|
+
field_name = DictScan.decode_pdf_string(t_tok)
|
|
350
|
+
field_name == name
|
|
351
|
+
end
|
|
352
|
+
|
|
353
|
+
if field_patch && field_patch[:body].include?("/FT")
|
|
354
|
+
ft_tok = DictScan.value_token_after("/FT", field_patch[:body])
|
|
355
|
+
if ft_tok
|
|
356
|
+
# Create a temporary Field object for newly added field
|
|
357
|
+
position = {}
|
|
358
|
+
field = Field.new(name, nil, ft_tok, field_patch[:ref], self, position)
|
|
359
|
+
end
|
|
360
|
+
end
|
|
361
|
+
end
|
|
362
|
+
|
|
363
|
+
return false unless field
|
|
364
|
+
|
|
365
|
+
field.update(new_value, new_name: new_name)
|
|
366
|
+
end
|
|
367
|
+
|
|
368
|
+
# Remove field by name from the AcroForm /Fields array
|
|
369
|
+
def remove_field(fld)
|
|
370
|
+
field = fld.is_a?(Field) ? fld : list_fields.find { |f| f.name == fld }
|
|
371
|
+
return false unless field
|
|
372
|
+
|
|
373
|
+
field.remove
|
|
374
|
+
end
|
|
375
|
+
|
|
376
|
+
# Clean up the PDF by removing unwanted fields.
|
|
377
|
+
# Options:
|
|
378
|
+
# - keep_fields: Array of field names to keep (all others removed)
|
|
379
|
+
# - remove_fields: Array of field names to remove
|
|
380
|
+
# - remove_pattern: Regex pattern - fields matching this are removed
|
|
381
|
+
# - block: Given field name, return true to keep, false to remove
|
|
382
|
+
# This rewrites the entire PDF (like flatten) but excludes the unwanted fields.
|
|
383
|
+
def clear(keep_fields: nil, remove_fields: nil, remove_pattern: nil)
|
|
384
|
+
root_ref = @resolver.root_ref
|
|
385
|
+
raise "Cannot clear: no /Root found" unless root_ref
|
|
386
|
+
|
|
387
|
+
# Build a set of fields to remove
|
|
388
|
+
fields_to_remove = Set.new
|
|
389
|
+
|
|
390
|
+
# Get all current fields
|
|
391
|
+
all_fields = list_fields
|
|
392
|
+
|
|
393
|
+
if block_given?
|
|
394
|
+
# Use block to determine which fields to remove
|
|
395
|
+
# Block receives field object (can check field.name, field.value, etc.)
|
|
396
|
+
# Return true to remove the field, false to keep it
|
|
397
|
+
all_fields.each do |field|
|
|
398
|
+
fields_to_remove.add(field.name) if yield(field)
|
|
399
|
+
end
|
|
400
|
+
elsif keep_fields
|
|
401
|
+
# Keep only specified fields
|
|
402
|
+
keep_set = Set.new(keep_fields.map(&:to_s))
|
|
403
|
+
all_fields.each do |field|
|
|
404
|
+
fields_to_remove.add(field.name) unless keep_set.include?(field.name)
|
|
405
|
+
end
|
|
406
|
+
elsif remove_fields
|
|
407
|
+
# Remove specified fields
|
|
408
|
+
remove_set = Set.new(remove_fields.map(&:to_s))
|
|
409
|
+
all_fields.each do |field|
|
|
410
|
+
fields_to_remove.add(field.name) if remove_set.include?(field.name)
|
|
411
|
+
end
|
|
412
|
+
elsif remove_pattern
|
|
413
|
+
# Remove fields matching pattern
|
|
414
|
+
all_fields.each do |field|
|
|
415
|
+
fields_to_remove.add(field.name) if field.name =~ remove_pattern
|
|
416
|
+
end
|
|
417
|
+
else
|
|
418
|
+
# No criteria specified, return original
|
|
419
|
+
return @raw
|
|
420
|
+
end
|
|
421
|
+
|
|
422
|
+
# Build sets of refs to exclude
|
|
423
|
+
field_refs_to_remove = Set.new
|
|
424
|
+
widget_refs_to_remove = Set.new
|
|
425
|
+
|
|
426
|
+
all_fields.each do |field|
|
|
427
|
+
next unless fields_to_remove.include?(field.name)
|
|
428
|
+
|
|
429
|
+
field_refs_to_remove.add(field.ref) if field.valid_ref?
|
|
430
|
+
|
|
431
|
+
# Find all widget annotations for this field
|
|
432
|
+
@resolver.each_object do |widget_ref, body|
|
|
433
|
+
next unless body && DictScan.is_widget?(body)
|
|
434
|
+
next if widget_ref == field.ref
|
|
435
|
+
|
|
436
|
+
# Match by /Parent reference
|
|
437
|
+
if body =~ %r{/Parent\s+(\d+)\s+(\d+)\s+R}
|
|
438
|
+
widget_parent_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
|
|
439
|
+
if widget_parent_ref == field.ref
|
|
440
|
+
widget_refs_to_remove.add(widget_ref)
|
|
441
|
+
next
|
|
442
|
+
end
|
|
443
|
+
end
|
|
444
|
+
|
|
445
|
+
# Also match by field name (/T)
|
|
446
|
+
next unless body.include?("/T")
|
|
447
|
+
|
|
448
|
+
t_tok = DictScan.value_token_after("/T", body)
|
|
449
|
+
next unless t_tok
|
|
450
|
+
|
|
451
|
+
widget_name = DictScan.decode_pdf_string(t_tok)
|
|
452
|
+
if widget_name && widget_name == field.name
|
|
453
|
+
widget_refs_to_remove.add(widget_ref)
|
|
454
|
+
end
|
|
455
|
+
end
|
|
456
|
+
end
|
|
457
|
+
|
|
458
|
+
# Collect refs to write (excluding removed fields and widgets)
|
|
459
|
+
# Store refs only initially to avoid loading all bodies into memory at once
|
|
460
|
+
refs_to_keep = []
|
|
461
|
+
@resolver.each_object do |ref, body|
|
|
462
|
+
next if field_refs_to_remove.include?(ref)
|
|
463
|
+
next if widget_refs_to_remove.include?(ref)
|
|
464
|
+
next unless body
|
|
465
|
+
|
|
466
|
+
refs_to_keep << ref
|
|
467
|
+
end
|
|
468
|
+
|
|
469
|
+
# Build objects hash - load bodies only for objects we need to modify
|
|
470
|
+
# For unmodified objects, we'll load bodies on demand during writing
|
|
471
|
+
objects = []
|
|
472
|
+
refs_to_keep.each do |ref|
|
|
473
|
+
body = @resolver.object_body(ref)
|
|
474
|
+
objects << { ref: ref, body: body } if body
|
|
475
|
+
end
|
|
476
|
+
|
|
477
|
+
# Process AcroForm to remove field references from /Fields array
|
|
478
|
+
af_ref = acroform_ref
|
|
479
|
+
if af_ref && refs_to_keep.include?(af_ref)
|
|
480
|
+
# Find the AcroForm object in our objects list
|
|
481
|
+
af_obj = objects.find { |o| o[:ref] == af_ref }
|
|
482
|
+
if af_obj
|
|
483
|
+
af_body = af_obj[:body]
|
|
484
|
+
fields_array_ref = DictScan.value_token_after("/Fields", af_body)
|
|
485
|
+
|
|
486
|
+
if fields_array_ref && fields_array_ref =~ /\A(\d+)\s+(\d+)\s+R/
|
|
487
|
+
# /Fields points to separate array object
|
|
488
|
+
arr_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
|
|
489
|
+
arr_obj = objects.find { |o| o[:ref] == arr_ref }
|
|
490
|
+
if arr_obj
|
|
491
|
+
arr_body = arr_obj[:body]
|
|
492
|
+
field_refs_to_remove.each do |field_ref|
|
|
493
|
+
arr_body = DictScan.remove_ref_from_array(arr_body, field_ref)
|
|
494
|
+
end
|
|
495
|
+
# Clean up empty array
|
|
496
|
+
arr_body = arr_body.strip.gsub(/\[\s+\]/, "[]")
|
|
497
|
+
arr_obj[:body] = arr_body
|
|
498
|
+
end
|
|
499
|
+
elsif af_body.include?("/Fields")
|
|
500
|
+
# Inline /Fields array
|
|
501
|
+
field_refs_to_remove.each do |field_ref|
|
|
502
|
+
af_body = DictScan.remove_ref_from_inline_array(af_body, "/Fields", field_ref)
|
|
503
|
+
end
|
|
504
|
+
af_obj[:body] = af_body
|
|
505
|
+
end
|
|
506
|
+
end
|
|
507
|
+
end
|
|
508
|
+
|
|
509
|
+
# Process page objects to remove widget references from /Annots arrays
|
|
510
|
+
# Also remove any orphaned widget references (widgets that reference non-existent fields)
|
|
511
|
+
objects_in_file = Set.new(objects.map { |o| o[:ref] })
|
|
512
|
+
field_refs_in_file = Set.new
|
|
513
|
+
objects.each do |obj|
|
|
514
|
+
body = obj[:body]
|
|
515
|
+
# Check if this is a field object
|
|
516
|
+
if body&.include?("/FT") && body.include?("/T")
|
|
517
|
+
field_refs_in_file.add(obj[:ref])
|
|
518
|
+
end
|
|
519
|
+
|
|
520
|
+
body = obj[:body]
|
|
521
|
+
next unless DictScan.is_page?(body)
|
|
522
|
+
|
|
523
|
+
# Handle inline /Annots array
|
|
524
|
+
if body =~ %r{/Annots\s*\[(.*?)\]}
|
|
525
|
+
annots_array_str = ::Regexp.last_match(1)
|
|
526
|
+
|
|
527
|
+
# Remove widgets that match removed fields
|
|
528
|
+
widget_refs_to_remove.each do |widget_ref|
|
|
529
|
+
annots_array_str = annots_array_str.gsub(/\b#{widget_ref[0]}\s+#{widget_ref[1]}\s+R\b/, "").strip
|
|
530
|
+
annots_array_str = annots_array_str.gsub(/\s+/, " ")
|
|
531
|
+
end
|
|
532
|
+
|
|
533
|
+
# Also remove orphaned widget references (widgets not in objects_in_file or pointing to non-existent fields)
|
|
534
|
+
annots_refs = annots_array_str.scan(/(\d+)\s+(\d+)\s+R/).map { |n, g| [Integer(n), Integer(g)] }
|
|
535
|
+
annots_refs.each do |annot_ref|
|
|
536
|
+
# Check if this annotation is a widget that should be removed
|
|
537
|
+
if objects_in_file.include?(annot_ref)
|
|
538
|
+
# Widget exists - check if it's an orphaned widget (references non-existent field)
|
|
539
|
+
widget_obj = objects.find { |o| o[:ref] == annot_ref }
|
|
540
|
+
if widget_obj && DictScan.is_widget?(widget_obj[:body])
|
|
541
|
+
widget_body = widget_obj[:body]
|
|
542
|
+
# Check if widget references a parent field that doesn't exist
|
|
543
|
+
if widget_body =~ %r{/Parent\s+(\d+)\s+(\d+)\s+R}
|
|
544
|
+
parent_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
|
|
545
|
+
unless field_refs_in_file.include?(parent_ref)
|
|
546
|
+
# Parent field doesn't exist - orphaned widget, remove it
|
|
547
|
+
annots_array_str = annots_array_str.gsub(/\b#{annot_ref[0]}\s+#{annot_ref[1]}\s+R\b/, "").strip
|
|
548
|
+
annots_array_str = annots_array_str.gsub(/\s+/, " ")
|
|
549
|
+
end
|
|
550
|
+
end
|
|
551
|
+
end
|
|
552
|
+
else
|
|
553
|
+
# Widget object doesn't exist - remove it
|
|
554
|
+
annots_array_str = annots_array_str.gsub(/\b#{annot_ref[0]}\s+#{annot_ref[1]}\s+R\b/, "").strip
|
|
555
|
+
annots_array_str = annots_array_str.gsub(/\s+/, " ")
|
|
556
|
+
end
|
|
557
|
+
end
|
|
558
|
+
|
|
559
|
+
new_annots = if annots_array_str.empty? || annots_array_str.strip.empty?
|
|
560
|
+
"[]"
|
|
561
|
+
else
|
|
562
|
+
"[#{annots_array_str}]"
|
|
563
|
+
end
|
|
564
|
+
|
|
565
|
+
new_body = body.sub(%r{/Annots\s*\[.*?\]}, "/Annots #{new_annots}")
|
|
566
|
+
obj[:body] = new_body
|
|
567
|
+
# Handle indirect /Annots array reference
|
|
568
|
+
elsif body =~ %r{/Annots\s+(\d+)\s+(\d+)\s+R}
|
|
569
|
+
annots_array_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
|
|
570
|
+
annots_obj = objects.find { |o| o[:ref] == annots_array_ref }
|
|
571
|
+
if annots_obj
|
|
572
|
+
annots_body = annots_obj[:body]
|
|
573
|
+
|
|
574
|
+
# Remove widgets that match removed fields
|
|
575
|
+
widget_refs_to_remove.each do |widget_ref|
|
|
576
|
+
annots_body = DictScan.remove_ref_from_array(annots_body, widget_ref)
|
|
577
|
+
end
|
|
578
|
+
|
|
579
|
+
# Also remove orphaned widget references
|
|
580
|
+
annots_refs = annots_body.scan(/(\d+)\s+(\d+)\s+R/).map { |n, g| [Integer(n), Integer(g)] }
|
|
581
|
+
annots_refs.each do |annot_ref|
|
|
582
|
+
if objects_in_file.include?(annot_ref)
|
|
583
|
+
widget_obj = objects.find { |o| o[:ref] == annot_ref }
|
|
584
|
+
if widget_obj && DictScan.is_widget?(widget_obj[:body])
|
|
585
|
+
widget_body = widget_obj[:body]
|
|
586
|
+
if widget_body =~ %r{/Parent\s+(\d+)\s+(\d+)\s+R}
|
|
587
|
+
parent_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
|
|
588
|
+
unless field_refs_in_file.include?(parent_ref)
|
|
589
|
+
annots_body = DictScan.remove_ref_from_array(annots_body, annot_ref)
|
|
590
|
+
end
|
|
591
|
+
end
|
|
592
|
+
end
|
|
593
|
+
else
|
|
594
|
+
annots_body = DictScan.remove_ref_from_array(annots_body, annot_ref)
|
|
595
|
+
end
|
|
596
|
+
end
|
|
597
|
+
|
|
598
|
+
annots_obj[:body] = annots_body
|
|
599
|
+
end
|
|
600
|
+
end
|
|
601
|
+
end
|
|
602
|
+
|
|
603
|
+
# Sort objects by object number
|
|
604
|
+
objects.sort_by! { |obj| obj[:ref][0] }
|
|
605
|
+
|
|
606
|
+
# Write the cleaned PDF
|
|
607
|
+
writer = PDFWriter.new
|
|
608
|
+
writer.write_header
|
|
609
|
+
|
|
610
|
+
objects.each do |obj|
|
|
611
|
+
writer.write_object(obj[:ref], obj[:body])
|
|
612
|
+
end
|
|
613
|
+
|
|
614
|
+
writer.write_xref
|
|
615
|
+
|
|
616
|
+
trailer_dict = @resolver.trailer_dict
|
|
617
|
+
info_ref = nil
|
|
618
|
+
if trailer_dict =~ %r{/Info\s+(\d+)\s+(\d+)\s+R}
|
|
619
|
+
info_ref = [::Regexp.last_match(1).to_i, ::Regexp.last_match(2).to_i]
|
|
620
|
+
end
|
|
621
|
+
|
|
622
|
+
# Write trailer
|
|
623
|
+
max_obj_num = objects.map { |obj| obj[:ref][0] }.max || 0
|
|
624
|
+
writer.write_trailer(max_obj_num + 1, root_ref, info_ref)
|
|
625
|
+
|
|
626
|
+
writer.output
|
|
627
|
+
end
|
|
628
|
+
|
|
629
|
+
# Clean up in-place (mutates current instance)
|
|
630
|
+
def clear!(...)
|
|
631
|
+
cleaned_content = clear(...).freeze
|
|
632
|
+
@raw = cleaned_content
|
|
633
|
+
@resolver.clear_cache
|
|
634
|
+
@resolver = CorpPdf::ObjectResolver.new(cleaned_content)
|
|
635
|
+
@patches = []
|
|
636
|
+
|
|
637
|
+
self
|
|
638
|
+
end
|
|
639
|
+
|
|
640
|
+
# Write out with an incremental update
|
|
641
|
+
def write(path_out = nil, flatten: true)
|
|
642
|
+
deduped_patches = @patches.reverse.uniq { |p| p[:ref] }.reverse
|
|
643
|
+
writer = CorpPdf::IncrementalWriter.new(@raw, deduped_patches)
|
|
644
|
+
@raw = writer.render.freeze
|
|
645
|
+
@patches = []
|
|
646
|
+
@resolver.clear_cache
|
|
647
|
+
@resolver = CorpPdf::ObjectResolver.new(@raw)
|
|
648
|
+
|
|
649
|
+
flatten! if flatten
|
|
650
|
+
|
|
651
|
+
if path_out
|
|
652
|
+
File.binwrite(path_out, @raw)
|
|
653
|
+
return true
|
|
654
|
+
else
|
|
655
|
+
return @raw
|
|
656
|
+
end
|
|
657
|
+
end
|
|
658
|
+
|
|
659
|
+
private
|
|
660
|
+
|
|
661
|
+
# Extract PDF content from multipart form data if present
|
|
662
|
+
# Some PDFs are uploaded as multipart form data with boundary markers
|
|
663
|
+
def extract_pdf_from_form_data(bytes)
|
|
664
|
+
# Check if this looks like multipart form data
|
|
665
|
+
if bytes =~ /\A------\w+/
|
|
666
|
+
# Find the PDF header
|
|
667
|
+
pdf_start = bytes.index("%PDF")
|
|
668
|
+
return bytes unless pdf_start
|
|
669
|
+
|
|
670
|
+
# Extract PDF content from start to EOF
|
|
671
|
+
pdf_end = bytes.rindex("%%EOF")
|
|
672
|
+
return bytes unless pdf_end
|
|
673
|
+
|
|
674
|
+
# Extract just the PDF portion
|
|
675
|
+
pdf_content = bytes[pdf_start..(pdf_end + 4)]
|
|
676
|
+
return pdf_content
|
|
677
|
+
end
|
|
678
|
+
|
|
679
|
+
# Not form data, return as-is
|
|
680
|
+
bytes
|
|
681
|
+
end
|
|
682
|
+
|
|
683
|
+
def collect_pages_from_tree(pages_ref, page_objects)
|
|
684
|
+
pages_body = @resolver.object_body(pages_ref)
|
|
685
|
+
return unless pages_body
|
|
686
|
+
|
|
687
|
+
# Extract /Kids array from Pages object
|
|
688
|
+
if pages_body =~ %r{/Kids\s*\[(.*?)\]}m
|
|
689
|
+
kids_array = ::Regexp.last_match(1)
|
|
690
|
+
# Extract all object references from Kids array in order
|
|
691
|
+
kids_array.scan(/(\d+)\s+(\d+)\s+R/) do |num_str, gen_str|
|
|
692
|
+
kid_ref = [num_str.to_i, gen_str.to_i]
|
|
693
|
+
kid_body = @resolver.object_body(kid_ref)
|
|
694
|
+
|
|
695
|
+
# Check if this kid is a page (not /Type/Pages)
|
|
696
|
+
if kid_body && DictScan.is_page?(kid_body)
|
|
697
|
+
page_objects << kid_ref unless page_objects.include?(kid_ref)
|
|
698
|
+
elsif kid_body && kid_body.include?("/Type /Pages")
|
|
699
|
+
# Recursively find pages in this Pages node
|
|
700
|
+
collect_pages_from_tree(kid_ref, page_objects)
|
|
701
|
+
end
|
|
702
|
+
end
|
|
703
|
+
end
|
|
704
|
+
end
|
|
705
|
+
|
|
706
|
+
# Find all page objects in document order
|
|
707
|
+
# Returns an array of page references [obj_num, gen_num]
|
|
708
|
+
def find_all_pages
|
|
709
|
+
page_objects = []
|
|
710
|
+
|
|
711
|
+
# First, try to get pages in document order via page tree
|
|
712
|
+
root_ref = @resolver.root_ref
|
|
713
|
+
if root_ref
|
|
714
|
+
catalog_body = @resolver.object_body(root_ref)
|
|
715
|
+
if catalog_body && catalog_body =~ %r{/Pages\s+(\d+)\s+(\d+)\s+R}
|
|
716
|
+
pages_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
|
|
717
|
+
collect_pages_from_tree(pages_ref, page_objects)
|
|
718
|
+
end
|
|
719
|
+
end
|
|
720
|
+
|
|
721
|
+
# Fallback: collect all page objects if page tree didn't work
|
|
722
|
+
if page_objects.empty?
|
|
723
|
+
@resolver.each_object do |ref, body|
|
|
724
|
+
next unless body
|
|
725
|
+
|
|
726
|
+
next unless DictScan.is_page?(body)
|
|
727
|
+
|
|
728
|
+
page_objects << ref unless page_objects.include?(ref)
|
|
729
|
+
end
|
|
730
|
+
|
|
731
|
+
# Sort by object number as fallback
|
|
732
|
+
page_objects.sort_by! { |ref| ref[0] }
|
|
733
|
+
end
|
|
734
|
+
|
|
735
|
+
page_objects
|
|
736
|
+
end
|
|
737
|
+
|
|
738
|
+
# Find a page by its page number (1-indexed)
|
|
739
|
+
# Returns [obj_num, gen_num] or nil if not found
|
|
740
|
+
def find_page_by_number(page_num)
|
|
741
|
+
page_objects = find_all_pages
|
|
742
|
+
|
|
743
|
+
return nil if page_objects.empty?
|
|
744
|
+
return page_objects[page_num - 1] if page_num.positive? && page_num <= page_objects.length
|
|
745
|
+
|
|
746
|
+
page_objects[0] # Default to first page if page_num is out of range
|
|
747
|
+
end
|
|
748
|
+
|
|
749
|
+
def find_page_number_for_ref(page_ref)
|
|
750
|
+
page_objects = find_all_pages
|
|
751
|
+
|
|
752
|
+
return nil if page_objects.empty?
|
|
753
|
+
|
|
754
|
+
page_index = page_objects.index(page_ref)
|
|
755
|
+
return nil unless page_index
|
|
756
|
+
|
|
757
|
+
page_index + 1
|
|
758
|
+
end
|
|
759
|
+
|
|
760
|
+
def next_fresh_object_number
|
|
761
|
+
max_obj_num = 0
|
|
762
|
+
@resolver.each_object do |ref, _|
|
|
763
|
+
max_obj_num = [max_obj_num, ref[0]].max
|
|
764
|
+
end
|
|
765
|
+
@patches.each do |p|
|
|
766
|
+
max_obj_num = [max_obj_num, p[:ref][0]].max
|
|
767
|
+
end
|
|
768
|
+
max_obj_num + 1
|
|
769
|
+
end
|
|
770
|
+
|
|
771
|
+
def acroform_ref
|
|
772
|
+
root_ref = @resolver.root_ref
|
|
773
|
+
return nil unless root_ref
|
|
774
|
+
|
|
775
|
+
cat_body = @resolver.object_body(root_ref)
|
|
776
|
+
|
|
777
|
+
return nil unless cat_body =~ %r{/AcroForm\s+(\d+)\s+(\d+)\s+R}
|
|
778
|
+
|
|
779
|
+
[Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
|
|
780
|
+
end
|
|
781
|
+
end
|
|
782
|
+
end
|