acro_that 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.DS_Store +0 -0
- data/.gitignore +8 -0
- data/.rubocop.yml +78 -0
- data/Gemfile +5 -0
- data/Gemfile.lock +86 -0
- data/README.md +360 -0
- data/Rakefile +18 -0
- data/acro_that.gemspec +34 -0
- data/docs/README.md +99 -0
- data/docs/dict_scan_explained.md +341 -0
- data/docs/object_streams.md +311 -0
- data/docs/pdf_structure.md +251 -0
- data/lib/acro_that/actions/add_field.rb +278 -0
- data/lib/acro_that/actions/add_signature_appearance.rb +422 -0
- data/lib/acro_that/actions/base.rb +44 -0
- data/lib/acro_that/actions/remove_field.rb +158 -0
- data/lib/acro_that/actions/update_field.rb +301 -0
- data/lib/acro_that/dict_scan.rb +413 -0
- data/lib/acro_that/document.rb +331 -0
- data/lib/acro_that/field.rb +143 -0
- data/lib/acro_that/incremental_writer.rb +244 -0
- data/lib/acro_that/object_resolver.rb +376 -0
- data/lib/acro_that/objstm.rb +75 -0
- data/lib/acro_that/pdf_writer.rb +97 -0
- data/lib/acro_that/version.rb +5 -0
- data/lib/acro_that.rb +24 -0
- metadata +143 -0
|
@@ -0,0 +1,413 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module AcroThat
|
|
4
|
+
module DictScan
|
|
5
|
+
module_function
|
|
6
|
+
|
|
7
|
+
# --- low-level string helpers -------------------------------------------------
|
|
8
|
+
|
|
9
|
+
def strip_stream_bodies(pdf)
|
|
10
|
+
pdf.gsub(/stream\r?\n.*?endstream/mi) { "stream\nENDSTREAM_STRIPPED\nendstream" }
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def each_dictionary(str)
|
|
14
|
+
i = 0
|
|
15
|
+
while (open = str.index("<<", i))
|
|
16
|
+
depth = 0
|
|
17
|
+
j = open
|
|
18
|
+
found = nil
|
|
19
|
+
while j < str.length
|
|
20
|
+
if str[j, 2] == "<<"
|
|
21
|
+
depth += 1
|
|
22
|
+
j += 2
|
|
23
|
+
elsif str[j, 2] == ">>"
|
|
24
|
+
depth -= 1
|
|
25
|
+
j += 2
|
|
26
|
+
if depth.zero?
|
|
27
|
+
found = str[open...j]
|
|
28
|
+
break
|
|
29
|
+
end
|
|
30
|
+
else
|
|
31
|
+
j += 1
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
break unless found
|
|
35
|
+
|
|
36
|
+
yield found
|
|
37
|
+
i = j
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def unescape_literal(s)
|
|
42
|
+
out = +""
|
|
43
|
+
i = 0
|
|
44
|
+
while i < s.length
|
|
45
|
+
ch = s[i]
|
|
46
|
+
if ch == "\\"
|
|
47
|
+
i += 1
|
|
48
|
+
break if i >= s.length
|
|
49
|
+
|
|
50
|
+
esc = s[i]
|
|
51
|
+
case esc
|
|
52
|
+
when "n" then out << "\n"
|
|
53
|
+
when "r" then out << "\r"
|
|
54
|
+
when "t" then out << "\t"
|
|
55
|
+
when "b" then out << "\b"
|
|
56
|
+
when "f" then out << "\f"
|
|
57
|
+
when "\\", "(", ")" then out << esc
|
|
58
|
+
when /\d/
|
|
59
|
+
oct = esc
|
|
60
|
+
if i + 1 < s.length && s[i + 1] =~ /\d/
|
|
61
|
+
i += 1
|
|
62
|
+
oct << s[i]
|
|
63
|
+
if i + 1 < s.length && s[i + 1] =~ /\d/
|
|
64
|
+
i += 1
|
|
65
|
+
oct << s[i]
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
out << oct.to_i(8).chr
|
|
69
|
+
else
|
|
70
|
+
out << esc
|
|
71
|
+
end
|
|
72
|
+
else
|
|
73
|
+
out << ch
|
|
74
|
+
end
|
|
75
|
+
i += 1
|
|
76
|
+
end
|
|
77
|
+
out
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def decode_pdf_string(token)
|
|
81
|
+
return nil unless token
|
|
82
|
+
|
|
83
|
+
t = token.strip
|
|
84
|
+
|
|
85
|
+
# Literal string: ( ... ) with PDF escapes and optional UTF-16BE BOM
|
|
86
|
+
if t.start_with?("(") && t.end_with?(")")
|
|
87
|
+
inner = t[1..-2]
|
|
88
|
+
s = unescape_literal(inner)
|
|
89
|
+
if s.bytesize >= 2 && s.getbyte(0) == 0xFE && s.getbyte(1) == 0xFF
|
|
90
|
+
return s.byteslice(2, s.bytesize - 2).force_encoding("UTF-16BE").encode("UTF-8")
|
|
91
|
+
else
|
|
92
|
+
return s.b
|
|
93
|
+
.force_encoding("binary")
|
|
94
|
+
.encode("UTF-8", invalid: :replace, undef: :replace)
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
# Hex string: < ... > with optional UTF-16BE BOM
|
|
99
|
+
if t.start_with?("<") && t.end_with?(">")
|
|
100
|
+
hex = t[1..-2].gsub(/\s+/, "")
|
|
101
|
+
hex << "0" if hex.length.odd?
|
|
102
|
+
bytes = [hex].pack("H*")
|
|
103
|
+
if bytes.bytesize >= 2 && bytes.getbyte(0) == 0xFE && bytes.getbyte(1) == 0xFF
|
|
104
|
+
return bytes.byteslice(2, bytes.bytesize - 2).force_encoding("UTF-16BE").encode("UTF-8")
|
|
105
|
+
else
|
|
106
|
+
return bytes.force_encoding("binary").encode("UTF-8", invalid: :replace, undef: :replace)
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# Fallback: return token as-is (names, numbers, refs, etc.)
|
|
111
|
+
t
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
def encode_pdf_string(val)
|
|
115
|
+
case val
|
|
116
|
+
when true then "true"
|
|
117
|
+
when false then "false"
|
|
118
|
+
when Symbol
|
|
119
|
+
"/#{val}"
|
|
120
|
+
when String
|
|
121
|
+
if val.ascii_only?
|
|
122
|
+
"(#{val.gsub(/([\\()])/, '\\\\\\1').gsub("\n", '\\n')})"
|
|
123
|
+
else
|
|
124
|
+
utf16 = val.encode("UTF-16BE")
|
|
125
|
+
bytes = "\xFE\xFF#{utf16}"
|
|
126
|
+
"<#{bytes.unpack1('H*')}>"
|
|
127
|
+
end
|
|
128
|
+
else
|
|
129
|
+
val.to_s
|
|
130
|
+
end
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
def value_token_after(key, dict_src)
|
|
134
|
+
# Find key followed by delimiter (whitespace, (, <, [, /)
|
|
135
|
+
# Use regex to ensure key is a complete token
|
|
136
|
+
match = dict_src.match(%r{#{Regexp.escape(key)}(?=[\s(<\[/])})
|
|
137
|
+
return nil unless match
|
|
138
|
+
|
|
139
|
+
i = match.end(0)
|
|
140
|
+
i += 1 while i < dict_src.length && dict_src[i] =~ /\s/
|
|
141
|
+
return nil if i >= dict_src.length
|
|
142
|
+
|
|
143
|
+
case dict_src[i]
|
|
144
|
+
when "("
|
|
145
|
+
depth = 0
|
|
146
|
+
j = i
|
|
147
|
+
while j < dict_src.length
|
|
148
|
+
ch = dict_src[j]
|
|
149
|
+
if ch == "\\"
|
|
150
|
+
j += 2
|
|
151
|
+
next
|
|
152
|
+
end
|
|
153
|
+
depth += 1 if ch == "("
|
|
154
|
+
if ch == ")"
|
|
155
|
+
depth -= 1
|
|
156
|
+
if depth.zero?
|
|
157
|
+
j += 1
|
|
158
|
+
return dict_src[i...j]
|
|
159
|
+
end
|
|
160
|
+
end
|
|
161
|
+
j += 1
|
|
162
|
+
end
|
|
163
|
+
nil
|
|
164
|
+
when "<"
|
|
165
|
+
if dict_src[i, 2] == "<<"
|
|
166
|
+
"<<"
|
|
167
|
+
else
|
|
168
|
+
j = dict_src.index(">", i)
|
|
169
|
+
j ? dict_src[i..j] : nil
|
|
170
|
+
end
|
|
171
|
+
when "["
|
|
172
|
+
# Array token - find matching closing bracket
|
|
173
|
+
depth = 0
|
|
174
|
+
j = i
|
|
175
|
+
while j < dict_src.length
|
|
176
|
+
ch = dict_src[j]
|
|
177
|
+
if ch == "["
|
|
178
|
+
depth += 1
|
|
179
|
+
elsif ch == "]"
|
|
180
|
+
depth -= 1
|
|
181
|
+
if depth.zero?
|
|
182
|
+
j += 1
|
|
183
|
+
return dict_src[i...j]
|
|
184
|
+
end
|
|
185
|
+
end
|
|
186
|
+
j += 1
|
|
187
|
+
end
|
|
188
|
+
nil
|
|
189
|
+
when "/"
|
|
190
|
+
# PDF name token - extract until whitespace or delimiter
|
|
191
|
+
j = i
|
|
192
|
+
while j < dict_src.length
|
|
193
|
+
ch = dict_src[j]
|
|
194
|
+
# PDF names can contain most characters except NUL, whitespace, and delimiters
|
|
195
|
+
break if ch =~ /[\s<>\[\]()]/ || (ch == "/" && j > i)
|
|
196
|
+
|
|
197
|
+
j += 1
|
|
198
|
+
end
|
|
199
|
+
j > i ? dict_src[i...j] : nil
|
|
200
|
+
else
|
|
201
|
+
# atom
|
|
202
|
+
m = %r{\A([^\s<>\[\]()/%]+)}.match(dict_src[i..])
|
|
203
|
+
m ? m[1] : nil
|
|
204
|
+
end
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
def replace_key_value(dict_src, key, new_token)
|
|
208
|
+
# Replace existing key's value token in a single dictionary source string (<<...>>)
|
|
209
|
+
# Use precise position-based replacement to avoid any regex issues
|
|
210
|
+
|
|
211
|
+
# Find the key position using pattern matching
|
|
212
|
+
key_pattern = %r{#{Regexp.escape(key)}(?=[\s(<\[/])}
|
|
213
|
+
key_match = dict_src.match(key_pattern)
|
|
214
|
+
return upsert_key_value(dict_src, key, new_token) unless key_match
|
|
215
|
+
|
|
216
|
+
# Get the existing value token
|
|
217
|
+
tok = value_token_after(key, dict_src)
|
|
218
|
+
return upsert_key_value(dict_src, key, new_token) unless tok
|
|
219
|
+
|
|
220
|
+
# Find exact positions
|
|
221
|
+
key_match.begin(0)
|
|
222
|
+
key_end = key_match.end(0)
|
|
223
|
+
|
|
224
|
+
# Skip whitespace after key
|
|
225
|
+
value_start = key_end
|
|
226
|
+
value_start += 1 while value_start < dict_src.length && dict_src[value_start] =~ /\s/
|
|
227
|
+
|
|
228
|
+
# Verify the token matches at this position
|
|
229
|
+
unless value_start < dict_src.length && dict_src[value_start, tok.length] == tok
|
|
230
|
+
# Token doesn't match - fallback to upsert
|
|
231
|
+
return upsert_key_value(dict_src, key, new_token)
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
# Replace using precise string slicing - this preserves everything exactly
|
|
235
|
+
before = dict_src[0...value_start]
|
|
236
|
+
after = dict_src[(value_start + tok.length)..]
|
|
237
|
+
result = "#{before}#{new_token}#{after}"
|
|
238
|
+
|
|
239
|
+
# Verify the result still has valid dictionary structure
|
|
240
|
+
unless result.include?("<<") && result.include?(">>")
|
|
241
|
+
# Dictionary corrupted - return original
|
|
242
|
+
return dict_src
|
|
243
|
+
end
|
|
244
|
+
|
|
245
|
+
result
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
def upsert_key_value(dict_src, key, token)
|
|
249
|
+
# Insert right after '<<' with a space between key and value
|
|
250
|
+
dict_src.sub("<<") { |_| "<<#{key} #{token}" }
|
|
251
|
+
end
|
|
252
|
+
|
|
253
|
+
def appearance_choice_for(new_value, dict_src)
|
|
254
|
+
# If /AP << /N << /Yes ... /Off ... >> >> exists, return /Yes or /Off
|
|
255
|
+
return nil unless dict_src.include?("/AP")
|
|
256
|
+
|
|
257
|
+
# Simplistic detection
|
|
258
|
+
yes = dict_src.include?("/Yes")
|
|
259
|
+
off = dict_src.include?("/Off")
|
|
260
|
+
case new_value
|
|
261
|
+
when true, :Yes, "Yes" then yes ? "/Yes" : nil
|
|
262
|
+
when false, :Off, "Off" then off ? "/Off" : nil
|
|
263
|
+
end
|
|
264
|
+
end
|
|
265
|
+
|
|
266
|
+
def remove_ref_from_array(array_body, ref)
|
|
267
|
+
num, gen = ref
|
|
268
|
+
array_body.gsub(/\b#{num}\s+#{gen}\s+R\b/, "").gsub(/\[\s+/, "[").gsub(/\s+\]/, "]")
|
|
269
|
+
end
|
|
270
|
+
|
|
271
|
+
def add_ref_to_array(array_body, ref)
|
|
272
|
+
num, gen = ref
|
|
273
|
+
ref_token = "#{num} #{gen} R"
|
|
274
|
+
|
|
275
|
+
# Handle empty array
|
|
276
|
+
if array_body.strip == "[]"
|
|
277
|
+
return "[#{ref_token}]"
|
|
278
|
+
end
|
|
279
|
+
|
|
280
|
+
# Add before the closing bracket, with proper spacing
|
|
281
|
+
# Find the last ']' and insert before it
|
|
282
|
+
if array_body.strip.end_with?("]")
|
|
283
|
+
# Remove trailing ] and add ref, then add ] back
|
|
284
|
+
without_closing = array_body.rstrip.chomp("]")
|
|
285
|
+
return "#{without_closing} #{ref_token}]"
|
|
286
|
+
end
|
|
287
|
+
|
|
288
|
+
# Fallback: just append
|
|
289
|
+
"#{array_body} #{ref_token}"
|
|
290
|
+
end
|
|
291
|
+
|
|
292
|
+
def remove_ref_from_inline_array(dict_body, key, ref)
|
|
293
|
+
return nil unless dict_body.include?(key)
|
|
294
|
+
|
|
295
|
+
# Extract the inline array token after key, then rebuild
|
|
296
|
+
arr_tok = value_token_after(key, dict_body)
|
|
297
|
+
return nil unless arr_tok && arr_tok.start_with?("[")
|
|
298
|
+
|
|
299
|
+
dict_body.sub(arr_tok) { |t| remove_ref_from_array(t, ref) }
|
|
300
|
+
end
|
|
301
|
+
|
|
302
|
+
def add_ref_to_inline_array(dict_body, key, ref)
|
|
303
|
+
return nil unless dict_body.include?(key)
|
|
304
|
+
|
|
305
|
+
# Extract the inline array token after key, then rebuild
|
|
306
|
+
arr_tok = value_token_after(key, dict_body)
|
|
307
|
+
return nil unless arr_tok && arr_tok.start_with?("[")
|
|
308
|
+
|
|
309
|
+
new_arr_tok = add_ref_to_array(arr_tok, ref)
|
|
310
|
+
dict_body.sub(arr_tok) { |_| new_arr_tok }
|
|
311
|
+
end
|
|
312
|
+
|
|
313
|
+
def is_widget?(body)
|
|
314
|
+
return false unless body
|
|
315
|
+
|
|
316
|
+
body.include?("/Subtype") && body.include?("/Widget") && body =~ %r{/Subtype\s*/Widget}
|
|
317
|
+
end
|
|
318
|
+
|
|
319
|
+
# Check if a field is multiline by checking /Ff flag bit 12 (0x1000)
|
|
320
|
+
def is_multiline_field?(dict_body)
|
|
321
|
+
return false unless dict_body
|
|
322
|
+
|
|
323
|
+
ff_tok = value_token_after("/Ff", dict_body)
|
|
324
|
+
return false unless ff_tok
|
|
325
|
+
|
|
326
|
+
ff_value = ff_tok.to_i
|
|
327
|
+
# Bit 12 (0x1000) indicates multiline text field
|
|
328
|
+
ff_value.anybits?(0x1000)
|
|
329
|
+
end
|
|
330
|
+
|
|
331
|
+
# Remove /AP (appearance stream) entry from a dictionary
|
|
332
|
+
def remove_appearance_stream(dict_body)
|
|
333
|
+
return dict_body unless dict_body&.include?("/AP")
|
|
334
|
+
|
|
335
|
+
# Find /AP entry using pattern matching
|
|
336
|
+
ap_key_pattern = %r{/AP(?=[\s(<\[/])}
|
|
337
|
+
ap_match = dict_body.match(ap_key_pattern)
|
|
338
|
+
return dict_body unless ap_match
|
|
339
|
+
|
|
340
|
+
key_end = ap_match.end(0)
|
|
341
|
+
value_start = key_end
|
|
342
|
+
value_start += 1 while value_start < dict_body.length && dict_body[value_start] =~ /\s/
|
|
343
|
+
return dict_body if value_start >= dict_body.length
|
|
344
|
+
|
|
345
|
+
# Determine what type of value we have
|
|
346
|
+
first_char = dict_body[value_start]
|
|
347
|
+
value_end = value_start
|
|
348
|
+
|
|
349
|
+
if first_char == "<" && value_start + 1 < dict_body.length && dict_body[value_start + 1] == "<"
|
|
350
|
+
# Inline dictionary: /AP << ... >>
|
|
351
|
+
# Need to find matching closing >>
|
|
352
|
+
depth = 0
|
|
353
|
+
i = value_start
|
|
354
|
+
while i < dict_body.length
|
|
355
|
+
if dict_body[i, 2] == "<<"
|
|
356
|
+
depth += 1
|
|
357
|
+
i += 2
|
|
358
|
+
elsif dict_body[i, 2] == ">>"
|
|
359
|
+
depth -= 1
|
|
360
|
+
i += 2
|
|
361
|
+
if depth.zero?
|
|
362
|
+
value_end = i
|
|
363
|
+
break
|
|
364
|
+
end
|
|
365
|
+
else
|
|
366
|
+
i += 1
|
|
367
|
+
end
|
|
368
|
+
end
|
|
369
|
+
elsif ["(", "<", "["].include?(first_char)
|
|
370
|
+
# Use value_token_after to get the complete token
|
|
371
|
+
ap_tok = value_token_after("/AP", dict_body)
|
|
372
|
+
return dict_body unless ap_tok
|
|
373
|
+
|
|
374
|
+
value_end = value_start + ap_tok.length
|
|
375
|
+
else
|
|
376
|
+
# Reference or other simple token
|
|
377
|
+
ap_tok = value_token_after("/AP", dict_body)
|
|
378
|
+
return dict_body unless ap_tok
|
|
379
|
+
|
|
380
|
+
value_end = value_start + ap_tok.length
|
|
381
|
+
end
|
|
382
|
+
|
|
383
|
+
# Skip trailing whitespace after the value
|
|
384
|
+
value_end += 1 while value_end < dict_body.length && dict_body[value_end] =~ /\s/
|
|
385
|
+
|
|
386
|
+
# Find the start of /AP (may need to remove preceding space/newline)
|
|
387
|
+
removal_start = ap_match.begin(0)
|
|
388
|
+
|
|
389
|
+
# Try to remove preceding whitespace/newline if it's on its own line
|
|
390
|
+
if removal_start.positive? && dict_body[removal_start - 1] == "\n"
|
|
391
|
+
# Check if there's whitespace before the newline we should remove too
|
|
392
|
+
line_start = removal_start - 1
|
|
393
|
+
line_start -= 1 while line_start.positive? && dict_body[line_start - 1] =~ /\s/
|
|
394
|
+
# Only remove the line if it starts with whitespace (indentation)
|
|
395
|
+
if line_start.positive? && dict_body[line_start - 1] == "\n"
|
|
396
|
+
removal_start = line_start
|
|
397
|
+
end
|
|
398
|
+
end
|
|
399
|
+
|
|
400
|
+
# Build result without /AP entry
|
|
401
|
+
before = dict_body[0...removal_start]
|
|
402
|
+
after = dict_body[value_end..]
|
|
403
|
+
result = "#{before}#{after}"
|
|
404
|
+
|
|
405
|
+
# Verify the result still has valid dictionary structure
|
|
406
|
+
unless result.include?("<<") && result.include?(">>")
|
|
407
|
+
return dict_body # Return original if corrupted
|
|
408
|
+
end
|
|
409
|
+
|
|
410
|
+
result
|
|
411
|
+
end
|
|
412
|
+
end
|
|
413
|
+
end
|