acro_that 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,413 @@
1
+ # frozen_string_literal: true
2
+
3
+ module AcroThat
4
+ module DictScan
5
+ module_function
6
+
7
+ # --- low-level string helpers -------------------------------------------------
8
+
9
+ def strip_stream_bodies(pdf)
10
+ pdf.gsub(/stream\r?\n.*?endstream/mi) { "stream\nENDSTREAM_STRIPPED\nendstream" }
11
+ end
12
+
13
+ def each_dictionary(str)
14
+ i = 0
15
+ while (open = str.index("<<", i))
16
+ depth = 0
17
+ j = open
18
+ found = nil
19
+ while j < str.length
20
+ if str[j, 2] == "<<"
21
+ depth += 1
22
+ j += 2
23
+ elsif str[j, 2] == ">>"
24
+ depth -= 1
25
+ j += 2
26
+ if depth.zero?
27
+ found = str[open...j]
28
+ break
29
+ end
30
+ else
31
+ j += 1
32
+ end
33
+ end
34
+ break unless found
35
+
36
+ yield found
37
+ i = j
38
+ end
39
+ end
40
+
41
+ def unescape_literal(s)
42
+ out = +""
43
+ i = 0
44
+ while i < s.length
45
+ ch = s[i]
46
+ if ch == "\\"
47
+ i += 1
48
+ break if i >= s.length
49
+
50
+ esc = s[i]
51
+ case esc
52
+ when "n" then out << "\n"
53
+ when "r" then out << "\r"
54
+ when "t" then out << "\t"
55
+ when "b" then out << "\b"
56
+ when "f" then out << "\f"
57
+ when "\\", "(", ")" then out << esc
58
+ when /\d/
59
+ oct = esc
60
+ if i + 1 < s.length && s[i + 1] =~ /\d/
61
+ i += 1
62
+ oct << s[i]
63
+ if i + 1 < s.length && s[i + 1] =~ /\d/
64
+ i += 1
65
+ oct << s[i]
66
+ end
67
+ end
68
+ out << oct.to_i(8).chr
69
+ else
70
+ out << esc
71
+ end
72
+ else
73
+ out << ch
74
+ end
75
+ i += 1
76
+ end
77
+ out
78
+ end
79
+
80
+ def decode_pdf_string(token)
81
+ return nil unless token
82
+
83
+ t = token.strip
84
+
85
+ # Literal string: ( ... ) with PDF escapes and optional UTF-16BE BOM
86
+ if t.start_with?("(") && t.end_with?(")")
87
+ inner = t[1..-2]
88
+ s = unescape_literal(inner)
89
+ if s.bytesize >= 2 && s.getbyte(0) == 0xFE && s.getbyte(1) == 0xFF
90
+ return s.byteslice(2, s.bytesize - 2).force_encoding("UTF-16BE").encode("UTF-8")
91
+ else
92
+ return s.b
93
+ .force_encoding("binary")
94
+ .encode("UTF-8", invalid: :replace, undef: :replace)
95
+ end
96
+ end
97
+
98
+ # Hex string: < ... > with optional UTF-16BE BOM
99
+ if t.start_with?("<") && t.end_with?(">")
100
+ hex = t[1..-2].gsub(/\s+/, "")
101
+ hex << "0" if hex.length.odd?
102
+ bytes = [hex].pack("H*")
103
+ if bytes.bytesize >= 2 && bytes.getbyte(0) == 0xFE && bytes.getbyte(1) == 0xFF
104
+ return bytes.byteslice(2, bytes.bytesize - 2).force_encoding("UTF-16BE").encode("UTF-8")
105
+ else
106
+ return bytes.force_encoding("binary").encode("UTF-8", invalid: :replace, undef: :replace)
107
+ end
108
+ end
109
+
110
+ # Fallback: return token as-is (names, numbers, refs, etc.)
111
+ t
112
+ end
113
+
114
+ def encode_pdf_string(val)
115
+ case val
116
+ when true then "true"
117
+ when false then "false"
118
+ when Symbol
119
+ "/#{val}"
120
+ when String
121
+ if val.ascii_only?
122
+ "(#{val.gsub(/([\\()])/, '\\\\\\1').gsub("\n", '\\n')})"
123
+ else
124
+ utf16 = val.encode("UTF-16BE")
125
+ bytes = "\xFE\xFF#{utf16}"
126
+ "<#{bytes.unpack1('H*')}>"
127
+ end
128
+ else
129
+ val.to_s
130
+ end
131
+ end
132
+
133
+ def value_token_after(key, dict_src)
134
+ # Find key followed by delimiter (whitespace, (, <, [, /)
135
+ # Use regex to ensure key is a complete token
136
+ match = dict_src.match(%r{#{Regexp.escape(key)}(?=[\s(<\[/])})
137
+ return nil unless match
138
+
139
+ i = match.end(0)
140
+ i += 1 while i < dict_src.length && dict_src[i] =~ /\s/
141
+ return nil if i >= dict_src.length
142
+
143
+ case dict_src[i]
144
+ when "("
145
+ depth = 0
146
+ j = i
147
+ while j < dict_src.length
148
+ ch = dict_src[j]
149
+ if ch == "\\"
150
+ j += 2
151
+ next
152
+ end
153
+ depth += 1 if ch == "("
154
+ if ch == ")"
155
+ depth -= 1
156
+ if depth.zero?
157
+ j += 1
158
+ return dict_src[i...j]
159
+ end
160
+ end
161
+ j += 1
162
+ end
163
+ nil
164
+ when "<"
165
+ if dict_src[i, 2] == "<<"
166
+ "<<"
167
+ else
168
+ j = dict_src.index(">", i)
169
+ j ? dict_src[i..j] : nil
170
+ end
171
+ when "["
172
+ # Array token - find matching closing bracket
173
+ depth = 0
174
+ j = i
175
+ while j < dict_src.length
176
+ ch = dict_src[j]
177
+ if ch == "["
178
+ depth += 1
179
+ elsif ch == "]"
180
+ depth -= 1
181
+ if depth.zero?
182
+ j += 1
183
+ return dict_src[i...j]
184
+ end
185
+ end
186
+ j += 1
187
+ end
188
+ nil
189
+ when "/"
190
+ # PDF name token - extract until whitespace or delimiter
191
+ j = i
192
+ while j < dict_src.length
193
+ ch = dict_src[j]
194
+ # PDF names can contain most characters except NUL, whitespace, and delimiters
195
+ break if ch =~ /[\s<>\[\]()]/ || (ch == "/" && j > i)
196
+
197
+ j += 1
198
+ end
199
+ j > i ? dict_src[i...j] : nil
200
+ else
201
+ # atom
202
+ m = %r{\A([^\s<>\[\]()/%]+)}.match(dict_src[i..])
203
+ m ? m[1] : nil
204
+ end
205
+ end
206
+
207
+ def replace_key_value(dict_src, key, new_token)
208
+ # Replace existing key's value token in a single dictionary source string (<<...>>)
209
+ # Use precise position-based replacement to avoid any regex issues
210
+
211
+ # Find the key position using pattern matching
212
+ key_pattern = %r{#{Regexp.escape(key)}(?=[\s(<\[/])}
213
+ key_match = dict_src.match(key_pattern)
214
+ return upsert_key_value(dict_src, key, new_token) unless key_match
215
+
216
+ # Get the existing value token
217
+ tok = value_token_after(key, dict_src)
218
+ return upsert_key_value(dict_src, key, new_token) unless tok
219
+
220
+ # Find exact positions
221
+ key_match.begin(0)
222
+ key_end = key_match.end(0)
223
+
224
+ # Skip whitespace after key
225
+ value_start = key_end
226
+ value_start += 1 while value_start < dict_src.length && dict_src[value_start] =~ /\s/
227
+
228
+ # Verify the token matches at this position
229
+ unless value_start < dict_src.length && dict_src[value_start, tok.length] == tok
230
+ # Token doesn't match - fallback to upsert
231
+ return upsert_key_value(dict_src, key, new_token)
232
+ end
233
+
234
+ # Replace using precise string slicing - this preserves everything exactly
235
+ before = dict_src[0...value_start]
236
+ after = dict_src[(value_start + tok.length)..]
237
+ result = "#{before}#{new_token}#{after}"
238
+
239
+ # Verify the result still has valid dictionary structure
240
+ unless result.include?("<<") && result.include?(">>")
241
+ # Dictionary corrupted - return original
242
+ return dict_src
243
+ end
244
+
245
+ result
246
+ end
247
+
248
+ def upsert_key_value(dict_src, key, token)
249
+ # Insert right after '<<' with a space between key and value
250
+ dict_src.sub("<<") { |_| "<<#{key} #{token}" }
251
+ end
252
+
253
+ def appearance_choice_for(new_value, dict_src)
254
+ # If /AP << /N << /Yes ... /Off ... >> >> exists, return /Yes or /Off
255
+ return nil unless dict_src.include?("/AP")
256
+
257
+ # Simplistic detection
258
+ yes = dict_src.include?("/Yes")
259
+ off = dict_src.include?("/Off")
260
+ case new_value
261
+ when true, :Yes, "Yes" then yes ? "/Yes" : nil
262
+ when false, :Off, "Off" then off ? "/Off" : nil
263
+ end
264
+ end
265
+
266
+ def remove_ref_from_array(array_body, ref)
267
+ num, gen = ref
268
+ array_body.gsub(/\b#{num}\s+#{gen}\s+R\b/, "").gsub(/\[\s+/, "[").gsub(/\s+\]/, "]")
269
+ end
270
+
271
+ def add_ref_to_array(array_body, ref)
272
+ num, gen = ref
273
+ ref_token = "#{num} #{gen} R"
274
+
275
+ # Handle empty array
276
+ if array_body.strip == "[]"
277
+ return "[#{ref_token}]"
278
+ end
279
+
280
+ # Add before the closing bracket, with proper spacing
281
+ # Find the last ']' and insert before it
282
+ if array_body.strip.end_with?("]")
283
+ # Remove trailing ] and add ref, then add ] back
284
+ without_closing = array_body.rstrip.chomp("]")
285
+ return "#{without_closing} #{ref_token}]"
286
+ end
287
+
288
+ # Fallback: just append
289
+ "#{array_body} #{ref_token}"
290
+ end
291
+
292
+ def remove_ref_from_inline_array(dict_body, key, ref)
293
+ return nil unless dict_body.include?(key)
294
+
295
+ # Extract the inline array token after key, then rebuild
296
+ arr_tok = value_token_after(key, dict_body)
297
+ return nil unless arr_tok && arr_tok.start_with?("[")
298
+
299
+ dict_body.sub(arr_tok) { |t| remove_ref_from_array(t, ref) }
300
+ end
301
+
302
+ def add_ref_to_inline_array(dict_body, key, ref)
303
+ return nil unless dict_body.include?(key)
304
+
305
+ # Extract the inline array token after key, then rebuild
306
+ arr_tok = value_token_after(key, dict_body)
307
+ return nil unless arr_tok && arr_tok.start_with?("[")
308
+
309
+ new_arr_tok = add_ref_to_array(arr_tok, ref)
310
+ dict_body.sub(arr_tok) { |_| new_arr_tok }
311
+ end
312
+
313
+ def is_widget?(body)
314
+ return false unless body
315
+
316
+ body.include?("/Subtype") && body.include?("/Widget") && body =~ %r{/Subtype\s*/Widget}
317
+ end
318
+
319
+ # Check if a field is multiline by checking /Ff flag bit 12 (0x1000)
320
+ def is_multiline_field?(dict_body)
321
+ return false unless dict_body
322
+
323
+ ff_tok = value_token_after("/Ff", dict_body)
324
+ return false unless ff_tok
325
+
326
+ ff_value = ff_tok.to_i
327
+ # Bit 12 (0x1000) indicates multiline text field
328
+ ff_value.anybits?(0x1000)
329
+ end
330
+
331
+ # Remove /AP (appearance stream) entry from a dictionary
332
+ def remove_appearance_stream(dict_body)
333
+ return dict_body unless dict_body&.include?("/AP")
334
+
335
+ # Find /AP entry using pattern matching
336
+ ap_key_pattern = %r{/AP(?=[\s(<\[/])}
337
+ ap_match = dict_body.match(ap_key_pattern)
338
+ return dict_body unless ap_match
339
+
340
+ key_end = ap_match.end(0)
341
+ value_start = key_end
342
+ value_start += 1 while value_start < dict_body.length && dict_body[value_start] =~ /\s/
343
+ return dict_body if value_start >= dict_body.length
344
+
345
+ # Determine what type of value we have
346
+ first_char = dict_body[value_start]
347
+ value_end = value_start
348
+
349
+ if first_char == "<" && value_start + 1 < dict_body.length && dict_body[value_start + 1] == "<"
350
+ # Inline dictionary: /AP << ... >>
351
+ # Need to find matching closing >>
352
+ depth = 0
353
+ i = value_start
354
+ while i < dict_body.length
355
+ if dict_body[i, 2] == "<<"
356
+ depth += 1
357
+ i += 2
358
+ elsif dict_body[i, 2] == ">>"
359
+ depth -= 1
360
+ i += 2
361
+ if depth.zero?
362
+ value_end = i
363
+ break
364
+ end
365
+ else
366
+ i += 1
367
+ end
368
+ end
369
+ elsif ["(", "<", "["].include?(first_char)
370
+ # Use value_token_after to get the complete token
371
+ ap_tok = value_token_after("/AP", dict_body)
372
+ return dict_body unless ap_tok
373
+
374
+ value_end = value_start + ap_tok.length
375
+ else
376
+ # Reference or other simple token
377
+ ap_tok = value_token_after("/AP", dict_body)
378
+ return dict_body unless ap_tok
379
+
380
+ value_end = value_start + ap_tok.length
381
+ end
382
+
383
+ # Skip trailing whitespace after the value
384
+ value_end += 1 while value_end < dict_body.length && dict_body[value_end] =~ /\s/
385
+
386
+ # Find the start of /AP (may need to remove preceding space/newline)
387
+ removal_start = ap_match.begin(0)
388
+
389
+ # Try to remove preceding whitespace/newline if it's on its own line
390
+ if removal_start.positive? && dict_body[removal_start - 1] == "\n"
391
+ # Check if there's whitespace before the newline we should remove too
392
+ line_start = removal_start - 1
393
+ line_start -= 1 while line_start.positive? && dict_body[line_start - 1] =~ /\s/
394
+ # Only remove the line if it starts with whitespace (indentation)
395
+ if line_start.positive? && dict_body[line_start - 1] == "\n"
396
+ removal_start = line_start
397
+ end
398
+ end
399
+
400
+ # Build result without /AP entry
401
+ before = dict_body[0...removal_start]
402
+ after = dict_body[value_end..]
403
+ result = "#{before}#{after}"
404
+
405
+ # Verify the result still has valid dictionary structure
406
+ unless result.include?("<<") && result.include?(">>")
407
+ return dict_body # Return original if corrupted
408
+ end
409
+
410
+ result
411
+ end
412
+ end
413
+ end