corp_pdf 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,523 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CorpPdf
4
+ module DictScan
5
+ module_function
6
+
7
+ # Configure I18n for transliteration (disable locale enforcement)
8
+ I18n.config.enforce_available_locales = false
9
+
10
+ # Transliterate a string to ASCII, converting special characters to their ASCII equivalents
11
+ # Example: "María Valentina" -> "Maria Valentina"
12
+ def transliterate_to_ascii(str)
13
+ return str unless str.is_a?(String)
14
+
15
+ # Ensure the string is in UTF-8 encoding
16
+ utf8_str = str.encode("UTF-8", invalid: :replace, undef: :replace)
17
+
18
+ # Use I18n transliteration to convert to ASCII
19
+ begin
20
+ I18n.transliterate(utf8_str, locale: :en, replacement: "")
21
+ rescue StandardError
22
+ # Fallback: if transliteration fails, try to encode to ASCII with replacements
23
+ utf8_str.encode("ASCII", invalid: :replace, undef: :replace)
24
+ end
25
+ end
26
+
27
+ # --- low-level string helpers -------------------------------------------------
28
+
29
+ def strip_stream_bodies(pdf)
30
+ pdf.gsub(/stream\r?\n.*?endstream/mi) { "stream\nENDSTREAM_STRIPPED\nendstream" }
31
+ end
32
+
33
+ def each_dictionary(str)
34
+ i = 0
35
+ while (open = str.index("<<", i))
36
+ depth = 0
37
+ j = open
38
+ found = nil
39
+ while j < str.length
40
+ if str[j, 2] == "<<"
41
+ depth += 1
42
+ j += 2
43
+ elsif str[j, 2] == ">>"
44
+ depth -= 1
45
+ j += 2
46
+ if depth.zero?
47
+ found = str[open...j]
48
+ break
49
+ end
50
+ else
51
+ j += 1
52
+ end
53
+ end
54
+ break unless found
55
+
56
+ yield found
57
+ i = j
58
+ end
59
+ end
60
+
61
+ def unescape_literal(s)
62
+ out = +""
63
+ i = 0
64
+ while i < s.length
65
+ ch = s[i]
66
+ if ch == "\\"
67
+ i += 1
68
+ break if i >= s.length
69
+
70
+ esc = s[i]
71
+ case esc
72
+ when "n" then out << "\n"
73
+ when "r" then out << "\r"
74
+ when "t" then out << "\t"
75
+ when "b" then out << "\b"
76
+ when "f" then out << "\f"
77
+ when "\\", "(", ")" then out << esc
78
+ when /\d/
79
+ oct = esc
80
+ if i + 1 < s.length && s[i + 1] =~ /\d/
81
+ i += 1
82
+ oct << s[i]
83
+ if i + 1 < s.length && s[i + 1] =~ /\d/
84
+ i += 1
85
+ oct << s[i]
86
+ end
87
+ end
88
+ out << oct.to_i(8).chr
89
+ else
90
+ out << esc
91
+ end
92
+ else
93
+ out << ch
94
+ end
95
+ i += 1
96
+ end
97
+ out
98
+ end
99
+
100
+ def decode_pdf_string(token)
101
+ return nil unless token
102
+
103
+ t = token.strip
104
+
105
+ # Literal string: ( ... ) with PDF escapes and optional UTF-16BE BOM
106
+ if t.start_with?("(") && t.end_with?(")")
107
+ inner = t[1..-2]
108
+ s = unescape_literal(inner)
109
+ if s.bytesize >= 2 && s.getbyte(0) == 0xFE && s.getbyte(1) == 0xFF
110
+ return s.byteslice(2, s.bytesize - 2).force_encoding("UTF-16BE").encode("UTF-8")
111
+ else
112
+ return s.b
113
+ .force_encoding("binary")
114
+ .encode("UTF-8", invalid: :replace, undef: :replace)
115
+ end
116
+ end
117
+
118
+ # Hex string: < ... > with optional UTF-16BE BOM
119
+ if t.start_with?("<") && t.end_with?(">")
120
+ hex = t[1..-2].gsub(/\s+/, "")
121
+ hex << "0" if hex.length.odd?
122
+ bytes = [hex].pack("H*")
123
+ if bytes.bytesize >= 2 && bytes.getbyte(0) == 0xFE && bytes.getbyte(1) == 0xFF
124
+ return bytes.byteslice(2, bytes.bytesize - 2).force_encoding("UTF-16BE").encode("UTF-8")
125
+ else
126
+ return bytes.force_encoding("binary").encode("UTF-8", invalid: :replace, undef: :replace)
127
+ end
128
+ end
129
+
130
+ # Fallback: return token as-is (names, numbers, refs, etc.)
131
+ t
132
+ end
133
+
134
+ def encode_pdf_string(val)
135
+ case val
136
+ when true then "true"
137
+ when false then "false"
138
+ when Symbol
139
+ "/#{val}"
140
+ when String
141
+ # Transliterate special characters to ASCII to avoid encoding issues
142
+ ascii_val = transliterate_to_ascii(val)
143
+
144
+ if ascii_val.ascii_only?
145
+ "(#{ascii_val.gsub(/([\\()])/, '\\\\\\1').gsub("\n", '\\n')})"
146
+ else
147
+ # Ensure string is in UTF-8 before encoding to UTF-16BE
148
+ utf8_str = ascii_val.encode("UTF-8", invalid: :replace, undef: :replace)
149
+ utf16 = utf8_str.encode("UTF-16BE")
150
+ bytes = "\xFE\xFF#{utf16}"
151
+ "<#{bytes.unpack1('H*')}>"
152
+ end
153
+ else
154
+ val.to_s
155
+ end
156
+ end
157
+
158
+ # Encode a string as a PDF name, escaping special characters with hex encoding
159
+ # PDF names must escape: # ( ) < > [ ] { } / % and control characters
160
+ # Example: "(Two Hr) Priority 2" becomes "/#28Two Hr#29 Priority 2"
161
+ def encode_pdf_name(name)
162
+ name_str = name.to_s
163
+ # Remove leading / if present (we'll add it back)
164
+ name_str = name_str[1..] if name_str.start_with?("/")
165
+
166
+ # Transliterate special characters to ASCII to avoid encoding issues
167
+ ascii_name = transliterate_to_ascii(name_str)
168
+
169
+ # Encode special characters as hex
170
+ encoded = ascii_name.each_byte.map do |byte|
171
+ char = byte.chr
172
+ # PDF name special characters that need hex encoding: # ( ) < > [ ] { } / %
173
+ # Also encode control characters (0x00-0x1F, 0x7F) and non-ASCII (0x80-0xFF)
174
+ if ["#", "(", ")", "<", ">", "[", "]", "{", "}", "/", "%"].include?(char) ||
175
+ byte.between?(0x00, 0x1F) || byte == 0x7F || byte.between?(0x80, 0xFF)
176
+ # Hex encode: # followed by 2-digit hex
177
+ "##{byte.to_s(16).upcase.rjust(2, '0')}"
178
+ else
179
+ # Regular printable ASCII: use as-is
180
+ char
181
+ end
182
+ end.join
183
+
184
+ "/#{encoded}"
185
+ end
186
+
187
+ # Format a metadata key as a PDF dictionary key (ensure it starts with /)
188
+ def format_pdf_key(key)
189
+ key_str = key.to_s
190
+ key_str.start_with?("/") ? key_str : "/#{key_str}"
191
+ end
192
+
193
+ # Format a metadata value appropriately for PDF
194
+ def format_pdf_value(value)
195
+ case value
196
+ when Integer, Float
197
+ value.to_s
198
+ when String
199
+ # If it looks like a PDF string (starts with parenthesis or angle bracket), use as-is
200
+ if value.start_with?("(") || value.start_with?("<") || value.start_with?("/")
201
+ value
202
+ else
203
+ # Otherwise encode as a PDF string
204
+ encode_pdf_string(value)
205
+ end
206
+ when Array
207
+ # Array format: [item1 item2 item3]
208
+ items = value.map { |v| format_pdf_value(v) }.join(" ")
209
+ "[#{items}]"
210
+ when Hash
211
+ # Dictionary format: << /Key1 value1 /Key2 value2 >>
212
+ dict = value.map do |k, v|
213
+ pdf_key = format_pdf_key(k)
214
+ pdf_val = format_pdf_value(v)
215
+ " #{pdf_key} #{pdf_val}"
216
+ end.join("\n")
217
+ "<<\n#{dict}\n>>"
218
+ else
219
+ value.to_s
220
+ end
221
+ end
222
+
223
+ def value_token_after(key, dict_src)
224
+ # Find key followed by delimiter (whitespace, (, <, [, /)
225
+ # Use regex to ensure key is a complete token
226
+ match = dict_src.match(%r{#{Regexp.escape(key)}(?=[\s(<\[/])})
227
+ return nil unless match
228
+
229
+ i = match.end(0)
230
+ i += 1 while i < dict_src.length && dict_src[i] =~ /\s/
231
+ return nil if i >= dict_src.length
232
+
233
+ case dict_src[i]
234
+ when "("
235
+ depth = 0
236
+ j = i
237
+ while j < dict_src.length
238
+ ch = dict_src[j]
239
+ if ch == "\\"
240
+ j += 2
241
+ next
242
+ end
243
+ depth += 1 if ch == "("
244
+ if ch == ")"
245
+ depth -= 1
246
+ if depth.zero?
247
+ j += 1
248
+ return dict_src[i...j]
249
+ end
250
+ end
251
+ j += 1
252
+ end
253
+ nil
254
+ when "<"
255
+ if dict_src[i, 2] == "<<"
256
+ "<<"
257
+ else
258
+ j = dict_src.index(">", i)
259
+ j ? dict_src[i..j] : nil
260
+ end
261
+ when "["
262
+ # Array token - find matching closing bracket
263
+ depth = 0
264
+ j = i
265
+ while j < dict_src.length
266
+ ch = dict_src[j]
267
+ if ch == "["
268
+ depth += 1
269
+ elsif ch == "]"
270
+ depth -= 1
271
+ if depth.zero?
272
+ j += 1
273
+ return dict_src[i...j]
274
+ end
275
+ end
276
+ j += 1
277
+ end
278
+ nil
279
+ when "/"
280
+ # PDF name token - extract until whitespace or delimiter
281
+ j = i
282
+ while j < dict_src.length
283
+ ch = dict_src[j]
284
+ # PDF names can contain most characters except NUL, whitespace, and delimiters
285
+ break if ch =~ /[\s<>\[\]()]/ || (ch == "/" && j > i)
286
+
287
+ j += 1
288
+ end
289
+ j > i ? dict_src[i...j] : nil
290
+ else
291
+ # atom
292
+ m = %r{\A([^\s<>\[\]()/%]+)}.match(dict_src[i..])
293
+ m ? m[1] : nil
294
+ end
295
+ end
296
+
297
+ def replace_key_value(dict_src, key, new_token)
298
+ # Replace existing key's value token in a single dictionary source string (<<...>>)
299
+ # Use precise position-based replacement to avoid any regex issues
300
+
301
+ # Find the key position using pattern matching
302
+ key_pattern = %r{#{Regexp.escape(key)}(?=[\s(<\[/])}
303
+ key_match = dict_src.match(key_pattern)
304
+ return upsert_key_value(dict_src, key, new_token) unless key_match
305
+
306
+ # Get the existing value token
307
+ tok = value_token_after(key, dict_src)
308
+ return upsert_key_value(dict_src, key, new_token) unless tok
309
+
310
+ # Find exact positions
311
+ key_match.begin(0)
312
+ key_end = key_match.end(0)
313
+
314
+ # Skip whitespace after key
315
+ value_start = key_end
316
+ value_start += 1 while value_start < dict_src.length && dict_src[value_start] =~ /\s/
317
+
318
+ # Verify the token matches at this position
319
+ unless value_start < dict_src.length && dict_src[value_start, tok.length] == tok
320
+ # Token doesn't match - fallback to upsert
321
+ return upsert_key_value(dict_src, key, new_token)
322
+ end
323
+
324
+ # Replace using precise string slicing - this preserves everything exactly
325
+ before = dict_src[0...value_start]
326
+ after = dict_src[(value_start + tok.length)..]
327
+ result = "#{before}#{new_token}#{after}"
328
+
329
+ # Verify the result still has valid dictionary structure
330
+ unless result.include?("<<") && result.include?(">>")
331
+ # Dictionary corrupted - return original
332
+ return dict_src
333
+ end
334
+
335
+ result
336
+ end
337
+
338
+ def upsert_key_value(dict_src, key, token)
339
+ # Insert right after '<<' with a space between key and value
340
+ dict_src.sub("<<") { |_| "<<#{key} #{token}" }
341
+ end
342
+
343
+ def appearance_choice_for(new_value, dict_src)
344
+ # If /AP << /N << /Yes ... /Off ... >> >> exists, return /Yes or /Off
345
+ return nil unless dict_src.include?("/AP")
346
+
347
+ # Simplistic detection
348
+ yes = dict_src.include?("/Yes")
349
+ off = dict_src.include?("/Off")
350
+ case new_value
351
+ when true, :Yes, "Yes" then yes ? "/Yes" : nil
352
+ when false, :Off, "Off" then off ? "/Off" : nil
353
+ end
354
+ end
355
+
356
+ def remove_ref_from_array(array_body, ref)
357
+ num, gen = ref
358
+ array_body.gsub(/\b#{num}\s+#{gen}\s+R\b/, "").gsub(/\[\s+/, "[").gsub(/\s+\]/, "]")
359
+ end
360
+
361
+ def add_ref_to_array(array_body, ref)
362
+ num, gen = ref
363
+ ref_token = "#{num} #{gen} R"
364
+
365
+ # Handle empty array
366
+ if array_body.strip == "[]"
367
+ return "[#{ref_token}]"
368
+ end
369
+
370
+ # Add before the closing bracket, with proper spacing
371
+ # Find the last ']' and insert before it
372
+ if array_body.strip.end_with?("]")
373
+ # Remove trailing ] and add ref, then add ] back
374
+ without_closing = array_body.rstrip.chomp("]")
375
+ return "#{without_closing} #{ref_token}]"
376
+ end
377
+
378
+ # Fallback: just append
379
+ "#{array_body} #{ref_token}"
380
+ end
381
+
382
+ def remove_ref_from_inline_array(dict_body, key, ref)
383
+ return nil unless dict_body.include?(key)
384
+
385
+ # Extract the inline array token after key, then rebuild
386
+ arr_tok = value_token_after(key, dict_body)
387
+ return nil unless arr_tok && arr_tok.start_with?("[")
388
+
389
+ dict_body.sub(arr_tok) { |t| remove_ref_from_array(t, ref) }
390
+ end
391
+
392
+ def add_ref_to_inline_array(dict_body, key, ref)
393
+ return nil unless dict_body.include?(key)
394
+
395
+ # Extract the inline array token after key, then rebuild
396
+ arr_tok = value_token_after(key, dict_body)
397
+ return nil unless arr_tok && arr_tok.start_with?("[")
398
+
399
+ new_arr_tok = add_ref_to_array(arr_tok, ref)
400
+ dict_body.sub(arr_tok) { |_| new_arr_tok }
401
+ end
402
+
403
+ def is_widget?(body)
404
+ return false unless body
405
+
406
+ body.include?("/Subtype") && body.include?("/Widget") && body =~ %r{/Subtype\s*/Widget}
407
+ end
408
+
409
+ # Check if a body represents a page object (not /Type/Pages)
410
+ def is_page?(body)
411
+ return false unless body
412
+
413
+ body.include?("/Type /Page") || body =~ %r{/Type\s*/Page(?!s)\b}
414
+ end
415
+
416
+ # Check if a field is multiline by checking /Ff flag bit 12 (0x1000)
417
+ def is_multiline_field?(dict_body)
418
+ return false unless dict_body
419
+
420
+ ff_tok = value_token_after("/Ff", dict_body)
421
+ return false unless ff_tok
422
+
423
+ ff_value = ff_tok.to_i
424
+ # Bit 12 (0x1000) indicates multiline text field
425
+ ff_value.anybits?(0x1000)
426
+ end
427
+
428
+ # Parse a box array (MediaBox, CropBox, ArtBox, BleedBox, TrimBox, etc.)
429
+ # Returns a hash with keys :llx, :lly, :urx, :ury, or nil if not found/invalid
430
+ def parse_box(body, box_type)
431
+ pattern = %r{/#{box_type}\s*\[(.*?)\]}
432
+ return nil unless body =~ pattern
433
+
434
+ box_values = ::Regexp.last_match(1).scan(/[-+]?\d*\.?\d+/).map(&:to_f)
435
+ return nil unless box_values.length == 4
436
+
437
+ llx, lly, urx, ury = box_values
438
+ { llx: llx, lly: lly, urx: urx, ury: ury }
439
+ end
440
+
441
+ # Remove /AP (appearance stream) entry from a dictionary
442
+ def remove_appearance_stream(dict_body)
443
+ return dict_body unless dict_body&.include?("/AP")
444
+
445
+ # Find /AP entry using pattern matching
446
+ ap_key_pattern = %r{/AP(?=[\s(<\[/])}
447
+ ap_match = dict_body.match(ap_key_pattern)
448
+ return dict_body unless ap_match
449
+
450
+ key_end = ap_match.end(0)
451
+ value_start = key_end
452
+ value_start += 1 while value_start < dict_body.length && dict_body[value_start] =~ /\s/
453
+ return dict_body if value_start >= dict_body.length
454
+
455
+ # Determine what type of value we have
456
+ first_char = dict_body[value_start]
457
+ value_end = value_start
458
+
459
+ if first_char == "<" && value_start + 1 < dict_body.length && dict_body[value_start + 1] == "<"
460
+ # Inline dictionary: /AP << ... >>
461
+ # Need to find matching closing >>
462
+ depth = 0
463
+ i = value_start
464
+ while i < dict_body.length
465
+ if dict_body[i, 2] == "<<"
466
+ depth += 1
467
+ i += 2
468
+ elsif dict_body[i, 2] == ">>"
469
+ depth -= 1
470
+ i += 2
471
+ if depth.zero?
472
+ value_end = i
473
+ break
474
+ end
475
+ else
476
+ i += 1
477
+ end
478
+ end
479
+ elsif ["(", "<", "["].include?(first_char)
480
+ # Use value_token_after to get the complete token
481
+ ap_tok = value_token_after("/AP", dict_body)
482
+ return dict_body unless ap_tok
483
+
484
+ value_end = value_start + ap_tok.length
485
+ else
486
+ # Reference or other simple token
487
+ ap_tok = value_token_after("/AP", dict_body)
488
+ return dict_body unless ap_tok
489
+
490
+ value_end = value_start + ap_tok.length
491
+ end
492
+
493
+ # Skip trailing whitespace after the value
494
+ value_end += 1 while value_end < dict_body.length && dict_body[value_end] =~ /\s/
495
+
496
+ # Find the start of /AP (may need to remove preceding space/newline)
497
+ removal_start = ap_match.begin(0)
498
+
499
+ # Try to remove preceding whitespace/newline if it's on its own line
500
+ if removal_start.positive? && dict_body[removal_start - 1] == "\n"
501
+ # Check if there's whitespace before the newline we should remove too
502
+ line_start = removal_start - 1
503
+ line_start -= 1 while line_start.positive? && dict_body[line_start - 1] =~ /\s/
504
+ # Only remove the line if it starts with whitespace (indentation)
505
+ if line_start.positive? && dict_body[line_start - 1] == "\n"
506
+ removal_start = line_start
507
+ end
508
+ end
509
+
510
+ # Build result without /AP entry
511
+ before = dict_body[0...removal_start]
512
+ after = dict_body[value_end..]
513
+ result = "#{before}#{after}"
514
+
515
+ # Verify the result still has valid dictionary structure
516
+ unless result.include?("<<") && result.include?(">>")
517
+ return dict_body # Return original if corrupted
518
+ end
519
+
520
+ result
521
+ end
522
+ end
523
+ end