RubyGems - corp_pdf - Versions diffs - 1.0.5 - Mend

corp_pdf 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

checksums.yaml +7 -0
data/.gitignore +13 -0
data/.rubocop.yml +78 -0
data/CHANGELOG.md +122 -0
data/Gemfile +5 -0
data/Gemfile.lock +90 -0
data/README.md +518 -0
data/Rakefile +18 -0
data/corp_pdf.gemspec +35 -0
data/docs/README.md +111 -0
data/docs/clear_fields.md +202 -0
data/docs/dict_scan_explained.md +341 -0
data/docs/object_streams.md +311 -0
data/docs/pdf_structure.md +251 -0
data/issues/README.md +59 -0
data/issues/memory-benchmark-results.md +551 -0
data/issues/memory-improvements.md +388 -0
data/issues/memory-optimization-summary.md +204 -0
data/issues/refactoring-opportunities.md +259 -0
data/lib/corp_pdf/actions/add_field.rb +73 -0
data/lib/corp_pdf/actions/base.rb +48 -0
data/lib/corp_pdf/actions/remove_field.rb +154 -0
data/lib/corp_pdf/actions/update_field.rb +663 -0
data/lib/corp_pdf/dict_scan.rb +523 -0
data/lib/corp_pdf/document.rb +782 -0
data/lib/corp_pdf/field.rb +145 -0
data/lib/corp_pdf/fields/base.rb +384 -0
data/lib/corp_pdf/fields/checkbox.rb +164 -0
data/lib/corp_pdf/fields/radio.rb +220 -0
data/lib/corp_pdf/fields/signature.rb +393 -0
data/lib/corp_pdf/fields/text.rb +31 -0
data/lib/corp_pdf/incremental_writer.rb +245 -0
data/lib/corp_pdf/object_resolver.rb +381 -0
data/lib/corp_pdf/objstm.rb +75 -0
data/lib/corp_pdf/page.rb +90 -0
data/lib/corp_pdf/pdf_writer.rb +133 -0
data/lib/corp_pdf/version.rb +5 -0
data/lib/corp_pdf.rb +35 -0
data/publish +183 -0
metadata +169 -0

data/lib/corp_pdf/dict_scan.rb ADDED Viewed

@@ -0,0 +1,523 @@
+# frozen_string_literal: true
+module CorpPdf
+  module DictScan
+    module_function
+    # Configure I18n for transliteration (disable locale enforcement)
+    I18n.config.enforce_available_locales = false
+    # Transliterate a string to ASCII, converting special characters to their ASCII equivalents
+    # Example: "María Valentina" -> "Maria Valentina"
+    def transliterate_to_ascii(str)
+      return str unless str.is_a?(String)
+      # Ensure the string is in UTF-8 encoding
+      utf8_str = str.encode("UTF-8", invalid: :replace, undef: :replace)
+      # Use I18n transliteration to convert to ASCII
+      begin
+        I18n.transliterate(utf8_str, locale: :en, replacement: "")
+      rescue StandardError
+        # Fallback: if transliteration fails, try to encode to ASCII with replacements
+        utf8_str.encode("ASCII", invalid: :replace, undef: :replace)
+      end
+    end
+    # --- low-level string helpers -------------------------------------------------
+    def strip_stream_bodies(pdf)
+      pdf.gsub(/stream\r?\n.*?endstream/mi) { "stream\nENDSTREAM_STRIPPED\nendstream" }
+    end
+    def each_dictionary(str)
+      i = 0
+      while (open = str.index("<<", i))
+        depth = 0
+        j = open
+        found = nil
+        while j < str.length
+          if str[j, 2] == "<<"
+            depth += 1
+            j += 2
+          elsif str[j, 2] == ">>"
+            depth -= 1
+            j += 2
+            if depth.zero?
+              found = str[open...j]
+              break
+            end
+          else
+            j += 1
+          end
+        end
+        break unless found
+        yield found
+        i = j
+      end
+    end
+    def unescape_literal(s)
+      out = +""
+      i = 0
+      while i < s.length
+        ch = s[i]
+        if ch == "\\"
+          i += 1
+          break if i >= s.length
+          esc = s[i]
+          case esc
+          when "n" then out << "\n"
+          when "r" then out << "\r"
+          when "t" then out << "\t"
+          when "b" then out << "\b"
+          when "f" then out << "\f"
+          when "\\", "(", ")" then out << esc
+          when /\d/
+            oct = esc
+            if i + 1 < s.length && s[i + 1] =~ /\d/
+              i += 1
+              oct << s[i]
+              if i + 1 < s.length && s[i + 1] =~ /\d/
+                i += 1
+                oct << s[i]
+              end
+            end
+            out << oct.to_i(8).chr
+          else
+            out << esc
+          end
+        else
+          out << ch
+        end
+        i += 1
+      end
+      out
+    end
+    def decode_pdf_string(token)
+      return nil unless token
+      t = token.strip
+      # Literal string: ( ... ) with PDF escapes and optional UTF-16BE BOM
+      if t.start_with?("(") && t.end_with?(")")
+        inner = t[1..-2]
+        s = unescape_literal(inner)
+        if s.bytesize >= 2 && s.getbyte(0) == 0xFE && s.getbyte(1) == 0xFF
+          return s.byteslice(2, s.bytesize - 2).force_encoding("UTF-16BE").encode("UTF-8")
+        else
+          return s.b
+                  .force_encoding("binary")
+                  .encode("UTF-8", invalid: :replace, undef: :replace)
+        end
+      end
+      # Hex string: < ... > with optional UTF-16BE BOM
+      if t.start_with?("<") && t.end_with?(">")
+        hex = t[1..-2].gsub(/\s+/, "")
+        hex << "0" if hex.length.odd?
+        bytes = [hex].pack("H*")
+        if bytes.bytesize >= 2 && bytes.getbyte(0) == 0xFE && bytes.getbyte(1) == 0xFF
+          return bytes.byteslice(2, bytes.bytesize - 2).force_encoding("UTF-16BE").encode("UTF-8")
+        else
+          return bytes.force_encoding("binary").encode("UTF-8", invalid: :replace, undef: :replace)
+        end
+      end
+      # Fallback: return token as-is (names, numbers, refs, etc.)
+      t
+    end
+    def encode_pdf_string(val)
+      case val
+      when true then "true"
+      when false then "false"
+      when Symbol
+        "/#{val}"
+      when String
+        # Transliterate special characters to ASCII to avoid encoding issues
+        ascii_val = transliterate_to_ascii(val)
+        if ascii_val.ascii_only?
+          "(#{ascii_val.gsub(/([\\()])/, '\\\\\\1').gsub("\n", '\\n')})"
+        else
+          # Ensure string is in UTF-8 before encoding to UTF-16BE
+          utf8_str = ascii_val.encode("UTF-8", invalid: :replace, undef: :replace)
+          utf16 = utf8_str.encode("UTF-16BE")
+          bytes = "\xFE\xFF#{utf16}"
+          "<#{bytes.unpack1('H*')}>"
+        end
+      else
+        val.to_s
+      end
+    end
+    # Encode a string as a PDF name, escaping special characters with hex encoding
+    # PDF names must escape: # ( ) < > [ ] { } / % and control characters
+    # Example: "(Two Hr) Priority 2" becomes "/#28Two Hr#29 Priority 2"
+    def encode_pdf_name(name)
+      name_str = name.to_s
+      # Remove leading / if present (we'll add it back)
+      name_str = name_str[1..] if name_str.start_with?("/")
+      # Transliterate special characters to ASCII to avoid encoding issues
+      ascii_name = transliterate_to_ascii(name_str)
+      # Encode special characters as hex
+      encoded = ascii_name.each_byte.map do |byte|
+        char = byte.chr
+        # PDF name special characters that need hex encoding: # ( ) < > [ ] { } / %
+        # Also encode control characters (0x00-0x1F, 0x7F) and non-ASCII (0x80-0xFF)
+        if ["#", "(", ")", "<", ">", "[", "]", "{", "}", "/", "%"].include?(char) ||
+           byte.between?(0x00, 0x1F) || byte == 0x7F || byte.between?(0x80, 0xFF)
+          # Hex encode: # followed by 2-digit hex
+          "##{byte.to_s(16).upcase.rjust(2, '0')}"
+        else
+          # Regular printable ASCII: use as-is
+          char
+        end
+      end.join
+      "/#{encoded}"
+    end
+    # Format a metadata key as a PDF dictionary key (ensure it starts with /)
+    def format_pdf_key(key)
+      key_str = key.to_s
+      key_str.start_with?("/") ? key_str : "/#{key_str}"
+    end
+    # Format a metadata value appropriately for PDF
+    def format_pdf_value(value)
+      case value
+      when Integer, Float
+        value.to_s
+      when String
+        # If it looks like a PDF string (starts with parenthesis or angle bracket), use as-is
+        if value.start_with?("(") || value.start_with?("<") || value.start_with?("/")
+          value
+        else
+          # Otherwise encode as a PDF string
+          encode_pdf_string(value)
+        end
+      when Array
+        # Array format: [item1 item2 item3]
+        items = value.map { |v| format_pdf_value(v) }.join(" ")
+        "[#{items}]"
+      when Hash
+        # Dictionary format: << /Key1 value1 /Key2 value2 >>
+        dict = value.map do |k, v|
+          pdf_key = format_pdf_key(k)
+          pdf_val = format_pdf_value(v)
+          "  #{pdf_key} #{pdf_val}"
+        end.join("\n")
+        "<<\n#{dict}\n>>"
+      else
+        value.to_s
+      end
+    end
+    def value_token_after(key, dict_src)
+      # Find key followed by delimiter (whitespace, (, <, [, /)
+      # Use regex to ensure key is a complete token
+      match = dict_src.match(%r{#{Regexp.escape(key)}(?=[\s(<\[/])})
+      return nil unless match
+      i = match.end(0)
+      i += 1 while i < dict_src.length && dict_src[i] =~ /\s/
+      return nil if i >= dict_src.length
+      case dict_src[i]
+      when "("
+        depth = 0
+        j = i
+        while j < dict_src.length
+          ch = dict_src[j]
+          if ch == "\\"
+            j += 2
+            next
+          end
+          depth += 1 if ch == "("
+          if ch == ")"
+            depth -= 1
+            if depth.zero?
+              j += 1
+              return dict_src[i...j]
+            end
+          end
+          j += 1
+        end
+        nil
+      when "<"
+        if dict_src[i, 2] == "<<"
+          "<<"
+        else
+          j = dict_src.index(">", i)
+          j ? dict_src[i..j] : nil
+        end
+      when "["
+        # Array token - find matching closing bracket
+        depth = 0
+        j = i
+        while j < dict_src.length
+          ch = dict_src[j]
+          if ch == "["
+            depth += 1
+          elsif ch == "]"
+            depth -= 1
+            if depth.zero?
+              j += 1
+              return dict_src[i...j]
+            end
+          end
+          j += 1
+        end
+        nil
+      when "/"
+        # PDF name token - extract until whitespace or delimiter
+        j = i
+        while j < dict_src.length
+          ch = dict_src[j]
+          # PDF names can contain most characters except NUL, whitespace, and delimiters
+          break if ch =~ /[\s<>\[\]()]/ || (ch == "/" && j > i)
+          j += 1
+        end
+        j > i ? dict_src[i...j] : nil
+      else
+        # atom
+        m = %r{\A([^\s<>\[\]()/%]+)}.match(dict_src[i..])
+        m ? m[1] : nil
+      end
+    end
+    def replace_key_value(dict_src, key, new_token)
+      # Replace existing key's value token in a single dictionary source string (<<...>>)
+      # Use precise position-based replacement to avoid any regex issues
+      # Find the key position using pattern matching
+      key_pattern = %r{#{Regexp.escape(key)}(?=[\s(<\[/])}
+      key_match = dict_src.match(key_pattern)
+      return upsert_key_value(dict_src, key, new_token) unless key_match
+      # Get the existing value token
+      tok = value_token_after(key, dict_src)
+      return upsert_key_value(dict_src, key, new_token) unless tok
+      # Find exact positions
+      key_match.begin(0)
+      key_end = key_match.end(0)
+      # Skip whitespace after key
+      value_start = key_end
+      value_start += 1 while value_start < dict_src.length && dict_src[value_start] =~ /\s/
+      # Verify the token matches at this position
+      unless value_start < dict_src.length && dict_src[value_start, tok.length] == tok
+        # Token doesn't match - fallback to upsert
+        return upsert_key_value(dict_src, key, new_token)
+      end
+      # Replace using precise string slicing - this preserves everything exactly
+      before = dict_src[0...value_start]
+      after = dict_src[(value_start + tok.length)..]
+      result = "#{before}#{new_token}#{after}"
+      # Verify the result still has valid dictionary structure
+      unless result.include?("<<") && result.include?(">>")
+        # Dictionary corrupted - return original
+        return dict_src
+      end
+      result
+    end
+    def upsert_key_value(dict_src, key, token)
+      # Insert right after '<<' with a space between key and value
+      dict_src.sub("<<") { |_| "<<#{key} #{token}" }
+    end
+    def appearance_choice_for(new_value, dict_src)
+      # If /AP << /N << /Yes ... /Off ... >> >> exists, return /Yes or /Off
+      return nil unless dict_src.include?("/AP")
+      # Simplistic detection
+      yes = dict_src.include?("/Yes")
+      off = dict_src.include?("/Off")
+      case new_value
+      when true, :Yes, "Yes" then yes ? "/Yes" : nil
+      when false, :Off, "Off" then off ? "/Off" : nil
+      end
+    end
+    def remove_ref_from_array(array_body, ref)
+      num, gen = ref
+      array_body.gsub(/\b#{num}\s+#{gen}\s+R\b/, "").gsub(/\[\s+/, "[").gsub(/\s+\]/, "]")
+    end
+    def add_ref_to_array(array_body, ref)
+      num, gen = ref
+      ref_token = "#{num} #{gen} R"
+      # Handle empty array
+      if array_body.strip == "[]"
+        return "[#{ref_token}]"
+      end
+      # Add before the closing bracket, with proper spacing
+      # Find the last ']' and insert before it
+      if array_body.strip.end_with?("]")
+        # Remove trailing ] and add ref, then add ] back
+        without_closing = array_body.rstrip.chomp("]")
+        return "#{without_closing} #{ref_token}]"
+      end
+      # Fallback: just append
+      "#{array_body} #{ref_token}"
+    end
+    def remove_ref_from_inline_array(dict_body, key, ref)
+      return nil unless dict_body.include?(key)
+      # Extract the inline array token after key, then rebuild
+      arr_tok = value_token_after(key, dict_body)
+      return nil unless arr_tok && arr_tok.start_with?("[")
+      dict_body.sub(arr_tok) { |t| remove_ref_from_array(t, ref) }
+    end
+    def add_ref_to_inline_array(dict_body, key, ref)
+      return nil unless dict_body.include?(key)
+      # Extract the inline array token after key, then rebuild
+      arr_tok = value_token_after(key, dict_body)
+      return nil unless arr_tok && arr_tok.start_with?("[")
+      new_arr_tok = add_ref_to_array(arr_tok, ref)
+      dict_body.sub(arr_tok) { |_| new_arr_tok }
+    end
+    def is_widget?(body)
+      return false unless body
+      body.include?("/Subtype") && body.include?("/Widget") && body =~ %r{/Subtype\s*/Widget}
+    end
+    # Check if a body represents a page object (not /Type/Pages)
+    def is_page?(body)
+      return false unless body
+      body.include?("/Type /Page") || body =~ %r{/Type\s*/Page(?!s)\b}
+    end
+    # Check if a field is multiline by checking /Ff flag bit 12 (0x1000)
+    def is_multiline_field?(dict_body)
+      return false unless dict_body
+      ff_tok = value_token_after("/Ff", dict_body)
+      return false unless ff_tok
+      ff_value = ff_tok.to_i
+      # Bit 12 (0x1000) indicates multiline text field
+      ff_value.anybits?(0x1000)
+    end
+    # Parse a box array (MediaBox, CropBox, ArtBox, BleedBox, TrimBox, etc.)
+    # Returns a hash with keys :llx, :lly, :urx, :ury, or nil if not found/invalid
+    def parse_box(body, box_type)
+      pattern = %r{/#{box_type}\s*\[(.*?)\]}
+      return nil unless body =~ pattern
+      box_values = ::Regexp.last_match(1).scan(/[-+]?\d*\.?\d+/).map(&:to_f)
+      return nil unless box_values.length == 4
+      llx, lly, urx, ury = box_values
+      { llx: llx, lly: lly, urx: urx, ury: ury }
+    end
+    # Remove /AP (appearance stream) entry from a dictionary
+    def remove_appearance_stream(dict_body)
+      return dict_body unless dict_body&.include?("/AP")
+      # Find /AP entry using pattern matching
+      ap_key_pattern = %r{/AP(?=[\s(<\[/])}
+      ap_match = dict_body.match(ap_key_pattern)
+      return dict_body unless ap_match
+      key_end = ap_match.end(0)
+      value_start = key_end
+      value_start += 1 while value_start < dict_body.length && dict_body[value_start] =~ /\s/
+      return dict_body if value_start >= dict_body.length
+      # Determine what type of value we have
+      first_char = dict_body[value_start]
+      value_end = value_start
+      if first_char == "<" && value_start + 1 < dict_body.length && dict_body[value_start + 1] == "<"
+        # Inline dictionary: /AP << ... >>
+        # Need to find matching closing >>
+        depth = 0
+        i = value_start
+        while i < dict_body.length
+          if dict_body[i, 2] == "<<"
+            depth += 1
+            i += 2
+          elsif dict_body[i, 2] == ">>"
+            depth -= 1
+            i += 2
+            if depth.zero?
+              value_end = i
+              break
+            end
+          else
+            i += 1
+          end
+        end
+      elsif ["(", "<", "["].include?(first_char)
+        # Use value_token_after to get the complete token
+        ap_tok = value_token_after("/AP", dict_body)
+        return dict_body unless ap_tok
+        value_end = value_start + ap_tok.length
+      else
+        # Reference or other simple token
+        ap_tok = value_token_after("/AP", dict_body)
+        return dict_body unless ap_tok
+        value_end = value_start + ap_tok.length
+      end
+      # Skip trailing whitespace after the value
+      value_end += 1 while value_end < dict_body.length && dict_body[value_end] =~ /\s/
+      # Find the start of /AP (may need to remove preceding space/newline)
+      removal_start = ap_match.begin(0)
+      # Try to remove preceding whitespace/newline if it's on its own line
+      if removal_start.positive? && dict_body[removal_start - 1] == "\n"
+        # Check if there's whitespace before the newline we should remove too
+        line_start = removal_start - 1
+        line_start -= 1 while line_start.positive? && dict_body[line_start - 1] =~ /\s/
+        # Only remove the line if it starts with whitespace (indentation)
+        if line_start.positive? && dict_body[line_start - 1] == "\n"
+          removal_start = line_start
+        end
+      end
+      # Build result without /AP entry
+      before = dict_body[0...removal_start]
+      after = dict_body[value_end..]
+      result = "#{before}#{after}"
+      # Verify the result still has valid dictionary structure
+      unless result.include?("<<") && result.include?(">>")
+        return dict_body # Return original if corrupted
+      end
+      result
+    end
+  end
+end