RubyGems - corp_pdf - Versions diffs - 1.0.5 - Mend

corp_pdf 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

checksums.yaml +7 -0
data/.gitignore +13 -0
data/.rubocop.yml +78 -0
data/CHANGELOG.md +122 -0
data/Gemfile +5 -0
data/Gemfile.lock +90 -0
data/README.md +518 -0
data/Rakefile +18 -0
data/corp_pdf.gemspec +35 -0
data/docs/README.md +111 -0
data/docs/clear_fields.md +202 -0
data/docs/dict_scan_explained.md +341 -0
data/docs/object_streams.md +311 -0
data/docs/pdf_structure.md +251 -0
data/issues/README.md +59 -0
data/issues/memory-benchmark-results.md +551 -0
data/issues/memory-improvements.md +388 -0
data/issues/memory-optimization-summary.md +204 -0
data/issues/refactoring-opportunities.md +259 -0
data/lib/corp_pdf/actions/add_field.rb +73 -0
data/lib/corp_pdf/actions/base.rb +48 -0
data/lib/corp_pdf/actions/remove_field.rb +154 -0
data/lib/corp_pdf/actions/update_field.rb +663 -0
data/lib/corp_pdf/dict_scan.rb +523 -0
data/lib/corp_pdf/document.rb +782 -0
data/lib/corp_pdf/field.rb +145 -0
data/lib/corp_pdf/fields/base.rb +384 -0
data/lib/corp_pdf/fields/checkbox.rb +164 -0
data/lib/corp_pdf/fields/radio.rb +220 -0
data/lib/corp_pdf/fields/signature.rb +393 -0
data/lib/corp_pdf/fields/text.rb +31 -0
data/lib/corp_pdf/incremental_writer.rb +245 -0
data/lib/corp_pdf/object_resolver.rb +381 -0
data/lib/corp_pdf/objstm.rb +75 -0
data/lib/corp_pdf/page.rb +90 -0
data/lib/corp_pdf/pdf_writer.rb +133 -0
data/lib/corp_pdf/version.rb +5 -0
data/lib/corp_pdf.rb +35 -0
data/publish +183 -0
metadata +169 -0

data/lib/corp_pdf/incremental_writer.rb ADDED Viewed

@@ -0,0 +1,245 @@
+# frozen_string_literal: true
+module CorpPdf
+  # Appends an incremental update containing the given patches.
+  # Each patch is {ref:[num,gen], body:String}
+  class IncrementalWriter
+    def initialize(original_bytes, patches)
+      @orig = original_bytes
+      @patches = patches
+    end
+    def render
+      return @orig if @patches.empty?
+      startxref_prev = find_startxref(@orig) or raise "startxref not found"
+      max_obj = scan_max_obj_number(@orig)
+      # Ensure we end with a newline before appending
+      # Avoid dup by concatenating instead of modifying in place
+      newline_if_needed = @orig.end_with?("\n") ? "".b : "\n".b
+      original_with_newline = @orig + newline_if_needed
+      buf = +""
+      offsets = []
+      # Write patches into an object stream for efficiency
+      objstm_data = CorpPdf::ObjStm.create(@patches, compress: true)
+      if objstm_data
+        # Get the next object number for the object stream itself
+        objstm_num = [max_obj + 1, @patches.map { |p| p[:ref][0] }.max.to_i + 1].max
+        # Write the object stream object
+        objstm_offset = original_with_newline.bytesize + buf.bytesize
+        offsets << [objstm_num, 0, objstm_offset]
+        buf << "#{objstm_num} 0 obj\n".b
+        buf << objstm_data[:dictionary]
+        buf << "\nstream\n".b
+        buf << objstm_data[:stream_body]
+        buf << "\nendstream\n".b
+        buf << "endobj\n".b
+        # Build xref stream (supports type 2 entries for objects in object streams)
+        sorted_patches = objstm_data[:patches]
+        xrefstm_num = objstm_num + 1
+        # Collect all entries: object stream itself (type 1) + patches (type 2)
+        # Format: [obj_num, gen, type, f1, f2]
+        # For type 1: f1 is offset, f2 is generation (unused in xref streams)
+        # For type 2: f1 is objstm_num, f2 is index in stream
+        entries = []
+        # Object stream itself - type 1 entry
+        entries << [objstm_num, 0, 1, objstm_offset, 0]
+        # Patches in object stream - type 2 entries
+        sorted_patches.each_with_index do |patch, index|
+          num, gen = patch[:ref]
+          next if num == objstm_num # Skip the object stream itself
+          entries << [num, gen, 2, objstm_num, index]
+        end
+        # Sort entries by object number
+        entries.sort_by! { |num, gen, _type, _f1, _f2| [num, gen] }
+        # Build Index array - use single range for simplicity
+        # Index format: [first_obj, count]
+        obj_nums = entries.map { |num, _gen, _type, _f1, _f2| num }
+        min_obj = obj_nums.min
+        max_obj = obj_nums.max
+        # For Index, we need consecutive entries from min_obj to max_obj
+        # So count is (max_obj - min_obj + 1)
+        index_count = max_obj - min_obj + 1
+        index_array = [min_obj, index_count]
+        # Build xref stream data with proper ordering
+        # W = [1, 4, 2] means: type (1 byte), offset/f1 (4 bytes), index/f2 (2 bytes)
+        w = [1, 4, 2]
+        # Create a map for quick lookup by object number
+        entry_map = {}
+        entries.each { |num, _gen, type, f1, f2| entry_map[num] = [type, f1, f2] }
+        # Build xref records in order according to Index range
+        # Index says entries start at min_obj and we have index_count entries
+        xref_records = []
+        index_count.times do |k|
+          obj_num = min_obj + k
+          if entry_map[obj_num]
+            type, f1, f2 = entry_map[obj_num]
+            xref_records << [type, f1, f2].pack("C N n")
+          else
+            # Type 0 (free) entry for missing objects in the range
+            xref_records << [0, 0, 0].pack("C N n")
+          end
+        end
+        xref_bytes = xref_records.join("".b)
+        # Compress xref stream
+        xref_compressed = Zlib::Deflate.deflate(xref_bytes)
+        # Size is max object number + 1 (must include xrefstm_num itself)
+        size = [max_obj + 1, objstm_num + 1, xrefstm_num + 1].max
+        # Write xref stream object
+        xrefstm_offset = original_with_newline.bytesize + buf.bytesize
+        root_ref = extract_root_from_trailer(@orig)
+        xrefstm_dict = "<<\n/Type /XRef\n/W [#{w.join(' ')}]\n/Size #{size}\n/Index [#{index_array.join(' ')}]\n/Prev #{startxref_prev}\n".b
+        xrefstm_dict << " /Root #{root_ref}".b if root_ref
+        xrefstm_dict << "\n/Filter /FlateDecode\n/Length #{xref_compressed.bytesize}\n>>\n".b
+        buf << "#{xrefstm_num} 0 obj\n".b
+        buf << xrefstm_dict
+        buf << "stream\n".b
+        buf << xref_compressed
+        buf << "\nendstream\n".b
+        buf << "endobj\n".b
+        # Build trailer - need to include xref stream itself
+        # The xref stream itself must be accessible, so we use a classic trailer pointing to it
+        new_size = [max_obj + 1, xrefstm_num, @patches.map { |p| p[:ref][0] }.max.to_i + 1].max
+        xref_offset = xrefstm_offset
+        trailer = "trailer\n<< /Size #{new_size} /Prev #{startxref_prev}".b
+        trailer << " /Root #{root_ref}".b if root_ref
+        trailer << " /XRefStm #{xrefstm_offset} >>\n".b
+        trailer << "startxref\n#{xref_offset}\n%%EOF\n".b
+        result = original_with_newline + buf + trailer
+      else
+        # Fallback to individual objects if ObjStm.create fails
+        @patches.each do |p|
+          num, gen = p[:ref]
+          offset = original_with_newline.bytesize + buf.bytesize
+          offsets << [num, gen, offset]
+          # Write object with proper formatting
+          buf << "#{num} #{gen} obj\n"
+          buf << p[:body]
+          buf << "\nendobj\n"
+        end
+        # Build xref table
+        sorted = offsets.sort_by { |n, g, _| [n, g] }
+        xref = +"xref\n"
+        i = 0
+        while i < sorted.length
+          first_num = sorted[i][0]
+          run = 1
+          while (i + run) < sorted.length && sorted[i + run][0] == first_num + run && sorted[i + run][1] == sorted[i][1]
+            run += 1
+          end
+          xref << "#{first_num} #{run}\n"
+          run.times do |r|
+            abs = sorted[i + r][2]
+            gen = sorted[i + r][1]
+            xref << format("%010d %05d n \n", abs, gen)
+          end
+          i += run
+        end
+        # Debug: verify xref was built
+        if xref == "xref\n"
+          raise "Xref table is empty! Offsets: #{offsets.inspect}"
+        end
+        # Build trailer with /Root reference
+        new_size = [max_obj + 1, @patches.map { |p| p[:ref][0] }.max.to_i + 1].max
+        xref_offset = original_with_newline.bytesize + buf.bytesize
+        # Extract /Root from original trailer
+        root_ref = extract_root_from_trailer(@orig)
+        root_entry = root_ref ? " /Root #{root_ref}" : ""
+        trailer = "trailer\n<< /Size #{new_size} /Prev #{startxref_prev}#{root_entry} >>\nstartxref\n#{xref_offset}\n%%EOF\n"
+        result = original_with_newline + buf + xref + trailer
+        # Verify xref was built correctly
+        if xref.length < 10
+          warn "Warning: xref table seems too short (#{xref.length} bytes). Expected proper entries."
+        end
+      end
+      result
+    end
+    private
+    def find_startxref(bytes)
+      if bytes =~ /startxref\s+(\d+)\s*%%EOF\s*\z/m
+        return Integer(::Regexp.last_match(1))
+      end
+      m = bytes.rindex("startxref")
+      return nil unless m
+      tail = bytes[m, bytes.length - m]
+      tail[/startxref\s+(\d+)/m, 1]&.to_i
+    end
+    def scan_max_obj_number(bytes)
+      max = 0
+      bytes.scan(/(^|\s)(\d+)\s+(\d+)\s+obj\b/) { max = [::Regexp.last_match(2).to_i, max].max }
+      max
+    end
+    def extract_root_from_trailer(bytes)
+      # For xref streams, find the last xref stream object dictionary
+      startxref_match = bytes.match(/startxref\s+(\d+)\s*%%EOF\s*\z/m)
+      if startxref_match
+        xref_offset = startxref_match[1].to_i
+        # Check if it's an xref stream (starts with object header)
+        if bytes[xref_offset, 50] =~ /(\d+\s+\d+\s+obj)/
+          # Find the dictionary in the xref stream object
+          dict_start = bytes.index("<<", xref_offset)
+          if dict_start
+            trailer_section = bytes[dict_start, 500]
+            if trailer_section =~ %r{/Root\s+(\d+\s+\d+\s+R)}
+              return ::Regexp.last_match(1)
+            end
+          end
+        end
+      end
+      # Fallback: look for classic trailer
+      trailer_idx = bytes.rindex("trailer")
+      if trailer_idx
+        dict_start = bytes.index("<<", trailer_idx)
+        if dict_start
+          trailer_section = bytes[dict_start, 500]
+          if trailer_section =~ %r{/Root\s+(\d+\s+\d+\s+R)}
+            return ::Regexp.last_match(1)
+          end
+        end
+      end
+      nil
+    end
+  end
+end

data/lib/corp_pdf/object_resolver.rb ADDED Viewed

@@ -0,0 +1,381 @@
+# frozen_string_literal: true
+module CorpPdf
+  # Parses xref (tables and streams) and exposes object bodies uniformly,
+  # including objects embedded in /ObjStm. Also gives you the trailer and /Root.
+  class ObjectResolver
+    Entry = Struct.new(:type, :offset, :objstm_num, :objstm_index, keyword_init: true)
+    def initialize(bytes)
+      @bytes = bytes
+      @entries = {}
+      @objstm_cache = {}
+      parse_cross_reference
+    end
+    def root_ref
+      tr = trailer_dict
+      return nil unless tr =~ %r{/Root\s+(\d+)\s+(\d+)\s+R}
+      [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
+    end
+    def trailer_dict
+      # Priority order:
+      # 1. Explicit trailer from classic xref (incremental updates)
+      # 2. Xref stream dictionary (original PDFs)
+      # 3. Search for trailer (fallback)
+      @trailer_dict ||= if @trailer_explicit
+                          @trailer_explicit
+                        elsif @last_xref_stream_dict
+                          @last_xref_stream_dict
+                        else
+                          # Find last 'trailer << ... >>' before last startxref
+                          start = find_startxref(@bytes) || 0
+                          head = @bytes[0...start]
+                          idx = head.rindex("trailer")
+                          raise "trailer not found" unless idx
+                          # naive grab following dict
+                          m = head.index("<<", idx)
+                          n = balanced_from(head, m)
+                          head[m...n]
+                        end
+    end
+    def each_object
+      @entries.each_key do |ref|
+        yield(ref, object_body(ref))
+      end
+    end
+    # Clear the object stream cache to free memory
+    def clear_cache
+      @objstm_cache.clear
+    end
+    def object_body(ref)
+      case (e = @entries[ref])&.type
+      when :in_file
+        i = e.offset
+        # Find "obj" start near offset (handle any preceding whitespace)
+        hdr = /\bobj\b/m.match(@bytes, i) or return nil
+        after = hdr.end(0)
+        # Skip optional whitespace and one line break if present
+        after += 1 while (ch = @bytes.getbyte(after)) && ch <= 0x20
+        j = @bytes.index(/\bendobj\b/m, after) or return nil
+        @bytes[after...j]
+      when :in_objstm
+        load_objstm([e.objstm_num, 0])
+        @objstm_cache[[e.objstm_num, 0]][e.objstm_index][:body]
+      end
+    end
+    # --- internals -----------------------------------------------------------
+    def parse_cross_reference
+      start = find_startxref(@bytes) or raise "startxref not found"
+      parse_xref_at_offset(start)
+    end
+    def parse_xref_at_offset(offset)
+      # 1) If 'xref' is literally at that offset => classic table
+      if @bytes[offset, 4] == "xref"
+        tr = parse_classic_xref(offset)
+        # 2) Classic trailers may include /XRefStm <offset> to an xref stream with compressed entries
+        xrefstm_tok = DictScan.value_token_after("/XRefStm", tr) if tr
+        if xrefstm_tok && (ofs = xrefstm_tok.to_i).positive?
+          parse_xref_stream_at(ofs) # merge entries from xref stream (type 0/1/2)
+        end
+        # 3) Follow /Prev pointer if present
+        prev_tok = DictScan.value_token_after("/Prev", tr) if tr
+        if prev_tok && (prev_ofs = prev_tok.to_i).positive?
+          parse_xref_at_offset(prev_ofs)
+        end
+      else
+        # Direct xref stream case (offset points to the xref stream obj header)
+        dict_src = parse_xref_stream_at(offset)
+        # Follow /Prev in the xref stream's dictionary
+        if dict_src
+          prev_tok = DictScan.value_token_after("/Prev", dict_src)
+          if prev_tok && (prev_ofs = prev_tok.to_i).positive?
+            parse_xref_at_offset(prev_ofs)
+          end
+        end
+      end
+    end
+    def parse_classic_xref(start)
+      pos = @bytes.rindex("xref", start) or raise "xref not found"
+      i = pos + 4
+      loop do
+        m = /\s*(\d+)\s+(\d+)/m.match(@bytes, i) or break
+        first = m[1].to_i
+        count = m[2].to_i
+        i = m.end(0)
+        count.times do |k|
+          # Skip whitespace/newlines before reading the 20-byte record
+          i += 1 while (ch = @bytes.getbyte(i)) && [0x0A, 0x0D, 0x20].include?(ch)
+          rec = @bytes[i, 20]
+          raise "bad xref record" unless rec && rec.bytesize == 20
+          off = rec[0, 10].to_i
+          gen = rec[11, 5].to_i
+          typ = rec[17, 1]
+          i += 20
+          # consume line ending(s)
+          i += 1 while (ch = @bytes.getbyte(i)) && [0x0A, 0x0D].include?(ch)
+          ref = [first + k, gen]
+          @entries[ref] ||= Entry.new(type: :in_file, offset: off) if typ == "n"
+          # (ignore 'f' free entries)
+        end
+        break if @bytes[i, 7] == "trailer"
+      end
+      tpos = @bytes.index("trailer", i)
+      if tpos
+        dpos = @bytes.index("<<", tpos)
+        if dpos
+          dend = balanced_from(@bytes, dpos)
+          @last_xref_stream_dict = nil
+          @trailer_explicit = @bytes[dpos...dend]
+          return @trailer_explicit
+        end
+      end
+      # No trailer found (might be at an intermediate xref in the chain)
+      nil
+    end
+    def parse_xref_stream_at(header_ofs)
+      # Expect "<num> <gen> obj" at header_ofs
+      m = /\A(\d+)\s+(\d+)\s+obj\b/m.match(@bytes[header_ofs, 50])
+      unless m
+        # Sometimes header_ofs might land on whitespace; search forward a bit
+        win = @bytes[header_ofs, 256]
+        m2 = /(\d+)\s+(\d+)\s+obj\b/m.match(win) or raise "xref stream header not found"
+        header_ofs += m2.begin(0)
+        m = m2
+      end
+      obj_ref = [m[1].to_i, m[2].to_i]
+      dpos = @bytes.index("<<", header_ofs + m[0].length) or raise "xref stream dict missing"
+      dend = balanced_from(@bytes, dpos)
+      dict_src = @bytes[dpos...dend]
+      @last_xref_stream_dict ||= dict_src # Keep first one for trailer_dict
+      spos = @bytes.index(/\bstream\r?\n/m, dend) or raise "xref stream body missing"
+      epos = @bytes.index(/\bendstream\b/m, spos) or raise "xref stream end missing"
+      data = @bytes[spos..epos]
+      raw = decode_stream_data(dict_src, data)
+      # W is mandatory in xref streams; if missing, bail (don't crash)
+      w_tok = DictScan.value_token_after("/W", dict_src)
+      return nil unless w_tok
+      w = JSON_like_array(w_tok)
+      idx_tok = DictScan.value_token_after("/Index", dict_src)
+      index = idx_tok ? JSON_like_array(idx_tok) : [0, DictScan.value_token_after("/Size", dict_src).to_i]
+      parse_xref_stream_records(raw, w, index)
+      # Ensure the xref stream object itself is registered (type 1 entry usually exists,
+      # but if not, add it so object_body can find the stream if needed)
+      unless @entries.key?(obj_ref)
+        # Approximate offset at header_ofs
+        @entries[obj_ref] = Entry.new(type: :in_file, offset: header_ofs)
+      end
+      dict_src # Return dict for /Prev checking
+    end
+    def parse_xref_stream_records(raw, w, index)
+      w0, w1, w2 = w
+      s = StringScanner.new(raw)
+      (0...(index.length / 2)).each do |i|
+        obj = index[2 * i].to_i
+        count = index[(2 * i) + 1].to_i
+        count.times do |k|
+          t  = read_int(s, w0)
+          f1 = read_int(s, w1)
+          f2 = read_int(s, w2)
+          ref = [obj + k, 0]
+          case t
+          when 0 then next # free
+          when 1 then @entries[ref] ||= Entry.new(type: :in_file, offset: f1)
+          when 2 then @entries[ref] ||= Entry.new(type: :in_objstm, objstm_num: f1, objstm_index: f2)
+          end
+        end
+      end
+    end
+    def read_int(scanner, width)
+      # Ensure width is an integer
+      w = width.is_a?(Integer) ? width : width.to_i
+      return 0 if w.zero?
+      bytes = scanner.peek(w)
+      return 0 unless bytes && bytes.bytesize == w
+      scanner.pos += w
+      val = 0
+      bytes.each_byte { |b| val = (val << 8) | b }
+      val
+    end
+    def JSON_like_array(tok)
+      inner = tok[1..-2]
+      inner.split(/\s+/).map { |t| t =~ /\A\d+\z/ ? t.to_i : t }
+    end
+    def decode_stream_data(dict_src, stream_chunk)
+      s_match = /\bstream\r?\n/.match(stream_chunk) or raise "stream keyword missing"
+      body = stream_chunk[s_match.end(0)..]
+      body = body.sub(/\bendstream\b.*/m, "")
+      # Decompress if FlateDecode (handle both "/Filter /FlateDecode" and "/Filter/FlateDecode")
+      data = if dict_src =~ %r{/Filter\s*/FlateDecode}
+               Zlib::Inflate.inflate(body)
+             else
+               body
+             end
+      # Apply PNG predictor if present
+      if dict_src =~ %r{/DecodeParms\s*<<[^>]*/Predictor\s+(\d+)}
+        predictor = ::Regexp.last_match(1).to_i
+        if predictor.between?(10, 15) # PNG predictors
+          columns = dict_src =~ %r{/Columns\s+(\d+)} ? ::Regexp.last_match(1).to_i : 1
+          data = apply_png_predictor(data, columns)
+        end
+      end
+      data
+    end
+    def apply_png_predictor(data, columns)
+      # PNG predictor: each row starts with a filter byte, followed by 'columns' data bytes
+      row_size = columns + 1  # 1 byte for predictor + columns bytes of data
+      num_rows = data.bytesize / row_size
+      result = []
+      prev_row = [0] * columns
+      num_rows.times do |i|
+        row_start = i * row_size
+        filter_type = data.getbyte(row_start)
+        row_bytes = (1..columns).map { |j| data.getbyte(row_start + j) }
+        decoded_row = case filter_type
+                      when 0  # None
+                        row_bytes
+                      when 1  # Sub
+                        out = []
+                        columns.times do |j|
+                          left = j.positive? ? out[j - 1] : 0
+                          out << ((row_bytes[j] + left) & 0xFF)
+                        end
+                        out
+                      when 2  # Up
+                        row_bytes.map.with_index { |b, j| (b + prev_row[j]) & 0xFF }
+                      when 3  # Average
+                        out = []
+                        columns.times do |j|
+                          left = j.positive? ? out[j - 1] : 0
+                          up = prev_row[j]
+                          out << ((row_bytes[j] + ((left + up) / 2)) & 0xFF)
+                        end
+                        out
+                      when 4  # Paeth
+                        out = []
+                        columns.times do |j|
+                          left = j.positive? ? out[j - 1] : 0
+                          up = prev_row[j]
+                          up_left = j.positive? ? prev_row[j - 1] : 0
+                          out << ((row_bytes[j] + paeth_predictor(left, up, up_left)) & 0xFF)
+                        end
+                        out
+                      else
+                        row_bytes # Unknown filter, pass through
+                      end
+        result.concat(decoded_row)
+        prev_row = decoded_row
+      end
+      result.pack("C*")
+    end
+    def paeth_predictor(a, b, c)
+      # a = left, b = up, c = up-left
+      p = a + b - c
+      pa = (p - a).abs
+      pb = (p - b).abs
+      pc = (p - c).abs
+      if pa <= pb && pa <= pc
+        a
+      elsif pb <= pc
+        b
+      else
+        c
+      end
+    end
+    def balanced_from(str, start_idx)
+      depth = 0
+      j = start_idx
+      while j < str.length
+        if str[j, 2] == "<<"
+          depth += 1
+          j += 2
+        elsif str[j, 2] == ">>"
+          depth -= 1
+          j += 2
+          return j if depth.zero?
+        else
+          j += 1
+        end
+      end
+      raise "unterminated dict"
+    end
+    def find_startxref(bytes)
+      return nil if bytes.nil? || bytes.empty?
+      if bytes =~ /startxref\s+(\d+)\s*%%EOF\s*\z/m
+        return Integer(::Regexp.last_match(1))
+      end
+      m = bytes.rindex("startxref")
+      return nil unless m
+      tail = bytes[m, bytes.length - m]
+      tail[/startxref\s+(\d+)/m, 1]&.to_i
+    end
+    def load_objstm(container_ref)
+      return if @objstm_cache.key?(container_ref)
+      body = object_body(container_ref)
+      raise "Object stream #{container_ref.inspect} not found in xref table" unless body
+      dict_start = body.index("<<") || 0
+      dict_end = balanced_from(body, dict_start)
+      dict_src = body[dict_start...dict_end]
+      s_pos = body.index(/\bstream\r?\n/m, dict_end) or raise "objstm stream missing"
+      e_pos = body.index(/\bendstream\b/m, s_pos) or raise "objstm end missing"
+      data = body[s_pos..e_pos]
+      raw = decode_stream_data(dict_src, data)
+      n = DictScan.value_token_after("/N", dict_src).to_i
+      first = DictScan.value_token_after("/First", dict_src).to_i
+      parsed = CorpPdf::ObjStm.parse(raw, n: n, first: first)
+      @objstm_cache[container_ref] = parsed
+    end
+  end
+end

data/lib/corp_pdf/objstm.rb ADDED Viewed

@@ -0,0 +1,75 @@
+# frozen_string_literal: true
+module CorpPdf
+  class ObjStm
+    # Parse an object stream body given N and First.
+    # Returns: array like [{ ref:[obj_num,0], body:String }, ...] in order of header listing.
+    def self.parse(bytes, n:, first:)
+      head = bytes[0...first]
+      entries = head.strip.split(/\s+/).map!(&:to_i)
+      refs = []
+      n.times do |i|
+        obj = entries[2 * i]
+        off = entries[(2 * i) + 1]
+        next_off = i + 1 < n ? entries[(2 * (i + 1)) + 1] : (bytes.bytesize - first)
+        body = bytes[first + off, (first + next_off) - (first + off)]
+        refs << { ref: [obj, 0], body: body }
+      end
+      refs
+    end
+    # Create an object stream from patches (array of {ref: [num, gen], body: String}).
+    # Returns: { dictionary: String, stream_body: String, object_count: Integer }
+    # The dictionary includes /Type /ObjStm, /N (count), /First (header size), and /Filter /FlateDecode
+    def self.create(patches, compress: true)
+      return nil if patches.empty?
+      # Sort patches by object number for consistency
+      sorted_patches = patches.sort_by { |p| p[:ref][0] }
+      # Build header: "obj_num offset obj_num offset ..."
+      # Offsets are relative to the start of the object data section (after header)
+      header_parts = []
+      body_parts = []
+      current_offset = 0
+      sorted_patches.each do |patch|
+        obj_num, = patch[:ref]
+        body = patch[:body].to_s
+        # Ensure body ends with newline for proper parsing
+        body += "\n" unless body.end_with?("\n")
+        header_parts << obj_num.to_s
+        header_parts << current_offset.to_s
+        body_parts << body
+        current_offset += body.bytesize
+      end
+      header = "#{header_parts.join(' ')}\n"
+      first = header.bytesize
+      object_bodies = body_parts.join
+      # Combine header and bodies
+      raw_data = header + object_bodies
+      # Compress if requested
+      stream_body = if compress
+                      Zlib::Deflate.deflate(raw_data)
+                    else
+                      raw_data
+                    end
+      # Build dictionary
+      dict = "<<\n/Type /ObjStm\n/N #{sorted_patches.length}\n/First #{first}".b
+      dict << "\n/Filter /FlateDecode".b if compress
+      dict << "\n/Length #{stream_body.bytesize}\n>>".b
+      {
+        dictionary: dict,
+        stream_body: stream_body.b,
+        object_count: sorted_patches.length,
+        patches: sorted_patches
+      }
+    end
+  end
+end