corp_pdf 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,245 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CorpPdf
4
+ # Appends an incremental update containing the given patches.
5
+ # Each patch is {ref:[num,gen], body:String}
6
+ class IncrementalWriter
7
+ def initialize(original_bytes, patches)
8
+ @orig = original_bytes
9
+ @patches = patches
10
+ end
11
+
12
+ def render
13
+ return @orig if @patches.empty?
14
+
15
+ startxref_prev = find_startxref(@orig) or raise "startxref not found"
16
+ max_obj = scan_max_obj_number(@orig)
17
+
18
+ # Ensure we end with a newline before appending
19
+ # Avoid dup by concatenating instead of modifying in place
20
+ newline_if_needed = @orig.end_with?("\n") ? "".b : "\n".b
21
+ original_with_newline = @orig + newline_if_needed
22
+
23
+ buf = +""
24
+ offsets = []
25
+
26
+ # Write patches into an object stream for efficiency
27
+ objstm_data = CorpPdf::ObjStm.create(@patches, compress: true)
28
+ if objstm_data
29
+ # Get the next object number for the object stream itself
30
+ objstm_num = [max_obj + 1, @patches.map { |p| p[:ref][0] }.max.to_i + 1].max
31
+
32
+ # Write the object stream object
33
+ objstm_offset = original_with_newline.bytesize + buf.bytesize
34
+ offsets << [objstm_num, 0, objstm_offset]
35
+
36
+ buf << "#{objstm_num} 0 obj\n".b
37
+ buf << objstm_data[:dictionary]
38
+ buf << "\nstream\n".b
39
+ buf << objstm_data[:stream_body]
40
+ buf << "\nendstream\n".b
41
+ buf << "endobj\n".b
42
+
43
+ # Build xref stream (supports type 2 entries for objects in object streams)
44
+ sorted_patches = objstm_data[:patches]
45
+ xrefstm_num = objstm_num + 1
46
+
47
+ # Collect all entries: object stream itself (type 1) + patches (type 2)
48
+ # Format: [obj_num, gen, type, f1, f2]
49
+ # For type 1: f1 is offset, f2 is generation (unused in xref streams)
50
+ # For type 2: f1 is objstm_num, f2 is index in stream
51
+ entries = []
52
+ # Object stream itself - type 1 entry
53
+ entries << [objstm_num, 0, 1, objstm_offset, 0]
54
+ # Patches in object stream - type 2 entries
55
+ sorted_patches.each_with_index do |patch, index|
56
+ num, gen = patch[:ref]
57
+ next if num == objstm_num # Skip the object stream itself
58
+
59
+ entries << [num, gen, 2, objstm_num, index]
60
+ end
61
+
62
+ # Sort entries by object number
63
+ entries.sort_by! { |num, gen, _type, _f1, _f2| [num, gen] }
64
+
65
+ # Build Index array - use single range for simplicity
66
+ # Index format: [first_obj, count]
67
+ obj_nums = entries.map { |num, _gen, _type, _f1, _f2| num }
68
+ min_obj = obj_nums.min
69
+ max_obj = obj_nums.max
70
+
71
+ # For Index, we need consecutive entries from min_obj to max_obj
72
+ # So count is (max_obj - min_obj + 1)
73
+ index_count = max_obj - min_obj + 1
74
+ index_array = [min_obj, index_count]
75
+
76
+ # Build xref stream data with proper ordering
77
+ # W = [1, 4, 2] means: type (1 byte), offset/f1 (4 bytes), index/f2 (2 bytes)
78
+ w = [1, 4, 2]
79
+
80
+ # Create a map for quick lookup by object number
81
+ entry_map = {}
82
+ entries.each { |num, _gen, type, f1, f2| entry_map[num] = [type, f1, f2] }
83
+
84
+ # Build xref records in order according to Index range
85
+ # Index says entries start at min_obj and we have index_count entries
86
+ xref_records = []
87
+ index_count.times do |k|
88
+ obj_num = min_obj + k
89
+ if entry_map[obj_num]
90
+ type, f1, f2 = entry_map[obj_num]
91
+ xref_records << [type, f1, f2].pack("C N n")
92
+ else
93
+ # Type 0 (free) entry for missing objects in the range
94
+ xref_records << [0, 0, 0].pack("C N n")
95
+ end
96
+ end
97
+
98
+ xref_bytes = xref_records.join("".b)
99
+
100
+ # Compress xref stream
101
+ xref_compressed = Zlib::Deflate.deflate(xref_bytes)
102
+
103
+ # Size is max object number + 1 (must include xrefstm_num itself)
104
+ size = [max_obj + 1, objstm_num + 1, xrefstm_num + 1].max
105
+
106
+ # Write xref stream object
107
+ xrefstm_offset = original_with_newline.bytesize + buf.bytesize
108
+ root_ref = extract_root_from_trailer(@orig)
109
+ xrefstm_dict = "<<\n/Type /XRef\n/W [#{w.join(' ')}]\n/Size #{size}\n/Index [#{index_array.join(' ')}]\n/Prev #{startxref_prev}\n".b
110
+ xrefstm_dict << " /Root #{root_ref}".b if root_ref
111
+ xrefstm_dict << "\n/Filter /FlateDecode\n/Length #{xref_compressed.bytesize}\n>>\n".b
112
+
113
+ buf << "#{xrefstm_num} 0 obj\n".b
114
+ buf << xrefstm_dict
115
+ buf << "stream\n".b
116
+ buf << xref_compressed
117
+ buf << "\nendstream\n".b
118
+ buf << "endobj\n".b
119
+
120
+ # Build trailer - need to include xref stream itself
121
+ # The xref stream itself must be accessible, so we use a classic trailer pointing to it
122
+ new_size = [max_obj + 1, xrefstm_num, @patches.map { |p| p[:ref][0] }.max.to_i + 1].max
123
+ xref_offset = xrefstm_offset
124
+
125
+ trailer = "trailer\n<< /Size #{new_size} /Prev #{startxref_prev}".b
126
+ trailer << " /Root #{root_ref}".b if root_ref
127
+ trailer << " /XRefStm #{xrefstm_offset} >>\n".b
128
+ trailer << "startxref\n#{xref_offset}\n%%EOF\n".b
129
+
130
+ result = original_with_newline + buf + trailer
131
+
132
+ else
133
+ # Fallback to individual objects if ObjStm.create fails
134
+ @patches.each do |p|
135
+ num, gen = p[:ref]
136
+ offset = original_with_newline.bytesize + buf.bytesize
137
+ offsets << [num, gen, offset]
138
+
139
+ # Write object with proper formatting
140
+ buf << "#{num} #{gen} obj\n"
141
+ buf << p[:body]
142
+ buf << "\nendobj\n"
143
+ end
144
+
145
+ # Build xref table
146
+ sorted = offsets.sort_by { |n, g, _| [n, g] }
147
+ xref = +"xref\n"
148
+
149
+ i = 0
150
+ while i < sorted.length
151
+ first_num = sorted[i][0]
152
+ run = 1
153
+ while (i + run) < sorted.length && sorted[i + run][0] == first_num + run && sorted[i + run][1] == sorted[i][1]
154
+ run += 1
155
+ end
156
+ xref << "#{first_num} #{run}\n"
157
+ run.times do |r|
158
+ abs = sorted[i + r][2]
159
+ gen = sorted[i + r][1]
160
+ xref << format("%010d %05d n \n", abs, gen)
161
+ end
162
+ i += run
163
+ end
164
+
165
+ # Debug: verify xref was built
166
+ if xref == "xref\n"
167
+ raise "Xref table is empty! Offsets: #{offsets.inspect}"
168
+ end
169
+
170
+ # Build trailer with /Root reference
171
+ new_size = [max_obj + 1, @patches.map { |p| p[:ref][0] }.max.to_i + 1].max
172
+ xref_offset = original_with_newline.bytesize + buf.bytesize
173
+
174
+ # Extract /Root from original trailer
175
+ root_ref = extract_root_from_trailer(@orig)
176
+ root_entry = root_ref ? " /Root #{root_ref}" : ""
177
+
178
+ trailer = "trailer\n<< /Size #{new_size} /Prev #{startxref_prev}#{root_entry} >>\nstartxref\n#{xref_offset}\n%%EOF\n"
179
+
180
+ result = original_with_newline + buf + xref + trailer
181
+
182
+ # Verify xref was built correctly
183
+ if xref.length < 10
184
+ warn "Warning: xref table seems too short (#{xref.length} bytes). Expected proper entries."
185
+ end
186
+
187
+ end
188
+ result
189
+ end
190
+
191
+ private
192
+
193
+ def find_startxref(bytes)
194
+ if bytes =~ /startxref\s+(\d+)\s*%%EOF\s*\z/m
195
+ return Integer(::Regexp.last_match(1))
196
+ end
197
+
198
+ m = bytes.rindex("startxref")
199
+ return nil unless m
200
+
201
+ tail = bytes[m, bytes.length - m]
202
+ tail[/startxref\s+(\d+)/m, 1]&.to_i
203
+ end
204
+
205
+ def scan_max_obj_number(bytes)
206
+ max = 0
207
+ bytes.scan(/(^|\s)(\d+)\s+(\d+)\s+obj\b/) { max = [::Regexp.last_match(2).to_i, max].max }
208
+ max
209
+ end
210
+
211
+ def extract_root_from_trailer(bytes)
212
+ # For xref streams, find the last xref stream object dictionary
213
+ startxref_match = bytes.match(/startxref\s+(\d+)\s*%%EOF\s*\z/m)
214
+ if startxref_match
215
+ xref_offset = startxref_match[1].to_i
216
+
217
+ # Check if it's an xref stream (starts with object header)
218
+ if bytes[xref_offset, 50] =~ /(\d+\s+\d+\s+obj)/
219
+ # Find the dictionary in the xref stream object
220
+ dict_start = bytes.index("<<", xref_offset)
221
+ if dict_start
222
+ trailer_section = bytes[dict_start, 500]
223
+ if trailer_section =~ %r{/Root\s+(\d+\s+\d+\s+R)}
224
+ return ::Regexp.last_match(1)
225
+ end
226
+ end
227
+ end
228
+ end
229
+
230
+ # Fallback: look for classic trailer
231
+ trailer_idx = bytes.rindex("trailer")
232
+ if trailer_idx
233
+ dict_start = bytes.index("<<", trailer_idx)
234
+ if dict_start
235
+ trailer_section = bytes[dict_start, 500]
236
+ if trailer_section =~ %r{/Root\s+(\d+\s+\d+\s+R)}
237
+ return ::Regexp.last_match(1)
238
+ end
239
+ end
240
+ end
241
+
242
+ nil
243
+ end
244
+ end
245
+ end
@@ -0,0 +1,381 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CorpPdf
4
+ # Parses xref (tables and streams) and exposes object bodies uniformly,
5
+ # including objects embedded in /ObjStm. Also gives you the trailer and /Root.
6
+ class ObjectResolver
7
+ Entry = Struct.new(:type, :offset, :objstm_num, :objstm_index, keyword_init: true)
8
+
9
+ def initialize(bytes)
10
+ @bytes = bytes
11
+ @entries = {}
12
+ @objstm_cache = {}
13
+ parse_cross_reference
14
+ end
15
+
16
+ def root_ref
17
+ tr = trailer_dict
18
+ return nil unless tr =~ %r{/Root\s+(\d+)\s+(\d+)\s+R}
19
+
20
+ [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
21
+ end
22
+
23
+ def trailer_dict
24
+ # Priority order:
25
+ # 1. Explicit trailer from classic xref (incremental updates)
26
+ # 2. Xref stream dictionary (original PDFs)
27
+ # 3. Search for trailer (fallback)
28
+ @trailer_dict ||= if @trailer_explicit
29
+ @trailer_explicit
30
+ elsif @last_xref_stream_dict
31
+ @last_xref_stream_dict
32
+ else
33
+ # Find last 'trailer << ... >>' before last startxref
34
+ start = find_startxref(@bytes) || 0
35
+ head = @bytes[0...start]
36
+ idx = head.rindex("trailer")
37
+ raise "trailer not found" unless idx
38
+
39
+ # naive grab following dict
40
+ m = head.index("<<", idx)
41
+ n = balanced_from(head, m)
42
+ head[m...n]
43
+ end
44
+ end
45
+
46
+ def each_object
47
+ @entries.each_key do |ref|
48
+ yield(ref, object_body(ref))
49
+ end
50
+ end
51
+
52
+ # Clear the object stream cache to free memory
53
+ def clear_cache
54
+ @objstm_cache.clear
55
+ end
56
+
57
+ def object_body(ref)
58
+ case (e = @entries[ref])&.type
59
+ when :in_file
60
+ i = e.offset
61
+ # Find "obj" start near offset (handle any preceding whitespace)
62
+ hdr = /\bobj\b/m.match(@bytes, i) or return nil
63
+ after = hdr.end(0)
64
+ # Skip optional whitespace and one line break if present
65
+ after += 1 while (ch = @bytes.getbyte(after)) && ch <= 0x20
66
+ j = @bytes.index(/\bendobj\b/m, after) or return nil
67
+ @bytes[after...j]
68
+ when :in_objstm
69
+ load_objstm([e.objstm_num, 0])
70
+ @objstm_cache[[e.objstm_num, 0]][e.objstm_index][:body]
71
+ end
72
+ end
73
+
74
+ # --- internals -----------------------------------------------------------
75
+
76
+ def parse_cross_reference
77
+ start = find_startxref(@bytes) or raise "startxref not found"
78
+ parse_xref_at_offset(start)
79
+ end
80
+
81
+ def parse_xref_at_offset(offset)
82
+ # 1) If 'xref' is literally at that offset => classic table
83
+ if @bytes[offset, 4] == "xref"
84
+ tr = parse_classic_xref(offset)
85
+
86
+ # 2) Classic trailers may include /XRefStm <offset> to an xref stream with compressed entries
87
+ xrefstm_tok = DictScan.value_token_after("/XRefStm", tr) if tr
88
+ if xrefstm_tok && (ofs = xrefstm_tok.to_i).positive?
89
+ parse_xref_stream_at(ofs) # merge entries from xref stream (type 0/1/2)
90
+ end
91
+
92
+ # 3) Follow /Prev pointer if present
93
+ prev_tok = DictScan.value_token_after("/Prev", tr) if tr
94
+ if prev_tok && (prev_ofs = prev_tok.to_i).positive?
95
+ parse_xref_at_offset(prev_ofs)
96
+ end
97
+ else
98
+ # Direct xref stream case (offset points to the xref stream obj header)
99
+ dict_src = parse_xref_stream_at(offset)
100
+
101
+ # Follow /Prev in the xref stream's dictionary
102
+ if dict_src
103
+ prev_tok = DictScan.value_token_after("/Prev", dict_src)
104
+ if prev_tok && (prev_ofs = prev_tok.to_i).positive?
105
+ parse_xref_at_offset(prev_ofs)
106
+ end
107
+ end
108
+ end
109
+ end
110
+
111
+ def parse_classic_xref(start)
112
+ pos = @bytes.rindex("xref", start) or raise "xref not found"
113
+ i = pos + 4
114
+
115
+ loop do
116
+ m = /\s*(\d+)\s+(\d+)/m.match(@bytes, i) or break
117
+ first = m[1].to_i
118
+ count = m[2].to_i
119
+ i = m.end(0)
120
+
121
+ count.times do |k|
122
+ # Skip whitespace/newlines before reading the 20-byte record
123
+ i += 1 while (ch = @bytes.getbyte(i)) && [0x0A, 0x0D, 0x20].include?(ch)
124
+
125
+ rec = @bytes[i, 20]
126
+ raise "bad xref record" unless rec && rec.bytesize == 20
127
+
128
+ off = rec[0, 10].to_i
129
+ gen = rec[11, 5].to_i
130
+ typ = rec[17, 1]
131
+ i += 20
132
+ # consume line ending(s)
133
+ i += 1 while (ch = @bytes.getbyte(i)) && [0x0A, 0x0D].include?(ch)
134
+
135
+ ref = [first + k, gen]
136
+ @entries[ref] ||= Entry.new(type: :in_file, offset: off) if typ == "n"
137
+ # (ignore 'f' free entries)
138
+ end
139
+
140
+ break if @bytes[i, 7] == "trailer"
141
+ end
142
+
143
+ tpos = @bytes.index("trailer", i)
144
+ if tpos
145
+ dpos = @bytes.index("<<", tpos)
146
+ if dpos
147
+ dend = balanced_from(@bytes, dpos)
148
+ @last_xref_stream_dict = nil
149
+ @trailer_explicit = @bytes[dpos...dend]
150
+ return @trailer_explicit
151
+ end
152
+ end
153
+
154
+ # No trailer found (might be at an intermediate xref in the chain)
155
+ nil
156
+ end
157
+
158
+ def parse_xref_stream_at(header_ofs)
159
+ # Expect "<num> <gen> obj" at header_ofs
160
+ m = /\A(\d+)\s+(\d+)\s+obj\b/m.match(@bytes[header_ofs, 50])
161
+ unless m
162
+ # Sometimes header_ofs might land on whitespace; search forward a bit
163
+ win = @bytes[header_ofs, 256]
164
+ m2 = /(\d+)\s+(\d+)\s+obj\b/m.match(win) or raise "xref stream header not found"
165
+ header_ofs += m2.begin(0)
166
+ m = m2
167
+ end
168
+ obj_ref = [m[1].to_i, m[2].to_i]
169
+
170
+ dpos = @bytes.index("<<", header_ofs + m[0].length) or raise "xref stream dict missing"
171
+ dend = balanced_from(@bytes, dpos)
172
+ dict_src = @bytes[dpos...dend]
173
+ @last_xref_stream_dict ||= dict_src # Keep first one for trailer_dict
174
+
175
+ spos = @bytes.index(/\bstream\r?\n/m, dend) or raise "xref stream body missing"
176
+ epos = @bytes.index(/\bendstream\b/m, spos) or raise "xref stream end missing"
177
+ data = @bytes[spos..epos]
178
+ raw = decode_stream_data(dict_src, data)
179
+
180
+ # W is mandatory in xref streams; if missing, bail (don't crash)
181
+ w_tok = DictScan.value_token_after("/W", dict_src)
182
+ return nil unless w_tok
183
+
184
+ w = JSON_like_array(w_tok)
185
+ idx_tok = DictScan.value_token_after("/Index", dict_src)
186
+ index = idx_tok ? JSON_like_array(idx_tok) : [0, DictScan.value_token_after("/Size", dict_src).to_i]
187
+
188
+ parse_xref_stream_records(raw, w, index)
189
+
190
+ # Ensure the xref stream object itself is registered (type 1 entry usually exists,
191
+ # but if not, add it so object_body can find the stream if needed)
192
+ unless @entries.key?(obj_ref)
193
+ # Approximate offset at header_ofs
194
+ @entries[obj_ref] = Entry.new(type: :in_file, offset: header_ofs)
195
+ end
196
+
197
+ dict_src # Return dict for /Prev checking
198
+ end
199
+
200
+ def parse_xref_stream_records(raw, w, index)
201
+ w0, w1, w2 = w
202
+ s = StringScanner.new(raw)
203
+ (0...(index.length / 2)).each do |i|
204
+ obj = index[2 * i].to_i
205
+ count = index[(2 * i) + 1].to_i
206
+ count.times do |k|
207
+ t = read_int(s, w0)
208
+ f1 = read_int(s, w1)
209
+ f2 = read_int(s, w2)
210
+ ref = [obj + k, 0]
211
+ case t
212
+ when 0 then next # free
213
+ when 1 then @entries[ref] ||= Entry.new(type: :in_file, offset: f1)
214
+ when 2 then @entries[ref] ||= Entry.new(type: :in_objstm, objstm_num: f1, objstm_index: f2)
215
+ end
216
+ end
217
+ end
218
+ end
219
+
220
+ def read_int(scanner, width)
221
+ # Ensure width is an integer
222
+ w = width.is_a?(Integer) ? width : width.to_i
223
+ return 0 if w.zero?
224
+
225
+ bytes = scanner.peek(w)
226
+ return 0 unless bytes && bytes.bytesize == w
227
+
228
+ scanner.pos += w
229
+ val = 0
230
+ bytes.each_byte { |b| val = (val << 8) | b }
231
+ val
232
+ end
233
+
234
+ def JSON_like_array(tok)
235
+ inner = tok[1..-2]
236
+ inner.split(/\s+/).map { |t| t =~ /\A\d+\z/ ? t.to_i : t }
237
+ end
238
+
239
+ def decode_stream_data(dict_src, stream_chunk)
240
+ s_match = /\bstream\r?\n/.match(stream_chunk) or raise "stream keyword missing"
241
+ body = stream_chunk[s_match.end(0)..]
242
+ body = body.sub(/\bendstream\b.*/m, "")
243
+
244
+ # Decompress if FlateDecode (handle both "/Filter /FlateDecode" and "/Filter/FlateDecode")
245
+ data = if dict_src =~ %r{/Filter\s*/FlateDecode}
246
+ Zlib::Inflate.inflate(body)
247
+ else
248
+ body
249
+ end
250
+
251
+ # Apply PNG predictor if present
252
+ if dict_src =~ %r{/DecodeParms\s*<<[^>]*/Predictor\s+(\d+)}
253
+ predictor = ::Regexp.last_match(1).to_i
254
+ if predictor.between?(10, 15) # PNG predictors
255
+ columns = dict_src =~ %r{/Columns\s+(\d+)} ? ::Regexp.last_match(1).to_i : 1
256
+ data = apply_png_predictor(data, columns)
257
+ end
258
+ end
259
+
260
+ data
261
+ end
262
+
263
+ def apply_png_predictor(data, columns)
264
+ # PNG predictor: each row starts with a filter byte, followed by 'columns' data bytes
265
+ row_size = columns + 1 # 1 byte for predictor + columns bytes of data
266
+ num_rows = data.bytesize / row_size
267
+ result = []
268
+ prev_row = [0] * columns
269
+
270
+ num_rows.times do |i|
271
+ row_start = i * row_size
272
+ filter_type = data.getbyte(row_start)
273
+ row_bytes = (1..columns).map { |j| data.getbyte(row_start + j) }
274
+
275
+ decoded_row = case filter_type
276
+ when 0 # None
277
+ row_bytes
278
+ when 1 # Sub
279
+ out = []
280
+ columns.times do |j|
281
+ left = j.positive? ? out[j - 1] : 0
282
+ out << ((row_bytes[j] + left) & 0xFF)
283
+ end
284
+ out
285
+ when 2 # Up
286
+ row_bytes.map.with_index { |b, j| (b + prev_row[j]) & 0xFF }
287
+ when 3 # Average
288
+ out = []
289
+ columns.times do |j|
290
+ left = j.positive? ? out[j - 1] : 0
291
+ up = prev_row[j]
292
+ out << ((row_bytes[j] + ((left + up) / 2)) & 0xFF)
293
+ end
294
+ out
295
+ when 4 # Paeth
296
+ out = []
297
+ columns.times do |j|
298
+ left = j.positive? ? out[j - 1] : 0
299
+ up = prev_row[j]
300
+ up_left = j.positive? ? prev_row[j - 1] : 0
301
+ out << ((row_bytes[j] + paeth_predictor(left, up, up_left)) & 0xFF)
302
+ end
303
+ out
304
+ else
305
+ row_bytes # Unknown filter, pass through
306
+ end
307
+
308
+ result.concat(decoded_row)
309
+ prev_row = decoded_row
310
+ end
311
+
312
+ result.pack("C*")
313
+ end
314
+
315
+ def paeth_predictor(a, b, c)
316
+ # a = left, b = up, c = up-left
317
+ p = a + b - c
318
+ pa = (p - a).abs
319
+ pb = (p - b).abs
320
+ pc = (p - c).abs
321
+ if pa <= pb && pa <= pc
322
+ a
323
+ elsif pb <= pc
324
+ b
325
+ else
326
+ c
327
+ end
328
+ end
329
+
330
+ def balanced_from(str, start_idx)
331
+ depth = 0
332
+ j = start_idx
333
+ while j < str.length
334
+ if str[j, 2] == "<<"
335
+ depth += 1
336
+ j += 2
337
+ elsif str[j, 2] == ">>"
338
+ depth -= 1
339
+ j += 2
340
+ return j if depth.zero?
341
+ else
342
+ j += 1
343
+ end
344
+ end
345
+ raise "unterminated dict"
346
+ end
347
+
348
+ def find_startxref(bytes)
349
+ return nil if bytes.nil? || bytes.empty?
350
+
351
+ if bytes =~ /startxref\s+(\d+)\s*%%EOF\s*\z/m
352
+ return Integer(::Regexp.last_match(1))
353
+ end
354
+
355
+ m = bytes.rindex("startxref")
356
+ return nil unless m
357
+
358
+ tail = bytes[m, bytes.length - m]
359
+ tail[/startxref\s+(\d+)/m, 1]&.to_i
360
+ end
361
+
362
+ def load_objstm(container_ref)
363
+ return if @objstm_cache.key?(container_ref)
364
+
365
+ body = object_body(container_ref)
366
+ raise "Object stream #{container_ref.inspect} not found in xref table" unless body
367
+
368
+ dict_start = body.index("<<") || 0
369
+ dict_end = balanced_from(body, dict_start)
370
+ dict_src = body[dict_start...dict_end]
371
+ s_pos = body.index(/\bstream\r?\n/m, dict_end) or raise "objstm stream missing"
372
+ e_pos = body.index(/\bendstream\b/m, s_pos) or raise "objstm end missing"
373
+ data = body[s_pos..e_pos]
374
+ raw = decode_stream_data(dict_src, data)
375
+ n = DictScan.value_token_after("/N", dict_src).to_i
376
+ first = DictScan.value_token_after("/First", dict_src).to_i
377
+ parsed = CorpPdf::ObjStm.parse(raw, n: n, first: first)
378
+ @objstm_cache[container_ref] = parsed
379
+ end
380
+ end
381
+ end
@@ -0,0 +1,75 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CorpPdf
4
+ class ObjStm
5
+ # Parse an object stream body given N and First.
6
+ # Returns: array like [{ ref:[obj_num,0], body:String }, ...] in order of header listing.
7
+ def self.parse(bytes, n:, first:)
8
+ head = bytes[0...first]
9
+ entries = head.strip.split(/\s+/).map!(&:to_i)
10
+ refs = []
11
+ n.times do |i|
12
+ obj = entries[2 * i]
13
+ off = entries[(2 * i) + 1]
14
+ next_off = i + 1 < n ? entries[(2 * (i + 1)) + 1] : (bytes.bytesize - first)
15
+ body = bytes[first + off, (first + next_off) - (first + off)]
16
+ refs << { ref: [obj, 0], body: body }
17
+ end
18
+ refs
19
+ end
20
+
21
+ # Create an object stream from patches (array of {ref: [num, gen], body: String}).
22
+ # Returns: { dictionary: String, stream_body: String, object_count: Integer }
23
+ # The dictionary includes /Type /ObjStm, /N (count), /First (header size), and /Filter /FlateDecode
24
+ def self.create(patches, compress: true)
25
+ return nil if patches.empty?
26
+
27
+ # Sort patches by object number for consistency
28
+ sorted_patches = patches.sort_by { |p| p[:ref][0] }
29
+
30
+ # Build header: "obj_num offset obj_num offset ..."
31
+ # Offsets are relative to the start of the object data section (after header)
32
+ header_parts = []
33
+ body_parts = []
34
+ current_offset = 0
35
+
36
+ sorted_patches.each do |patch|
37
+ obj_num, = patch[:ref]
38
+ body = patch[:body].to_s
39
+ # Ensure body ends with newline for proper parsing
40
+ body += "\n" unless body.end_with?("\n")
41
+
42
+ header_parts << obj_num.to_s
43
+ header_parts << current_offset.to_s
44
+ body_parts << body
45
+ current_offset += body.bytesize
46
+ end
47
+
48
+ header = "#{header_parts.join(' ')}\n"
49
+ first = header.bytesize
50
+ object_bodies = body_parts.join
51
+
52
+ # Combine header and bodies
53
+ raw_data = header + object_bodies
54
+
55
+ # Compress if requested
56
+ stream_body = if compress
57
+ Zlib::Deflate.deflate(raw_data)
58
+ else
59
+ raw_data
60
+ end
61
+
62
+ # Build dictionary
63
+ dict = "<<\n/Type /ObjStm\n/N #{sorted_patches.length}\n/First #{first}".b
64
+ dict << "\n/Filter /FlateDecode".b if compress
65
+ dict << "\n/Length #{stream_body.bytesize}\n>>".b
66
+
67
+ {
68
+ dictionary: dict,
69
+ stream_body: stream_body.b,
70
+ object_count: sorted_patches.length,
71
+ patches: sorted_patches
72
+ }
73
+ end
74
+ end
75
+ end