acro_that 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,244 @@
1
+ # frozen_string_literal: true
2
+
3
+ module AcroThat
4
+ # Appends an incremental update containing the given patches.
5
+ # Each patch is {ref:[num,gen], body:String}
6
+ class IncrementalWriter
7
+ def initialize(original_bytes, patches)
8
+ @orig = original_bytes
9
+ @patches = patches
10
+ end
11
+
12
+ def render
13
+ return @orig if @patches.empty?
14
+
15
+ startxref_prev = find_startxref(@orig) or raise "startxref not found"
16
+ max_obj = scan_max_obj_number(@orig)
17
+
18
+ # Ensure we end with a newline before appending
19
+ original_with_newline = @orig.dup
20
+ original_with_newline << "\n" unless @orig.end_with?("\n")
21
+
22
+ buf = +""
23
+ offsets = []
24
+
25
+ # Write patches into an object stream for efficiency
26
+ objstm_data = AcroThat::ObjStm.create(@patches, compress: true)
27
+ if objstm_data
28
+ # Get the next object number for the object stream itself
29
+ objstm_num = [max_obj + 1, @patches.map { |p| p[:ref][0] }.max.to_i + 1].max
30
+
31
+ # Write the object stream object
32
+ objstm_offset = original_with_newline.bytesize + buf.bytesize
33
+ offsets << [objstm_num, 0, objstm_offset]
34
+
35
+ buf << "#{objstm_num} 0 obj\n".b
36
+ buf << objstm_data[:dictionary]
37
+ buf << "\nstream\n".b
38
+ buf << objstm_data[:stream_body]
39
+ buf << "\nendstream\n".b
40
+ buf << "endobj\n".b
41
+
42
+ # Build xref stream (supports type 2 entries for objects in object streams)
43
+ sorted_patches = objstm_data[:patches]
44
+ xrefstm_num = objstm_num + 1
45
+
46
+ # Collect all entries: object stream itself (type 1) + patches (type 2)
47
+ # Format: [obj_num, gen, type, f1, f2]
48
+ # For type 1: f1 is offset, f2 is generation (unused in xref streams)
49
+ # For type 2: f1 is objstm_num, f2 is index in stream
50
+ entries = []
51
+ # Object stream itself - type 1 entry
52
+ entries << [objstm_num, 0, 1, objstm_offset, 0]
53
+ # Patches in object stream - type 2 entries
54
+ sorted_patches.each_with_index do |patch, index|
55
+ num, gen = patch[:ref]
56
+ next if num == objstm_num # Skip the object stream itself
57
+
58
+ entries << [num, gen, 2, objstm_num, index]
59
+ end
60
+
61
+ # Sort entries by object number
62
+ entries.sort_by! { |num, gen, _type, _f1, _f2| [num, gen] }
63
+
64
+ # Build Index array - use single range for simplicity
65
+ # Index format: [first_obj, count]
66
+ obj_nums = entries.map { |num, _gen, _type, _f1, _f2| num }
67
+ min_obj = obj_nums.min
68
+ max_obj = obj_nums.max
69
+
70
+ # For Index, we need consecutive entries from min_obj to max_obj
71
+ # So count is (max_obj - min_obj + 1)
72
+ index_count = max_obj - min_obj + 1
73
+ index_array = [min_obj, index_count]
74
+
75
+ # Build xref stream data with proper ordering
76
+ # W = [1, 4, 2] means: type (1 byte), offset/f1 (4 bytes), index/f2 (2 bytes)
77
+ w = [1, 4, 2]
78
+
79
+ # Create a map for quick lookup by object number
80
+ entry_map = {}
81
+ entries.each { |num, _gen, type, f1, f2| entry_map[num] = [type, f1, f2] }
82
+
83
+ # Build xref records in order according to Index range
84
+ # Index says entries start at min_obj and we have index_count entries
85
+ xref_records = []
86
+ index_count.times do |k|
87
+ obj_num = min_obj + k
88
+ if entry_map[obj_num]
89
+ type, f1, f2 = entry_map[obj_num]
90
+ xref_records << [type, f1, f2].pack("C N n")
91
+ else
92
+ # Type 0 (free) entry for missing objects in the range
93
+ xref_records << [0, 0, 0].pack("C N n")
94
+ end
95
+ end
96
+
97
+ xref_bytes = xref_records.join("".b)
98
+
99
+ # Compress xref stream
100
+ xref_compressed = Zlib::Deflate.deflate(xref_bytes)
101
+
102
+ # Size is max object number + 1 (must include xrefstm_num itself)
103
+ size = [max_obj + 1, objstm_num + 1, xrefstm_num + 1].max
104
+
105
+ # Write xref stream object
106
+ xrefstm_offset = original_with_newline.bytesize + buf.bytesize
107
+ root_ref = extract_root_from_trailer(@orig)
108
+ xrefstm_dict = "<<\n/Type /XRef\n/W [#{w.join(' ')}]\n/Size #{size}\n/Index [#{index_array.join(' ')}]\n/Prev #{startxref_prev}\n".b
109
+ xrefstm_dict << " /Root #{root_ref}".b if root_ref
110
+ xrefstm_dict << "\n/Filter /FlateDecode\n/Length #{xref_compressed.bytesize}\n>>\n".b
111
+
112
+ buf << "#{xrefstm_num} 0 obj\n".b
113
+ buf << xrefstm_dict
114
+ buf << "stream\n".b
115
+ buf << xref_compressed
116
+ buf << "\nendstream\n".b
117
+ buf << "endobj\n".b
118
+
119
+ # Build trailer - need to include xref stream itself
120
+ # The xref stream itself must be accessible, so we use a classic trailer pointing to it
121
+ new_size = [max_obj + 1, xrefstm_num, @patches.map { |p| p[:ref][0] }.max.to_i + 1].max
122
+ xref_offset = xrefstm_offset
123
+
124
+ trailer = "trailer\n<< /Size #{new_size} /Prev #{startxref_prev}".b
125
+ trailer << " /Root #{root_ref}".b if root_ref
126
+ trailer << " /XRefStm #{xrefstm_offset} >>\n".b
127
+ trailer << "startxref\n#{xref_offset}\n%%EOF\n".b
128
+
129
+ result = original_with_newline + buf + trailer
130
+
131
+ else
132
+ # Fallback to individual objects if ObjStm.create fails
133
+ @patches.each do |p|
134
+ num, gen = p[:ref]
135
+ offset = original_with_newline.bytesize + buf.bytesize
136
+ offsets << [num, gen, offset]
137
+
138
+ # Write object with proper formatting
139
+ buf << "#{num} #{gen} obj\n"
140
+ buf << p[:body]
141
+ buf << "\nendobj\n"
142
+ end
143
+
144
+ # Build xref table
145
+ sorted = offsets.sort_by { |n, g, _| [n, g] }
146
+ xref = +"xref\n"
147
+
148
+ i = 0
149
+ while i < sorted.length
150
+ first_num = sorted[i][0]
151
+ run = 1
152
+ while (i + run) < sorted.length && sorted[i + run][0] == first_num + run && sorted[i + run][1] == sorted[i][1]
153
+ run += 1
154
+ end
155
+ xref << "#{first_num} #{run}\n"
156
+ run.times do |r|
157
+ abs = sorted[i + r][2]
158
+ gen = sorted[i + r][1]
159
+ xref << format("%010d %05d n \n", abs, gen)
160
+ end
161
+ i += run
162
+ end
163
+
164
+ # Debug: verify xref was built
165
+ if xref == "xref\n"
166
+ raise "Xref table is empty! Offsets: #{offsets.inspect}"
167
+ end
168
+
169
+ # Build trailer with /Root reference
170
+ new_size = [max_obj + 1, @patches.map { |p| p[:ref][0] }.max.to_i + 1].max
171
+ xref_offset = original_with_newline.bytesize + buf.bytesize
172
+
173
+ # Extract /Root from original trailer
174
+ root_ref = extract_root_from_trailer(@orig)
175
+ root_entry = root_ref ? " /Root #{root_ref}" : ""
176
+
177
+ trailer = "trailer\n<< /Size #{new_size} /Prev #{startxref_prev}#{root_entry} >>\nstartxref\n#{xref_offset}\n%%EOF\n"
178
+
179
+ result = original_with_newline + buf + xref + trailer
180
+
181
+ # Verify xref was built correctly
182
+ if xref.length < 10
183
+ warn "Warning: xref table seems too short (#{xref.length} bytes). Expected proper entries."
184
+ end
185
+
186
+ end
187
+ result
188
+ end
189
+
190
+ private
191
+
192
+ def find_startxref(bytes)
193
+ if bytes =~ /startxref\s+(\d+)\s*%%EOF\s*\z/m
194
+ return Integer(::Regexp.last_match(1))
195
+ end
196
+
197
+ m = bytes.rindex("startxref")
198
+ return nil unless m
199
+
200
+ tail = bytes[m, bytes.length - m]
201
+ tail[/startxref\s+(\d+)/m, 1]&.to_i
202
+ end
203
+
204
+ def scan_max_obj_number(bytes)
205
+ max = 0
206
+ bytes.scan(/(^|\s)(\d+)\s+(\d+)\s+obj\b/) { max = [::Regexp.last_match(2).to_i, max].max }
207
+ max
208
+ end
209
+
210
+ def extract_root_from_trailer(bytes)
211
+ # For xref streams, find the last xref stream object dictionary
212
+ startxref_match = bytes.match(/startxref\s+(\d+)\s*%%EOF\s*\z/m)
213
+ if startxref_match
214
+ xref_offset = startxref_match[1].to_i
215
+
216
+ # Check if it's an xref stream (starts with object header)
217
+ if bytes[xref_offset, 50] =~ /(\d+\s+\d+\s+obj)/
218
+ # Find the dictionary in the xref stream object
219
+ dict_start = bytes.index("<<", xref_offset)
220
+ if dict_start
221
+ trailer_section = bytes[dict_start, 500]
222
+ if trailer_section =~ %r{/Root\s+(\d+\s+\d+\s+R)}
223
+ return ::Regexp.last_match(1)
224
+ end
225
+ end
226
+ end
227
+ end
228
+
229
+ # Fallback: look for classic trailer
230
+ trailer_idx = bytes.rindex("trailer")
231
+ if trailer_idx
232
+ dict_start = bytes.index("<<", trailer_idx)
233
+ if dict_start
234
+ trailer_section = bytes[dict_start, 500]
235
+ if trailer_section =~ %r{/Root\s+(\d+\s+\d+\s+R)}
236
+ return ::Regexp.last_match(1)
237
+ end
238
+ end
239
+ end
240
+
241
+ nil
242
+ end
243
+ end
244
+ end
@@ -0,0 +1,376 @@
1
+ # frozen_string_literal: true
2
+
3
+ module AcroThat
4
+ # Parses xref (tables and streams) and exposes object bodies uniformly,
5
+ # including objects embedded in /ObjStm. Also gives you the trailer and /Root.
6
+ class ObjectResolver
7
+ Entry = Struct.new(:type, :offset, :objstm_num, :objstm_index, keyword_init: true)
8
+
9
+ def initialize(bytes)
10
+ @bytes = bytes
11
+ @entries = {}
12
+ @objstm_cache = {}
13
+ parse_cross_reference
14
+ end
15
+
16
+ def root_ref
17
+ tr = trailer_dict
18
+ return nil unless tr =~ %r{/Root\s+(\d+)\s+(\d+)\s+R}
19
+
20
+ [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
21
+ end
22
+
23
+ def trailer_dict
24
+ # Priority order:
25
+ # 1. Explicit trailer from classic xref (incremental updates)
26
+ # 2. Xref stream dictionary (original PDFs)
27
+ # 3. Search for trailer (fallback)
28
+ @trailer_dict ||= if @trailer_explicit
29
+ @trailer_explicit
30
+ elsif @last_xref_stream_dict
31
+ @last_xref_stream_dict
32
+ else
33
+ # Find last 'trailer << ... >>' before last startxref
34
+ start = find_startxref(@bytes) || 0
35
+ head = @bytes[0...start]
36
+ idx = head.rindex("trailer")
37
+ raise "trailer not found" unless idx
38
+
39
+ # naive grab following dict
40
+ m = head.index("<<", idx)
41
+ n = balanced_from(head, m)
42
+ head[m...n]
43
+ end
44
+ end
45
+
46
+ def each_object
47
+ @entries.each_key do |ref|
48
+ yield(ref, object_body(ref))
49
+ end
50
+ end
51
+
52
+ def object_body(ref)
53
+ case (e = @entries[ref])&.type
54
+ when :in_file
55
+ i = e.offset
56
+ # Find "obj" start near offset (handle any preceding whitespace)
57
+ hdr = /\bobj\b/m.match(@bytes, i) or return nil
58
+ after = hdr.end(0)
59
+ # Skip optional whitespace and one line break if present
60
+ after += 1 while (ch = @bytes.getbyte(after)) && ch <= 0x20
61
+ j = @bytes.index(/\bendobj\b/m, after) or return nil
62
+ @bytes[after...j]
63
+ when :in_objstm
64
+ load_objstm([e.objstm_num, 0])
65
+ @objstm_cache[[e.objstm_num, 0]][e.objstm_index][:body]
66
+ end
67
+ end
68
+
69
+ # --- internals -----------------------------------------------------------
70
+
71
+ def parse_cross_reference
72
+ start = find_startxref(@bytes) or raise "startxref not found"
73
+ parse_xref_at_offset(start)
74
+ end
75
+
76
+ def parse_xref_at_offset(offset)
77
+ # 1) If 'xref' is literally at that offset => classic table
78
+ if @bytes[offset, 4] == "xref"
79
+ tr = parse_classic_xref(offset)
80
+
81
+ # 2) Classic trailers may include /XRefStm <offset> to an xref stream with compressed entries
82
+ xrefstm_tok = DictScan.value_token_after("/XRefStm", tr) if tr
83
+ if xrefstm_tok && (ofs = xrefstm_tok.to_i).positive?
84
+ parse_xref_stream_at(ofs) # merge entries from xref stream (type 0/1/2)
85
+ end
86
+
87
+ # 3) Follow /Prev pointer if present
88
+ prev_tok = DictScan.value_token_after("/Prev", tr) if tr
89
+ if prev_tok && (prev_ofs = prev_tok.to_i).positive?
90
+ parse_xref_at_offset(prev_ofs)
91
+ end
92
+ else
93
+ # Direct xref stream case (offset points to the xref stream obj header)
94
+ dict_src = parse_xref_stream_at(offset)
95
+
96
+ # Follow /Prev in the xref stream's dictionary
97
+ if dict_src
98
+ prev_tok = DictScan.value_token_after("/Prev", dict_src)
99
+ if prev_tok && (prev_ofs = prev_tok.to_i).positive?
100
+ parse_xref_at_offset(prev_ofs)
101
+ end
102
+ end
103
+ end
104
+ end
105
+
106
+ def parse_classic_xref(start)
107
+ pos = @bytes.rindex("xref", start) or raise "xref not found"
108
+ i = pos + 4
109
+
110
+ loop do
111
+ m = /\s*(\d+)\s+(\d+)/m.match(@bytes, i) or break
112
+ first = m[1].to_i
113
+ count = m[2].to_i
114
+ i = m.end(0)
115
+
116
+ count.times do |k|
117
+ # Skip whitespace/newlines before reading the 20-byte record
118
+ i += 1 while (ch = @bytes.getbyte(i)) && [0x0A, 0x0D, 0x20].include?(ch)
119
+
120
+ rec = @bytes[i, 20]
121
+ raise "bad xref record" unless rec && rec.bytesize == 20
122
+
123
+ off = rec[0, 10].to_i
124
+ gen = rec[11, 5].to_i
125
+ typ = rec[17, 1]
126
+ i += 20
127
+ # consume line ending(s)
128
+ i += 1 while (ch = @bytes.getbyte(i)) && [0x0A, 0x0D].include?(ch)
129
+
130
+ ref = [first + k, gen]
131
+ @entries[ref] ||= Entry.new(type: :in_file, offset: off) if typ == "n"
132
+ # (ignore 'f' free entries)
133
+ end
134
+
135
+ break if @bytes[i, 7] == "trailer"
136
+ end
137
+
138
+ tpos = @bytes.index("trailer", i)
139
+ if tpos
140
+ dpos = @bytes.index("<<", tpos)
141
+ if dpos
142
+ dend = balanced_from(@bytes, dpos)
143
+ @last_xref_stream_dict = nil
144
+ @trailer_explicit = @bytes[dpos...dend]
145
+ return @trailer_explicit
146
+ end
147
+ end
148
+
149
+ # No trailer found (might be at an intermediate xref in the chain)
150
+ nil
151
+ end
152
+
153
+ def parse_xref_stream_at(header_ofs)
154
+ # Expect "<num> <gen> obj" at header_ofs
155
+ m = /\A(\d+)\s+(\d+)\s+obj\b/m.match(@bytes[header_ofs, 50])
156
+ unless m
157
+ # Sometimes header_ofs might land on whitespace; search forward a bit
158
+ win = @bytes[header_ofs, 256]
159
+ m2 = /(\d+)\s+(\d+)\s+obj\b/m.match(win) or raise "xref stream header not found"
160
+ header_ofs += m2.begin(0)
161
+ m = m2
162
+ end
163
+ obj_ref = [m[1].to_i, m[2].to_i]
164
+
165
+ dpos = @bytes.index("<<", header_ofs + m[0].length) or raise "xref stream dict missing"
166
+ dend = balanced_from(@bytes, dpos)
167
+ dict_src = @bytes[dpos...dend]
168
+ @last_xref_stream_dict ||= dict_src # Keep first one for trailer_dict
169
+
170
+ spos = @bytes.index(/\bstream\r?\n/m, dend) or raise "xref stream body missing"
171
+ epos = @bytes.index(/\bendstream\b/m, spos) or raise "xref stream end missing"
172
+ data = @bytes[spos..epos]
173
+ raw = decode_stream_data(dict_src, data)
174
+
175
+ # W is mandatory in xref streams; if missing, bail (don't crash)
176
+ w_tok = DictScan.value_token_after("/W", dict_src)
177
+ return nil unless w_tok
178
+
179
+ w = JSON_like_array(w_tok)
180
+ idx_tok = DictScan.value_token_after("/Index", dict_src)
181
+ index = idx_tok ? JSON_like_array(idx_tok) : [0, DictScan.value_token_after("/Size", dict_src).to_i]
182
+
183
+ parse_xref_stream_records(raw, w, index)
184
+
185
+ # Ensure the xref stream object itself is registered (type 1 entry usually exists,
186
+ # but if not, add it so object_body can find the stream if needed)
187
+ unless @entries.key?(obj_ref)
188
+ # Approximate offset at header_ofs
189
+ @entries[obj_ref] = Entry.new(type: :in_file, offset: header_ofs)
190
+ end
191
+
192
+ dict_src # Return dict for /Prev checking
193
+ end
194
+
195
+ def parse_xref_stream_records(raw, w, index)
196
+ w0, w1, w2 = w
197
+ s = StringScanner.new(raw)
198
+ (0...(index.length / 2)).each do |i|
199
+ obj = index[2 * i].to_i
200
+ count = index[(2 * i) + 1].to_i
201
+ count.times do |k|
202
+ t = read_int(s, w0)
203
+ f1 = read_int(s, w1)
204
+ f2 = read_int(s, w2)
205
+ ref = [obj + k, 0]
206
+ case t
207
+ when 0 then next # free
208
+ when 1 then @entries[ref] ||= Entry.new(type: :in_file, offset: f1)
209
+ when 2 then @entries[ref] ||= Entry.new(type: :in_objstm, objstm_num: f1, objstm_index: f2)
210
+ end
211
+ end
212
+ end
213
+ end
214
+
215
+ def read_int(scanner, width)
216
+ # Ensure width is an integer
217
+ w = width.is_a?(Integer) ? width : width.to_i
218
+ return 0 if w.zero?
219
+
220
+ bytes = scanner.peek(w)
221
+ return 0 unless bytes && bytes.bytesize == w
222
+
223
+ scanner.pos += w
224
+ val = 0
225
+ bytes.each_byte { |b| val = (val << 8) | b }
226
+ val
227
+ end
228
+
229
+ def JSON_like_array(tok)
230
+ inner = tok[1..-2]
231
+ inner.split(/\s+/).map { |t| t =~ /\A\d+\z/ ? t.to_i : t }
232
+ end
233
+
234
+ def decode_stream_data(dict_src, stream_chunk)
235
+ s_match = /\bstream\r?\n/.match(stream_chunk) or raise "stream keyword missing"
236
+ body = stream_chunk[s_match.end(0)..]
237
+ body = body.sub(/\bendstream\b.*/m, "")
238
+
239
+ # Decompress if FlateDecode (handle both "/Filter /FlateDecode" and "/Filter/FlateDecode")
240
+ data = if dict_src =~ %r{/Filter\s*/FlateDecode}
241
+ Zlib::Inflate.inflate(body)
242
+ else
243
+ body
244
+ end
245
+
246
+ # Apply PNG predictor if present
247
+ if dict_src =~ %r{/DecodeParms\s*<<[^>]*/Predictor\s+(\d+)}
248
+ predictor = ::Regexp.last_match(1).to_i
249
+ if predictor.between?(10, 15) # PNG predictors
250
+ columns = dict_src =~ %r{/Columns\s+(\d+)} ? ::Regexp.last_match(1).to_i : 1
251
+ data = apply_png_predictor(data, columns)
252
+ end
253
+ end
254
+
255
+ data
256
+ end
257
+
258
+ def apply_png_predictor(data, columns)
259
+ # PNG predictor: each row starts with a filter byte, followed by 'columns' data bytes
260
+ row_size = columns + 1 # 1 byte for predictor + columns bytes of data
261
+ num_rows = data.bytesize / row_size
262
+ result = []
263
+ prev_row = [0] * columns
264
+
265
+ num_rows.times do |i|
266
+ row_start = i * row_size
267
+ filter_type = data.getbyte(row_start)
268
+ row_bytes = (1..columns).map { |j| data.getbyte(row_start + j) }
269
+
270
+ decoded_row = case filter_type
271
+ when 0 # None
272
+ row_bytes
273
+ when 1 # Sub
274
+ out = []
275
+ columns.times do |j|
276
+ left = j.positive? ? out[j - 1] : 0
277
+ out << ((row_bytes[j] + left) & 0xFF)
278
+ end
279
+ out
280
+ when 2 # Up
281
+ row_bytes.map.with_index { |b, j| (b + prev_row[j]) & 0xFF }
282
+ when 3 # Average
283
+ out = []
284
+ columns.times do |j|
285
+ left = j.positive? ? out[j - 1] : 0
286
+ up = prev_row[j]
287
+ out << ((row_bytes[j] + ((left + up) / 2)) & 0xFF)
288
+ end
289
+ out
290
+ when 4 # Paeth
291
+ out = []
292
+ columns.times do |j|
293
+ left = j.positive? ? out[j - 1] : 0
294
+ up = prev_row[j]
295
+ up_left = j.positive? ? prev_row[j - 1] : 0
296
+ out << ((row_bytes[j] + paeth_predictor(left, up, up_left)) & 0xFF)
297
+ end
298
+ out
299
+ else
300
+ row_bytes # Unknown filter, pass through
301
+ end
302
+
303
+ result.concat(decoded_row)
304
+ prev_row = decoded_row
305
+ end
306
+
307
+ result.pack("C*")
308
+ end
309
+
310
+ def paeth_predictor(a, b, c)
311
+ # a = left, b = up, c = up-left
312
+ p = a + b - c
313
+ pa = (p - a).abs
314
+ pb = (p - b).abs
315
+ pc = (p - c).abs
316
+ if pa <= pb && pa <= pc
317
+ a
318
+ elsif pb <= pc
319
+ b
320
+ else
321
+ c
322
+ end
323
+ end
324
+
325
+ def balanced_from(str, start_idx)
326
+ depth = 0
327
+ j = start_idx
328
+ while j < str.length
329
+ if str[j, 2] == "<<"
330
+ depth += 1
331
+ j += 2
332
+ elsif str[j, 2] == ">>"
333
+ depth -= 1
334
+ j += 2
335
+ return j if depth.zero?
336
+ else
337
+ j += 1
338
+ end
339
+ end
340
+ raise "unterminated dict"
341
+ end
342
+
343
+ def find_startxref(bytes)
344
+ return nil if bytes.nil? || bytes.empty?
345
+
346
+ if bytes =~ /startxref\s+(\d+)\s*%%EOF\s*\z/m
347
+ return Integer(::Regexp.last_match(1))
348
+ end
349
+
350
+ m = bytes.rindex("startxref")
351
+ return nil unless m
352
+
353
+ tail = bytes[m, bytes.length - m]
354
+ tail[/startxref\s+(\d+)/m, 1]&.to_i
355
+ end
356
+
357
+ def load_objstm(container_ref)
358
+ return if @objstm_cache.key?(container_ref)
359
+
360
+ body = object_body(container_ref)
361
+ raise "Object stream #{container_ref.inspect} not found in xref table" unless body
362
+
363
+ dict_start = body.index("<<") || 0
364
+ dict_end = balanced_from(body, dict_start)
365
+ dict_src = body[dict_start...dict_end]
366
+ s_pos = body.index(/\bstream\r?\n/m, dict_end) or raise "objstm stream missing"
367
+ e_pos = body.index(/\bendstream\b/m, s_pos) or raise "objstm end missing"
368
+ data = body[s_pos..e_pos]
369
+ raw = decode_stream_data(dict_src, data)
370
+ n = DictScan.value_token_after("/N", dict_src).to_i
371
+ first = DictScan.value_token_after("/First", dict_src).to_i
372
+ parsed = AcroThat::ObjStm.parse(raw, n: n, first: first)
373
+ @objstm_cache[container_ref] = parsed
374
+ end
375
+ end
376
+ end
@@ -0,0 +1,75 @@
1
+ # frozen_string_literal: true
2
+
3
+ module AcroThat
4
+ class ObjStm
5
+ # Parse an object stream body given N and First.
6
+ # Returns: array like [{ ref:[obj_num,0], body:String }, ...] in order of header listing.
7
+ def self.parse(bytes, n:, first:)
8
+ head = bytes[0...first]
9
+ entries = head.strip.split(/\s+/).map!(&:to_i)
10
+ refs = []
11
+ n.times do |i|
12
+ obj = entries[2 * i]
13
+ off = entries[(2 * i) + 1]
14
+ next_off = i + 1 < n ? entries[(2 * (i + 1)) + 1] : (bytes.bytesize - first)
15
+ body = bytes[first + off, (first + next_off) - (first + off)]
16
+ refs << { ref: [obj, 0], body: body }
17
+ end
18
+ refs
19
+ end
20
+
21
+ # Create an object stream from patches (array of {ref: [num, gen], body: String}).
22
+ # Returns: { dictionary: String, stream_body: String, object_count: Integer }
23
+ # The dictionary includes /Type /ObjStm, /N (count), /First (header size), and /Filter /FlateDecode
24
+ def self.create(patches, compress: true)
25
+ return nil if patches.empty?
26
+
27
+ # Sort patches by object number for consistency
28
+ sorted_patches = patches.sort_by { |p| p[:ref][0] }
29
+
30
+ # Build header: "obj_num offset obj_num offset ..."
31
+ # Offsets are relative to the start of the object data section (after header)
32
+ header_parts = []
33
+ body_parts = []
34
+ current_offset = 0
35
+
36
+ sorted_patches.each do |patch|
37
+ obj_num, = patch[:ref]
38
+ body = patch[:body].to_s
39
+ # Ensure body ends with newline for proper parsing
40
+ body += "\n" unless body.end_with?("\n")
41
+
42
+ header_parts << obj_num.to_s
43
+ header_parts << current_offset.to_s
44
+ body_parts << body
45
+ current_offset += body.bytesize
46
+ end
47
+
48
+ header = "#{header_parts.join(' ')}\n"
49
+ first = header.bytesize
50
+ object_bodies = body_parts.join
51
+
52
+ # Combine header and bodies
53
+ raw_data = header + object_bodies
54
+
55
+ # Compress if requested
56
+ stream_body = if compress
57
+ Zlib::Deflate.deflate(raw_data)
58
+ else
59
+ raw_data
60
+ end
61
+
62
+ # Build dictionary
63
+ dict = "<<\n/Type /ObjStm\n/N #{sorted_patches.length}\n/First #{first}".b
64
+ dict << "\n/Filter /FlateDecode".b if compress
65
+ dict << "\n/Length #{stream_body.bytesize}\n>>".b
66
+
67
+ {
68
+ dictionary: dict,
69
+ stream_body: stream_body.b,
70
+ object_count: sorted_patches.length,
71
+ patches: sorted_patches
72
+ }
73
+ end
74
+ end
75
+ end