@cj-tech-master/excelts 8.0.0 → 8.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. package/README.md +14 -1
  2. package/README_zh.md +6 -0
  3. package/dist/browser/modules/archive/zip/stream.d.ts +4 -0
  4. package/dist/browser/modules/archive/zip/stream.js +53 -0
  5. package/dist/browser/modules/pdf/core/crypto.d.ts +65 -0
  6. package/dist/browser/modules/pdf/core/crypto.js +637 -0
  7. package/dist/browser/modules/pdf/core/encryption.d.ts +23 -20
  8. package/dist/browser/modules/pdf/core/encryption.js +88 -261
  9. package/dist/browser/modules/pdf/core/pdf-writer.d.ts +6 -4
  10. package/dist/browser/modules/pdf/core/pdf-writer.js +19 -10
  11. package/dist/browser/modules/pdf/index.d.ts +23 -2
  12. package/dist/browser/modules/pdf/index.js +21 -3
  13. package/dist/browser/modules/pdf/reader/annotation-extractor.d.ts +63 -0
  14. package/dist/browser/modules/pdf/reader/annotation-extractor.js +155 -0
  15. package/dist/browser/modules/pdf/reader/cmap-parser.d.ts +70 -0
  16. package/dist/browser/modules/pdf/reader/cmap-parser.js +321 -0
  17. package/dist/browser/modules/pdf/reader/content-interpreter.d.ts +57 -0
  18. package/dist/browser/modules/pdf/reader/content-interpreter.js +715 -0
  19. package/dist/browser/modules/pdf/reader/font-decoder.d.ts +58 -0
  20. package/dist/browser/modules/pdf/reader/font-decoder.js +1513 -0
  21. package/dist/browser/modules/pdf/reader/form-extractor.d.ts +48 -0
  22. package/dist/browser/modules/pdf/reader/form-extractor.js +355 -0
  23. package/dist/browser/modules/pdf/reader/image-extractor.d.ts +55 -0
  24. package/dist/browser/modules/pdf/reader/image-extractor.js +220 -0
  25. package/dist/browser/modules/pdf/reader/metadata-reader.d.ts +56 -0
  26. package/dist/browser/modules/pdf/reader/metadata-reader.js +275 -0
  27. package/dist/browser/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
  28. package/dist/browser/modules/pdf/reader/pdf-decrypt.js +443 -0
  29. package/dist/browser/modules/pdf/reader/pdf-document.d.ts +191 -0
  30. package/dist/browser/modules/pdf/reader/pdf-document.js +818 -0
  31. package/dist/browser/modules/pdf/reader/pdf-parser.d.ts +65 -0
  32. package/dist/browser/modules/pdf/reader/pdf-parser.js +285 -0
  33. package/dist/browser/modules/pdf/reader/pdf-reader.d.ts +143 -0
  34. package/dist/browser/modules/pdf/reader/pdf-reader.js +200 -0
  35. package/dist/browser/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
  36. package/dist/browser/modules/pdf/reader/pdf-tokenizer.js +543 -0
  37. package/dist/browser/modules/pdf/reader/reader-utils.d.ts +15 -0
  38. package/dist/browser/modules/pdf/reader/reader-utils.js +27 -0
  39. package/dist/browser/modules/pdf/reader/stream-filters.d.ts +20 -0
  40. package/dist/browser/modules/pdf/reader/stream-filters.js +456 -0
  41. package/dist/browser/modules/pdf/reader/text-reconstruction.d.ts +44 -0
  42. package/dist/browser/modules/pdf/reader/text-reconstruction.js +463 -0
  43. package/dist/cjs/modules/archive/zip/stream.js +53 -0
  44. package/dist/cjs/modules/pdf/core/crypto.js +649 -0
  45. package/dist/cjs/modules/pdf/core/encryption.js +88 -263
  46. package/dist/cjs/modules/pdf/core/pdf-writer.js +19 -10
  47. package/dist/cjs/modules/pdf/index.js +23 -4
  48. package/dist/cjs/modules/pdf/reader/annotation-extractor.js +158 -0
  49. package/dist/cjs/modules/pdf/reader/cmap-parser.js +326 -0
  50. package/dist/cjs/modules/pdf/reader/content-interpreter.js +718 -0
  51. package/dist/cjs/modules/pdf/reader/font-decoder.js +1518 -0
  52. package/dist/cjs/modules/pdf/reader/form-extractor.js +358 -0
  53. package/dist/cjs/modules/pdf/reader/image-extractor.js +223 -0
  54. package/dist/cjs/modules/pdf/reader/metadata-reader.js +278 -0
  55. package/dist/cjs/modules/pdf/reader/pdf-decrypt.js +447 -0
  56. package/dist/cjs/modules/pdf/reader/pdf-document.js +822 -0
  57. package/dist/cjs/modules/pdf/reader/pdf-parser.js +301 -0
  58. package/dist/cjs/modules/pdf/reader/pdf-reader.js +203 -0
  59. package/dist/cjs/modules/pdf/reader/pdf-tokenizer.js +517 -0
  60. package/dist/cjs/modules/pdf/reader/reader-utils.js +30 -0
  61. package/dist/cjs/modules/pdf/reader/stream-filters.js +459 -0
  62. package/dist/cjs/modules/pdf/reader/text-reconstruction.js +467 -0
  63. package/dist/esm/modules/archive/zip/stream.js +53 -0
  64. package/dist/esm/modules/pdf/core/crypto.js +637 -0
  65. package/dist/esm/modules/pdf/core/encryption.js +88 -261
  66. package/dist/esm/modules/pdf/core/pdf-writer.js +19 -10
  67. package/dist/esm/modules/pdf/index.js +21 -3
  68. package/dist/esm/modules/pdf/reader/annotation-extractor.js +155 -0
  69. package/dist/esm/modules/pdf/reader/cmap-parser.js +321 -0
  70. package/dist/esm/modules/pdf/reader/content-interpreter.js +715 -0
  71. package/dist/esm/modules/pdf/reader/font-decoder.js +1513 -0
  72. package/dist/esm/modules/pdf/reader/form-extractor.js +355 -0
  73. package/dist/esm/modules/pdf/reader/image-extractor.js +220 -0
  74. package/dist/esm/modules/pdf/reader/metadata-reader.js +275 -0
  75. package/dist/esm/modules/pdf/reader/pdf-decrypt.js +443 -0
  76. package/dist/esm/modules/pdf/reader/pdf-document.js +818 -0
  77. package/dist/esm/modules/pdf/reader/pdf-parser.js +285 -0
  78. package/dist/esm/modules/pdf/reader/pdf-reader.js +200 -0
  79. package/dist/esm/modules/pdf/reader/pdf-tokenizer.js +543 -0
  80. package/dist/esm/modules/pdf/reader/reader-utils.js +27 -0
  81. package/dist/esm/modules/pdf/reader/stream-filters.js +456 -0
  82. package/dist/esm/modules/pdf/reader/text-reconstruction.js +463 -0
  83. package/dist/iife/excelts.iife.js +703 -267
  84. package/dist/iife/excelts.iife.js.map +1 -1
  85. package/dist/iife/excelts.iife.min.js +35 -35
  86. package/dist/types/modules/archive/zip/stream.d.ts +4 -0
  87. package/dist/types/modules/pdf/core/crypto.d.ts +65 -0
  88. package/dist/types/modules/pdf/core/encryption.d.ts +23 -20
  89. package/dist/types/modules/pdf/core/pdf-writer.d.ts +6 -4
  90. package/dist/types/modules/pdf/index.d.ts +23 -2
  91. package/dist/types/modules/pdf/reader/annotation-extractor.d.ts +63 -0
  92. package/dist/types/modules/pdf/reader/cmap-parser.d.ts +70 -0
  93. package/dist/types/modules/pdf/reader/content-interpreter.d.ts +57 -0
  94. package/dist/types/modules/pdf/reader/font-decoder.d.ts +58 -0
  95. package/dist/types/modules/pdf/reader/form-extractor.d.ts +48 -0
  96. package/dist/types/modules/pdf/reader/image-extractor.d.ts +55 -0
  97. package/dist/types/modules/pdf/reader/metadata-reader.d.ts +56 -0
  98. package/dist/types/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
  99. package/dist/types/modules/pdf/reader/pdf-document.d.ts +191 -0
  100. package/dist/types/modules/pdf/reader/pdf-parser.d.ts +65 -0
  101. package/dist/types/modules/pdf/reader/pdf-reader.d.ts +143 -0
  102. package/dist/types/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
  103. package/dist/types/modules/pdf/reader/reader-utils.d.ts +15 -0
  104. package/dist/types/modules/pdf/reader/stream-filters.d.ts +20 -0
  105. package/dist/types/modules/pdf/reader/text-reconstruction.d.ts +44 -0
  106. package/package.json +1 -1
@@ -0,0 +1,818 @@
1
+ /**
2
+ * PDF document parser.
3
+ *
4
+ * Handles the high-level PDF file structure:
5
+ * - Locating startxref
6
+ * - Parsing cross-reference tables (traditional and stream-based)
7
+ * - Reading trailer dictionaries
8
+ * - Resolving indirect object references
9
+ * - Handling incremental updates
10
+ *
11
+ * @see PDF Reference 1.7, §3.4 - File Structure
12
+ */
13
+ import { PdfTokenizer, TokenType } from "./pdf-tokenizer.js";
14
+ import { parseObject, isPdfDict, isPdfStream, isPdfRef, isPdfArray, dictGetNumber, dictGetRef, dictGetArray, dictGetName, decodePdfStringBytes } from "./pdf-parser.js";
15
+ import { decodeStreamFilters } from "./stream-filters.js";
16
+ import { PdfStructureError } from "../errors.js";
17
+ // =============================================================================
18
+ // Module-level cached TextEncoder
19
+ // =============================================================================
20
+ /** Cached TextEncoder instance to avoid repeated allocation in hot paths */
21
+ const _encoder = new TextEncoder();
22
+ // =============================================================================
23
+ // PDF Document
24
+ // =============================================================================
25
+ /**
26
+ * Parsed PDF document with lazy object resolution.
27
+ *
28
+ * Reads the cross-reference table and trailer on construction,
29
+ * then resolves individual objects on demand with caching.
30
+ */
31
+ export class PdfDocument {
32
+ constructor(data) {
33
+ this.xref = new Map();
34
+ this.cache = new Map();
35
+ /** Encryption handler (set externally after decryption is initialized) */
36
+ this.decryptFn = null;
37
+ this.tokenizer = new PdfTokenizer(data);
38
+ this.trailer = this.parseFileStructure();
39
+ }
40
+ /** Get the underlying raw data */
41
+ get data() {
42
+ return this.tokenizer.bytes;
43
+ }
44
+ // ===========================================================================
45
+ // File Structure Parsing
46
+ // ===========================================================================
47
+ parseFileStructure() {
48
+ try {
49
+ const startxrefOffset = this.findStartxref();
50
+ return this.parseXrefChain(startxrefOffset);
51
+ }
52
+ catch {
53
+ // If normal xref parsing fails, attempt full-file reconstruction
54
+ return this.reconstructXref();
55
+ }
56
+ }
57
+ /**
58
+ * Find the startxref offset by scanning backward from EOF.
59
+ */
60
+ findStartxref() {
61
+ const data = this.tokenizer.bytes;
62
+ const startxrefKeyword = _encoder.encode("startxref");
63
+ const pos = this.tokenizer.findSequenceBackward(startxrefKeyword);
64
+ if (pos < 0) {
65
+ throw new PdfStructureError("Could not find startxref keyword");
66
+ }
67
+ // Position after "startxref"
68
+ this.tokenizer.pos = pos + startxrefKeyword.length;
69
+ this.tokenizer.skipWhitespaceAndComments();
70
+ // Read the offset number
71
+ let numStr = "";
72
+ while (this.tokenizer.pos < data.length) {
73
+ const b = data[this.tokenizer.pos];
74
+ if (b >= 0x30 && b <= 0x39) {
75
+ numStr += String.fromCharCode(b);
76
+ this.tokenizer.pos++;
77
+ }
78
+ else {
79
+ break;
80
+ }
81
+ }
82
+ const offset = parseInt(numStr, 10);
83
+ if (isNaN(offset)) {
84
+ throw new PdfStructureError("Invalid startxref offset");
85
+ }
86
+ return offset;
87
+ }
88
+ /**
89
+ * Parse the xref chain starting at the given offset.
90
+ * Follows /Prev links for incremental updates.
91
+ * Returns the merged trailer dictionary.
92
+ */
93
+ parseXrefChain(startOffset) {
94
+ let trailerDict = null;
95
+ let offset = startOffset;
96
+ const visited = new Set();
97
+ while (offset !== null) {
98
+ if (visited.has(offset)) {
99
+ break; // Prevent infinite loops
100
+ }
101
+ visited.add(offset);
102
+ this.tokenizer.pos = offset;
103
+ this.tokenizer.skipWhitespaceAndComments();
104
+ // Check if this is a traditional xref table or an xref stream
105
+ const peekStart = this.tokenizer.pos;
106
+ const firstToken = this.tokenizer.next();
107
+ if (firstToken.type === TokenType.Keyword && firstToken.strValue === "xref") {
108
+ // Traditional xref table
109
+ const trailer = this.parseTraditionalXref();
110
+ if (!trailerDict) {
111
+ trailerDict = trailer;
112
+ }
113
+ else {
114
+ // Merge: first trailer wins for Root, Info, Encrypt, ID
115
+ this.mergeTrailer(trailerDict, trailer);
116
+ }
117
+ const prev = dictGetNumber(trailer, "Prev");
118
+ offset = prev ?? null;
119
+ }
120
+ else if (firstToken.type === TokenType.Number) {
121
+ // Xref stream (PDF 1.5+): starts with `N gen obj`
122
+ this.tokenizer.pos = peekStart;
123
+ const trailer = this.parseXrefStream(offset);
124
+ if (!trailerDict) {
125
+ trailerDict = trailer;
126
+ }
127
+ else {
128
+ this.mergeTrailer(trailerDict, trailer);
129
+ }
130
+ const prev = dictGetNumber(trailer, "Prev");
131
+ offset = prev ?? null;
132
+ }
133
+ else {
134
+ throw new PdfStructureError(`Invalid xref at offset ${offset}: expected 'xref' keyword or xref stream`);
135
+ }
136
+ }
137
+ if (!trailerDict) {
138
+ throw new PdfStructureError("No trailer found");
139
+ }
140
+ return trailerDict;
141
+ }
142
+ /**
143
+ * Parse a traditional xref table and its trailer.
144
+ */
145
+ parseTraditionalXref() {
146
+ // The "xref" keyword has already been consumed
147
+ while (true) {
148
+ this.tokenizer.skipWhitespaceAndComments();
149
+ // Check if we've hit the trailer
150
+ const peekPos = this.tokenizer.pos;
151
+ const token = this.tokenizer.next();
152
+ if (token.type === TokenType.Keyword && token.strValue === "trailer") {
153
+ break;
154
+ }
155
+ // Subsection header: startObj count
156
+ if (token.type !== TokenType.Number) {
157
+ // End of xref sections
158
+ this.tokenizer.pos = peekPos;
159
+ break;
160
+ }
161
+ const startObj = token.numValue;
162
+ const countToken = this.tokenizer.next();
163
+ if (countToken.type !== TokenType.Number) {
164
+ throw new PdfStructureError("Invalid xref subsection header");
165
+ }
166
+ const count = countToken.numValue;
167
+ // Parse entries
168
+ for (let i = 0; i < count; i++) {
169
+ const objNum = startObj + i;
170
+ this.tokenizer.skipWhitespaceAndComments();
171
+ // Each entry is exactly "OOOOOOOOOO GGGGG n \n" or "OOOOOOOOOO GGGGG f \n"
172
+ const line = this.tokenizer.readLine();
173
+ const parts = line.trim().split(/\s+/);
174
+ if (parts.length < 3) {
175
+ continue;
176
+ }
177
+ const entryOffset = parseInt(parts[0], 10);
178
+ const gen = parseInt(parts[1], 10);
179
+ const inUse = parts[2] === "n";
180
+ if (inUse && !this.xref.has(objNum)) {
181
+ this.xref.set(objNum, { offset: entryOffset, gen, type: 1 });
182
+ }
183
+ }
184
+ }
185
+ // Parse the trailer dictionary
186
+ this.tokenizer.skipWhitespaceAndComments();
187
+ const trailerObj = parseObject(this.tokenizer);
188
+ if (!isPdfDict(trailerObj)) {
189
+ throw new PdfStructureError("Expected dictionary after 'trailer' keyword");
190
+ }
191
+ return trailerObj;
192
+ }
193
+ /**
194
+ * Parse a cross-reference stream (PDF 1.5+).
195
+ */
196
+ parseXrefStream(offset) {
197
+ this.tokenizer.pos = offset;
198
+ const obj = parseObject(this.tokenizer);
199
+ if (!isPdfStream(obj)) {
200
+ throw new PdfStructureError("Expected xref stream object");
201
+ }
202
+ const dict = obj.dict;
203
+ const type = dictGetName(dict, "Type");
204
+ if (type !== "XRef") {
205
+ throw new PdfStructureError(`Expected /Type /XRef, got /Type /${type}`);
206
+ }
207
+ // Decode the stream data
208
+ const streamData = decodeStreamFilters(obj.data, dict);
209
+ // Parse W array: [fieldSizeType, fieldSizeOffset, fieldSizeGen]
210
+ const wArray = dictGetArray(dict, "W");
211
+ if (!wArray || wArray.length < 3) {
212
+ throw new PdfStructureError("Invalid /W array in xref stream");
213
+ }
214
+ const w0 = wArray[0];
215
+ const w1 = wArray[1];
216
+ const w2 = wArray[2];
217
+ const entrySize = w0 + w1 + w2;
218
+ // Parse Index array (default: [0 Size])
219
+ const size = dictGetNumber(dict, "Size") ?? 0;
220
+ let indexArray = dictGetArray(dict, "Index");
221
+ if (!indexArray) {
222
+ indexArray = [0, size];
223
+ }
224
+ // Process entries
225
+ let dataOffset = 0;
226
+ for (let i = 0; i < indexArray.length; i += 2) {
227
+ const startObj = indexArray[i];
228
+ const count = indexArray[i + 1];
229
+ for (let j = 0; j < count; j++) {
230
+ if (dataOffset + entrySize > streamData.length) {
231
+ break;
232
+ }
233
+ const objNum = startObj + j;
234
+ const fieldType = w0 > 0 ? readIntBE(streamData, dataOffset, w0) : 1;
235
+ const field2 = readIntBE(streamData, dataOffset + w0, w1);
236
+ const field3 = w2 > 0 ? readIntBE(streamData, dataOffset + w0 + w1, w2) : 0;
237
+ dataOffset += entrySize;
238
+ if (this.xref.has(objNum)) {
239
+ continue; // First entry wins
240
+ }
241
+ if (fieldType === 0) {
242
+ // Free object — skip
243
+ }
244
+ else if (fieldType === 1) {
245
+ // Uncompressed object: field2 = byte offset, field3 = generation
246
+ this.xref.set(objNum, { offset: field2, gen: field3, type: 1 });
247
+ }
248
+ else if (fieldType === 2) {
249
+ // Compressed object in object stream: field2 = objstm number, field3 = index
250
+ this.xref.set(objNum, { offset: field2, gen: field3, type: 2 });
251
+ }
252
+ }
253
+ }
254
+ return dict;
255
+ }
256
+ /**
257
+ * Reconstruct the xref table by scanning the entire file for `N N obj` patterns.
258
+ * This is a fallback for corrupted or broken PDFs where the normal xref parsing fails.
259
+ *
260
+ * @returns A synthetic trailer dictionary
261
+ */
262
+ reconstructXref() {
263
+ const data = this.tokenizer.bytes;
264
+ this.xref.clear();
265
+ // Regex-style scan: look for patterns like "123 0 obj" in the raw bytes
266
+ // We scan byte-by-byte looking for digit sequences followed by spaces and "obj"
267
+ const objKeyword = _encoder.encode("obj");
268
+ let pos = 0;
269
+ while (pos < data.length - 5) {
270
+ // Skip to a potential start of an object definition (digit character)
271
+ if (data[pos] < 0x30 || data[pos] > 0x39) {
272
+ pos++;
273
+ continue;
274
+ }
275
+ // Ensure we're at a line boundary or start of file
276
+ if (pos > 0 && data[pos - 1] !== 0x0a && data[pos - 1] !== 0x0d && data[pos - 1] !== 0x20) {
277
+ pos++;
278
+ continue;
279
+ }
280
+ // Try to read: objNum gen obj
281
+ const savedPos = pos;
282
+ let objNumStr = "";
283
+ while (pos < data.length && data[pos] >= 0x30 && data[pos] <= 0x39) {
284
+ objNumStr += String.fromCharCode(data[pos]);
285
+ pos++;
286
+ }
287
+ if (objNumStr.length === 0 || pos >= data.length || data[pos] !== 0x20) {
288
+ pos = savedPos + 1;
289
+ continue;
290
+ }
291
+ pos++; // skip space
292
+ let genStr = "";
293
+ while (pos < data.length && data[pos] >= 0x30 && data[pos] <= 0x39) {
294
+ genStr += String.fromCharCode(data[pos]);
295
+ pos++;
296
+ }
297
+ if (genStr.length === 0 || pos >= data.length || data[pos] !== 0x20) {
298
+ pos = savedPos + 1;
299
+ continue;
300
+ }
301
+ pos++; // skip space
302
+ // Check for "obj" keyword
303
+ if (pos + objKeyword.length <= data.length &&
304
+ data[pos] === objKeyword[0] &&
305
+ data[pos + 1] === objKeyword[1] &&
306
+ data[pos + 2] === objKeyword[2]) {
307
+ // Verify the character after "obj" is whitespace or delimiter
308
+ const afterObj = pos + 3;
309
+ if (afterObj >= data.length ||
310
+ data[afterObj] === 0x20 ||
311
+ data[afterObj] === 0x0a ||
312
+ data[afterObj] === 0x0d ||
313
+ data[afterObj] === 0x09 ||
314
+ data[afterObj] === 0x3c // '<' for immediate dict/stream
315
+ ) {
316
+ const objNum = parseInt(objNumStr, 10);
317
+ const gen = parseInt(genStr, 10);
318
+ if (!this.xref.has(objNum)) {
319
+ this.xref.set(objNum, { offset: savedPos, gen, type: 1 });
320
+ }
321
+ }
322
+ }
323
+ pos = savedPos + 1;
324
+ }
325
+ if (this.xref.size === 0) {
326
+ throw new PdfStructureError("Could not reconstruct xref: no objects found");
327
+ }
328
+ // Try to find a trailer dictionary by scanning for "trailer" keyword
329
+ const trailerKeyword = _encoder.encode("trailer");
330
+ const trailerPos = this.tokenizer.findSequenceBackward(trailerKeyword);
331
+ if (trailerPos >= 0) {
332
+ this.tokenizer.pos = trailerPos + trailerKeyword.length;
333
+ this.tokenizer.skipWhitespaceAndComments();
334
+ try {
335
+ const trailerObj = parseObject(this.tokenizer);
336
+ if (isPdfDict(trailerObj)) {
337
+ return trailerObj;
338
+ }
339
+ }
340
+ catch {
341
+ // Fall through to synthetic trailer
342
+ }
343
+ }
344
+ // Build a synthetic trailer by finding the Root catalog
345
+ const syntheticTrailer = new Map();
346
+ syntheticTrailer.set("Size", this.xref.size);
347
+ // Scan resolved objects to find the catalog (the one with /Type /Catalog)
348
+ for (const [objNum, entry] of this.xref) {
349
+ if (entry.type !== 1) {
350
+ continue;
351
+ }
352
+ try {
353
+ this.tokenizer.pos = entry.offset;
354
+ const obj = parseObject(this.tokenizer);
355
+ if (isPdfDict(obj)) {
356
+ const typeVal = dictGetName(obj, "Type");
357
+ if (typeVal === "Catalog") {
358
+ syntheticTrailer.set("Root", { type: "ref", objNum, gen: entry.gen });
359
+ break;
360
+ }
361
+ }
362
+ else if (isPdfStream(obj)) {
363
+ const typeVal = dictGetName(obj.dict, "Type");
364
+ if (typeVal === "Catalog") {
365
+ syntheticTrailer.set("Root", { type: "ref", objNum, gen: entry.gen });
366
+ break;
367
+ }
368
+ }
369
+ }
370
+ catch {
371
+ // Skip unparseable objects
372
+ }
373
+ }
374
+ return syntheticTrailer;
375
+ }
376
+ /**
377
+ * Merge trailer entries from an older trailer into the current one.
378
+ * Only adds keys that don't already exist.
379
+ */
380
+ mergeTrailer(current, older) {
381
+ for (const [key, value] of older) {
382
+ if (!current.has(key)) {
383
+ current.set(key, value);
384
+ }
385
+ }
386
+ }
387
+ // ===========================================================================
388
+ // Object Resolution
389
+ // ===========================================================================
390
+ /**
391
+ * Resolve a PDF object by its object number and generation.
392
+ * Returns null if the object doesn't exist.
393
+ */
394
+ resolve(objNum, gen = 0) {
395
+ const cacheKey = `${objNum}:${gen}`;
396
+ if (this.cache.has(cacheKey)) {
397
+ return this.cache.get(cacheKey);
398
+ }
399
+ const entry = this.xref.get(objNum);
400
+ if (!entry) {
401
+ return null;
402
+ }
403
+ let obj = null;
404
+ if (entry.type === 1) {
405
+ // Uncompressed object — parse directly at offset
406
+ obj = this.parseObjectAt(entry.offset, objNum, entry.gen);
407
+ }
408
+ else if (entry.type === 2) {
409
+ // Compressed object in an object stream
410
+ obj = this.parseCompressedObject(entry.offset, entry.gen);
411
+ }
412
+ // Decrypt string values within the resolved object (V1-V4 per-object encryption)
413
+ if (obj !== null && this.decryptFn) {
414
+ obj = this.decryptObjectStrings(obj, objNum, entry.gen);
415
+ }
416
+ if (obj !== null) {
417
+ this.cache.set(cacheKey, obj);
418
+ }
419
+ return obj;
420
+ }
421
+ /**
422
+ * Resolve a PDF object and return it along with its object/generation numbers.
423
+ * Useful for tracking which object a value came from (for decryption).
424
+ *
425
+ * @param objNum - The object number to resolve
426
+ * @param gen - The generation number (default 0)
427
+ * @returns The resolved object with its objNum and gen for decryption context
428
+ */
429
+ resolveWithObjNum(objNum, gen = 0) {
430
+ const obj = this.resolve(objNum, gen);
431
+ return { obj, objNum, gen };
432
+ }
433
+ /**
434
+ * Dereference a PdfRef to its actual object value.
435
+ * If the input is not a PdfRef, returns it as-is.
436
+ */
437
+ deref(obj) {
438
+ if (obj === null || obj === undefined) {
439
+ return null;
440
+ }
441
+ if (isPdfRef(obj)) {
442
+ return this.resolve(obj.objNum, obj.gen);
443
+ }
444
+ return obj;
445
+ }
446
+ /**
447
+ * Dereference a PdfRef and assert it's a dictionary.
448
+ */
449
+ derefDict(obj) {
450
+ const resolved = this.deref(obj);
451
+ if (resolved === null) {
452
+ return null;
453
+ }
454
+ if (isPdfDict(resolved)) {
455
+ return resolved;
456
+ }
457
+ if (isPdfStream(resolved)) {
458
+ return resolved.dict;
459
+ }
460
+ return null;
461
+ }
462
+ /**
463
+ * Dereference a PdfRef and get the stream, along with the objNum/gen
464
+ * needed for correct per-object decryption.
465
+ */
466
+ derefStream(obj) {
467
+ const resolved = this.deref(obj);
468
+ if (resolved === null) {
469
+ return null;
470
+ }
471
+ if (isPdfStream(resolved)) {
472
+ return resolved;
473
+ }
474
+ return null;
475
+ }
476
+ /**
477
+ * Dereference a PdfRef and get the stream with its object number and generation.
478
+ * Returns null if the object is not a stream.
479
+ * The objNum/gen are needed for correct per-object decryption (V1-V4).
480
+ */
481
+ derefStreamWithObjNum(obj) {
482
+ if (obj === null || obj === undefined) {
483
+ return null;
484
+ }
485
+ let objNum = 0;
486
+ let gen = 0;
487
+ if (isPdfRef(obj)) {
488
+ objNum = obj.objNum;
489
+ gen = obj.gen;
490
+ }
491
+ const resolved = this.deref(obj);
492
+ if (resolved === null) {
493
+ return null;
494
+ }
495
+ if (isPdfStream(resolved)) {
496
+ return { stream: resolved, objNum, gen };
497
+ }
498
+ return null;
499
+ }
500
+ /**
501
+ * Get decoded stream data from a stream object.
502
+ * Applies filter chain decoding and decryption.
503
+ *
504
+ * When objNum/gen are not provided (default 0), decryption may not
505
+ * produce correct results. Use {@link resolveWithObjNum} to obtain
506
+ * the correct objNum/gen for the stream's containing object.
507
+ */
508
+ getStreamData(stream, objNum = 0, gen = 0) {
509
+ let data = stream.data;
510
+ // Decrypt stream data if encryption is active
511
+ if (this.decryptFn) {
512
+ data = this.decryptFn(data, objNum, gen);
513
+ }
514
+ return decodeStreamFilters(data, stream.dict);
515
+ }
516
+ /**
517
+ * Decrypt a string value (bytes) if encryption is active.
518
+ */
519
+ decryptString(bytes, objNum, gen) {
520
+ if (this.decryptFn) {
521
+ return this.decryptFn(bytes, objNum, gen);
522
+ }
523
+ return bytes;
524
+ }
525
+ /**
526
+ * Decode a PDF string to a JS string, with optional decryption.
527
+ */
528
+ decodeString(bytes, objNum = 0, gen = 0) {
529
+ const decrypted = this.decryptString(bytes, objNum, gen);
530
+ return decodePdfStringBytes(decrypted);
531
+ }
532
+ /**
533
+ * Recursively decrypt all string values (Uint8Array) within a parsed PDF object.
534
+ * PDF spec requires all strings in an encrypted document to be decrypted using
535
+ * the per-object key derived from the containing object's objNum/gen.
536
+ * Streams are NOT decrypted here — they are decrypted in getStreamData().
537
+ */
538
+ decryptObjectStrings(obj, objNum, gen) {
539
+ if (obj === null || typeof obj !== "object") {
540
+ return obj;
541
+ }
542
+ // Decrypt Uint8Array string values
543
+ if (obj instanceof Uint8Array) {
544
+ return this.decryptFn(obj, objNum, gen);
545
+ }
546
+ // Recurse into dictionaries
547
+ if (isPdfDict(obj)) {
548
+ const decrypted = new Map();
549
+ for (const [key, value] of obj) {
550
+ decrypted.set(key, this.decryptObjectStrings(value, objNum, gen));
551
+ }
552
+ return decrypted;
553
+ }
554
+ // Recurse into arrays
555
+ if (isPdfArray(obj)) {
556
+ return obj.map(item => this.decryptObjectStrings(item, objNum, gen));
557
+ }
558
+ // Decrypt strings inside stream dicts (but NOT the stream data itself)
559
+ if (isPdfStream(obj)) {
560
+ const decryptedDict = this.decryptObjectStrings(obj.dict, objNum, gen);
561
+ return { type: "stream", dict: decryptedDict, data: obj.data };
562
+ }
563
+ return obj;
564
+ }
565
+ /**
566
+ * Get the catalog dictionary (the root of the document structure).
567
+ */
568
+ getCatalog() {
569
+ const rootRef = dictGetRef(this.trailer, "Root");
570
+ if (!rootRef) {
571
+ throw new PdfStructureError("No /Root in trailer");
572
+ }
573
+ const catalog = this.derefDict(rootRef);
574
+ if (!catalog) {
575
+ throw new PdfStructureError("Could not resolve catalog");
576
+ }
577
+ return catalog;
578
+ }
579
+ /**
580
+ * Get the pages array from the page tree.
581
+ * Returns an array of page dictionaries in order.
582
+ */
583
+ getPages() {
584
+ return this.getPagesWithObjInfo().map(p => p.dict);
585
+ }
586
+ /**
587
+ * Get pages with their object numbers (needed for correct decryption of
588
+ * inline streams within page objects).
589
+ */
590
+ getPagesWithObjInfo() {
591
+ const catalog = this.getCatalog();
592
+ const pagesRef = catalog.get("Pages");
593
+ const pagesDict = this.derefDict(pagesRef);
594
+ if (!pagesDict) {
595
+ throw new PdfStructureError("Could not resolve /Pages");
596
+ }
597
+ const pages = [];
598
+ const visited = new Set();
599
+ this.collectPages(pagesDict, pages, visited);
600
+ return pages;
601
+ }
602
+ /**
603
+ * Recursively collect page dictionaries from the page tree.
604
+ * Uses a visited set to prevent infinite recursion on cyclic page trees.
605
+ */
606
+ collectPages(node, pages, visited) {
607
+ if (visited.has(node)) {
608
+ return; // Cycle guard
609
+ }
610
+ visited.add(node);
611
+ const type = dictGetName(node, "Type");
612
+ if (type === "Page") {
613
+ // We don't know the objNum from here — it was lost during deref.
614
+ // Use 0 as fallback; callers that need objNum should use getPagesWithObjInfo().
615
+ pages.push({ dict: node, objNum: 0, gen: 0 });
616
+ return;
617
+ }
618
+ // Pages node — recurse into Kids
619
+ const kids = dictGetArray(node, "Kids");
620
+ if (!kids) {
621
+ return;
622
+ }
623
+ for (const kid of kids) {
624
+ let objNum = 0;
625
+ let gen = 0;
626
+ if (isPdfRef(kid)) {
627
+ objNum = kid.objNum;
628
+ gen = kid.gen;
629
+ }
630
+ const childDict = this.derefDict(kid);
631
+ if (childDict) {
632
+ const childType = dictGetName(childDict, "Type");
633
+ if (childType === "Page") {
634
+ pages.push({ dict: childDict, objNum, gen });
635
+ }
636
+ else {
637
+ this.collectPages(childDict, pages, visited);
638
+ }
639
+ }
640
+ }
641
+ }
642
+ /**
643
+ * Get the object number for a given object reference.
644
+ * Useful for tracking which object a value came from (for decryption).
645
+ */
646
+ getObjNumForRef(ref) {
647
+ return ref.objNum;
648
+ }
649
+ // ===========================================================================
650
+ // Low-level Object Parsing
651
+ // ===========================================================================
652
+ /**
653
+ * Parse an object definition at the given byte offset.
654
+ */
655
+ parseObjectAt(offset, objNum, _gen) {
656
+ this.tokenizer.pos = offset;
657
+ try {
658
+ const obj = parseObject(this.tokenizer);
659
+ return obj;
660
+ }
661
+ catch {
662
+ return null;
663
+ }
664
+ }
665
+ /**
666
+ * Parse a compressed object from an object stream.
667
+ * @param objStmNum - The object number of the object stream
668
+ * @param index - The index of the object within the stream
669
+ */
670
+ parseCompressedObject(objStmNum, index) {
671
+ // Resolve the object stream itself (must be type 1 — not recursive)
672
+ const stmCacheKey = `objstm:${objStmNum}`;
673
+ let stmObjects;
674
+ if (this.cache.has(stmCacheKey)) {
675
+ stmObjects = this.cache.get(stmCacheKey);
676
+ }
677
+ else {
678
+ stmObjects = this.parseObjectStream(objStmNum) ?? undefined;
679
+ if (stmObjects) {
680
+ this.cache.set(stmCacheKey, stmObjects);
681
+ }
682
+ }
683
+ if (!stmObjects) {
684
+ return null;
685
+ }
686
+ // The index field in the xref is the index within the object stream
687
+ // We need to find the object by its index position
688
+ let i = 0;
689
+ for (const [, value] of stmObjects) {
690
+ if (i === index) {
691
+ return value;
692
+ }
693
+ i++;
694
+ }
695
+ return null;
696
+ }
697
+ /**
698
+ * Parse all objects from an object stream.
699
+ * @returns Map of object number → object value
700
+ */
701
+ parseObjectStream(objStmNum) {
702
+ const entry = this.xref.get(objStmNum);
703
+ if (!entry || entry.type !== 1) {
704
+ return null;
705
+ }
706
+ this.tokenizer.pos = entry.offset;
707
+ const stmObj = parseObject(this.tokenizer);
708
+ if (!isPdfStream(stmObj)) {
709
+ return null;
710
+ }
711
+ const dict = stmObj.dict;
712
+ const n = dictGetNumber(dict, "N") ?? 0;
713
+ const first = dictGetNumber(dict, "First") ?? 0;
714
+ // Decode stream data (pass objStmNum/gen for correct decryption)
715
+ const streamData = this.getStreamData(stmObj, objStmNum, entry.gen);
716
+ // Parse the N pairs of (objNum offset) before 'first'
717
+ const headerTokenizer = new PdfTokenizer(streamData);
718
+ const pairs = [];
719
+ for (let i = 0; i < n; i++) {
720
+ const numTok = headerTokenizer.next();
721
+ const offTok = headerTokenizer.next();
722
+ if (numTok.type === TokenType.Number && offTok.type === TokenType.Number) {
723
+ pairs.push([numTok.numValue, offTok.numValue]);
724
+ }
725
+ }
726
+ // Parse each object
727
+ const result = new Map();
728
+ for (const [objectNumber, relOffset] of pairs) {
729
+ const objTokenizer = new PdfTokenizer(streamData, first + relOffset);
730
+ try {
731
+ const obj = parseObject(objTokenizer);
732
+ result.set(objectNumber, obj);
733
+ }
734
+ catch {
735
+ // Skip unparseable objects
736
+ }
737
+ }
738
+ return result;
739
+ }
740
+ /**
741
+ * Resolve a page's bounding box (MediaBox/CropBox) with indirect ref resolution
742
+ * and parent inheritance. Returns `{ width, height }` or null if no box found.
743
+ *
744
+ * This is a shared helper so callers don't duplicate box resolution logic.
745
+ */
746
+ resolvePageBox(pageDict, visited) {
747
+ const seen = visited ?? new Set();
748
+ if (seen.has(pageDict)) {
749
+ return null; // Cycle guard
750
+ }
751
+ seen.add(pageDict);
752
+ for (const key of ["MediaBox", "CropBox"]) {
753
+ const raw = pageDict.get(key);
754
+ if (!raw) {
755
+ continue;
756
+ }
757
+ // Dereference in case the box is an indirect reference
758
+ const resolved = this.deref(raw);
759
+ if (Array.isArray(resolved) && resolved.length === 4) {
760
+ const width = Math.abs(resolved[2] - resolved[0]);
761
+ const height = Math.abs(resolved[3] - resolved[1]);
762
+ if (width > 0 && height > 0) {
763
+ return { width, height };
764
+ }
765
+ }
766
+ }
767
+ // Inherit from parent
768
+ const parent = pageDict.get("Parent");
769
+ if (parent) {
770
+ const parentDict = this.derefDict(parent);
771
+ if (parentDict) {
772
+ return this.resolvePageBox(parentDict, seen);
773
+ }
774
+ }
775
+ return null;
776
+ }
777
+ /**
778
+ * Resolve a page's Resources dictionary, inheriting from parent pages if needed.
779
+ * Protected against cyclic parent chains.
780
+ */
781
+ resolvePageResources(pageDict, visited) {
782
+ const seen = visited ?? new Set();
783
+ if (seen.has(pageDict)) {
784
+ return new Map(); // Cycle guard
785
+ }
786
+ seen.add(pageDict);
787
+ const resources = pageDict.get("Resources");
788
+ if (resources) {
789
+ const resolved = this.derefDict(resources);
790
+ if (resolved) {
791
+ return resolved;
792
+ }
793
+ }
794
+ const parent = pageDict.get("Parent");
795
+ if (parent) {
796
+ const parentDict = this.derefDict(parent);
797
+ if (parentDict) {
798
+ return this.resolvePageResources(parentDict, seen);
799
+ }
800
+ }
801
+ return new Map();
802
+ }
803
+ }
804
+ // =============================================================================
805
+ // Helpers
806
+ // =============================================================================
807
+ /**
808
+ * Read a big-endian integer of the given byte width.
809
+ * Uses multiplication instead of bitwise shift to avoid signed 32-bit overflow
810
+ * for values that exceed 2^31 (e.g. large file offsets).
811
+ */
812
+ function readIntBE(data, offset, width) {
813
+ let value = 0;
814
+ for (let i = 0; i < width; i++) {
815
+ value = value * 256 + (data[offset + i] ?? 0);
816
+ }
817
+ return value;
818
+ }