@cj-tech-master/excelts 8.0.0 → 8.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. package/README.md +14 -1
  2. package/README_zh.md +6 -0
  3. package/dist/browser/modules/archive/zip/stream.d.ts +4 -0
  4. package/dist/browser/modules/archive/zip/stream.js +53 -0
  5. package/dist/browser/modules/pdf/core/crypto.d.ts +65 -0
  6. package/dist/browser/modules/pdf/core/crypto.js +637 -0
  7. package/dist/browser/modules/pdf/core/encryption.d.ts +23 -20
  8. package/dist/browser/modules/pdf/core/encryption.js +88 -261
  9. package/dist/browser/modules/pdf/core/pdf-writer.d.ts +6 -4
  10. package/dist/browser/modules/pdf/core/pdf-writer.js +19 -10
  11. package/dist/browser/modules/pdf/index.d.ts +23 -2
  12. package/dist/browser/modules/pdf/index.js +21 -3
  13. package/dist/browser/modules/pdf/reader/annotation-extractor.d.ts +63 -0
  14. package/dist/browser/modules/pdf/reader/annotation-extractor.js +155 -0
  15. package/dist/browser/modules/pdf/reader/cmap-parser.d.ts +70 -0
  16. package/dist/browser/modules/pdf/reader/cmap-parser.js +321 -0
  17. package/dist/browser/modules/pdf/reader/content-interpreter.d.ts +57 -0
  18. package/dist/browser/modules/pdf/reader/content-interpreter.js +715 -0
  19. package/dist/browser/modules/pdf/reader/font-decoder.d.ts +58 -0
  20. package/dist/browser/modules/pdf/reader/font-decoder.js +1513 -0
  21. package/dist/browser/modules/pdf/reader/form-extractor.d.ts +48 -0
  22. package/dist/browser/modules/pdf/reader/form-extractor.js +355 -0
  23. package/dist/browser/modules/pdf/reader/image-extractor.d.ts +55 -0
  24. package/dist/browser/modules/pdf/reader/image-extractor.js +220 -0
  25. package/dist/browser/modules/pdf/reader/metadata-reader.d.ts +56 -0
  26. package/dist/browser/modules/pdf/reader/metadata-reader.js +275 -0
  27. package/dist/browser/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
  28. package/dist/browser/modules/pdf/reader/pdf-decrypt.js +443 -0
  29. package/dist/browser/modules/pdf/reader/pdf-document.d.ts +191 -0
  30. package/dist/browser/modules/pdf/reader/pdf-document.js +818 -0
  31. package/dist/browser/modules/pdf/reader/pdf-parser.d.ts +65 -0
  32. package/dist/browser/modules/pdf/reader/pdf-parser.js +285 -0
  33. package/dist/browser/modules/pdf/reader/pdf-reader.d.ts +143 -0
  34. package/dist/browser/modules/pdf/reader/pdf-reader.js +200 -0
  35. package/dist/browser/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
  36. package/dist/browser/modules/pdf/reader/pdf-tokenizer.js +543 -0
  37. package/dist/browser/modules/pdf/reader/reader-utils.d.ts +15 -0
  38. package/dist/browser/modules/pdf/reader/reader-utils.js +27 -0
  39. package/dist/browser/modules/pdf/reader/stream-filters.d.ts +20 -0
  40. package/dist/browser/modules/pdf/reader/stream-filters.js +456 -0
  41. package/dist/browser/modules/pdf/reader/text-reconstruction.d.ts +44 -0
  42. package/dist/browser/modules/pdf/reader/text-reconstruction.js +463 -0
  43. package/dist/cjs/modules/archive/zip/stream.js +53 -0
  44. package/dist/cjs/modules/pdf/core/crypto.js +649 -0
  45. package/dist/cjs/modules/pdf/core/encryption.js +88 -263
  46. package/dist/cjs/modules/pdf/core/pdf-writer.js +19 -10
  47. package/dist/cjs/modules/pdf/index.js +23 -4
  48. package/dist/cjs/modules/pdf/reader/annotation-extractor.js +158 -0
  49. package/dist/cjs/modules/pdf/reader/cmap-parser.js +326 -0
  50. package/dist/cjs/modules/pdf/reader/content-interpreter.js +718 -0
  51. package/dist/cjs/modules/pdf/reader/font-decoder.js +1518 -0
  52. package/dist/cjs/modules/pdf/reader/form-extractor.js +358 -0
  53. package/dist/cjs/modules/pdf/reader/image-extractor.js +223 -0
  54. package/dist/cjs/modules/pdf/reader/metadata-reader.js +278 -0
  55. package/dist/cjs/modules/pdf/reader/pdf-decrypt.js +447 -0
  56. package/dist/cjs/modules/pdf/reader/pdf-document.js +822 -0
  57. package/dist/cjs/modules/pdf/reader/pdf-parser.js +301 -0
  58. package/dist/cjs/modules/pdf/reader/pdf-reader.js +203 -0
  59. package/dist/cjs/modules/pdf/reader/pdf-tokenizer.js +517 -0
  60. package/dist/cjs/modules/pdf/reader/reader-utils.js +30 -0
  61. package/dist/cjs/modules/pdf/reader/stream-filters.js +459 -0
  62. package/dist/cjs/modules/pdf/reader/text-reconstruction.js +467 -0
  63. package/dist/esm/modules/archive/zip/stream.js +53 -0
  64. package/dist/esm/modules/pdf/core/crypto.js +637 -0
  65. package/dist/esm/modules/pdf/core/encryption.js +88 -261
  66. package/dist/esm/modules/pdf/core/pdf-writer.js +19 -10
  67. package/dist/esm/modules/pdf/index.js +21 -3
  68. package/dist/esm/modules/pdf/reader/annotation-extractor.js +155 -0
  69. package/dist/esm/modules/pdf/reader/cmap-parser.js +321 -0
  70. package/dist/esm/modules/pdf/reader/content-interpreter.js +715 -0
  71. package/dist/esm/modules/pdf/reader/font-decoder.js +1513 -0
  72. package/dist/esm/modules/pdf/reader/form-extractor.js +355 -0
  73. package/dist/esm/modules/pdf/reader/image-extractor.js +220 -0
  74. package/dist/esm/modules/pdf/reader/metadata-reader.js +275 -0
  75. package/dist/esm/modules/pdf/reader/pdf-decrypt.js +443 -0
  76. package/dist/esm/modules/pdf/reader/pdf-document.js +818 -0
  77. package/dist/esm/modules/pdf/reader/pdf-parser.js +285 -0
  78. package/dist/esm/modules/pdf/reader/pdf-reader.js +200 -0
  79. package/dist/esm/modules/pdf/reader/pdf-tokenizer.js +543 -0
  80. package/dist/esm/modules/pdf/reader/reader-utils.js +27 -0
  81. package/dist/esm/modules/pdf/reader/stream-filters.js +456 -0
  82. package/dist/esm/modules/pdf/reader/text-reconstruction.js +463 -0
  83. package/dist/iife/excelts.iife.js +703 -267
  84. package/dist/iife/excelts.iife.js.map +1 -1
  85. package/dist/iife/excelts.iife.min.js +35 -35
  86. package/dist/types/modules/archive/zip/stream.d.ts +4 -0
  87. package/dist/types/modules/pdf/core/crypto.d.ts +65 -0
  88. package/dist/types/modules/pdf/core/encryption.d.ts +23 -20
  89. package/dist/types/modules/pdf/core/pdf-writer.d.ts +6 -4
  90. package/dist/types/modules/pdf/index.d.ts +23 -2
  91. package/dist/types/modules/pdf/reader/annotation-extractor.d.ts +63 -0
  92. package/dist/types/modules/pdf/reader/cmap-parser.d.ts +70 -0
  93. package/dist/types/modules/pdf/reader/content-interpreter.d.ts +57 -0
  94. package/dist/types/modules/pdf/reader/font-decoder.d.ts +58 -0
  95. package/dist/types/modules/pdf/reader/form-extractor.d.ts +48 -0
  96. package/dist/types/modules/pdf/reader/image-extractor.d.ts +55 -0
  97. package/dist/types/modules/pdf/reader/metadata-reader.d.ts +56 -0
  98. package/dist/types/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
  99. package/dist/types/modules/pdf/reader/pdf-document.d.ts +191 -0
  100. package/dist/types/modules/pdf/reader/pdf-parser.d.ts +65 -0
  101. package/dist/types/modules/pdf/reader/pdf-reader.d.ts +143 -0
  102. package/dist/types/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
  103. package/dist/types/modules/pdf/reader/reader-utils.d.ts +15 -0
  104. package/dist/types/modules/pdf/reader/stream-filters.d.ts +20 -0
  105. package/dist/types/modules/pdf/reader/text-reconstruction.d.ts +44 -0
  106. package/package.json +1 -1
@@ -0,0 +1,822 @@
1
+ "use strict";
2
+ /**
3
+ * PDF document parser.
4
+ *
5
+ * Handles the high-level PDF file structure:
6
+ * - Locating startxref
7
+ * - Parsing cross-reference tables (traditional and stream-based)
8
+ * - Reading trailer dictionaries
9
+ * - Resolving indirect object references
10
+ * - Handling incremental updates
11
+ *
12
+ * @see PDF Reference 1.7, §3.4 - File Structure
13
+ */
14
+ Object.defineProperty(exports, "__esModule", { value: true });
15
+ exports.PdfDocument = void 0;
16
+ const pdf_tokenizer_1 = require("./pdf-tokenizer");
17
+ const pdf_parser_1 = require("./pdf-parser");
18
+ const stream_filters_1 = require("./stream-filters");
19
+ const errors_1 = require("../errors");
20
+ // =============================================================================
21
+ // Module-level cached TextEncoder
22
+ // =============================================================================
23
+ /** Cached TextEncoder instance to avoid repeated allocation in hot paths */
24
+ const _encoder = new TextEncoder();
25
+ // =============================================================================
26
+ // PDF Document
27
+ // =============================================================================
28
+ /**
29
+ * Parsed PDF document with lazy object resolution.
30
+ *
31
+ * Reads the cross-reference table and trailer on construction,
32
+ * then resolves individual objects on demand with caching.
33
+ */
34
+ class PdfDocument {
35
+ constructor(data) {
36
+ this.xref = new Map();
37
+ this.cache = new Map();
38
+ /** Encryption handler (set externally after decryption is initialized) */
39
+ this.decryptFn = null;
40
+ this.tokenizer = new pdf_tokenizer_1.PdfTokenizer(data);
41
+ this.trailer = this.parseFileStructure();
42
+ }
43
+ /** Get the underlying raw data */
44
+ get data() {
45
+ return this.tokenizer.bytes;
46
+ }
47
+ // ===========================================================================
48
+ // File Structure Parsing
49
+ // ===========================================================================
50
+ parseFileStructure() {
51
+ try {
52
+ const startxrefOffset = this.findStartxref();
53
+ return this.parseXrefChain(startxrefOffset);
54
+ }
55
+ catch {
56
+ // If normal xref parsing fails, attempt full-file reconstruction
57
+ return this.reconstructXref();
58
+ }
59
+ }
60
+ /**
61
+ * Find the startxref offset by scanning backward from EOF.
62
+ */
63
+ findStartxref() {
64
+ const data = this.tokenizer.bytes;
65
+ const startxrefKeyword = _encoder.encode("startxref");
66
+ const pos = this.tokenizer.findSequenceBackward(startxrefKeyword);
67
+ if (pos < 0) {
68
+ throw new errors_1.PdfStructureError("Could not find startxref keyword");
69
+ }
70
+ // Position after "startxref"
71
+ this.tokenizer.pos = pos + startxrefKeyword.length;
72
+ this.tokenizer.skipWhitespaceAndComments();
73
+ // Read the offset number
74
+ let numStr = "";
75
+ while (this.tokenizer.pos < data.length) {
76
+ const b = data[this.tokenizer.pos];
77
+ if (b >= 0x30 && b <= 0x39) {
78
+ numStr += String.fromCharCode(b);
79
+ this.tokenizer.pos++;
80
+ }
81
+ else {
82
+ break;
83
+ }
84
+ }
85
+ const offset = parseInt(numStr, 10);
86
+ if (isNaN(offset)) {
87
+ throw new errors_1.PdfStructureError("Invalid startxref offset");
88
+ }
89
+ return offset;
90
+ }
91
+ /**
92
+ * Parse the xref chain starting at the given offset.
93
+ * Follows /Prev links for incremental updates.
94
+ * Returns the merged trailer dictionary.
95
+ */
96
+ parseXrefChain(startOffset) {
97
+ let trailerDict = null;
98
+ let offset = startOffset;
99
+ const visited = new Set();
100
+ while (offset !== null) {
101
+ if (visited.has(offset)) {
102
+ break; // Prevent infinite loops
103
+ }
104
+ visited.add(offset);
105
+ this.tokenizer.pos = offset;
106
+ this.tokenizer.skipWhitespaceAndComments();
107
+ // Check if this is a traditional xref table or an xref stream
108
+ const peekStart = this.tokenizer.pos;
109
+ const firstToken = this.tokenizer.next();
110
+ if (firstToken.type === 6 /* TokenType.Keyword */ && firstToken.strValue === "xref") {
111
+ // Traditional xref table
112
+ const trailer = this.parseTraditionalXref();
113
+ if (!trailerDict) {
114
+ trailerDict = trailer;
115
+ }
116
+ else {
117
+ // Merge: first trailer wins for Root, Info, Encrypt, ID
118
+ this.mergeTrailer(trailerDict, trailer);
119
+ }
120
+ const prev = (0, pdf_parser_1.dictGetNumber)(trailer, "Prev");
121
+ offset = prev ?? null;
122
+ }
123
+ else if (firstToken.type === 0 /* TokenType.Number */) {
124
+ // Xref stream (PDF 1.5+): starts with `N gen obj`
125
+ this.tokenizer.pos = peekStart;
126
+ const trailer = this.parseXrefStream(offset);
127
+ if (!trailerDict) {
128
+ trailerDict = trailer;
129
+ }
130
+ else {
131
+ this.mergeTrailer(trailerDict, trailer);
132
+ }
133
+ const prev = (0, pdf_parser_1.dictGetNumber)(trailer, "Prev");
134
+ offset = prev ?? null;
135
+ }
136
+ else {
137
+ throw new errors_1.PdfStructureError(`Invalid xref at offset ${offset}: expected 'xref' keyword or xref stream`);
138
+ }
139
+ }
140
+ if (!trailerDict) {
141
+ throw new errors_1.PdfStructureError("No trailer found");
142
+ }
143
+ return trailerDict;
144
+ }
145
+ /**
146
+ * Parse a traditional xref table and its trailer.
147
+ */
148
+ parseTraditionalXref() {
149
+ // The "xref" keyword has already been consumed
150
+ while (true) {
151
+ this.tokenizer.skipWhitespaceAndComments();
152
+ // Check if we've hit the trailer
153
+ const peekPos = this.tokenizer.pos;
154
+ const token = this.tokenizer.next();
155
+ if (token.type === 6 /* TokenType.Keyword */ && token.strValue === "trailer") {
156
+ break;
157
+ }
158
+ // Subsection header: startObj count
159
+ if (token.type !== 0 /* TokenType.Number */) {
160
+ // End of xref sections
161
+ this.tokenizer.pos = peekPos;
162
+ break;
163
+ }
164
+ const startObj = token.numValue;
165
+ const countToken = this.tokenizer.next();
166
+ if (countToken.type !== 0 /* TokenType.Number */) {
167
+ throw new errors_1.PdfStructureError("Invalid xref subsection header");
168
+ }
169
+ const count = countToken.numValue;
170
+ // Parse entries
171
+ for (let i = 0; i < count; i++) {
172
+ const objNum = startObj + i;
173
+ this.tokenizer.skipWhitespaceAndComments();
174
+ // Each entry is exactly "OOOOOOOOOO GGGGG n \n" or "OOOOOOOOOO GGGGG f \n"
175
+ const line = this.tokenizer.readLine();
176
+ const parts = line.trim().split(/\s+/);
177
+ if (parts.length < 3) {
178
+ continue;
179
+ }
180
+ const entryOffset = parseInt(parts[0], 10);
181
+ const gen = parseInt(parts[1], 10);
182
+ const inUse = parts[2] === "n";
183
+ if (inUse && !this.xref.has(objNum)) {
184
+ this.xref.set(objNum, { offset: entryOffset, gen, type: 1 });
185
+ }
186
+ }
187
+ }
188
+ // Parse the trailer dictionary
189
+ this.tokenizer.skipWhitespaceAndComments();
190
+ const trailerObj = (0, pdf_parser_1.parseObject)(this.tokenizer);
191
+ if (!(0, pdf_parser_1.isPdfDict)(trailerObj)) {
192
+ throw new errors_1.PdfStructureError("Expected dictionary after 'trailer' keyword");
193
+ }
194
+ return trailerObj;
195
+ }
196
+ /**
197
+ * Parse a cross-reference stream (PDF 1.5+).
198
+ */
199
+ parseXrefStream(offset) {
200
+ this.tokenizer.pos = offset;
201
+ const obj = (0, pdf_parser_1.parseObject)(this.tokenizer);
202
+ if (!(0, pdf_parser_1.isPdfStream)(obj)) {
203
+ throw new errors_1.PdfStructureError("Expected xref stream object");
204
+ }
205
+ const dict = obj.dict;
206
+ const type = (0, pdf_parser_1.dictGetName)(dict, "Type");
207
+ if (type !== "XRef") {
208
+ throw new errors_1.PdfStructureError(`Expected /Type /XRef, got /Type /${type}`);
209
+ }
210
+ // Decode the stream data
211
+ const streamData = (0, stream_filters_1.decodeStreamFilters)(obj.data, dict);
212
+ // Parse W array: [fieldSizeType, fieldSizeOffset, fieldSizeGen]
213
+ const wArray = (0, pdf_parser_1.dictGetArray)(dict, "W");
214
+ if (!wArray || wArray.length < 3) {
215
+ throw new errors_1.PdfStructureError("Invalid /W array in xref stream");
216
+ }
217
+ const w0 = wArray[0];
218
+ const w1 = wArray[1];
219
+ const w2 = wArray[2];
220
+ const entrySize = w0 + w1 + w2;
221
+ // Parse Index array (default: [0 Size])
222
+ const size = (0, pdf_parser_1.dictGetNumber)(dict, "Size") ?? 0;
223
+ let indexArray = (0, pdf_parser_1.dictGetArray)(dict, "Index");
224
+ if (!indexArray) {
225
+ indexArray = [0, size];
226
+ }
227
+ // Process entries
228
+ let dataOffset = 0;
229
+ for (let i = 0; i < indexArray.length; i += 2) {
230
+ const startObj = indexArray[i];
231
+ const count = indexArray[i + 1];
232
+ for (let j = 0; j < count; j++) {
233
+ if (dataOffset + entrySize > streamData.length) {
234
+ break;
235
+ }
236
+ const objNum = startObj + j;
237
+ const fieldType = w0 > 0 ? readIntBE(streamData, dataOffset, w0) : 1;
238
+ const field2 = readIntBE(streamData, dataOffset + w0, w1);
239
+ const field3 = w2 > 0 ? readIntBE(streamData, dataOffset + w0 + w1, w2) : 0;
240
+ dataOffset += entrySize;
241
+ if (this.xref.has(objNum)) {
242
+ continue; // First entry wins
243
+ }
244
+ if (fieldType === 0) {
245
+ // Free object — skip
246
+ }
247
+ else if (fieldType === 1) {
248
+ // Uncompressed object: field2 = byte offset, field3 = generation
249
+ this.xref.set(objNum, { offset: field2, gen: field3, type: 1 });
250
+ }
251
+ else if (fieldType === 2) {
252
+ // Compressed object in object stream: field2 = objstm number, field3 = index
253
+ this.xref.set(objNum, { offset: field2, gen: field3, type: 2 });
254
+ }
255
+ }
256
+ }
257
+ return dict;
258
+ }
259
+ /**
260
+ * Reconstruct the xref table by scanning the entire file for `N N obj` patterns.
261
+ * This is a fallback for corrupted or broken PDFs where the normal xref parsing fails.
262
+ *
263
+ * @returns A synthetic trailer dictionary
264
+ */
265
+ reconstructXref() {
266
+ const data = this.tokenizer.bytes;
267
+ this.xref.clear();
268
+ // Regex-style scan: look for patterns like "123 0 obj" in the raw bytes
269
+ // We scan byte-by-byte looking for digit sequences followed by spaces and "obj"
270
+ const objKeyword = _encoder.encode("obj");
271
+ let pos = 0;
272
+ while (pos < data.length - 5) {
273
+ // Skip to a potential start of an object definition (digit character)
274
+ if (data[pos] < 0x30 || data[pos] > 0x39) {
275
+ pos++;
276
+ continue;
277
+ }
278
+ // Ensure we're at a line boundary or start of file
279
+ if (pos > 0 && data[pos - 1] !== 0x0a && data[pos - 1] !== 0x0d && data[pos - 1] !== 0x20) {
280
+ pos++;
281
+ continue;
282
+ }
283
+ // Try to read: objNum gen obj
284
+ const savedPos = pos;
285
+ let objNumStr = "";
286
+ while (pos < data.length && data[pos] >= 0x30 && data[pos] <= 0x39) {
287
+ objNumStr += String.fromCharCode(data[pos]);
288
+ pos++;
289
+ }
290
+ if (objNumStr.length === 0 || pos >= data.length || data[pos] !== 0x20) {
291
+ pos = savedPos + 1;
292
+ continue;
293
+ }
294
+ pos++; // skip space
295
+ let genStr = "";
296
+ while (pos < data.length && data[pos] >= 0x30 && data[pos] <= 0x39) {
297
+ genStr += String.fromCharCode(data[pos]);
298
+ pos++;
299
+ }
300
+ if (genStr.length === 0 || pos >= data.length || data[pos] !== 0x20) {
301
+ pos = savedPos + 1;
302
+ continue;
303
+ }
304
+ pos++; // skip space
305
+ // Check for "obj" keyword
306
+ if (pos + objKeyword.length <= data.length &&
307
+ data[pos] === objKeyword[0] &&
308
+ data[pos + 1] === objKeyword[1] &&
309
+ data[pos + 2] === objKeyword[2]) {
310
+ // Verify the character after "obj" is whitespace or delimiter
311
+ const afterObj = pos + 3;
312
+ if (afterObj >= data.length ||
313
+ data[afterObj] === 0x20 ||
314
+ data[afterObj] === 0x0a ||
315
+ data[afterObj] === 0x0d ||
316
+ data[afterObj] === 0x09 ||
317
+ data[afterObj] === 0x3c // '<' for immediate dict/stream
318
+ ) {
319
+ const objNum = parseInt(objNumStr, 10);
320
+ const gen = parseInt(genStr, 10);
321
+ if (!this.xref.has(objNum)) {
322
+ this.xref.set(objNum, { offset: savedPos, gen, type: 1 });
323
+ }
324
+ }
325
+ }
326
+ pos = savedPos + 1;
327
+ }
328
+ if (this.xref.size === 0) {
329
+ throw new errors_1.PdfStructureError("Could not reconstruct xref: no objects found");
330
+ }
331
+ // Try to find a trailer dictionary by scanning for "trailer" keyword
332
+ const trailerKeyword = _encoder.encode("trailer");
333
+ const trailerPos = this.tokenizer.findSequenceBackward(trailerKeyword);
334
+ if (trailerPos >= 0) {
335
+ this.tokenizer.pos = trailerPos + trailerKeyword.length;
336
+ this.tokenizer.skipWhitespaceAndComments();
337
+ try {
338
+ const trailerObj = (0, pdf_parser_1.parseObject)(this.tokenizer);
339
+ if ((0, pdf_parser_1.isPdfDict)(trailerObj)) {
340
+ return trailerObj;
341
+ }
342
+ }
343
+ catch {
344
+ // Fall through to synthetic trailer
345
+ }
346
+ }
347
+ // Build a synthetic trailer by finding the Root catalog
348
+ const syntheticTrailer = new Map();
349
+ syntheticTrailer.set("Size", this.xref.size);
350
+ // Scan resolved objects to find the catalog (the one with /Type /Catalog)
351
+ for (const [objNum, entry] of this.xref) {
352
+ if (entry.type !== 1) {
353
+ continue;
354
+ }
355
+ try {
356
+ this.tokenizer.pos = entry.offset;
357
+ const obj = (0, pdf_parser_1.parseObject)(this.tokenizer);
358
+ if ((0, pdf_parser_1.isPdfDict)(obj)) {
359
+ const typeVal = (0, pdf_parser_1.dictGetName)(obj, "Type");
360
+ if (typeVal === "Catalog") {
361
+ syntheticTrailer.set("Root", { type: "ref", objNum, gen: entry.gen });
362
+ break;
363
+ }
364
+ }
365
+ else if ((0, pdf_parser_1.isPdfStream)(obj)) {
366
+ const typeVal = (0, pdf_parser_1.dictGetName)(obj.dict, "Type");
367
+ if (typeVal === "Catalog") {
368
+ syntheticTrailer.set("Root", { type: "ref", objNum, gen: entry.gen });
369
+ break;
370
+ }
371
+ }
372
+ }
373
+ catch {
374
+ // Skip unparseable objects
375
+ }
376
+ }
377
+ return syntheticTrailer;
378
+ }
379
+ /**
380
+ * Merge trailer entries from an older trailer into the current one.
381
+ * Only adds keys that don't already exist.
382
+ */
383
+ mergeTrailer(current, older) {
384
+ for (const [key, value] of older) {
385
+ if (!current.has(key)) {
386
+ current.set(key, value);
387
+ }
388
+ }
389
+ }
390
+ // ===========================================================================
391
+ // Object Resolution
392
+ // ===========================================================================
393
+ /**
394
+ * Resolve a PDF object by its object number and generation.
395
+ * Returns null if the object doesn't exist.
396
+ */
397
+ resolve(objNum, gen = 0) {
398
+ const cacheKey = `${objNum}:${gen}`;
399
+ if (this.cache.has(cacheKey)) {
400
+ return this.cache.get(cacheKey);
401
+ }
402
+ const entry = this.xref.get(objNum);
403
+ if (!entry) {
404
+ return null;
405
+ }
406
+ let obj = null;
407
+ if (entry.type === 1) {
408
+ // Uncompressed object — parse directly at offset
409
+ obj = this.parseObjectAt(entry.offset, objNum, entry.gen);
410
+ }
411
+ else if (entry.type === 2) {
412
+ // Compressed object in an object stream
413
+ obj = this.parseCompressedObject(entry.offset, entry.gen);
414
+ }
415
+ // Decrypt string values within the resolved object (V1-V4 per-object encryption)
416
+ if (obj !== null && this.decryptFn) {
417
+ obj = this.decryptObjectStrings(obj, objNum, entry.gen);
418
+ }
419
+ if (obj !== null) {
420
+ this.cache.set(cacheKey, obj);
421
+ }
422
+ return obj;
423
+ }
424
+ /**
425
+ * Resolve a PDF object and return it along with its object/generation numbers.
426
+ * Useful for tracking which object a value came from (for decryption).
427
+ *
428
+ * @param objNum - The object number to resolve
429
+ * @param gen - The generation number (default 0)
430
+ * @returns The resolved object with its objNum and gen for decryption context
431
+ */
432
+ resolveWithObjNum(objNum, gen = 0) {
433
+ const obj = this.resolve(objNum, gen);
434
+ return { obj, objNum, gen };
435
+ }
436
+ /**
437
+ * Dereference a PdfRef to its actual object value.
438
+ * If the input is not a PdfRef, returns it as-is.
439
+ */
440
+ deref(obj) {
441
+ if (obj === null || obj === undefined) {
442
+ return null;
443
+ }
444
+ if ((0, pdf_parser_1.isPdfRef)(obj)) {
445
+ return this.resolve(obj.objNum, obj.gen);
446
+ }
447
+ return obj;
448
+ }
449
+ /**
450
+ * Dereference a PdfRef and assert it's a dictionary.
451
+ */
452
+ derefDict(obj) {
453
+ const resolved = this.deref(obj);
454
+ if (resolved === null) {
455
+ return null;
456
+ }
457
+ if ((0, pdf_parser_1.isPdfDict)(resolved)) {
458
+ return resolved;
459
+ }
460
+ if ((0, pdf_parser_1.isPdfStream)(resolved)) {
461
+ return resolved.dict;
462
+ }
463
+ return null;
464
+ }
465
+ /**
466
+ * Dereference a PdfRef and get the stream, along with the objNum/gen
467
+ * needed for correct per-object decryption.
468
+ */
469
+ derefStream(obj) {
470
+ const resolved = this.deref(obj);
471
+ if (resolved === null) {
472
+ return null;
473
+ }
474
+ if ((0, pdf_parser_1.isPdfStream)(resolved)) {
475
+ return resolved;
476
+ }
477
+ return null;
478
+ }
479
+ /**
480
+ * Dereference a PdfRef and get the stream with its object number and generation.
481
+ * Returns null if the object is not a stream.
482
+ * The objNum/gen are needed for correct per-object decryption (V1-V4).
483
+ */
484
+ derefStreamWithObjNum(obj) {
485
+ if (obj === null || obj === undefined) {
486
+ return null;
487
+ }
488
+ let objNum = 0;
489
+ let gen = 0;
490
+ if ((0, pdf_parser_1.isPdfRef)(obj)) {
491
+ objNum = obj.objNum;
492
+ gen = obj.gen;
493
+ }
494
+ const resolved = this.deref(obj);
495
+ if (resolved === null) {
496
+ return null;
497
+ }
498
+ if ((0, pdf_parser_1.isPdfStream)(resolved)) {
499
+ return { stream: resolved, objNum, gen };
500
+ }
501
+ return null;
502
+ }
503
+ /**
504
+ * Get decoded stream data from a stream object.
505
+ * Applies filter chain decoding and decryption.
506
+ *
507
+ * When objNum/gen are not provided (default 0), decryption may not
508
+ * produce correct results. Use {@link resolveWithObjNum} to obtain
509
+ * the correct objNum/gen for the stream's containing object.
510
+ */
511
+ getStreamData(stream, objNum = 0, gen = 0) {
512
+ let data = stream.data;
513
+ // Decrypt stream data if encryption is active
514
+ if (this.decryptFn) {
515
+ data = this.decryptFn(data, objNum, gen);
516
+ }
517
+ return (0, stream_filters_1.decodeStreamFilters)(data, stream.dict);
518
+ }
519
+ /**
520
+ * Decrypt a string value (bytes) if encryption is active.
521
+ */
522
+ decryptString(bytes, objNum, gen) {
523
+ if (this.decryptFn) {
524
+ return this.decryptFn(bytes, objNum, gen);
525
+ }
526
+ return bytes;
527
+ }
528
+ /**
529
+ * Decode a PDF string to a JS string, with optional decryption.
530
+ */
531
+ decodeString(bytes, objNum = 0, gen = 0) {
532
+ const decrypted = this.decryptString(bytes, objNum, gen);
533
+ return (0, pdf_parser_1.decodePdfStringBytes)(decrypted);
534
+ }
535
+ /**
536
+ * Recursively decrypt all string values (Uint8Array) within a parsed PDF object.
537
+ * PDF spec requires all strings in an encrypted document to be decrypted using
538
+ * the per-object key derived from the containing object's objNum/gen.
539
+ * Streams are NOT decrypted here — they are decrypted in getStreamData().
540
+ */
541
+ decryptObjectStrings(obj, objNum, gen) {
542
+ if (obj === null || typeof obj !== "object") {
543
+ return obj;
544
+ }
545
+ // Decrypt Uint8Array string values
546
+ if (obj instanceof Uint8Array) {
547
+ return this.decryptFn(obj, objNum, gen);
548
+ }
549
+ // Recurse into dictionaries
550
+ if ((0, pdf_parser_1.isPdfDict)(obj)) {
551
+ const decrypted = new Map();
552
+ for (const [key, value] of obj) {
553
+ decrypted.set(key, this.decryptObjectStrings(value, objNum, gen));
554
+ }
555
+ return decrypted;
556
+ }
557
+ // Recurse into arrays
558
+ if ((0, pdf_parser_1.isPdfArray)(obj)) {
559
+ return obj.map(item => this.decryptObjectStrings(item, objNum, gen));
560
+ }
561
+ // Decrypt strings inside stream dicts (but NOT the stream data itself)
562
+ if ((0, pdf_parser_1.isPdfStream)(obj)) {
563
+ const decryptedDict = this.decryptObjectStrings(obj.dict, objNum, gen);
564
+ return { type: "stream", dict: decryptedDict, data: obj.data };
565
+ }
566
+ return obj;
567
+ }
568
+ /**
569
+ * Get the catalog dictionary (the root of the document structure).
570
+ */
571
+ getCatalog() {
572
+ const rootRef = (0, pdf_parser_1.dictGetRef)(this.trailer, "Root");
573
+ if (!rootRef) {
574
+ throw new errors_1.PdfStructureError("No /Root in trailer");
575
+ }
576
+ const catalog = this.derefDict(rootRef);
577
+ if (!catalog) {
578
+ throw new errors_1.PdfStructureError("Could not resolve catalog");
579
+ }
580
+ return catalog;
581
+ }
582
+ /**
583
+ * Get the pages array from the page tree.
584
+ * Returns an array of page dictionaries in order.
585
+ */
586
+ getPages() {
587
+ return this.getPagesWithObjInfo().map(p => p.dict);
588
+ }
589
+ /**
590
+ * Get pages with their object numbers (needed for correct decryption of
591
+ * inline streams within page objects).
592
+ */
593
+ getPagesWithObjInfo() {
594
+ const catalog = this.getCatalog();
595
+ const pagesRef = catalog.get("Pages");
596
+ const pagesDict = this.derefDict(pagesRef);
597
+ if (!pagesDict) {
598
+ throw new errors_1.PdfStructureError("Could not resolve /Pages");
599
+ }
600
+ const pages = [];
601
+ const visited = new Set();
602
+ this.collectPages(pagesDict, pages, visited);
603
+ return pages;
604
+ }
605
+ /**
606
+ * Recursively collect page dictionaries from the page tree.
607
+ * Uses a visited set to prevent infinite recursion on cyclic page trees.
608
+ */
609
+ collectPages(node, pages, visited) {
610
+ if (visited.has(node)) {
611
+ return; // Cycle guard
612
+ }
613
+ visited.add(node);
614
+ const type = (0, pdf_parser_1.dictGetName)(node, "Type");
615
+ if (type === "Page") {
616
+ // We don't know the objNum from here — it was lost during deref.
617
+ // Use 0 as fallback; callers that need objNum should use getPagesWithObjInfo().
618
+ pages.push({ dict: node, objNum: 0, gen: 0 });
619
+ return;
620
+ }
621
+ // Pages node — recurse into Kids
622
+ const kids = (0, pdf_parser_1.dictGetArray)(node, "Kids");
623
+ if (!kids) {
624
+ return;
625
+ }
626
+ for (const kid of kids) {
627
+ let objNum = 0;
628
+ let gen = 0;
629
+ if ((0, pdf_parser_1.isPdfRef)(kid)) {
630
+ objNum = kid.objNum;
631
+ gen = kid.gen;
632
+ }
633
+ const childDict = this.derefDict(kid);
634
+ if (childDict) {
635
+ const childType = (0, pdf_parser_1.dictGetName)(childDict, "Type");
636
+ if (childType === "Page") {
637
+ pages.push({ dict: childDict, objNum, gen });
638
+ }
639
+ else {
640
+ this.collectPages(childDict, pages, visited);
641
+ }
642
+ }
643
+ }
644
+ }
645
+ /**
646
+ * Get the object number for a given object reference.
647
+ * Useful for tracking which object a value came from (for decryption).
648
+ */
649
+ getObjNumForRef(ref) {
650
+ return ref.objNum;
651
+ }
652
+ // ===========================================================================
653
+ // Low-level Object Parsing
654
+ // ===========================================================================
655
+ /**
656
+ * Parse an object definition at the given byte offset.
657
+ */
658
+ parseObjectAt(offset, objNum, _gen) {
659
+ this.tokenizer.pos = offset;
660
+ try {
661
+ const obj = (0, pdf_parser_1.parseObject)(this.tokenizer);
662
+ return obj;
663
+ }
664
+ catch {
665
+ return null;
666
+ }
667
+ }
668
+ /**
669
+ * Parse a compressed object from an object stream.
670
+ * @param objStmNum - The object number of the object stream
671
+ * @param index - The index of the object within the stream
672
+ */
673
+ parseCompressedObject(objStmNum, index) {
674
+ // Resolve the object stream itself (must be type 1 — not recursive)
675
+ const stmCacheKey = `objstm:${objStmNum}`;
676
+ let stmObjects;
677
+ if (this.cache.has(stmCacheKey)) {
678
+ stmObjects = this.cache.get(stmCacheKey);
679
+ }
680
+ else {
681
+ stmObjects = this.parseObjectStream(objStmNum) ?? undefined;
682
+ if (stmObjects) {
683
+ this.cache.set(stmCacheKey, stmObjects);
684
+ }
685
+ }
686
+ if (!stmObjects) {
687
+ return null;
688
+ }
689
+ // The index field in the xref is the index within the object stream
690
+ // We need to find the object by its index position
691
+ let i = 0;
692
+ for (const [, value] of stmObjects) {
693
+ if (i === index) {
694
+ return value;
695
+ }
696
+ i++;
697
+ }
698
+ return null;
699
+ }
700
+ /**
701
+ * Parse all objects from an object stream.
702
+ * @returns Map of object number → object value
703
+ */
704
+ parseObjectStream(objStmNum) {
705
+ const entry = this.xref.get(objStmNum);
706
+ if (!entry || entry.type !== 1) {
707
+ return null;
708
+ }
709
+ this.tokenizer.pos = entry.offset;
710
+ const stmObj = (0, pdf_parser_1.parseObject)(this.tokenizer);
711
+ if (!(0, pdf_parser_1.isPdfStream)(stmObj)) {
712
+ return null;
713
+ }
714
+ const dict = stmObj.dict;
715
+ const n = (0, pdf_parser_1.dictGetNumber)(dict, "N") ?? 0;
716
+ const first = (0, pdf_parser_1.dictGetNumber)(dict, "First") ?? 0;
717
+ // Decode stream data (pass objStmNum/gen for correct decryption)
718
+ const streamData = this.getStreamData(stmObj, objStmNum, entry.gen);
719
+ // Parse the N pairs of (objNum offset) before 'first'
720
+ const headerTokenizer = new pdf_tokenizer_1.PdfTokenizer(streamData);
721
+ const pairs = [];
722
+ for (let i = 0; i < n; i++) {
723
+ const numTok = headerTokenizer.next();
724
+ const offTok = headerTokenizer.next();
725
+ if (numTok.type === 0 /* TokenType.Number */ && offTok.type === 0 /* TokenType.Number */) {
726
+ pairs.push([numTok.numValue, offTok.numValue]);
727
+ }
728
+ }
729
+ // Parse each object
730
+ const result = new Map();
731
+ for (const [objectNumber, relOffset] of pairs) {
732
+ const objTokenizer = new pdf_tokenizer_1.PdfTokenizer(streamData, first + relOffset);
733
+ try {
734
+ const obj = (0, pdf_parser_1.parseObject)(objTokenizer);
735
+ result.set(objectNumber, obj);
736
+ }
737
+ catch {
738
+ // Skip unparseable objects
739
+ }
740
+ }
741
+ return result;
742
+ }
743
+ /**
744
+ * Resolve a page's bounding box (MediaBox/CropBox) with indirect ref resolution
745
+ * and parent inheritance. Returns `{ width, height }` or null if no box found.
746
+ *
747
+ * This is a shared helper so callers don't duplicate box resolution logic.
748
+ */
749
+ resolvePageBox(pageDict, visited) {
750
+ const seen = visited ?? new Set();
751
+ if (seen.has(pageDict)) {
752
+ return null; // Cycle guard
753
+ }
754
+ seen.add(pageDict);
755
+ for (const key of ["MediaBox", "CropBox"]) {
756
+ const raw = pageDict.get(key);
757
+ if (!raw) {
758
+ continue;
759
+ }
760
+ // Dereference in case the box is an indirect reference
761
+ const resolved = this.deref(raw);
762
+ if (Array.isArray(resolved) && resolved.length === 4) {
763
+ const width = Math.abs(resolved[2] - resolved[0]);
764
+ const height = Math.abs(resolved[3] - resolved[1]);
765
+ if (width > 0 && height > 0) {
766
+ return { width, height };
767
+ }
768
+ }
769
+ }
770
+ // Inherit from parent
771
+ const parent = pageDict.get("Parent");
772
+ if (parent) {
773
+ const parentDict = this.derefDict(parent);
774
+ if (parentDict) {
775
+ return this.resolvePageBox(parentDict, seen);
776
+ }
777
+ }
778
+ return null;
779
+ }
780
+ /**
781
+ * Resolve a page's Resources dictionary, inheriting from parent pages if needed.
782
+ * Protected against cyclic parent chains.
783
+ */
784
+ resolvePageResources(pageDict, visited) {
785
+ const seen = visited ?? new Set();
786
+ if (seen.has(pageDict)) {
787
+ return new Map(); // Cycle guard
788
+ }
789
+ seen.add(pageDict);
790
+ const resources = pageDict.get("Resources");
791
+ if (resources) {
792
+ const resolved = this.derefDict(resources);
793
+ if (resolved) {
794
+ return resolved;
795
+ }
796
+ }
797
+ const parent = pageDict.get("Parent");
798
+ if (parent) {
799
+ const parentDict = this.derefDict(parent);
800
+ if (parentDict) {
801
+ return this.resolvePageResources(parentDict, seen);
802
+ }
803
+ }
804
+ return new Map();
805
+ }
806
+ }
807
+ exports.PdfDocument = PdfDocument;
808
+ // =============================================================================
809
+ // Helpers
810
+ // =============================================================================
811
+ /**
812
+ * Read a big-endian integer of the given byte width.
813
+ * Uses multiplication instead of bitwise shift to avoid signed 32-bit overflow
814
+ * for values that exceed 2^31 (e.g. large file offsets).
815
+ */
816
+ function readIntBE(data, offset, width) {
817
+ let value = 0;
818
+ for (let i = 0; i < width; i++) {
819
+ value = value * 256 + (data[offset + i] ?? 0);
820
+ }
821
+ return value;
822
+ }