@cj-tech-master/excelts 8.0.0 → 8.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +14 -1
- package/README_zh.md +6 -0
- package/dist/browser/modules/archive/zip/stream.d.ts +4 -0
- package/dist/browser/modules/archive/zip/stream.js +53 -0
- package/dist/browser/modules/pdf/core/crypto.d.ts +65 -0
- package/dist/browser/modules/pdf/core/crypto.js +637 -0
- package/dist/browser/modules/pdf/core/encryption.d.ts +23 -20
- package/dist/browser/modules/pdf/core/encryption.js +88 -261
- package/dist/browser/modules/pdf/core/pdf-writer.d.ts +6 -4
- package/dist/browser/modules/pdf/core/pdf-writer.js +19 -10
- package/dist/browser/modules/pdf/index.d.ts +23 -2
- package/dist/browser/modules/pdf/index.js +21 -3
- package/dist/browser/modules/pdf/reader/annotation-extractor.d.ts +63 -0
- package/dist/browser/modules/pdf/reader/annotation-extractor.js +155 -0
- package/dist/browser/modules/pdf/reader/cmap-parser.d.ts +70 -0
- package/dist/browser/modules/pdf/reader/cmap-parser.js +321 -0
- package/dist/browser/modules/pdf/reader/content-interpreter.d.ts +57 -0
- package/dist/browser/modules/pdf/reader/content-interpreter.js +715 -0
- package/dist/browser/modules/pdf/reader/font-decoder.d.ts +58 -0
- package/dist/browser/modules/pdf/reader/font-decoder.js +1513 -0
- package/dist/browser/modules/pdf/reader/form-extractor.d.ts +48 -0
- package/dist/browser/modules/pdf/reader/form-extractor.js +355 -0
- package/dist/browser/modules/pdf/reader/image-extractor.d.ts +55 -0
- package/dist/browser/modules/pdf/reader/image-extractor.js +220 -0
- package/dist/browser/modules/pdf/reader/metadata-reader.d.ts +56 -0
- package/dist/browser/modules/pdf/reader/metadata-reader.js +275 -0
- package/dist/browser/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
- package/dist/browser/modules/pdf/reader/pdf-decrypt.js +443 -0
- package/dist/browser/modules/pdf/reader/pdf-document.d.ts +191 -0
- package/dist/browser/modules/pdf/reader/pdf-document.js +818 -0
- package/dist/browser/modules/pdf/reader/pdf-parser.d.ts +65 -0
- package/dist/browser/modules/pdf/reader/pdf-parser.js +285 -0
- package/dist/browser/modules/pdf/reader/pdf-reader.d.ts +143 -0
- package/dist/browser/modules/pdf/reader/pdf-reader.js +200 -0
- package/dist/browser/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
- package/dist/browser/modules/pdf/reader/pdf-tokenizer.js +543 -0
- package/dist/browser/modules/pdf/reader/reader-utils.d.ts +15 -0
- package/dist/browser/modules/pdf/reader/reader-utils.js +27 -0
- package/dist/browser/modules/pdf/reader/stream-filters.d.ts +20 -0
- package/dist/browser/modules/pdf/reader/stream-filters.js +456 -0
- package/dist/browser/modules/pdf/reader/text-reconstruction.d.ts +44 -0
- package/dist/browser/modules/pdf/reader/text-reconstruction.js +463 -0
- package/dist/cjs/modules/archive/zip/stream.js +53 -0
- package/dist/cjs/modules/pdf/core/crypto.js +649 -0
- package/dist/cjs/modules/pdf/core/encryption.js +88 -263
- package/dist/cjs/modules/pdf/core/pdf-writer.js +19 -10
- package/dist/cjs/modules/pdf/index.js +23 -4
- package/dist/cjs/modules/pdf/reader/annotation-extractor.js +158 -0
- package/dist/cjs/modules/pdf/reader/cmap-parser.js +326 -0
- package/dist/cjs/modules/pdf/reader/content-interpreter.js +718 -0
- package/dist/cjs/modules/pdf/reader/font-decoder.js +1518 -0
- package/dist/cjs/modules/pdf/reader/form-extractor.js +358 -0
- package/dist/cjs/modules/pdf/reader/image-extractor.js +223 -0
- package/dist/cjs/modules/pdf/reader/metadata-reader.js +278 -0
- package/dist/cjs/modules/pdf/reader/pdf-decrypt.js +447 -0
- package/dist/cjs/modules/pdf/reader/pdf-document.js +822 -0
- package/dist/cjs/modules/pdf/reader/pdf-parser.js +301 -0
- package/dist/cjs/modules/pdf/reader/pdf-reader.js +203 -0
- package/dist/cjs/modules/pdf/reader/pdf-tokenizer.js +517 -0
- package/dist/cjs/modules/pdf/reader/reader-utils.js +30 -0
- package/dist/cjs/modules/pdf/reader/stream-filters.js +459 -0
- package/dist/cjs/modules/pdf/reader/text-reconstruction.js +467 -0
- package/dist/esm/modules/archive/zip/stream.js +53 -0
- package/dist/esm/modules/pdf/core/crypto.js +637 -0
- package/dist/esm/modules/pdf/core/encryption.js +88 -261
- package/dist/esm/modules/pdf/core/pdf-writer.js +19 -10
- package/dist/esm/modules/pdf/index.js +21 -3
- package/dist/esm/modules/pdf/reader/annotation-extractor.js +155 -0
- package/dist/esm/modules/pdf/reader/cmap-parser.js +321 -0
- package/dist/esm/modules/pdf/reader/content-interpreter.js +715 -0
- package/dist/esm/modules/pdf/reader/font-decoder.js +1513 -0
- package/dist/esm/modules/pdf/reader/form-extractor.js +355 -0
- package/dist/esm/modules/pdf/reader/image-extractor.js +220 -0
- package/dist/esm/modules/pdf/reader/metadata-reader.js +275 -0
- package/dist/esm/modules/pdf/reader/pdf-decrypt.js +443 -0
- package/dist/esm/modules/pdf/reader/pdf-document.js +818 -0
- package/dist/esm/modules/pdf/reader/pdf-parser.js +285 -0
- package/dist/esm/modules/pdf/reader/pdf-reader.js +200 -0
- package/dist/esm/modules/pdf/reader/pdf-tokenizer.js +543 -0
- package/dist/esm/modules/pdf/reader/reader-utils.js +27 -0
- package/dist/esm/modules/pdf/reader/stream-filters.js +456 -0
- package/dist/esm/modules/pdf/reader/text-reconstruction.js +463 -0
- package/dist/iife/excelts.iife.js +703 -267
- package/dist/iife/excelts.iife.js.map +1 -1
- package/dist/iife/excelts.iife.min.js +35 -35
- package/dist/types/modules/archive/zip/stream.d.ts +4 -0
- package/dist/types/modules/pdf/core/crypto.d.ts +65 -0
- package/dist/types/modules/pdf/core/encryption.d.ts +23 -20
- package/dist/types/modules/pdf/core/pdf-writer.d.ts +6 -4
- package/dist/types/modules/pdf/index.d.ts +23 -2
- package/dist/types/modules/pdf/reader/annotation-extractor.d.ts +63 -0
- package/dist/types/modules/pdf/reader/cmap-parser.d.ts +70 -0
- package/dist/types/modules/pdf/reader/content-interpreter.d.ts +57 -0
- package/dist/types/modules/pdf/reader/font-decoder.d.ts +58 -0
- package/dist/types/modules/pdf/reader/form-extractor.d.ts +48 -0
- package/dist/types/modules/pdf/reader/image-extractor.d.ts +55 -0
- package/dist/types/modules/pdf/reader/metadata-reader.d.ts +56 -0
- package/dist/types/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
- package/dist/types/modules/pdf/reader/pdf-document.d.ts +191 -0
- package/dist/types/modules/pdf/reader/pdf-parser.d.ts +65 -0
- package/dist/types/modules/pdf/reader/pdf-reader.d.ts +143 -0
- package/dist/types/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
- package/dist/types/modules/pdf/reader/reader-utils.d.ts +15 -0
- package/dist/types/modules/pdf/reader/stream-filters.d.ts +20 -0
- package/dist/types/modules/pdf/reader/text-reconstruction.d.ts +44 -0
- package/package.json +1 -1
|
@@ -0,0 +1,818 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PDF document parser.
|
|
3
|
+
*
|
|
4
|
+
* Handles the high-level PDF file structure:
|
|
5
|
+
* - Locating startxref
|
|
6
|
+
* - Parsing cross-reference tables (traditional and stream-based)
|
|
7
|
+
* - Reading trailer dictionaries
|
|
8
|
+
* - Resolving indirect object references
|
|
9
|
+
* - Handling incremental updates
|
|
10
|
+
*
|
|
11
|
+
* @see PDF Reference 1.7, §3.4 - File Structure
|
|
12
|
+
*/
|
|
13
|
+
import { PdfTokenizer, TokenType } from "./pdf-tokenizer.js";
|
|
14
|
+
import { parseObject, isPdfDict, isPdfStream, isPdfRef, isPdfArray, dictGetNumber, dictGetRef, dictGetArray, dictGetName, decodePdfStringBytes } from "./pdf-parser.js";
|
|
15
|
+
import { decodeStreamFilters } from "./stream-filters.js";
|
|
16
|
+
import { PdfStructureError } from "../errors.js";
|
|
17
|
+
// =============================================================================
|
|
18
|
+
// Module-level cached TextEncoder
|
|
19
|
+
// =============================================================================
|
|
20
|
+
/** Cached TextEncoder instance to avoid repeated allocation in hot paths */
|
|
21
|
+
const _encoder = new TextEncoder();
|
|
22
|
+
// =============================================================================
|
|
23
|
+
// PDF Document
|
|
24
|
+
// =============================================================================
|
|
25
|
+
/**
|
|
26
|
+
* Parsed PDF document with lazy object resolution.
|
|
27
|
+
*
|
|
28
|
+
* Reads the cross-reference table and trailer on construction,
|
|
29
|
+
* then resolves individual objects on demand with caching.
|
|
30
|
+
*/
|
|
31
|
+
export class PdfDocument {
|
|
32
|
+
constructor(data) {
|
|
33
|
+
this.xref = new Map();
|
|
34
|
+
this.cache = new Map();
|
|
35
|
+
/** Encryption handler (set externally after decryption is initialized) */
|
|
36
|
+
this.decryptFn = null;
|
|
37
|
+
this.tokenizer = new PdfTokenizer(data);
|
|
38
|
+
this.trailer = this.parseFileStructure();
|
|
39
|
+
}
|
|
40
|
+
/** Get the underlying raw data */
|
|
41
|
+
get data() {
|
|
42
|
+
return this.tokenizer.bytes;
|
|
43
|
+
}
|
|
44
|
+
// ===========================================================================
|
|
45
|
+
// File Structure Parsing
|
|
46
|
+
// ===========================================================================
|
|
47
|
+
parseFileStructure() {
|
|
48
|
+
try {
|
|
49
|
+
const startxrefOffset = this.findStartxref();
|
|
50
|
+
return this.parseXrefChain(startxrefOffset);
|
|
51
|
+
}
|
|
52
|
+
catch {
|
|
53
|
+
// If normal xref parsing fails, attempt full-file reconstruction
|
|
54
|
+
return this.reconstructXref();
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
/**
|
|
58
|
+
* Find the startxref offset by scanning backward from EOF.
|
|
59
|
+
*/
|
|
60
|
+
findStartxref() {
|
|
61
|
+
const data = this.tokenizer.bytes;
|
|
62
|
+
const startxrefKeyword = _encoder.encode("startxref");
|
|
63
|
+
const pos = this.tokenizer.findSequenceBackward(startxrefKeyword);
|
|
64
|
+
if (pos < 0) {
|
|
65
|
+
throw new PdfStructureError("Could not find startxref keyword");
|
|
66
|
+
}
|
|
67
|
+
// Position after "startxref"
|
|
68
|
+
this.tokenizer.pos = pos + startxrefKeyword.length;
|
|
69
|
+
this.tokenizer.skipWhitespaceAndComments();
|
|
70
|
+
// Read the offset number
|
|
71
|
+
let numStr = "";
|
|
72
|
+
while (this.tokenizer.pos < data.length) {
|
|
73
|
+
const b = data[this.tokenizer.pos];
|
|
74
|
+
if (b >= 0x30 && b <= 0x39) {
|
|
75
|
+
numStr += String.fromCharCode(b);
|
|
76
|
+
this.tokenizer.pos++;
|
|
77
|
+
}
|
|
78
|
+
else {
|
|
79
|
+
break;
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
const offset = parseInt(numStr, 10);
|
|
83
|
+
if (isNaN(offset)) {
|
|
84
|
+
throw new PdfStructureError("Invalid startxref offset");
|
|
85
|
+
}
|
|
86
|
+
return offset;
|
|
87
|
+
}
|
|
88
|
+
/**
|
|
89
|
+
* Parse the xref chain starting at the given offset.
|
|
90
|
+
* Follows /Prev links for incremental updates.
|
|
91
|
+
* Returns the merged trailer dictionary.
|
|
92
|
+
*/
|
|
93
|
+
parseXrefChain(startOffset) {
|
|
94
|
+
let trailerDict = null;
|
|
95
|
+
let offset = startOffset;
|
|
96
|
+
const visited = new Set();
|
|
97
|
+
while (offset !== null) {
|
|
98
|
+
if (visited.has(offset)) {
|
|
99
|
+
break; // Prevent infinite loops
|
|
100
|
+
}
|
|
101
|
+
visited.add(offset);
|
|
102
|
+
this.tokenizer.pos = offset;
|
|
103
|
+
this.tokenizer.skipWhitespaceAndComments();
|
|
104
|
+
// Check if this is a traditional xref table or an xref stream
|
|
105
|
+
const peekStart = this.tokenizer.pos;
|
|
106
|
+
const firstToken = this.tokenizer.next();
|
|
107
|
+
if (firstToken.type === TokenType.Keyword && firstToken.strValue === "xref") {
|
|
108
|
+
// Traditional xref table
|
|
109
|
+
const trailer = this.parseTraditionalXref();
|
|
110
|
+
if (!trailerDict) {
|
|
111
|
+
trailerDict = trailer;
|
|
112
|
+
}
|
|
113
|
+
else {
|
|
114
|
+
// Merge: first trailer wins for Root, Info, Encrypt, ID
|
|
115
|
+
this.mergeTrailer(trailerDict, trailer);
|
|
116
|
+
}
|
|
117
|
+
const prev = dictGetNumber(trailer, "Prev");
|
|
118
|
+
offset = prev ?? null;
|
|
119
|
+
}
|
|
120
|
+
else if (firstToken.type === TokenType.Number) {
|
|
121
|
+
// Xref stream (PDF 1.5+): starts with `N gen obj`
|
|
122
|
+
this.tokenizer.pos = peekStart;
|
|
123
|
+
const trailer = this.parseXrefStream(offset);
|
|
124
|
+
if (!trailerDict) {
|
|
125
|
+
trailerDict = trailer;
|
|
126
|
+
}
|
|
127
|
+
else {
|
|
128
|
+
this.mergeTrailer(trailerDict, trailer);
|
|
129
|
+
}
|
|
130
|
+
const prev = dictGetNumber(trailer, "Prev");
|
|
131
|
+
offset = prev ?? null;
|
|
132
|
+
}
|
|
133
|
+
else {
|
|
134
|
+
throw new PdfStructureError(`Invalid xref at offset ${offset}: expected 'xref' keyword or xref stream`);
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
if (!trailerDict) {
|
|
138
|
+
throw new PdfStructureError("No trailer found");
|
|
139
|
+
}
|
|
140
|
+
return trailerDict;
|
|
141
|
+
}
|
|
142
|
+
/**
|
|
143
|
+
* Parse a traditional xref table and its trailer.
|
|
144
|
+
*/
|
|
145
|
+
parseTraditionalXref() {
|
|
146
|
+
// The "xref" keyword has already been consumed
|
|
147
|
+
while (true) {
|
|
148
|
+
this.tokenizer.skipWhitespaceAndComments();
|
|
149
|
+
// Check if we've hit the trailer
|
|
150
|
+
const peekPos = this.tokenizer.pos;
|
|
151
|
+
const token = this.tokenizer.next();
|
|
152
|
+
if (token.type === TokenType.Keyword && token.strValue === "trailer") {
|
|
153
|
+
break;
|
|
154
|
+
}
|
|
155
|
+
// Subsection header: startObj count
|
|
156
|
+
if (token.type !== TokenType.Number) {
|
|
157
|
+
// End of xref sections
|
|
158
|
+
this.tokenizer.pos = peekPos;
|
|
159
|
+
break;
|
|
160
|
+
}
|
|
161
|
+
const startObj = token.numValue;
|
|
162
|
+
const countToken = this.tokenizer.next();
|
|
163
|
+
if (countToken.type !== TokenType.Number) {
|
|
164
|
+
throw new PdfStructureError("Invalid xref subsection header");
|
|
165
|
+
}
|
|
166
|
+
const count = countToken.numValue;
|
|
167
|
+
// Parse entries
|
|
168
|
+
for (let i = 0; i < count; i++) {
|
|
169
|
+
const objNum = startObj + i;
|
|
170
|
+
this.tokenizer.skipWhitespaceAndComments();
|
|
171
|
+
// Each entry is exactly "OOOOOOOOOO GGGGG n \n" or "OOOOOOOOOO GGGGG f \n"
|
|
172
|
+
const line = this.tokenizer.readLine();
|
|
173
|
+
const parts = line.trim().split(/\s+/);
|
|
174
|
+
if (parts.length < 3) {
|
|
175
|
+
continue;
|
|
176
|
+
}
|
|
177
|
+
const entryOffset = parseInt(parts[0], 10);
|
|
178
|
+
const gen = parseInt(parts[1], 10);
|
|
179
|
+
const inUse = parts[2] === "n";
|
|
180
|
+
if (inUse && !this.xref.has(objNum)) {
|
|
181
|
+
this.xref.set(objNum, { offset: entryOffset, gen, type: 1 });
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
// Parse the trailer dictionary
|
|
186
|
+
this.tokenizer.skipWhitespaceAndComments();
|
|
187
|
+
const trailerObj = parseObject(this.tokenizer);
|
|
188
|
+
if (!isPdfDict(trailerObj)) {
|
|
189
|
+
throw new PdfStructureError("Expected dictionary after 'trailer' keyword");
|
|
190
|
+
}
|
|
191
|
+
return trailerObj;
|
|
192
|
+
}
|
|
193
|
+
/**
|
|
194
|
+
* Parse a cross-reference stream (PDF 1.5+).
|
|
195
|
+
*/
|
|
196
|
+
parseXrefStream(offset) {
|
|
197
|
+
this.tokenizer.pos = offset;
|
|
198
|
+
const obj = parseObject(this.tokenizer);
|
|
199
|
+
if (!isPdfStream(obj)) {
|
|
200
|
+
throw new PdfStructureError("Expected xref stream object");
|
|
201
|
+
}
|
|
202
|
+
const dict = obj.dict;
|
|
203
|
+
const type = dictGetName(dict, "Type");
|
|
204
|
+
if (type !== "XRef") {
|
|
205
|
+
throw new PdfStructureError(`Expected /Type /XRef, got /Type /${type}`);
|
|
206
|
+
}
|
|
207
|
+
// Decode the stream data
|
|
208
|
+
const streamData = decodeStreamFilters(obj.data, dict);
|
|
209
|
+
// Parse W array: [fieldSizeType, fieldSizeOffset, fieldSizeGen]
|
|
210
|
+
const wArray = dictGetArray(dict, "W");
|
|
211
|
+
if (!wArray || wArray.length < 3) {
|
|
212
|
+
throw new PdfStructureError("Invalid /W array in xref stream");
|
|
213
|
+
}
|
|
214
|
+
const w0 = wArray[0];
|
|
215
|
+
const w1 = wArray[1];
|
|
216
|
+
const w2 = wArray[2];
|
|
217
|
+
const entrySize = w0 + w1 + w2;
|
|
218
|
+
// Parse Index array (default: [0 Size])
|
|
219
|
+
const size = dictGetNumber(dict, "Size") ?? 0;
|
|
220
|
+
let indexArray = dictGetArray(dict, "Index");
|
|
221
|
+
if (!indexArray) {
|
|
222
|
+
indexArray = [0, size];
|
|
223
|
+
}
|
|
224
|
+
// Process entries
|
|
225
|
+
let dataOffset = 0;
|
|
226
|
+
for (let i = 0; i < indexArray.length; i += 2) {
|
|
227
|
+
const startObj = indexArray[i];
|
|
228
|
+
const count = indexArray[i + 1];
|
|
229
|
+
for (let j = 0; j < count; j++) {
|
|
230
|
+
if (dataOffset + entrySize > streamData.length) {
|
|
231
|
+
break;
|
|
232
|
+
}
|
|
233
|
+
const objNum = startObj + j;
|
|
234
|
+
const fieldType = w0 > 0 ? readIntBE(streamData, dataOffset, w0) : 1;
|
|
235
|
+
const field2 = readIntBE(streamData, dataOffset + w0, w1);
|
|
236
|
+
const field3 = w2 > 0 ? readIntBE(streamData, dataOffset + w0 + w1, w2) : 0;
|
|
237
|
+
dataOffset += entrySize;
|
|
238
|
+
if (this.xref.has(objNum)) {
|
|
239
|
+
continue; // First entry wins
|
|
240
|
+
}
|
|
241
|
+
if (fieldType === 0) {
|
|
242
|
+
// Free object — skip
|
|
243
|
+
}
|
|
244
|
+
else if (fieldType === 1) {
|
|
245
|
+
// Uncompressed object: field2 = byte offset, field3 = generation
|
|
246
|
+
this.xref.set(objNum, { offset: field2, gen: field3, type: 1 });
|
|
247
|
+
}
|
|
248
|
+
else if (fieldType === 2) {
|
|
249
|
+
// Compressed object in object stream: field2 = objstm number, field3 = index
|
|
250
|
+
this.xref.set(objNum, { offset: field2, gen: field3, type: 2 });
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
return dict;
|
|
255
|
+
}
|
|
256
|
+
/**
|
|
257
|
+
* Reconstruct the xref table by scanning the entire file for `N N obj` patterns.
|
|
258
|
+
* This is a fallback for corrupted or broken PDFs where the normal xref parsing fails.
|
|
259
|
+
*
|
|
260
|
+
* @returns A synthetic trailer dictionary
|
|
261
|
+
*/
|
|
262
|
+
reconstructXref() {
|
|
263
|
+
const data = this.tokenizer.bytes;
|
|
264
|
+
this.xref.clear();
|
|
265
|
+
// Regex-style scan: look for patterns like "123 0 obj" in the raw bytes
|
|
266
|
+
// We scan byte-by-byte looking for digit sequences followed by spaces and "obj"
|
|
267
|
+
const objKeyword = _encoder.encode("obj");
|
|
268
|
+
let pos = 0;
|
|
269
|
+
while (pos < data.length - 5) {
|
|
270
|
+
// Skip to a potential start of an object definition (digit character)
|
|
271
|
+
if (data[pos] < 0x30 || data[pos] > 0x39) {
|
|
272
|
+
pos++;
|
|
273
|
+
continue;
|
|
274
|
+
}
|
|
275
|
+
// Ensure we're at a line boundary or start of file
|
|
276
|
+
if (pos > 0 && data[pos - 1] !== 0x0a && data[pos - 1] !== 0x0d && data[pos - 1] !== 0x20) {
|
|
277
|
+
pos++;
|
|
278
|
+
continue;
|
|
279
|
+
}
|
|
280
|
+
// Try to read: objNum gen obj
|
|
281
|
+
const savedPos = pos;
|
|
282
|
+
let objNumStr = "";
|
|
283
|
+
while (pos < data.length && data[pos] >= 0x30 && data[pos] <= 0x39) {
|
|
284
|
+
objNumStr += String.fromCharCode(data[pos]);
|
|
285
|
+
pos++;
|
|
286
|
+
}
|
|
287
|
+
if (objNumStr.length === 0 || pos >= data.length || data[pos] !== 0x20) {
|
|
288
|
+
pos = savedPos + 1;
|
|
289
|
+
continue;
|
|
290
|
+
}
|
|
291
|
+
pos++; // skip space
|
|
292
|
+
let genStr = "";
|
|
293
|
+
while (pos < data.length && data[pos] >= 0x30 && data[pos] <= 0x39) {
|
|
294
|
+
genStr += String.fromCharCode(data[pos]);
|
|
295
|
+
pos++;
|
|
296
|
+
}
|
|
297
|
+
if (genStr.length === 0 || pos >= data.length || data[pos] !== 0x20) {
|
|
298
|
+
pos = savedPos + 1;
|
|
299
|
+
continue;
|
|
300
|
+
}
|
|
301
|
+
pos++; // skip space
|
|
302
|
+
// Check for "obj" keyword
|
|
303
|
+
if (pos + objKeyword.length <= data.length &&
|
|
304
|
+
data[pos] === objKeyword[0] &&
|
|
305
|
+
data[pos + 1] === objKeyword[1] &&
|
|
306
|
+
data[pos + 2] === objKeyword[2]) {
|
|
307
|
+
// Verify the character after "obj" is whitespace or delimiter
|
|
308
|
+
const afterObj = pos + 3;
|
|
309
|
+
if (afterObj >= data.length ||
|
|
310
|
+
data[afterObj] === 0x20 ||
|
|
311
|
+
data[afterObj] === 0x0a ||
|
|
312
|
+
data[afterObj] === 0x0d ||
|
|
313
|
+
data[afterObj] === 0x09 ||
|
|
314
|
+
data[afterObj] === 0x3c // '<' for immediate dict/stream
|
|
315
|
+
) {
|
|
316
|
+
const objNum = parseInt(objNumStr, 10);
|
|
317
|
+
const gen = parseInt(genStr, 10);
|
|
318
|
+
if (!this.xref.has(objNum)) {
|
|
319
|
+
this.xref.set(objNum, { offset: savedPos, gen, type: 1 });
|
|
320
|
+
}
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
pos = savedPos + 1;
|
|
324
|
+
}
|
|
325
|
+
if (this.xref.size === 0) {
|
|
326
|
+
throw new PdfStructureError("Could not reconstruct xref: no objects found");
|
|
327
|
+
}
|
|
328
|
+
// Try to find a trailer dictionary by scanning for "trailer" keyword
|
|
329
|
+
const trailerKeyword = _encoder.encode("trailer");
|
|
330
|
+
const trailerPos = this.tokenizer.findSequenceBackward(trailerKeyword);
|
|
331
|
+
if (trailerPos >= 0) {
|
|
332
|
+
this.tokenizer.pos = trailerPos + trailerKeyword.length;
|
|
333
|
+
this.tokenizer.skipWhitespaceAndComments();
|
|
334
|
+
try {
|
|
335
|
+
const trailerObj = parseObject(this.tokenizer);
|
|
336
|
+
if (isPdfDict(trailerObj)) {
|
|
337
|
+
return trailerObj;
|
|
338
|
+
}
|
|
339
|
+
}
|
|
340
|
+
catch {
|
|
341
|
+
// Fall through to synthetic trailer
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
// Build a synthetic trailer by finding the Root catalog
|
|
345
|
+
const syntheticTrailer = new Map();
|
|
346
|
+
syntheticTrailer.set("Size", this.xref.size);
|
|
347
|
+
// Scan resolved objects to find the catalog (the one with /Type /Catalog)
|
|
348
|
+
for (const [objNum, entry] of this.xref) {
|
|
349
|
+
if (entry.type !== 1) {
|
|
350
|
+
continue;
|
|
351
|
+
}
|
|
352
|
+
try {
|
|
353
|
+
this.tokenizer.pos = entry.offset;
|
|
354
|
+
const obj = parseObject(this.tokenizer);
|
|
355
|
+
if (isPdfDict(obj)) {
|
|
356
|
+
const typeVal = dictGetName(obj, "Type");
|
|
357
|
+
if (typeVal === "Catalog") {
|
|
358
|
+
syntheticTrailer.set("Root", { type: "ref", objNum, gen: entry.gen });
|
|
359
|
+
break;
|
|
360
|
+
}
|
|
361
|
+
}
|
|
362
|
+
else if (isPdfStream(obj)) {
|
|
363
|
+
const typeVal = dictGetName(obj.dict, "Type");
|
|
364
|
+
if (typeVal === "Catalog") {
|
|
365
|
+
syntheticTrailer.set("Root", { type: "ref", objNum, gen: entry.gen });
|
|
366
|
+
break;
|
|
367
|
+
}
|
|
368
|
+
}
|
|
369
|
+
}
|
|
370
|
+
catch {
|
|
371
|
+
// Skip unparseable objects
|
|
372
|
+
}
|
|
373
|
+
}
|
|
374
|
+
return syntheticTrailer;
|
|
375
|
+
}
|
|
376
|
+
/**
|
|
377
|
+
* Merge trailer entries from an older trailer into the current one.
|
|
378
|
+
* Only adds keys that don't already exist.
|
|
379
|
+
*/
|
|
380
|
+
mergeTrailer(current, older) {
|
|
381
|
+
for (const [key, value] of older) {
|
|
382
|
+
if (!current.has(key)) {
|
|
383
|
+
current.set(key, value);
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
// ===========================================================================
|
|
388
|
+
// Object Resolution
|
|
389
|
+
// ===========================================================================
|
|
390
|
+
/**
|
|
391
|
+
* Resolve a PDF object by its object number and generation.
|
|
392
|
+
* Returns null if the object doesn't exist.
|
|
393
|
+
*/
|
|
394
|
+
resolve(objNum, gen = 0) {
|
|
395
|
+
const cacheKey = `${objNum}:${gen}`;
|
|
396
|
+
if (this.cache.has(cacheKey)) {
|
|
397
|
+
return this.cache.get(cacheKey);
|
|
398
|
+
}
|
|
399
|
+
const entry = this.xref.get(objNum);
|
|
400
|
+
if (!entry) {
|
|
401
|
+
return null;
|
|
402
|
+
}
|
|
403
|
+
let obj = null;
|
|
404
|
+
if (entry.type === 1) {
|
|
405
|
+
// Uncompressed object — parse directly at offset
|
|
406
|
+
obj = this.parseObjectAt(entry.offset, objNum, entry.gen);
|
|
407
|
+
}
|
|
408
|
+
else if (entry.type === 2) {
|
|
409
|
+
// Compressed object in an object stream
|
|
410
|
+
obj = this.parseCompressedObject(entry.offset, entry.gen);
|
|
411
|
+
}
|
|
412
|
+
// Decrypt string values within the resolved object (V1-V4 per-object encryption)
|
|
413
|
+
if (obj !== null && this.decryptFn) {
|
|
414
|
+
obj = this.decryptObjectStrings(obj, objNum, entry.gen);
|
|
415
|
+
}
|
|
416
|
+
if (obj !== null) {
|
|
417
|
+
this.cache.set(cacheKey, obj);
|
|
418
|
+
}
|
|
419
|
+
return obj;
|
|
420
|
+
}
|
|
421
|
+
/**
|
|
422
|
+
* Resolve a PDF object and return it along with its object/generation numbers.
|
|
423
|
+
* Useful for tracking which object a value came from (for decryption).
|
|
424
|
+
*
|
|
425
|
+
* @param objNum - The object number to resolve
|
|
426
|
+
* @param gen - The generation number (default 0)
|
|
427
|
+
* @returns The resolved object with its objNum and gen for decryption context
|
|
428
|
+
*/
|
|
429
|
+
resolveWithObjNum(objNum, gen = 0) {
|
|
430
|
+
const obj = this.resolve(objNum, gen);
|
|
431
|
+
return { obj, objNum, gen };
|
|
432
|
+
}
|
|
433
|
+
/**
|
|
434
|
+
* Dereference a PdfRef to its actual object value.
|
|
435
|
+
* If the input is not a PdfRef, returns it as-is.
|
|
436
|
+
*/
|
|
437
|
+
deref(obj) {
|
|
438
|
+
if (obj === null || obj === undefined) {
|
|
439
|
+
return null;
|
|
440
|
+
}
|
|
441
|
+
if (isPdfRef(obj)) {
|
|
442
|
+
return this.resolve(obj.objNum, obj.gen);
|
|
443
|
+
}
|
|
444
|
+
return obj;
|
|
445
|
+
}
|
|
446
|
+
/**
|
|
447
|
+
* Dereference a PdfRef and assert it's a dictionary.
|
|
448
|
+
*/
|
|
449
|
+
derefDict(obj) {
|
|
450
|
+
const resolved = this.deref(obj);
|
|
451
|
+
if (resolved === null) {
|
|
452
|
+
return null;
|
|
453
|
+
}
|
|
454
|
+
if (isPdfDict(resolved)) {
|
|
455
|
+
return resolved;
|
|
456
|
+
}
|
|
457
|
+
if (isPdfStream(resolved)) {
|
|
458
|
+
return resolved.dict;
|
|
459
|
+
}
|
|
460
|
+
return null;
|
|
461
|
+
}
|
|
462
|
+
/**
|
|
463
|
+
* Dereference a PdfRef and get the stream, along with the objNum/gen
|
|
464
|
+
* needed for correct per-object decryption.
|
|
465
|
+
*/
|
|
466
|
+
derefStream(obj) {
|
|
467
|
+
const resolved = this.deref(obj);
|
|
468
|
+
if (resolved === null) {
|
|
469
|
+
return null;
|
|
470
|
+
}
|
|
471
|
+
if (isPdfStream(resolved)) {
|
|
472
|
+
return resolved;
|
|
473
|
+
}
|
|
474
|
+
return null;
|
|
475
|
+
}
|
|
476
|
+
/**
|
|
477
|
+
* Dereference a PdfRef and get the stream with its object number and generation.
|
|
478
|
+
* Returns null if the object is not a stream.
|
|
479
|
+
* The objNum/gen are needed for correct per-object decryption (V1-V4).
|
|
480
|
+
*/
|
|
481
|
+
derefStreamWithObjNum(obj) {
|
|
482
|
+
if (obj === null || obj === undefined) {
|
|
483
|
+
return null;
|
|
484
|
+
}
|
|
485
|
+
let objNum = 0;
|
|
486
|
+
let gen = 0;
|
|
487
|
+
if (isPdfRef(obj)) {
|
|
488
|
+
objNum = obj.objNum;
|
|
489
|
+
gen = obj.gen;
|
|
490
|
+
}
|
|
491
|
+
const resolved = this.deref(obj);
|
|
492
|
+
if (resolved === null) {
|
|
493
|
+
return null;
|
|
494
|
+
}
|
|
495
|
+
if (isPdfStream(resolved)) {
|
|
496
|
+
return { stream: resolved, objNum, gen };
|
|
497
|
+
}
|
|
498
|
+
return null;
|
|
499
|
+
}
|
|
500
|
+
/**
|
|
501
|
+
* Get decoded stream data from a stream object.
|
|
502
|
+
* Applies filter chain decoding and decryption.
|
|
503
|
+
*
|
|
504
|
+
* When objNum/gen are not provided (default 0), decryption may not
|
|
505
|
+
* produce correct results. Use {@link resolveWithObjNum} to obtain
|
|
506
|
+
* the correct objNum/gen for the stream's containing object.
|
|
507
|
+
*/
|
|
508
|
+
getStreamData(stream, objNum = 0, gen = 0) {
|
|
509
|
+
let data = stream.data;
|
|
510
|
+
// Decrypt stream data if encryption is active
|
|
511
|
+
if (this.decryptFn) {
|
|
512
|
+
data = this.decryptFn(data, objNum, gen);
|
|
513
|
+
}
|
|
514
|
+
return decodeStreamFilters(data, stream.dict);
|
|
515
|
+
}
|
|
516
|
+
/**
|
|
517
|
+
* Decrypt a string value (bytes) if encryption is active.
|
|
518
|
+
*/
|
|
519
|
+
decryptString(bytes, objNum, gen) {
|
|
520
|
+
if (this.decryptFn) {
|
|
521
|
+
return this.decryptFn(bytes, objNum, gen);
|
|
522
|
+
}
|
|
523
|
+
return bytes;
|
|
524
|
+
}
|
|
525
|
+
/**
|
|
526
|
+
* Decode a PDF string to a JS string, with optional decryption.
|
|
527
|
+
*/
|
|
528
|
+
decodeString(bytes, objNum = 0, gen = 0) {
|
|
529
|
+
const decrypted = this.decryptString(bytes, objNum, gen);
|
|
530
|
+
return decodePdfStringBytes(decrypted);
|
|
531
|
+
}
|
|
532
|
+
/**
|
|
533
|
+
* Recursively decrypt all string values (Uint8Array) within a parsed PDF object.
|
|
534
|
+
* PDF spec requires all strings in an encrypted document to be decrypted using
|
|
535
|
+
* the per-object key derived from the containing object's objNum/gen.
|
|
536
|
+
* Streams are NOT decrypted here — they are decrypted in getStreamData().
|
|
537
|
+
*/
|
|
538
|
+
decryptObjectStrings(obj, objNum, gen) {
|
|
539
|
+
if (obj === null || typeof obj !== "object") {
|
|
540
|
+
return obj;
|
|
541
|
+
}
|
|
542
|
+
// Decrypt Uint8Array string values
|
|
543
|
+
if (obj instanceof Uint8Array) {
|
|
544
|
+
return this.decryptFn(obj, objNum, gen);
|
|
545
|
+
}
|
|
546
|
+
// Recurse into dictionaries
|
|
547
|
+
if (isPdfDict(obj)) {
|
|
548
|
+
const decrypted = new Map();
|
|
549
|
+
for (const [key, value] of obj) {
|
|
550
|
+
decrypted.set(key, this.decryptObjectStrings(value, objNum, gen));
|
|
551
|
+
}
|
|
552
|
+
return decrypted;
|
|
553
|
+
}
|
|
554
|
+
// Recurse into arrays
|
|
555
|
+
if (isPdfArray(obj)) {
|
|
556
|
+
return obj.map(item => this.decryptObjectStrings(item, objNum, gen));
|
|
557
|
+
}
|
|
558
|
+
// Decrypt strings inside stream dicts (but NOT the stream data itself)
|
|
559
|
+
if (isPdfStream(obj)) {
|
|
560
|
+
const decryptedDict = this.decryptObjectStrings(obj.dict, objNum, gen);
|
|
561
|
+
return { type: "stream", dict: decryptedDict, data: obj.data };
|
|
562
|
+
}
|
|
563
|
+
return obj;
|
|
564
|
+
}
|
|
565
|
+
/**
|
|
566
|
+
* Get the catalog dictionary (the root of the document structure).
|
|
567
|
+
*/
|
|
568
|
+
getCatalog() {
|
|
569
|
+
const rootRef = dictGetRef(this.trailer, "Root");
|
|
570
|
+
if (!rootRef) {
|
|
571
|
+
throw new PdfStructureError("No /Root in trailer");
|
|
572
|
+
}
|
|
573
|
+
const catalog = this.derefDict(rootRef);
|
|
574
|
+
if (!catalog) {
|
|
575
|
+
throw new PdfStructureError("Could not resolve catalog");
|
|
576
|
+
}
|
|
577
|
+
return catalog;
|
|
578
|
+
}
|
|
579
|
+
/**
|
|
580
|
+
* Get the pages array from the page tree.
|
|
581
|
+
* Returns an array of page dictionaries in order.
|
|
582
|
+
*/
|
|
583
|
+
getPages() {
|
|
584
|
+
return this.getPagesWithObjInfo().map(p => p.dict);
|
|
585
|
+
}
|
|
586
|
+
/**
|
|
587
|
+
* Get pages with their object numbers (needed for correct decryption of
|
|
588
|
+
* inline streams within page objects).
|
|
589
|
+
*/
|
|
590
|
+
getPagesWithObjInfo() {
|
|
591
|
+
const catalog = this.getCatalog();
|
|
592
|
+
const pagesRef = catalog.get("Pages");
|
|
593
|
+
const pagesDict = this.derefDict(pagesRef);
|
|
594
|
+
if (!pagesDict) {
|
|
595
|
+
throw new PdfStructureError("Could not resolve /Pages");
|
|
596
|
+
}
|
|
597
|
+
const pages = [];
|
|
598
|
+
const visited = new Set();
|
|
599
|
+
this.collectPages(pagesDict, pages, visited);
|
|
600
|
+
return pages;
|
|
601
|
+
}
|
|
602
|
+
/**
|
|
603
|
+
* Recursively collect page dictionaries from the page tree.
|
|
604
|
+
* Uses a visited set to prevent infinite recursion on cyclic page trees.
|
|
605
|
+
*/
|
|
606
|
+
collectPages(node, pages, visited) {
|
|
607
|
+
if (visited.has(node)) {
|
|
608
|
+
return; // Cycle guard
|
|
609
|
+
}
|
|
610
|
+
visited.add(node);
|
|
611
|
+
const type = dictGetName(node, "Type");
|
|
612
|
+
if (type === "Page") {
|
|
613
|
+
// We don't know the objNum from here — it was lost during deref.
|
|
614
|
+
// Use 0 as fallback; callers that need objNum should use getPagesWithObjInfo().
|
|
615
|
+
pages.push({ dict: node, objNum: 0, gen: 0 });
|
|
616
|
+
return;
|
|
617
|
+
}
|
|
618
|
+
// Pages node — recurse into Kids
|
|
619
|
+
const kids = dictGetArray(node, "Kids");
|
|
620
|
+
if (!kids) {
|
|
621
|
+
return;
|
|
622
|
+
}
|
|
623
|
+
for (const kid of kids) {
|
|
624
|
+
let objNum = 0;
|
|
625
|
+
let gen = 0;
|
|
626
|
+
if (isPdfRef(kid)) {
|
|
627
|
+
objNum = kid.objNum;
|
|
628
|
+
gen = kid.gen;
|
|
629
|
+
}
|
|
630
|
+
const childDict = this.derefDict(kid);
|
|
631
|
+
if (childDict) {
|
|
632
|
+
const childType = dictGetName(childDict, "Type");
|
|
633
|
+
if (childType === "Page") {
|
|
634
|
+
pages.push({ dict: childDict, objNum, gen });
|
|
635
|
+
}
|
|
636
|
+
else {
|
|
637
|
+
this.collectPages(childDict, pages, visited);
|
|
638
|
+
}
|
|
639
|
+
}
|
|
640
|
+
}
|
|
641
|
+
}
|
|
642
|
+
/**
|
|
643
|
+
* Get the object number for a given object reference.
|
|
644
|
+
* Useful for tracking which object a value came from (for decryption).
|
|
645
|
+
*/
|
|
646
|
+
getObjNumForRef(ref) {
|
|
647
|
+
return ref.objNum;
|
|
648
|
+
}
|
|
649
|
+
// ===========================================================================
|
|
650
|
+
// Low-level Object Parsing
|
|
651
|
+
// ===========================================================================
|
|
652
|
+
/**
|
|
653
|
+
* Parse an object definition at the given byte offset.
|
|
654
|
+
*/
|
|
655
|
+
parseObjectAt(offset, objNum, _gen) {
|
|
656
|
+
this.tokenizer.pos = offset;
|
|
657
|
+
try {
|
|
658
|
+
const obj = parseObject(this.tokenizer);
|
|
659
|
+
return obj;
|
|
660
|
+
}
|
|
661
|
+
catch {
|
|
662
|
+
return null;
|
|
663
|
+
}
|
|
664
|
+
}
|
|
665
|
+
/**
|
|
666
|
+
* Parse a compressed object from an object stream.
|
|
667
|
+
* @param objStmNum - The object number of the object stream
|
|
668
|
+
* @param index - The index of the object within the stream
|
|
669
|
+
*/
|
|
670
|
+
parseCompressedObject(objStmNum, index) {
|
|
671
|
+
// Resolve the object stream itself (must be type 1 — not recursive)
|
|
672
|
+
const stmCacheKey = `objstm:${objStmNum}`;
|
|
673
|
+
let stmObjects;
|
|
674
|
+
if (this.cache.has(stmCacheKey)) {
|
|
675
|
+
stmObjects = this.cache.get(stmCacheKey);
|
|
676
|
+
}
|
|
677
|
+
else {
|
|
678
|
+
stmObjects = this.parseObjectStream(objStmNum) ?? undefined;
|
|
679
|
+
if (stmObjects) {
|
|
680
|
+
this.cache.set(stmCacheKey, stmObjects);
|
|
681
|
+
}
|
|
682
|
+
}
|
|
683
|
+
if (!stmObjects) {
|
|
684
|
+
return null;
|
|
685
|
+
}
|
|
686
|
+
// The index field in the xref is the index within the object stream
|
|
687
|
+
// We need to find the object by its index position
|
|
688
|
+
let i = 0;
|
|
689
|
+
for (const [, value] of stmObjects) {
|
|
690
|
+
if (i === index) {
|
|
691
|
+
return value;
|
|
692
|
+
}
|
|
693
|
+
i++;
|
|
694
|
+
}
|
|
695
|
+
return null;
|
|
696
|
+
}
|
|
697
|
+
/**
|
|
698
|
+
* Parse all objects from an object stream.
|
|
699
|
+
* @returns Map of object number → object value
|
|
700
|
+
*/
|
|
701
|
+
parseObjectStream(objStmNum) {
|
|
702
|
+
const entry = this.xref.get(objStmNum);
|
|
703
|
+
if (!entry || entry.type !== 1) {
|
|
704
|
+
return null;
|
|
705
|
+
}
|
|
706
|
+
this.tokenizer.pos = entry.offset;
|
|
707
|
+
const stmObj = parseObject(this.tokenizer);
|
|
708
|
+
if (!isPdfStream(stmObj)) {
|
|
709
|
+
return null;
|
|
710
|
+
}
|
|
711
|
+
const dict = stmObj.dict;
|
|
712
|
+
const n = dictGetNumber(dict, "N") ?? 0;
|
|
713
|
+
const first = dictGetNumber(dict, "First") ?? 0;
|
|
714
|
+
// Decode stream data (pass objStmNum/gen for correct decryption)
|
|
715
|
+
const streamData = this.getStreamData(stmObj, objStmNum, entry.gen);
|
|
716
|
+
// Parse the N pairs of (objNum offset) before 'first'
|
|
717
|
+
const headerTokenizer = new PdfTokenizer(streamData);
|
|
718
|
+
const pairs = [];
|
|
719
|
+
for (let i = 0; i < n; i++) {
|
|
720
|
+
const numTok = headerTokenizer.next();
|
|
721
|
+
const offTok = headerTokenizer.next();
|
|
722
|
+
if (numTok.type === TokenType.Number && offTok.type === TokenType.Number) {
|
|
723
|
+
pairs.push([numTok.numValue, offTok.numValue]);
|
|
724
|
+
}
|
|
725
|
+
}
|
|
726
|
+
// Parse each object
|
|
727
|
+
const result = new Map();
|
|
728
|
+
for (const [objectNumber, relOffset] of pairs) {
|
|
729
|
+
const objTokenizer = new PdfTokenizer(streamData, first + relOffset);
|
|
730
|
+
try {
|
|
731
|
+
const obj = parseObject(objTokenizer);
|
|
732
|
+
result.set(objectNumber, obj);
|
|
733
|
+
}
|
|
734
|
+
catch {
|
|
735
|
+
// Skip unparseable objects
|
|
736
|
+
}
|
|
737
|
+
}
|
|
738
|
+
return result;
|
|
739
|
+
}
|
|
740
|
+
/**
|
|
741
|
+
* Resolve a page's bounding box (MediaBox/CropBox) with indirect ref resolution
|
|
742
|
+
* and parent inheritance. Returns `{ width, height }` or null if no box found.
|
|
743
|
+
*
|
|
744
|
+
* This is a shared helper so callers don't duplicate box resolution logic.
|
|
745
|
+
*/
|
|
746
|
+
resolvePageBox(pageDict, visited) {
|
|
747
|
+
const seen = visited ?? new Set();
|
|
748
|
+
if (seen.has(pageDict)) {
|
|
749
|
+
return null; // Cycle guard
|
|
750
|
+
}
|
|
751
|
+
seen.add(pageDict);
|
|
752
|
+
for (const key of ["MediaBox", "CropBox"]) {
|
|
753
|
+
const raw = pageDict.get(key);
|
|
754
|
+
if (!raw) {
|
|
755
|
+
continue;
|
|
756
|
+
}
|
|
757
|
+
// Dereference in case the box is an indirect reference
|
|
758
|
+
const resolved = this.deref(raw);
|
|
759
|
+
if (Array.isArray(resolved) && resolved.length === 4) {
|
|
760
|
+
const width = Math.abs(resolved[2] - resolved[0]);
|
|
761
|
+
const height = Math.abs(resolved[3] - resolved[1]);
|
|
762
|
+
if (width > 0 && height > 0) {
|
|
763
|
+
return { width, height };
|
|
764
|
+
}
|
|
765
|
+
}
|
|
766
|
+
}
|
|
767
|
+
// Inherit from parent
|
|
768
|
+
const parent = pageDict.get("Parent");
|
|
769
|
+
if (parent) {
|
|
770
|
+
const parentDict = this.derefDict(parent);
|
|
771
|
+
if (parentDict) {
|
|
772
|
+
return this.resolvePageBox(parentDict, seen);
|
|
773
|
+
}
|
|
774
|
+
}
|
|
775
|
+
return null;
|
|
776
|
+
}
|
|
777
|
+
/**
|
|
778
|
+
* Resolve a page's Resources dictionary, inheriting from parent pages if needed.
|
|
779
|
+
* Protected against cyclic parent chains.
|
|
780
|
+
*/
|
|
781
|
+
resolvePageResources(pageDict, visited) {
|
|
782
|
+
const seen = visited ?? new Set();
|
|
783
|
+
if (seen.has(pageDict)) {
|
|
784
|
+
return new Map(); // Cycle guard
|
|
785
|
+
}
|
|
786
|
+
seen.add(pageDict);
|
|
787
|
+
const resources = pageDict.get("Resources");
|
|
788
|
+
if (resources) {
|
|
789
|
+
const resolved = this.derefDict(resources);
|
|
790
|
+
if (resolved) {
|
|
791
|
+
return resolved;
|
|
792
|
+
}
|
|
793
|
+
}
|
|
794
|
+
const parent = pageDict.get("Parent");
|
|
795
|
+
if (parent) {
|
|
796
|
+
const parentDict = this.derefDict(parent);
|
|
797
|
+
if (parentDict) {
|
|
798
|
+
return this.resolvePageResources(parentDict, seen);
|
|
799
|
+
}
|
|
800
|
+
}
|
|
801
|
+
return new Map();
|
|
802
|
+
}
|
|
803
|
+
}
|
|
804
|
+
// =============================================================================
|
|
805
|
+
// Helpers
|
|
806
|
+
// =============================================================================
|
|
807
|
+
/**
|
|
808
|
+
* Read a big-endian integer of the given byte width.
|
|
809
|
+
* Uses multiplication instead of bitwise shift to avoid signed 32-bit overflow
|
|
810
|
+
* for values that exceed 2^31 (e.g. large file offsets).
|
|
811
|
+
*/
|
|
812
|
+
function readIntBE(data, offset, width) {
|
|
813
|
+
let value = 0;
|
|
814
|
+
for (let i = 0; i < width; i++) {
|
|
815
|
+
value = value * 256 + (data[offset + i] ?? 0);
|
|
816
|
+
}
|
|
817
|
+
return value;
|
|
818
|
+
}
|