@cj-tech-master/excelts 8.0.0 → 8.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. package/README.md +14 -1
  2. package/README_zh.md +6 -0
  3. package/dist/browser/modules/archive/zip/stream.d.ts +4 -0
  4. package/dist/browser/modules/archive/zip/stream.js +53 -0
  5. package/dist/browser/modules/pdf/core/crypto.d.ts +65 -0
  6. package/dist/browser/modules/pdf/core/crypto.js +637 -0
  7. package/dist/browser/modules/pdf/core/encryption.d.ts +23 -20
  8. package/dist/browser/modules/pdf/core/encryption.js +88 -261
  9. package/dist/browser/modules/pdf/core/pdf-writer.d.ts +6 -4
  10. package/dist/browser/modules/pdf/core/pdf-writer.js +19 -10
  11. package/dist/browser/modules/pdf/index.d.ts +23 -2
  12. package/dist/browser/modules/pdf/index.js +21 -3
  13. package/dist/browser/modules/pdf/reader/annotation-extractor.d.ts +63 -0
  14. package/dist/browser/modules/pdf/reader/annotation-extractor.js +155 -0
  15. package/dist/browser/modules/pdf/reader/cmap-parser.d.ts +70 -0
  16. package/dist/browser/modules/pdf/reader/cmap-parser.js +321 -0
  17. package/dist/browser/modules/pdf/reader/content-interpreter.d.ts +57 -0
  18. package/dist/browser/modules/pdf/reader/content-interpreter.js +715 -0
  19. package/dist/browser/modules/pdf/reader/font-decoder.d.ts +58 -0
  20. package/dist/browser/modules/pdf/reader/font-decoder.js +1513 -0
  21. package/dist/browser/modules/pdf/reader/form-extractor.d.ts +48 -0
  22. package/dist/browser/modules/pdf/reader/form-extractor.js +355 -0
  23. package/dist/browser/modules/pdf/reader/image-extractor.d.ts +55 -0
  24. package/dist/browser/modules/pdf/reader/image-extractor.js +220 -0
  25. package/dist/browser/modules/pdf/reader/metadata-reader.d.ts +56 -0
  26. package/dist/browser/modules/pdf/reader/metadata-reader.js +275 -0
  27. package/dist/browser/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
  28. package/dist/browser/modules/pdf/reader/pdf-decrypt.js +443 -0
  29. package/dist/browser/modules/pdf/reader/pdf-document.d.ts +191 -0
  30. package/dist/browser/modules/pdf/reader/pdf-document.js +818 -0
  31. package/dist/browser/modules/pdf/reader/pdf-parser.d.ts +65 -0
  32. package/dist/browser/modules/pdf/reader/pdf-parser.js +285 -0
  33. package/dist/browser/modules/pdf/reader/pdf-reader.d.ts +143 -0
  34. package/dist/browser/modules/pdf/reader/pdf-reader.js +200 -0
  35. package/dist/browser/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
  36. package/dist/browser/modules/pdf/reader/pdf-tokenizer.js +543 -0
  37. package/dist/browser/modules/pdf/reader/reader-utils.d.ts +15 -0
  38. package/dist/browser/modules/pdf/reader/reader-utils.js +27 -0
  39. package/dist/browser/modules/pdf/reader/stream-filters.d.ts +20 -0
  40. package/dist/browser/modules/pdf/reader/stream-filters.js +456 -0
  41. package/dist/browser/modules/pdf/reader/text-reconstruction.d.ts +44 -0
  42. package/dist/browser/modules/pdf/reader/text-reconstruction.js +463 -0
  43. package/dist/cjs/modules/archive/zip/stream.js +53 -0
  44. package/dist/cjs/modules/pdf/core/crypto.js +649 -0
  45. package/dist/cjs/modules/pdf/core/encryption.js +88 -263
  46. package/dist/cjs/modules/pdf/core/pdf-writer.js +19 -10
  47. package/dist/cjs/modules/pdf/index.js +23 -4
  48. package/dist/cjs/modules/pdf/reader/annotation-extractor.js +158 -0
  49. package/dist/cjs/modules/pdf/reader/cmap-parser.js +326 -0
  50. package/dist/cjs/modules/pdf/reader/content-interpreter.js +718 -0
  51. package/dist/cjs/modules/pdf/reader/font-decoder.js +1518 -0
  52. package/dist/cjs/modules/pdf/reader/form-extractor.js +358 -0
  53. package/dist/cjs/modules/pdf/reader/image-extractor.js +223 -0
  54. package/dist/cjs/modules/pdf/reader/metadata-reader.js +278 -0
  55. package/dist/cjs/modules/pdf/reader/pdf-decrypt.js +447 -0
  56. package/dist/cjs/modules/pdf/reader/pdf-document.js +822 -0
  57. package/dist/cjs/modules/pdf/reader/pdf-parser.js +301 -0
  58. package/dist/cjs/modules/pdf/reader/pdf-reader.js +203 -0
  59. package/dist/cjs/modules/pdf/reader/pdf-tokenizer.js +517 -0
  60. package/dist/cjs/modules/pdf/reader/reader-utils.js +30 -0
  61. package/dist/cjs/modules/pdf/reader/stream-filters.js +459 -0
  62. package/dist/cjs/modules/pdf/reader/text-reconstruction.js +467 -0
  63. package/dist/esm/modules/archive/zip/stream.js +53 -0
  64. package/dist/esm/modules/pdf/core/crypto.js +637 -0
  65. package/dist/esm/modules/pdf/core/encryption.js +88 -261
  66. package/dist/esm/modules/pdf/core/pdf-writer.js +19 -10
  67. package/dist/esm/modules/pdf/index.js +21 -3
  68. package/dist/esm/modules/pdf/reader/annotation-extractor.js +155 -0
  69. package/dist/esm/modules/pdf/reader/cmap-parser.js +321 -0
  70. package/dist/esm/modules/pdf/reader/content-interpreter.js +715 -0
  71. package/dist/esm/modules/pdf/reader/font-decoder.js +1513 -0
  72. package/dist/esm/modules/pdf/reader/form-extractor.js +355 -0
  73. package/dist/esm/modules/pdf/reader/image-extractor.js +220 -0
  74. package/dist/esm/modules/pdf/reader/metadata-reader.js +275 -0
  75. package/dist/esm/modules/pdf/reader/pdf-decrypt.js +443 -0
  76. package/dist/esm/modules/pdf/reader/pdf-document.js +818 -0
  77. package/dist/esm/modules/pdf/reader/pdf-parser.js +285 -0
  78. package/dist/esm/modules/pdf/reader/pdf-reader.js +200 -0
  79. package/dist/esm/modules/pdf/reader/pdf-tokenizer.js +543 -0
  80. package/dist/esm/modules/pdf/reader/reader-utils.js +27 -0
  81. package/dist/esm/modules/pdf/reader/stream-filters.js +456 -0
  82. package/dist/esm/modules/pdf/reader/text-reconstruction.js +463 -0
  83. package/dist/iife/excelts.iife.js +703 -267
  84. package/dist/iife/excelts.iife.js.map +1 -1
  85. package/dist/iife/excelts.iife.min.js +35 -35
  86. package/dist/types/modules/archive/zip/stream.d.ts +4 -0
  87. package/dist/types/modules/pdf/core/crypto.d.ts +65 -0
  88. package/dist/types/modules/pdf/core/encryption.d.ts +23 -20
  89. package/dist/types/modules/pdf/core/pdf-writer.d.ts +6 -4
  90. package/dist/types/modules/pdf/index.d.ts +23 -2
  91. package/dist/types/modules/pdf/reader/annotation-extractor.d.ts +63 -0
  92. package/dist/types/modules/pdf/reader/cmap-parser.d.ts +70 -0
  93. package/dist/types/modules/pdf/reader/content-interpreter.d.ts +57 -0
  94. package/dist/types/modules/pdf/reader/font-decoder.d.ts +58 -0
  95. package/dist/types/modules/pdf/reader/form-extractor.d.ts +48 -0
  96. package/dist/types/modules/pdf/reader/image-extractor.d.ts +55 -0
  97. package/dist/types/modules/pdf/reader/metadata-reader.d.ts +56 -0
  98. package/dist/types/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
  99. package/dist/types/modules/pdf/reader/pdf-document.d.ts +191 -0
  100. package/dist/types/modules/pdf/reader/pdf-parser.d.ts +65 -0
  101. package/dist/types/modules/pdf/reader/pdf-reader.d.ts +143 -0
  102. package/dist/types/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
  103. package/dist/types/modules/pdf/reader/reader-utils.d.ts +15 -0
  104. package/dist/types/modules/pdf/reader/stream-filters.d.ts +20 -0
  105. package/dist/types/modules/pdf/reader/text-reconstruction.d.ts +44 -0
  106. package/package.json +1 -1
@@ -0,0 +1,543 @@
1
+ /**
2
+ * PDF tokenizer / lexer.
3
+ *
4
+ * Scans raw PDF bytes and produces a stream of typed tokens.
5
+ * Handles all PDF token types: numbers, strings (literal and hex),
6
+ * names, booleans, null, keywords, and delimiters.
7
+ *
8
+ * @see PDF Reference 1.7, §3.1 - Lexical Conventions
9
+ */
10
+ import { PdfStructureError } from "../errors.js";
11
+ // =============================================================================
12
+ // Token Types
13
+ // =============================================================================
14
+ export var TokenType;
15
+ (function (TokenType) {
16
+ /** Integer or real number */
17
+ TokenType[TokenType["Number"] = 0] = "Number";
18
+ /** Literal string delimited by parentheses `(...)` */
19
+ TokenType[TokenType["LiteralString"] = 1] = "LiteralString";
20
+ /** Hex string delimited by angle brackets `<...>` */
21
+ TokenType[TokenType["HexString"] = 2] = "HexString";
22
+ /** Name object starting with `/` */
23
+ TokenType[TokenType["Name"] = 3] = "Name";
24
+ /** Boolean `true` or `false` */
25
+ TokenType[TokenType["Boolean"] = 4] = "Boolean";
26
+ /** The `null` keyword */
27
+ TokenType[TokenType["Null"] = 5] = "Null";
28
+ /** Keywords: obj, endobj, stream, endstream, xref, trailer, startxref, R */
29
+ TokenType[TokenType["Keyword"] = 6] = "Keyword";
30
+ /** `<<` dict begin */
31
+ TokenType[TokenType["DictBegin"] = 7] = "DictBegin";
32
+ /** `>>` dict end */
33
+ TokenType[TokenType["DictEnd"] = 8] = "DictEnd";
34
+ /** `[` array begin */
35
+ TokenType[TokenType["ArrayBegin"] = 9] = "ArrayBegin";
36
+ /** `]` array end */
37
+ TokenType[TokenType["ArrayEnd"] = 10] = "ArrayEnd";
38
+ /** End of input */
39
+ TokenType[TokenType["EOF"] = 11] = "EOF";
40
+ })(TokenType || (TokenType = {}));
41
+ // =============================================================================
42
+ // Character Classification
43
+ // =============================================================================
44
+ /** Whitespace bytes per PDF spec §3.1 */
45
+ function isWhitespace(b) {
46
+ return b === 0x00 || b === 0x09 || b === 0x0a || b === 0x0d || b === 0x0c || b === 0x20;
47
+ }
48
+ /** Delimiter bytes per PDF spec §3.1 */
49
+ function isDelimiter(b) {
50
+ return (b === 0x28 || // (
51
+ b === 0x29 || // )
52
+ b === 0x3c || // <
53
+ b === 0x3e || // >
54
+ b === 0x5b || // [
55
+ b === 0x5d || // ]
56
+ b === 0x7b || // {
57
+ b === 0x7d || // }
58
+ b === 0x2f || // /
59
+ b === 0x25 // %
60
+ );
61
+ }
62
+ function isDigit(b) {
63
+ return b >= 0x30 && b <= 0x39;
64
+ }
65
+ function isHexDigit(b) {
66
+ return (b >= 0x30 && b <= 0x39) || (b >= 0x41 && b <= 0x46) || (b >= 0x61 && b <= 0x66);
67
+ }
68
+ function hexVal(b) {
69
+ if (b >= 0x30 && b <= 0x39) {
70
+ return b - 0x30;
71
+ }
72
+ if (b >= 0x41 && b <= 0x46) {
73
+ return b - 0x41 + 10;
74
+ }
75
+ return b - 0x61 + 10;
76
+ }
77
+ // =============================================================================
78
+ // Cached Constants
79
+ // =============================================================================
80
+ /** Pre-encoded "endstream" bytes to avoid repeated TextEncoder allocations */
81
+ const ENDSTREAM_BYTES = new Uint8Array([101, 110, 100, 115, 116, 114, 101, 97, 109]); // "endstream"
82
+ // =============================================================================
83
+ // PDF Tokenizer
84
+ // =============================================================================
85
+ /**
86
+ * Byte-level PDF tokenizer.
87
+ *
88
+ * Provides a `next()` method that returns the next token from the input.
89
+ * The tokenizer maintains a mutable position pointer that advances through
90
+ * the input bytes.
91
+ */
92
+ export class PdfTokenizer {
93
+ constructor(data, offset = 0) {
94
+ this.data = data;
95
+ this.pos = offset;
96
+ }
97
+ /** Get current position */
98
+ get position() {
99
+ return this.pos;
100
+ }
101
+ /** Set current position */
102
+ set position(offset) {
103
+ this.pos = offset;
104
+ }
105
+ /** Get the underlying data */
106
+ get bytes() {
107
+ return this.data;
108
+ }
109
+ /** Peek at the byte at the current position without consuming it */
110
+ peek() {
111
+ return this.pos < this.data.length ? this.data[this.pos] : -1;
112
+ }
113
+ /** Read the next token */
114
+ next() {
115
+ this.skipWhitespaceAndComments();
116
+ if (this.pos >= this.data.length) {
117
+ return { type: TokenType.EOF, offset: this.pos };
118
+ }
119
+ const startPos = this.pos;
120
+ const b = this.data[this.pos];
121
+ // Literal string
122
+ if (b === 0x28) {
123
+ return this.readLiteralString(startPos);
124
+ }
125
+ // Hex string or dict delimiter
126
+ if (b === 0x3c) {
127
+ if (this.pos + 1 < this.data.length && this.data[this.pos + 1] === 0x3c) {
128
+ this.pos += 2;
129
+ return { type: TokenType.DictBegin, offset: startPos };
130
+ }
131
+ return this.readHexString(startPos);
132
+ }
133
+ // Dict end
134
+ if (b === 0x3e) {
135
+ if (this.pos + 1 < this.data.length && this.data[this.pos + 1] === 0x3e) {
136
+ this.pos += 2;
137
+ return { type: TokenType.DictEnd, offset: startPos };
138
+ }
139
+ // Stray > — treat as keyword
140
+ this.pos++;
141
+ return { type: TokenType.Keyword, strValue: ">", offset: startPos };
142
+ }
143
+ // Array delimiters
144
+ if (b === 0x5b) {
145
+ this.pos++;
146
+ return { type: TokenType.ArrayBegin, offset: startPos };
147
+ }
148
+ if (b === 0x5d) {
149
+ this.pos++;
150
+ return { type: TokenType.ArrayEnd, offset: startPos };
151
+ }
152
+ // Name
153
+ if (b === 0x2f) {
154
+ return this.readName(startPos);
155
+ }
156
+ // Number (digit, sign, or decimal point)
157
+ if (isDigit(b) || b === 0x2d || b === 0x2b || b === 0x2e) {
158
+ return this.readNumber(startPos);
159
+ }
160
+ // Regular character — keyword (obj, endobj, true, false, null, etc.)
161
+ return this.readKeyword(startPos);
162
+ }
163
+ // ===========================================================================
164
+ // Skip Whitespace and Comments
165
+ // ===========================================================================
166
+ skipWhitespaceAndComments() {
167
+ while (this.pos < this.data.length) {
168
+ const b = this.data[this.pos];
169
+ if (isWhitespace(b)) {
170
+ this.pos++;
171
+ continue;
172
+ }
173
+ // PDF comment: % ... EOL
174
+ if (b === 0x25) {
175
+ this.pos++;
176
+ while (this.pos < this.data.length) {
177
+ const c = this.data[this.pos];
178
+ if (c === 0x0a || c === 0x0d) {
179
+ break;
180
+ }
181
+ this.pos++;
182
+ }
183
+ continue;
184
+ }
185
+ break;
186
+ }
187
+ }
188
+ // ===========================================================================
189
+ // Literal String (...)
190
+ // ===========================================================================
191
+ readLiteralString(startPos) {
192
+ this.pos++; // skip opening '('
193
+ const bytes = [];
194
+ let depth = 1;
195
+ while (this.pos < this.data.length && depth > 0) {
196
+ const b = this.data[this.pos];
197
+ if (b === 0x5c) {
198
+ // Backslash escape
199
+ this.pos++;
200
+ if (this.pos >= this.data.length) {
201
+ break;
202
+ }
203
+ const esc = this.data[this.pos];
204
+ switch (esc) {
205
+ case 0x6e:
206
+ bytes.push(0x0a);
207
+ break; // \n
208
+ case 0x72:
209
+ bytes.push(0x0d);
210
+ break; // \r
211
+ case 0x74:
212
+ bytes.push(0x09);
213
+ break; // \t
214
+ case 0x62:
215
+ bytes.push(0x08);
216
+ break; // \b
217
+ case 0x66:
218
+ bytes.push(0x0c);
219
+ break; // \f
220
+ case 0x28:
221
+ bytes.push(0x28);
222
+ break; // \(
223
+ case 0x29:
224
+ bytes.push(0x29);
225
+ break; // \)
226
+ case 0x5c:
227
+ bytes.push(0x5c);
228
+ break; // \\
229
+ case 0x0a: // \<LF> — line continuation
230
+ break;
231
+ case 0x0d: // \<CR> or \<CR><LF> — line continuation
232
+ if (this.pos + 1 < this.data.length && this.data[this.pos + 1] === 0x0a) {
233
+ this.pos++;
234
+ }
235
+ break;
236
+ default:
237
+ // Octal escape: up to 3 octal digits
238
+ if (esc >= 0x30 && esc <= 0x37) {
239
+ let octal = esc - 0x30;
240
+ if (this.pos + 1 < this.data.length &&
241
+ this.data[this.pos + 1] >= 0x30 &&
242
+ this.data[this.pos + 1] <= 0x37) {
243
+ this.pos++;
244
+ octal = octal * 8 + (this.data[this.pos] - 0x30);
245
+ if (this.pos + 1 < this.data.length &&
246
+ this.data[this.pos + 1] >= 0x30 &&
247
+ this.data[this.pos + 1] <= 0x37) {
248
+ this.pos++;
249
+ octal = octal * 8 + (this.data[this.pos] - 0x30);
250
+ }
251
+ }
252
+ bytes.push(octal & 0xff);
253
+ }
254
+ else {
255
+ // Unknown escape — just include the character
256
+ bytes.push(esc);
257
+ }
258
+ break;
259
+ }
260
+ this.pos++;
261
+ }
262
+ else if (b === 0x28) {
263
+ // Nested (
264
+ depth++;
265
+ bytes.push(b);
266
+ this.pos++;
267
+ }
268
+ else if (b === 0x29) {
269
+ // Closing )
270
+ depth--;
271
+ if (depth > 0) {
272
+ bytes.push(b);
273
+ }
274
+ this.pos++;
275
+ }
276
+ else {
277
+ // Normalize line endings: \r\n or \r → \n
278
+ if (b === 0x0d) {
279
+ bytes.push(0x0a);
280
+ this.pos++;
281
+ if (this.pos < this.data.length && this.data[this.pos] === 0x0a) {
282
+ this.pos++;
283
+ }
284
+ }
285
+ else {
286
+ bytes.push(b);
287
+ this.pos++;
288
+ }
289
+ }
290
+ }
291
+ const rawBytes = new Uint8Array(bytes);
292
+ return { type: TokenType.LiteralString, rawBytes, offset: startPos };
293
+ }
294
+ // ===========================================================================
295
+ // Hex String <...>
296
+ // ===========================================================================
297
+ readHexString(startPos) {
298
+ this.pos++; // skip opening '<'
299
+ const hexBytes = [];
300
+ let highNibble = -1;
301
+ while (this.pos < this.data.length) {
302
+ const b = this.data[this.pos];
303
+ if (b === 0x3e) {
304
+ this.pos++; // skip closing '>'
305
+ break;
306
+ }
307
+ if (isWhitespace(b)) {
308
+ this.pos++;
309
+ continue;
310
+ }
311
+ if (isHexDigit(b)) {
312
+ if (highNibble < 0) {
313
+ highNibble = hexVal(b);
314
+ }
315
+ else {
316
+ hexBytes.push((highNibble << 4) | hexVal(b));
317
+ highNibble = -1;
318
+ }
319
+ }
320
+ this.pos++;
321
+ }
322
+ // Odd number of hex digits — pad with 0
323
+ if (highNibble >= 0) {
324
+ hexBytes.push(highNibble << 4);
325
+ }
326
+ const rawBytes = new Uint8Array(hexBytes);
327
+ return { type: TokenType.HexString, rawBytes, offset: startPos };
328
+ }
329
+ // ===========================================================================
330
+ // Name /...
331
+ // ===========================================================================
332
+ readName(startPos) {
333
+ this.pos++; // skip '/'
334
+ const chars = [];
335
+ while (this.pos < this.data.length) {
336
+ const b = this.data[this.pos];
337
+ if (isWhitespace(b) || isDelimiter(b)) {
338
+ break;
339
+ }
340
+ if (b === 0x23 && this.pos + 2 < this.data.length) {
341
+ // #XX escape
342
+ const h1 = this.data[this.pos + 1];
343
+ const h2 = this.data[this.pos + 2];
344
+ if (isHexDigit(h1) && isHexDigit(h2)) {
345
+ chars.push((hexVal(h1) << 4) | hexVal(h2));
346
+ this.pos += 3;
347
+ continue;
348
+ }
349
+ }
350
+ chars.push(b);
351
+ this.pos++;
352
+ }
353
+ const name = String.fromCharCode(...chars);
354
+ return { type: TokenType.Name, strValue: name, offset: startPos };
355
+ }
356
+ // ===========================================================================
357
+ // Number
358
+ // ===========================================================================
359
+ readNumber(startPos) {
360
+ let numStr = "";
361
+ const first = this.data[this.pos];
362
+ // Sign
363
+ if (first === 0x2d || first === 0x2b) {
364
+ numStr += String.fromCharCode(first);
365
+ this.pos++;
366
+ }
367
+ let hasDecimal = first === 0x2e;
368
+ if (hasDecimal) {
369
+ numStr += ".";
370
+ this.pos++;
371
+ }
372
+ while (this.pos < this.data.length) {
373
+ const b = this.data[this.pos];
374
+ if (isDigit(b)) {
375
+ numStr += String.fromCharCode(b);
376
+ this.pos++;
377
+ }
378
+ else if (b === 0x2e && !hasDecimal) {
379
+ hasDecimal = true;
380
+ numStr += ".";
381
+ this.pos++;
382
+ }
383
+ else {
384
+ break;
385
+ }
386
+ }
387
+ // Edge case: just a sign with no digits — treat as keyword
388
+ if (numStr === "+" || numStr === "-" || numStr === "." || numStr === "") {
389
+ return this.readKeyword(startPos);
390
+ }
391
+ const numValue = hasDecimal ? parseFloat(numStr) : parseInt(numStr, 10);
392
+ return { type: TokenType.Number, numValue, offset: startPos };
393
+ }
394
+ // ===========================================================================
395
+ // Keyword
396
+ // ===========================================================================
397
+ readKeyword(startPos) {
398
+ const chars = [];
399
+ while (this.pos < this.data.length) {
400
+ const b = this.data[this.pos];
401
+ if (isWhitespace(b) || isDelimiter(b)) {
402
+ break;
403
+ }
404
+ chars.push(b);
405
+ this.pos++;
406
+ }
407
+ const word = String.fromCharCode(...chars);
408
+ if (word === "true") {
409
+ return { type: TokenType.Boolean, boolValue: true, strValue: "true", offset: startPos };
410
+ }
411
+ if (word === "false") {
412
+ return { type: TokenType.Boolean, boolValue: false, strValue: "false", offset: startPos };
413
+ }
414
+ if (word === "null") {
415
+ return { type: TokenType.Null, offset: startPos };
416
+ }
417
+ return { type: TokenType.Keyword, strValue: word, offset: startPos };
418
+ }
419
+ // ===========================================================================
420
+ // Utility: Find a byte sequence
421
+ // ===========================================================================
422
+ /**
423
+ * Search forward for a byte sequence starting from the current position.
424
+ * Returns the offset where the sequence starts, or -1 if not found.
425
+ * Does NOT advance the position.
426
+ */
427
+ findSequence(seq, from) {
428
+ const start = from ?? this.pos;
429
+ const len = seq.length;
430
+ const end = this.data.length - len;
431
+ for (let i = start; i <= end; i++) {
432
+ let match = true;
433
+ for (let j = 0; j < len; j++) {
434
+ if (this.data[i + j] !== seq[j]) {
435
+ match = false;
436
+ break;
437
+ }
438
+ }
439
+ if (match) {
440
+ return i;
441
+ }
442
+ }
443
+ return -1;
444
+ }
445
+ /**
446
+ * Search backward for a byte sequence starting from `from` (or end of data).
447
+ * Returns the offset where the sequence starts, or -1 if not found.
448
+ */
449
+ findSequenceBackward(seq, from) {
450
+ const start = from ?? this.data.length - 1;
451
+ const len = seq.length;
452
+ for (let i = start - len + 1; i >= 0; i--) {
453
+ let match = true;
454
+ for (let j = 0; j < len; j++) {
455
+ if (this.data[i + j] !== seq[j]) {
456
+ match = false;
457
+ break;
458
+ }
459
+ }
460
+ if (match) {
461
+ return i;
462
+ }
463
+ }
464
+ return -1;
465
+ }
466
+ /**
467
+ * Read a line of text at the current position. Advances past the line ending.
468
+ */
469
+ readLine() {
470
+ const chars = [];
471
+ while (this.pos < this.data.length) {
472
+ const b = this.data[this.pos];
473
+ this.pos++;
474
+ if (b === 0x0a) {
475
+ break;
476
+ }
477
+ if (b === 0x0d) {
478
+ if (this.pos < this.data.length && this.data[this.pos] === 0x0a) {
479
+ this.pos++;
480
+ }
481
+ break;
482
+ }
483
+ chars.push(b);
484
+ }
485
+ return String.fromCharCode(...chars);
486
+ }
487
+ /**
488
+ * Extract a slice of the underlying data.
489
+ */
490
+ slice(start, end) {
491
+ return this.data.subarray(start, end);
492
+ }
493
+ /**
494
+ * Read the stream content following a `stream` keyword.
495
+ * The tokenizer should be positioned right after the `stream` keyword.
496
+ * Returns the raw stream bytes (between stream\n and endstream).
497
+ */
498
+ readStreamContent(length) {
499
+ // Skip the EOL after "stream" keyword
500
+ if (this.pos < this.data.length && this.data[this.pos] === 0x0d) {
501
+ this.pos++;
502
+ }
503
+ if (this.pos < this.data.length && this.data[this.pos] === 0x0a) {
504
+ this.pos++;
505
+ }
506
+ if (length < 0) {
507
+ // Length unknown — search for endstream
508
+ const endPos = this.findSequence(ENDSTREAM_BYTES, this.pos);
509
+ if (endPos < 0) {
510
+ throw new PdfStructureError("Could not find endstream marker");
511
+ }
512
+ let streamEnd = endPos;
513
+ // Strip trailing EOL before endstream
514
+ if (streamEnd > this.pos && this.data[streamEnd - 1] === 0x0a) {
515
+ streamEnd--;
516
+ }
517
+ if (streamEnd > this.pos && this.data[streamEnd - 1] === 0x0d) {
518
+ streamEnd--;
519
+ }
520
+ const content = this.data.subarray(this.pos, streamEnd);
521
+ this.pos = endPos + ENDSTREAM_BYTES.length;
522
+ return content;
523
+ }
524
+ const content = this.data.subarray(this.pos, this.pos + length);
525
+ this.pos += length;
526
+ // Skip to endstream
527
+ this.skipWhitespaceAndComments();
528
+ // Try to consume "endstream" keyword
529
+ if (this.pos + ENDSTREAM_BYTES.length <= this.data.length) {
530
+ let match = true;
531
+ for (let i = 0; i < ENDSTREAM_BYTES.length; i++) {
532
+ if (this.data[this.pos + i] !== ENDSTREAM_BYTES[i]) {
533
+ match = false;
534
+ break;
535
+ }
536
+ }
537
+ if (match) {
538
+ this.pos += ENDSTREAM_BYTES.length;
539
+ }
540
+ }
541
+ return content;
542
+ }
543
+ }
@@ -0,0 +1,27 @@
1
+ /**
2
+ * Shared utility functions for PDF reader modules.
3
+ */
4
+ import { decodePdfStringBytes } from "./pdf-parser.js";
5
+ /**
6
+ * Safely extract a string value from a PDF dictionary entry.
7
+ * Handles both name strings and Uint8Array PDF strings (with BOM/encoding detection).
8
+ *
9
+ * @param dict - The PDF dictionary
10
+ * @param key - The key to look up
11
+ * @param doc - The PDF document for resolving indirect references
12
+ * @returns The string value, or empty string if not found or not a string
13
+ */
14
+ export function getDictStringValue(dict, key, doc) {
15
+ const val = dict.get(key);
16
+ if (!val) {
17
+ return "";
18
+ }
19
+ const resolved = doc.deref(val);
20
+ if (typeof resolved === "string") {
21
+ return resolved;
22
+ }
23
+ if (resolved instanceof Uint8Array) {
24
+ return decodePdfStringBytes(resolved);
25
+ }
26
+ return "";
27
+ }