@cj-tech-master/excelts 8.0.0 → 8.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +14 -1
- package/README_zh.md +6 -0
- package/dist/browser/modules/archive/zip/stream.d.ts +4 -0
- package/dist/browser/modules/archive/zip/stream.js +53 -0
- package/dist/browser/modules/pdf/core/crypto.d.ts +65 -0
- package/dist/browser/modules/pdf/core/crypto.js +637 -0
- package/dist/browser/modules/pdf/core/encryption.d.ts +23 -20
- package/dist/browser/modules/pdf/core/encryption.js +88 -261
- package/dist/browser/modules/pdf/core/pdf-writer.d.ts +6 -4
- package/dist/browser/modules/pdf/core/pdf-writer.js +19 -10
- package/dist/browser/modules/pdf/index.d.ts +23 -2
- package/dist/browser/modules/pdf/index.js +21 -3
- package/dist/browser/modules/pdf/reader/annotation-extractor.d.ts +63 -0
- package/dist/browser/modules/pdf/reader/annotation-extractor.js +155 -0
- package/dist/browser/modules/pdf/reader/cmap-parser.d.ts +70 -0
- package/dist/browser/modules/pdf/reader/cmap-parser.js +321 -0
- package/dist/browser/modules/pdf/reader/content-interpreter.d.ts +57 -0
- package/dist/browser/modules/pdf/reader/content-interpreter.js +715 -0
- package/dist/browser/modules/pdf/reader/font-decoder.d.ts +58 -0
- package/dist/browser/modules/pdf/reader/font-decoder.js +1513 -0
- package/dist/browser/modules/pdf/reader/form-extractor.d.ts +48 -0
- package/dist/browser/modules/pdf/reader/form-extractor.js +355 -0
- package/dist/browser/modules/pdf/reader/image-extractor.d.ts +55 -0
- package/dist/browser/modules/pdf/reader/image-extractor.js +220 -0
- package/dist/browser/modules/pdf/reader/metadata-reader.d.ts +56 -0
- package/dist/browser/modules/pdf/reader/metadata-reader.js +275 -0
- package/dist/browser/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
- package/dist/browser/modules/pdf/reader/pdf-decrypt.js +443 -0
- package/dist/browser/modules/pdf/reader/pdf-document.d.ts +191 -0
- package/dist/browser/modules/pdf/reader/pdf-document.js +818 -0
- package/dist/browser/modules/pdf/reader/pdf-parser.d.ts +65 -0
- package/dist/browser/modules/pdf/reader/pdf-parser.js +285 -0
- package/dist/browser/modules/pdf/reader/pdf-reader.d.ts +143 -0
- package/dist/browser/modules/pdf/reader/pdf-reader.js +200 -0
- package/dist/browser/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
- package/dist/browser/modules/pdf/reader/pdf-tokenizer.js +543 -0
- package/dist/browser/modules/pdf/reader/reader-utils.d.ts +15 -0
- package/dist/browser/modules/pdf/reader/reader-utils.js +27 -0
- package/dist/browser/modules/pdf/reader/stream-filters.d.ts +20 -0
- package/dist/browser/modules/pdf/reader/stream-filters.js +456 -0
- package/dist/browser/modules/pdf/reader/text-reconstruction.d.ts +44 -0
- package/dist/browser/modules/pdf/reader/text-reconstruction.js +463 -0
- package/dist/cjs/modules/archive/zip/stream.js +53 -0
- package/dist/cjs/modules/pdf/core/crypto.js +649 -0
- package/dist/cjs/modules/pdf/core/encryption.js +88 -263
- package/dist/cjs/modules/pdf/core/pdf-writer.js +19 -10
- package/dist/cjs/modules/pdf/index.js +23 -4
- package/dist/cjs/modules/pdf/reader/annotation-extractor.js +158 -0
- package/dist/cjs/modules/pdf/reader/cmap-parser.js +326 -0
- package/dist/cjs/modules/pdf/reader/content-interpreter.js +718 -0
- package/dist/cjs/modules/pdf/reader/font-decoder.js +1518 -0
- package/dist/cjs/modules/pdf/reader/form-extractor.js +358 -0
- package/dist/cjs/modules/pdf/reader/image-extractor.js +223 -0
- package/dist/cjs/modules/pdf/reader/metadata-reader.js +278 -0
- package/dist/cjs/modules/pdf/reader/pdf-decrypt.js +447 -0
- package/dist/cjs/modules/pdf/reader/pdf-document.js +822 -0
- package/dist/cjs/modules/pdf/reader/pdf-parser.js +301 -0
- package/dist/cjs/modules/pdf/reader/pdf-reader.js +203 -0
- package/dist/cjs/modules/pdf/reader/pdf-tokenizer.js +517 -0
- package/dist/cjs/modules/pdf/reader/reader-utils.js +30 -0
- package/dist/cjs/modules/pdf/reader/stream-filters.js +459 -0
- package/dist/cjs/modules/pdf/reader/text-reconstruction.js +467 -0
- package/dist/esm/modules/archive/zip/stream.js +53 -0
- package/dist/esm/modules/pdf/core/crypto.js +637 -0
- package/dist/esm/modules/pdf/core/encryption.js +88 -261
- package/dist/esm/modules/pdf/core/pdf-writer.js +19 -10
- package/dist/esm/modules/pdf/index.js +21 -3
- package/dist/esm/modules/pdf/reader/annotation-extractor.js +155 -0
- package/dist/esm/modules/pdf/reader/cmap-parser.js +321 -0
- package/dist/esm/modules/pdf/reader/content-interpreter.js +715 -0
- package/dist/esm/modules/pdf/reader/font-decoder.js +1513 -0
- package/dist/esm/modules/pdf/reader/form-extractor.js +355 -0
- package/dist/esm/modules/pdf/reader/image-extractor.js +220 -0
- package/dist/esm/modules/pdf/reader/metadata-reader.js +275 -0
- package/dist/esm/modules/pdf/reader/pdf-decrypt.js +443 -0
- package/dist/esm/modules/pdf/reader/pdf-document.js +818 -0
- package/dist/esm/modules/pdf/reader/pdf-parser.js +285 -0
- package/dist/esm/modules/pdf/reader/pdf-reader.js +200 -0
- package/dist/esm/modules/pdf/reader/pdf-tokenizer.js +543 -0
- package/dist/esm/modules/pdf/reader/reader-utils.js +27 -0
- package/dist/esm/modules/pdf/reader/stream-filters.js +456 -0
- package/dist/esm/modules/pdf/reader/text-reconstruction.js +463 -0
- package/dist/iife/excelts.iife.js +703 -267
- package/dist/iife/excelts.iife.js.map +1 -1
- package/dist/iife/excelts.iife.min.js +35 -35
- package/dist/types/modules/archive/zip/stream.d.ts +4 -0
- package/dist/types/modules/pdf/core/crypto.d.ts +65 -0
- package/dist/types/modules/pdf/core/encryption.d.ts +23 -20
- package/dist/types/modules/pdf/core/pdf-writer.d.ts +6 -4
- package/dist/types/modules/pdf/index.d.ts +23 -2
- package/dist/types/modules/pdf/reader/annotation-extractor.d.ts +63 -0
- package/dist/types/modules/pdf/reader/cmap-parser.d.ts +70 -0
- package/dist/types/modules/pdf/reader/content-interpreter.d.ts +57 -0
- package/dist/types/modules/pdf/reader/font-decoder.d.ts +58 -0
- package/dist/types/modules/pdf/reader/form-extractor.d.ts +48 -0
- package/dist/types/modules/pdf/reader/image-extractor.d.ts +55 -0
- package/dist/types/modules/pdf/reader/metadata-reader.d.ts +56 -0
- package/dist/types/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
- package/dist/types/modules/pdf/reader/pdf-document.d.ts +191 -0
- package/dist/types/modules/pdf/reader/pdf-parser.d.ts +65 -0
- package/dist/types/modules/pdf/reader/pdf-reader.d.ts +143 -0
- package/dist/types/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
- package/dist/types/modules/pdf/reader/reader-utils.d.ts +15 -0
- package/dist/types/modules/pdf/reader/stream-filters.d.ts +20 -0
- package/dist/types/modules/pdf/reader/text-reconstruction.d.ts +44 -0
- package/package.json +1 -1
|
@@ -0,0 +1,517 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* PDF tokenizer / lexer.
|
|
4
|
+
*
|
|
5
|
+
* Scans raw PDF bytes and produces a stream of typed tokens.
|
|
6
|
+
* Handles all PDF token types: numbers, strings (literal and hex),
|
|
7
|
+
* names, booleans, null, keywords, and delimiters.
|
|
8
|
+
*
|
|
9
|
+
* @see PDF Reference 1.7, §3.1 - Lexical Conventions
|
|
10
|
+
*/
|
|
11
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
|
+
exports.PdfTokenizer = void 0;
|
|
13
|
+
const errors_1 = require("../errors");
|
|
14
|
+
// =============================================================================
|
|
15
|
+
// Character Classification
|
|
16
|
+
// =============================================================================
|
|
17
|
+
/** Whitespace bytes per PDF spec §3.1 */
|
|
18
|
+
function isWhitespace(b) {
|
|
19
|
+
return b === 0x00 || b === 0x09 || b === 0x0a || b === 0x0d || b === 0x0c || b === 0x20;
|
|
20
|
+
}
|
|
21
|
+
/** Delimiter bytes per PDF spec §3.1 */
|
|
22
|
+
function isDelimiter(b) {
|
|
23
|
+
return (b === 0x28 || // (
|
|
24
|
+
b === 0x29 || // )
|
|
25
|
+
b === 0x3c || // <
|
|
26
|
+
b === 0x3e || // >
|
|
27
|
+
b === 0x5b || // [
|
|
28
|
+
b === 0x5d || // ]
|
|
29
|
+
b === 0x7b || // {
|
|
30
|
+
b === 0x7d || // }
|
|
31
|
+
b === 0x2f || // /
|
|
32
|
+
b === 0x25 // %
|
|
33
|
+
);
|
|
34
|
+
}
|
|
35
|
+
function isDigit(b) {
|
|
36
|
+
return b >= 0x30 && b <= 0x39;
|
|
37
|
+
}
|
|
38
|
+
function isHexDigit(b) {
|
|
39
|
+
return (b >= 0x30 && b <= 0x39) || (b >= 0x41 && b <= 0x46) || (b >= 0x61 && b <= 0x66);
|
|
40
|
+
}
|
|
41
|
+
function hexVal(b) {
|
|
42
|
+
if (b >= 0x30 && b <= 0x39) {
|
|
43
|
+
return b - 0x30;
|
|
44
|
+
}
|
|
45
|
+
if (b >= 0x41 && b <= 0x46) {
|
|
46
|
+
return b - 0x41 + 10;
|
|
47
|
+
}
|
|
48
|
+
return b - 0x61 + 10;
|
|
49
|
+
}
|
|
50
|
+
// =============================================================================
|
|
51
|
+
// Cached Constants
|
|
52
|
+
// =============================================================================
|
|
53
|
+
/** Pre-encoded "endstream" bytes to avoid repeated TextEncoder allocations */
|
|
54
|
+
const ENDSTREAM_BYTES = new Uint8Array([101, 110, 100, 115, 116, 114, 101, 97, 109]); // "endstream"
|
|
55
|
+
// =============================================================================
|
|
56
|
+
// PDF Tokenizer
|
|
57
|
+
// =============================================================================
|
|
58
|
+
/**
|
|
59
|
+
* Byte-level PDF tokenizer.
|
|
60
|
+
*
|
|
61
|
+
* Provides a `next()` method that returns the next token from the input.
|
|
62
|
+
* The tokenizer maintains a mutable position pointer that advances through
|
|
63
|
+
* the input bytes.
|
|
64
|
+
*/
|
|
65
|
+
class PdfTokenizer {
|
|
66
|
+
constructor(data, offset = 0) {
|
|
67
|
+
this.data = data;
|
|
68
|
+
this.pos = offset;
|
|
69
|
+
}
|
|
70
|
+
/** Get current position */
|
|
71
|
+
get position() {
|
|
72
|
+
return this.pos;
|
|
73
|
+
}
|
|
74
|
+
/** Set current position */
|
|
75
|
+
set position(offset) {
|
|
76
|
+
this.pos = offset;
|
|
77
|
+
}
|
|
78
|
+
/** Get the underlying data */
|
|
79
|
+
get bytes() {
|
|
80
|
+
return this.data;
|
|
81
|
+
}
|
|
82
|
+
/** Peek at the byte at the current position without consuming it */
|
|
83
|
+
peek() {
|
|
84
|
+
return this.pos < this.data.length ? this.data[this.pos] : -1;
|
|
85
|
+
}
|
|
86
|
+
/** Read the next token */
|
|
87
|
+
next() {
|
|
88
|
+
this.skipWhitespaceAndComments();
|
|
89
|
+
if (this.pos >= this.data.length) {
|
|
90
|
+
return { type: 11 /* TokenType.EOF */, offset: this.pos };
|
|
91
|
+
}
|
|
92
|
+
const startPos = this.pos;
|
|
93
|
+
const b = this.data[this.pos];
|
|
94
|
+
// Literal string
|
|
95
|
+
if (b === 0x28) {
|
|
96
|
+
return this.readLiteralString(startPos);
|
|
97
|
+
}
|
|
98
|
+
// Hex string or dict delimiter
|
|
99
|
+
if (b === 0x3c) {
|
|
100
|
+
if (this.pos + 1 < this.data.length && this.data[this.pos + 1] === 0x3c) {
|
|
101
|
+
this.pos += 2;
|
|
102
|
+
return { type: 7 /* TokenType.DictBegin */, offset: startPos };
|
|
103
|
+
}
|
|
104
|
+
return this.readHexString(startPos);
|
|
105
|
+
}
|
|
106
|
+
// Dict end
|
|
107
|
+
if (b === 0x3e) {
|
|
108
|
+
if (this.pos + 1 < this.data.length && this.data[this.pos + 1] === 0x3e) {
|
|
109
|
+
this.pos += 2;
|
|
110
|
+
return { type: 8 /* TokenType.DictEnd */, offset: startPos };
|
|
111
|
+
}
|
|
112
|
+
// Stray > — treat as keyword
|
|
113
|
+
this.pos++;
|
|
114
|
+
return { type: 6 /* TokenType.Keyword */, strValue: ">", offset: startPos };
|
|
115
|
+
}
|
|
116
|
+
// Array delimiters
|
|
117
|
+
if (b === 0x5b) {
|
|
118
|
+
this.pos++;
|
|
119
|
+
return { type: 9 /* TokenType.ArrayBegin */, offset: startPos };
|
|
120
|
+
}
|
|
121
|
+
if (b === 0x5d) {
|
|
122
|
+
this.pos++;
|
|
123
|
+
return { type: 10 /* TokenType.ArrayEnd */, offset: startPos };
|
|
124
|
+
}
|
|
125
|
+
// Name
|
|
126
|
+
if (b === 0x2f) {
|
|
127
|
+
return this.readName(startPos);
|
|
128
|
+
}
|
|
129
|
+
// Number (digit, sign, or decimal point)
|
|
130
|
+
if (isDigit(b) || b === 0x2d || b === 0x2b || b === 0x2e) {
|
|
131
|
+
return this.readNumber(startPos);
|
|
132
|
+
}
|
|
133
|
+
// Regular character — keyword (obj, endobj, true, false, null, etc.)
|
|
134
|
+
return this.readKeyword(startPos);
|
|
135
|
+
}
|
|
136
|
+
// ===========================================================================
|
|
137
|
+
// Skip Whitespace and Comments
|
|
138
|
+
// ===========================================================================
|
|
139
|
+
skipWhitespaceAndComments() {
|
|
140
|
+
while (this.pos < this.data.length) {
|
|
141
|
+
const b = this.data[this.pos];
|
|
142
|
+
if (isWhitespace(b)) {
|
|
143
|
+
this.pos++;
|
|
144
|
+
continue;
|
|
145
|
+
}
|
|
146
|
+
// PDF comment: % ... EOL
|
|
147
|
+
if (b === 0x25) {
|
|
148
|
+
this.pos++;
|
|
149
|
+
while (this.pos < this.data.length) {
|
|
150
|
+
const c = this.data[this.pos];
|
|
151
|
+
if (c === 0x0a || c === 0x0d) {
|
|
152
|
+
break;
|
|
153
|
+
}
|
|
154
|
+
this.pos++;
|
|
155
|
+
}
|
|
156
|
+
continue;
|
|
157
|
+
}
|
|
158
|
+
break;
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
// ===========================================================================
|
|
162
|
+
// Literal String (...)
|
|
163
|
+
// ===========================================================================
|
|
164
|
+
readLiteralString(startPos) {
|
|
165
|
+
this.pos++; // skip opening '('
|
|
166
|
+
const bytes = [];
|
|
167
|
+
let depth = 1;
|
|
168
|
+
while (this.pos < this.data.length && depth > 0) {
|
|
169
|
+
const b = this.data[this.pos];
|
|
170
|
+
if (b === 0x5c) {
|
|
171
|
+
// Backslash escape
|
|
172
|
+
this.pos++;
|
|
173
|
+
if (this.pos >= this.data.length) {
|
|
174
|
+
break;
|
|
175
|
+
}
|
|
176
|
+
const esc = this.data[this.pos];
|
|
177
|
+
switch (esc) {
|
|
178
|
+
case 0x6e:
|
|
179
|
+
bytes.push(0x0a);
|
|
180
|
+
break; // \n
|
|
181
|
+
case 0x72:
|
|
182
|
+
bytes.push(0x0d);
|
|
183
|
+
break; // \r
|
|
184
|
+
case 0x74:
|
|
185
|
+
bytes.push(0x09);
|
|
186
|
+
break; // \t
|
|
187
|
+
case 0x62:
|
|
188
|
+
bytes.push(0x08);
|
|
189
|
+
break; // \b
|
|
190
|
+
case 0x66:
|
|
191
|
+
bytes.push(0x0c);
|
|
192
|
+
break; // \f
|
|
193
|
+
case 0x28:
|
|
194
|
+
bytes.push(0x28);
|
|
195
|
+
break; // \(
|
|
196
|
+
case 0x29:
|
|
197
|
+
bytes.push(0x29);
|
|
198
|
+
break; // \)
|
|
199
|
+
case 0x5c:
|
|
200
|
+
bytes.push(0x5c);
|
|
201
|
+
break; // \\
|
|
202
|
+
case 0x0a: // \<LF> — line continuation
|
|
203
|
+
break;
|
|
204
|
+
case 0x0d: // \<CR> or \<CR><LF> — line continuation
|
|
205
|
+
if (this.pos + 1 < this.data.length && this.data[this.pos + 1] === 0x0a) {
|
|
206
|
+
this.pos++;
|
|
207
|
+
}
|
|
208
|
+
break;
|
|
209
|
+
default:
|
|
210
|
+
// Octal escape: up to 3 octal digits
|
|
211
|
+
if (esc >= 0x30 && esc <= 0x37) {
|
|
212
|
+
let octal = esc - 0x30;
|
|
213
|
+
if (this.pos + 1 < this.data.length &&
|
|
214
|
+
this.data[this.pos + 1] >= 0x30 &&
|
|
215
|
+
this.data[this.pos + 1] <= 0x37) {
|
|
216
|
+
this.pos++;
|
|
217
|
+
octal = octal * 8 + (this.data[this.pos] - 0x30);
|
|
218
|
+
if (this.pos + 1 < this.data.length &&
|
|
219
|
+
this.data[this.pos + 1] >= 0x30 &&
|
|
220
|
+
this.data[this.pos + 1] <= 0x37) {
|
|
221
|
+
this.pos++;
|
|
222
|
+
octal = octal * 8 + (this.data[this.pos] - 0x30);
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
bytes.push(octal & 0xff);
|
|
226
|
+
}
|
|
227
|
+
else {
|
|
228
|
+
// Unknown escape — just include the character
|
|
229
|
+
bytes.push(esc);
|
|
230
|
+
}
|
|
231
|
+
break;
|
|
232
|
+
}
|
|
233
|
+
this.pos++;
|
|
234
|
+
}
|
|
235
|
+
else if (b === 0x28) {
|
|
236
|
+
// Nested (
|
|
237
|
+
depth++;
|
|
238
|
+
bytes.push(b);
|
|
239
|
+
this.pos++;
|
|
240
|
+
}
|
|
241
|
+
else if (b === 0x29) {
|
|
242
|
+
// Closing )
|
|
243
|
+
depth--;
|
|
244
|
+
if (depth > 0) {
|
|
245
|
+
bytes.push(b);
|
|
246
|
+
}
|
|
247
|
+
this.pos++;
|
|
248
|
+
}
|
|
249
|
+
else {
|
|
250
|
+
// Normalize line endings: \r\n or \r → \n
|
|
251
|
+
if (b === 0x0d) {
|
|
252
|
+
bytes.push(0x0a);
|
|
253
|
+
this.pos++;
|
|
254
|
+
if (this.pos < this.data.length && this.data[this.pos] === 0x0a) {
|
|
255
|
+
this.pos++;
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
else {
|
|
259
|
+
bytes.push(b);
|
|
260
|
+
this.pos++;
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
const rawBytes = new Uint8Array(bytes);
|
|
265
|
+
return { type: 1 /* TokenType.LiteralString */, rawBytes, offset: startPos };
|
|
266
|
+
}
|
|
267
|
+
// ===========================================================================
|
|
268
|
+
// Hex String <...>
|
|
269
|
+
// ===========================================================================
|
|
270
|
+
readHexString(startPos) {
|
|
271
|
+
this.pos++; // skip opening '<'
|
|
272
|
+
const hexBytes = [];
|
|
273
|
+
let highNibble = -1;
|
|
274
|
+
while (this.pos < this.data.length) {
|
|
275
|
+
const b = this.data[this.pos];
|
|
276
|
+
if (b === 0x3e) {
|
|
277
|
+
this.pos++; // skip closing '>'
|
|
278
|
+
break;
|
|
279
|
+
}
|
|
280
|
+
if (isWhitespace(b)) {
|
|
281
|
+
this.pos++;
|
|
282
|
+
continue;
|
|
283
|
+
}
|
|
284
|
+
if (isHexDigit(b)) {
|
|
285
|
+
if (highNibble < 0) {
|
|
286
|
+
highNibble = hexVal(b);
|
|
287
|
+
}
|
|
288
|
+
else {
|
|
289
|
+
hexBytes.push((highNibble << 4) | hexVal(b));
|
|
290
|
+
highNibble = -1;
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
this.pos++;
|
|
294
|
+
}
|
|
295
|
+
// Odd number of hex digits — pad with 0
|
|
296
|
+
if (highNibble >= 0) {
|
|
297
|
+
hexBytes.push(highNibble << 4);
|
|
298
|
+
}
|
|
299
|
+
const rawBytes = new Uint8Array(hexBytes);
|
|
300
|
+
return { type: 2 /* TokenType.HexString */, rawBytes, offset: startPos };
|
|
301
|
+
}
|
|
302
|
+
// ===========================================================================
|
|
303
|
+
// Name /...
|
|
304
|
+
// ===========================================================================
|
|
305
|
+
readName(startPos) {
|
|
306
|
+
this.pos++; // skip '/'
|
|
307
|
+
const chars = [];
|
|
308
|
+
while (this.pos < this.data.length) {
|
|
309
|
+
const b = this.data[this.pos];
|
|
310
|
+
if (isWhitespace(b) || isDelimiter(b)) {
|
|
311
|
+
break;
|
|
312
|
+
}
|
|
313
|
+
if (b === 0x23 && this.pos + 2 < this.data.length) {
|
|
314
|
+
// #XX escape
|
|
315
|
+
const h1 = this.data[this.pos + 1];
|
|
316
|
+
const h2 = this.data[this.pos + 2];
|
|
317
|
+
if (isHexDigit(h1) && isHexDigit(h2)) {
|
|
318
|
+
chars.push((hexVal(h1) << 4) | hexVal(h2));
|
|
319
|
+
this.pos += 3;
|
|
320
|
+
continue;
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
chars.push(b);
|
|
324
|
+
this.pos++;
|
|
325
|
+
}
|
|
326
|
+
const name = String.fromCharCode(...chars);
|
|
327
|
+
return { type: 3 /* TokenType.Name */, strValue: name, offset: startPos };
|
|
328
|
+
}
|
|
329
|
+
// ===========================================================================
|
|
330
|
+
// Number
|
|
331
|
+
// ===========================================================================
|
|
332
|
+
readNumber(startPos) {
|
|
333
|
+
let numStr = "";
|
|
334
|
+
const first = this.data[this.pos];
|
|
335
|
+
// Sign
|
|
336
|
+
if (first === 0x2d || first === 0x2b) {
|
|
337
|
+
numStr += String.fromCharCode(first);
|
|
338
|
+
this.pos++;
|
|
339
|
+
}
|
|
340
|
+
let hasDecimal = first === 0x2e;
|
|
341
|
+
if (hasDecimal) {
|
|
342
|
+
numStr += ".";
|
|
343
|
+
this.pos++;
|
|
344
|
+
}
|
|
345
|
+
while (this.pos < this.data.length) {
|
|
346
|
+
const b = this.data[this.pos];
|
|
347
|
+
if (isDigit(b)) {
|
|
348
|
+
numStr += String.fromCharCode(b);
|
|
349
|
+
this.pos++;
|
|
350
|
+
}
|
|
351
|
+
else if (b === 0x2e && !hasDecimal) {
|
|
352
|
+
hasDecimal = true;
|
|
353
|
+
numStr += ".";
|
|
354
|
+
this.pos++;
|
|
355
|
+
}
|
|
356
|
+
else {
|
|
357
|
+
break;
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
// Edge case: just a sign with no digits — treat as keyword
|
|
361
|
+
if (numStr === "+" || numStr === "-" || numStr === "." || numStr === "") {
|
|
362
|
+
return this.readKeyword(startPos);
|
|
363
|
+
}
|
|
364
|
+
const numValue = hasDecimal ? parseFloat(numStr) : parseInt(numStr, 10);
|
|
365
|
+
return { type: 0 /* TokenType.Number */, numValue, offset: startPos };
|
|
366
|
+
}
|
|
367
|
+
// ===========================================================================
|
|
368
|
+
// Keyword
|
|
369
|
+
// ===========================================================================
|
|
370
|
+
readKeyword(startPos) {
|
|
371
|
+
const chars = [];
|
|
372
|
+
while (this.pos < this.data.length) {
|
|
373
|
+
const b = this.data[this.pos];
|
|
374
|
+
if (isWhitespace(b) || isDelimiter(b)) {
|
|
375
|
+
break;
|
|
376
|
+
}
|
|
377
|
+
chars.push(b);
|
|
378
|
+
this.pos++;
|
|
379
|
+
}
|
|
380
|
+
const word = String.fromCharCode(...chars);
|
|
381
|
+
if (word === "true") {
|
|
382
|
+
return { type: 4 /* TokenType.Boolean */, boolValue: true, strValue: "true", offset: startPos };
|
|
383
|
+
}
|
|
384
|
+
if (word === "false") {
|
|
385
|
+
return { type: 4 /* TokenType.Boolean */, boolValue: false, strValue: "false", offset: startPos };
|
|
386
|
+
}
|
|
387
|
+
if (word === "null") {
|
|
388
|
+
return { type: 5 /* TokenType.Null */, offset: startPos };
|
|
389
|
+
}
|
|
390
|
+
return { type: 6 /* TokenType.Keyword */, strValue: word, offset: startPos };
|
|
391
|
+
}
|
|
392
|
+
// ===========================================================================
|
|
393
|
+
// Utility: Find a byte sequence
|
|
394
|
+
// ===========================================================================
|
|
395
|
+
/**
|
|
396
|
+
* Search forward for a byte sequence starting from the current position.
|
|
397
|
+
* Returns the offset where the sequence starts, or -1 if not found.
|
|
398
|
+
* Does NOT advance the position.
|
|
399
|
+
*/
|
|
400
|
+
findSequence(seq, from) {
|
|
401
|
+
const start = from ?? this.pos;
|
|
402
|
+
const len = seq.length;
|
|
403
|
+
const end = this.data.length - len;
|
|
404
|
+
for (let i = start; i <= end; i++) {
|
|
405
|
+
let match = true;
|
|
406
|
+
for (let j = 0; j < len; j++) {
|
|
407
|
+
if (this.data[i + j] !== seq[j]) {
|
|
408
|
+
match = false;
|
|
409
|
+
break;
|
|
410
|
+
}
|
|
411
|
+
}
|
|
412
|
+
if (match) {
|
|
413
|
+
return i;
|
|
414
|
+
}
|
|
415
|
+
}
|
|
416
|
+
return -1;
|
|
417
|
+
}
|
|
418
|
+
/**
|
|
419
|
+
* Search backward for a byte sequence starting from `from` (or end of data).
|
|
420
|
+
* Returns the offset where the sequence starts, or -1 if not found.
|
|
421
|
+
*/
|
|
422
|
+
findSequenceBackward(seq, from) {
|
|
423
|
+
const start = from ?? this.data.length - 1;
|
|
424
|
+
const len = seq.length;
|
|
425
|
+
for (let i = start - len + 1; i >= 0; i--) {
|
|
426
|
+
let match = true;
|
|
427
|
+
for (let j = 0; j < len; j++) {
|
|
428
|
+
if (this.data[i + j] !== seq[j]) {
|
|
429
|
+
match = false;
|
|
430
|
+
break;
|
|
431
|
+
}
|
|
432
|
+
}
|
|
433
|
+
if (match) {
|
|
434
|
+
return i;
|
|
435
|
+
}
|
|
436
|
+
}
|
|
437
|
+
return -1;
|
|
438
|
+
}
|
|
439
|
+
/**
|
|
440
|
+
* Read a line of text at the current position. Advances past the line ending.
|
|
441
|
+
*/
|
|
442
|
+
readLine() {
|
|
443
|
+
const chars = [];
|
|
444
|
+
while (this.pos < this.data.length) {
|
|
445
|
+
const b = this.data[this.pos];
|
|
446
|
+
this.pos++;
|
|
447
|
+
if (b === 0x0a) {
|
|
448
|
+
break;
|
|
449
|
+
}
|
|
450
|
+
if (b === 0x0d) {
|
|
451
|
+
if (this.pos < this.data.length && this.data[this.pos] === 0x0a) {
|
|
452
|
+
this.pos++;
|
|
453
|
+
}
|
|
454
|
+
break;
|
|
455
|
+
}
|
|
456
|
+
chars.push(b);
|
|
457
|
+
}
|
|
458
|
+
return String.fromCharCode(...chars);
|
|
459
|
+
}
|
|
460
|
+
/**
|
|
461
|
+
* Extract a slice of the underlying data.
|
|
462
|
+
*/
|
|
463
|
+
slice(start, end) {
|
|
464
|
+
return this.data.subarray(start, end);
|
|
465
|
+
}
|
|
466
|
+
/**
|
|
467
|
+
* Read the stream content following a `stream` keyword.
|
|
468
|
+
* The tokenizer should be positioned right after the `stream` keyword.
|
|
469
|
+
* Returns the raw stream bytes (between stream\n and endstream).
|
|
470
|
+
*/
|
|
471
|
+
readStreamContent(length) {
|
|
472
|
+
// Skip the EOL after "stream" keyword
|
|
473
|
+
if (this.pos < this.data.length && this.data[this.pos] === 0x0d) {
|
|
474
|
+
this.pos++;
|
|
475
|
+
}
|
|
476
|
+
if (this.pos < this.data.length && this.data[this.pos] === 0x0a) {
|
|
477
|
+
this.pos++;
|
|
478
|
+
}
|
|
479
|
+
if (length < 0) {
|
|
480
|
+
// Length unknown — search for endstream
|
|
481
|
+
const endPos = this.findSequence(ENDSTREAM_BYTES, this.pos);
|
|
482
|
+
if (endPos < 0) {
|
|
483
|
+
throw new errors_1.PdfStructureError("Could not find endstream marker");
|
|
484
|
+
}
|
|
485
|
+
let streamEnd = endPos;
|
|
486
|
+
// Strip trailing EOL before endstream
|
|
487
|
+
if (streamEnd > this.pos && this.data[streamEnd - 1] === 0x0a) {
|
|
488
|
+
streamEnd--;
|
|
489
|
+
}
|
|
490
|
+
if (streamEnd > this.pos && this.data[streamEnd - 1] === 0x0d) {
|
|
491
|
+
streamEnd--;
|
|
492
|
+
}
|
|
493
|
+
const content = this.data.subarray(this.pos, streamEnd);
|
|
494
|
+
this.pos = endPos + ENDSTREAM_BYTES.length;
|
|
495
|
+
return content;
|
|
496
|
+
}
|
|
497
|
+
const content = this.data.subarray(this.pos, this.pos + length);
|
|
498
|
+
this.pos += length;
|
|
499
|
+
// Skip to endstream
|
|
500
|
+
this.skipWhitespaceAndComments();
|
|
501
|
+
// Try to consume "endstream" keyword
|
|
502
|
+
if (this.pos + ENDSTREAM_BYTES.length <= this.data.length) {
|
|
503
|
+
let match = true;
|
|
504
|
+
for (let i = 0; i < ENDSTREAM_BYTES.length; i++) {
|
|
505
|
+
if (this.data[this.pos + i] !== ENDSTREAM_BYTES[i]) {
|
|
506
|
+
match = false;
|
|
507
|
+
break;
|
|
508
|
+
}
|
|
509
|
+
}
|
|
510
|
+
if (match) {
|
|
511
|
+
this.pos += ENDSTREAM_BYTES.length;
|
|
512
|
+
}
|
|
513
|
+
}
|
|
514
|
+
return content;
|
|
515
|
+
}
|
|
516
|
+
}
|
|
517
|
+
exports.PdfTokenizer = PdfTokenizer;
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Shared utility functions for PDF reader modules.
|
|
4
|
+
*/
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.getDictStringValue = getDictStringValue;
|
|
7
|
+
const pdf_parser_1 = require("./pdf-parser");
|
|
8
|
+
/**
|
|
9
|
+
* Safely extract a string value from a PDF dictionary entry.
|
|
10
|
+
* Handles both name strings and Uint8Array PDF strings (with BOM/encoding detection).
|
|
11
|
+
*
|
|
12
|
+
* @param dict - The PDF dictionary
|
|
13
|
+
* @param key - The key to look up
|
|
14
|
+
* @param doc - The PDF document for resolving indirect references
|
|
15
|
+
* @returns The string value, or empty string if not found or not a string
|
|
16
|
+
*/
|
|
17
|
+
function getDictStringValue(dict, key, doc) {
|
|
18
|
+
const val = dict.get(key);
|
|
19
|
+
if (!val) {
|
|
20
|
+
return "";
|
|
21
|
+
}
|
|
22
|
+
const resolved = doc.deref(val);
|
|
23
|
+
if (typeof resolved === "string") {
|
|
24
|
+
return resolved;
|
|
25
|
+
}
|
|
26
|
+
if (resolved instanceof Uint8Array) {
|
|
27
|
+
return (0, pdf_parser_1.decodePdfStringBytes)(resolved);
|
|
28
|
+
}
|
|
29
|
+
return "";
|
|
30
|
+
}
|