pdf-lite 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/EXAMPLES.md +1 -1
- package/dist/acroform/acroform.d.ts +21 -0
- package/dist/acroform/acroform.js +135 -7
- package/dist/acroform/manager.d.ts +2 -2
- package/dist/acroform/manager.js +3 -3
- package/dist/core/decoder.js +2 -2
- package/dist/core/objects/pdf-hexadecimal.d.ts +6 -1
- package/dist/core/objects/pdf-hexadecimal.js +11 -3
- package/dist/core/objects/pdf-string.d.ts +11 -1
- package/dist/core/objects/pdf-string.js +21 -6
- package/dist/core/tokeniser.js +35 -78
- package/dist/core/tokens/hexadecimal-token.d.ts +8 -1
- package/dist/core/tokens/hexadecimal-token.js +20 -2
- package/dist/core/tokens/name-token.js +0 -3
- package/dist/core/tokens/string-token.d.ts +8 -1
- package/dist/core/tokens/string-token.js +20 -2
- package/dist/pdf/pdf-document.d.ts +6 -6
- package/dist/pdf/pdf-document.js +21 -21
- package/dist/utils/decodeWithFontEncoding.d.ts +20 -0
- package/dist/utils/decodeWithFontEncoding.js +67 -0
- package/dist/utils/escapeString.d.ts +1 -1
- package/dist/utils/escapeString.js +12 -3
- package/dist/utils/glyphNameToUnicode.d.ts +10 -0
- package/dist/utils/glyphNameToUnicode.js +4292 -0
- package/package.json +1 -1
package/EXAMPLES.md
CHANGED
|
@@ -1109,7 +1109,7 @@ console.log('Created form-empty.pdf with empty form fields')
|
|
|
1109
1109
|
const emptyFormBytes = await fs.readFile(`${tmpFolder}/form-empty.pdf`)
|
|
1110
1110
|
const filledDocument = await PdfDocument.fromBytes([emptyFormBytes])
|
|
1111
1111
|
|
|
1112
|
-
const acroform = await filledDocument.acroForm.
|
|
1112
|
+
const acroform = await filledDocument.acroForm.read()
|
|
1113
1113
|
if (!acroform) {
|
|
1114
1114
|
throw new Error('No AcroForm found in the document')
|
|
1115
1115
|
}
|
|
@@ -37,8 +37,10 @@ export declare class PdfAcroFormField extends PdfDictionary<{
|
|
|
37
37
|
}> {
|
|
38
38
|
parent?: PdfAcroFormField;
|
|
39
39
|
readonly container?: PdfIndirectObject;
|
|
40
|
+
form?: PdfAcroForm;
|
|
40
41
|
constructor(options?: {
|
|
41
42
|
container?: PdfIndirectObject;
|
|
43
|
+
form?: PdfAcroForm;
|
|
42
44
|
});
|
|
43
45
|
/**
|
|
44
46
|
* Gets the field type
|
|
@@ -68,6 +70,11 @@ export declare class PdfAcroFormField extends PdfDictionary<{
|
|
|
68
70
|
*/
|
|
69
71
|
set defaultValue(val: string);
|
|
70
72
|
get value(): string;
|
|
73
|
+
/**
|
|
74
|
+
* Gets the cached encoding map for this field's font, if available.
|
|
75
|
+
* Returns undefined if no encoding has been cached yet.
|
|
76
|
+
*/
|
|
77
|
+
private getCachedEncodingMap;
|
|
71
78
|
set value(val: string);
|
|
72
79
|
get checked(): boolean;
|
|
73
80
|
set checked(isChecked: boolean);
|
|
@@ -132,10 +139,13 @@ export declare class PdfAcroForm<T extends Record<string, string> = Record<strin
|
|
|
132
139
|
}> {
|
|
133
140
|
fields: PdfAcroFormField[];
|
|
134
141
|
readonly container?: PdfIndirectObject;
|
|
142
|
+
readonly fontEncodingMaps: Map<string, Map<number, string> | null>;
|
|
143
|
+
private document?;
|
|
135
144
|
constructor(options: {
|
|
136
145
|
dict: PdfDictionary;
|
|
137
146
|
fields?: PdfAcroFormField[];
|
|
138
147
|
container?: PdfIndirectObject;
|
|
148
|
+
document?: PdfDocument;
|
|
139
149
|
});
|
|
140
150
|
/**
|
|
141
151
|
* Gets the NeedAppearances flag
|
|
@@ -177,7 +187,18 @@ export declare class PdfAcroForm<T extends Record<string, string> = Record<strin
|
|
|
177
187
|
setValues(values: Partial<T>): void;
|
|
178
188
|
importData(fields: T): void;
|
|
179
189
|
exportData(): Partial<T>;
|
|
190
|
+
/**
|
|
191
|
+
* Gets the encoding map for a specific font in the form's resources.
|
|
192
|
+
* Returns null if no custom encoding is found.
|
|
193
|
+
* Results are cached for performance.
|
|
194
|
+
*/
|
|
195
|
+
getFontEncodingMap(fontName: string): Promise<Map<number, string> | null>;
|
|
180
196
|
static fromDocument(document: PdfDocument): Promise<PdfAcroForm | null>;
|
|
197
|
+
/**
|
|
198
|
+
* Pre-caches encoding maps for all fonts used in the form fields.
|
|
199
|
+
* This makes subsequent field value access faster and synchronous.
|
|
200
|
+
*/
|
|
201
|
+
private cacheAllFontEncodings;
|
|
181
202
|
/**
|
|
182
203
|
* Gets or creates the Annots array for a page.
|
|
183
204
|
* Returns the array and metadata about whether it's an indirect object.
|
|
@@ -6,6 +6,7 @@ import { PdfIndirectObject } from '../core/objects/pdf-indirect-object.js';
|
|
|
6
6
|
import { PdfName } from '../core/objects/pdf-name.js';
|
|
7
7
|
import { PdfBoolean } from '../core/objects/pdf-boolean.js';
|
|
8
8
|
import { PdfNumber } from '../core/objects/pdf-number.js';
|
|
9
|
+
import { buildEncodingMap, decodeWithFontEncoding, } from '../utils/decodeWithFontEncoding.js';
|
|
9
10
|
/**
|
|
10
11
|
* Field types for AcroForm fields
|
|
11
12
|
*/
|
|
@@ -18,9 +19,11 @@ export const PdfFieldType = {
|
|
|
18
19
|
export class PdfAcroFormField extends PdfDictionary {
|
|
19
20
|
parent;
|
|
20
21
|
container;
|
|
22
|
+
form;
|
|
21
23
|
constructor(options) {
|
|
22
24
|
super();
|
|
23
25
|
this.container = options?.container;
|
|
26
|
+
this.form = options?.form;
|
|
24
27
|
}
|
|
25
28
|
/**
|
|
26
29
|
* Gets the field type
|
|
@@ -81,7 +84,12 @@ export class PdfAcroFormField extends PdfDictionary {
|
|
|
81
84
|
* Gets the field name
|
|
82
85
|
*/
|
|
83
86
|
get name() {
|
|
84
|
-
|
|
87
|
+
const parentName = this.parent?.name ?? '';
|
|
88
|
+
const ownName = this.get('T')?.as(PdfString)?.value ?? '';
|
|
89
|
+
if (parentName && ownName) {
|
|
90
|
+
return `${parentName}.${ownName}`;
|
|
91
|
+
}
|
|
92
|
+
return parentName || ownName;
|
|
85
93
|
}
|
|
86
94
|
/**
|
|
87
95
|
* Sets the field name
|
|
@@ -117,6 +125,15 @@ export class PdfAcroFormField extends PdfDictionary {
|
|
|
117
125
|
get value() {
|
|
118
126
|
const v = this.get('V');
|
|
119
127
|
if (v instanceof PdfString) {
|
|
128
|
+
// UTF-16BE strings should always use UTF-16BE decoding regardless of font encoding
|
|
129
|
+
if (v.isUTF16BE) {
|
|
130
|
+
return v.value; // Use PdfString's built-in UTF-16BE decoder
|
|
131
|
+
}
|
|
132
|
+
// Try to use custom font encoding if available
|
|
133
|
+
const encodingMap = this.getCachedEncodingMap();
|
|
134
|
+
if (encodingMap !== undefined) {
|
|
135
|
+
return decodeWithFontEncoding(v.raw, encodingMap);
|
|
136
|
+
}
|
|
120
137
|
return v.value;
|
|
121
138
|
}
|
|
122
139
|
else if (v instanceof PdfName) {
|
|
@@ -124,6 +141,24 @@ export class PdfAcroFormField extends PdfDictionary {
|
|
|
124
141
|
}
|
|
125
142
|
return '';
|
|
126
143
|
}
|
|
144
|
+
/**
|
|
145
|
+
* Gets the cached encoding map for this field's font, if available.
|
|
146
|
+
* Returns undefined if no encoding has been cached yet.
|
|
147
|
+
*/
|
|
148
|
+
getCachedEncodingMap() {
|
|
149
|
+
if (!this.form)
|
|
150
|
+
return undefined;
|
|
151
|
+
// Parse font name from DA (default appearance) string
|
|
152
|
+
const da = this.get('DA')?.as(PdfString)?.value;
|
|
153
|
+
if (!da)
|
|
154
|
+
return undefined;
|
|
155
|
+
// Extract font name from DA string (format: /FontName size Tf ...)
|
|
156
|
+
const fontMatch = da.match(/\/(\w+)\s+[\d.]+\s+Tf/);
|
|
157
|
+
if (!fontMatch)
|
|
158
|
+
return undefined;
|
|
159
|
+
const fontName = fontMatch[1];
|
|
160
|
+
return this.form.fontEncodingMaps.get(fontName);
|
|
161
|
+
}
|
|
127
162
|
set value(val) {
|
|
128
163
|
const fieldType = this.get('FT')?.as(PdfName)?.value;
|
|
129
164
|
if (fieldType === PdfFieldType.Button) {
|
|
@@ -298,11 +333,14 @@ export class PdfAcroFormField extends PdfDictionary {
|
|
|
298
333
|
export class PdfAcroForm extends PdfDictionary {
|
|
299
334
|
fields;
|
|
300
335
|
container;
|
|
336
|
+
fontEncodingMaps = new Map();
|
|
337
|
+
document;
|
|
301
338
|
constructor(options) {
|
|
302
339
|
super();
|
|
303
340
|
this.copyFrom(options.dict);
|
|
304
341
|
this.fields = options.fields ?? [];
|
|
305
342
|
this.container = options.container;
|
|
343
|
+
this.document = options.document;
|
|
306
344
|
}
|
|
307
345
|
/**
|
|
308
346
|
* Gets the NeedAppearances flag
|
|
@@ -383,6 +421,69 @@ export class PdfAcroForm extends PdfDictionary {
|
|
|
383
421
|
}
|
|
384
422
|
return result;
|
|
385
423
|
}
|
|
424
|
+
/**
|
|
425
|
+
* Gets the encoding map for a specific font in the form's resources.
|
|
426
|
+
* Returns null if no custom encoding is found.
|
|
427
|
+
* Results are cached for performance.
|
|
428
|
+
*/
|
|
429
|
+
async getFontEncodingMap(fontName) {
|
|
430
|
+
// Check cache first
|
|
431
|
+
if (this.fontEncodingMaps.has(fontName)) {
|
|
432
|
+
return this.fontEncodingMaps.get(fontName);
|
|
433
|
+
}
|
|
434
|
+
// Get the font from DR (default resources)
|
|
435
|
+
const dr = this.get('DR')?.as(PdfDictionary);
|
|
436
|
+
if (!dr) {
|
|
437
|
+
this.fontEncodingMaps.set(fontName, null);
|
|
438
|
+
return null;
|
|
439
|
+
}
|
|
440
|
+
const fonts = dr.get('Font')?.as(PdfDictionary);
|
|
441
|
+
if (!fonts) {
|
|
442
|
+
this.fontEncodingMaps.set(fontName, null);
|
|
443
|
+
return null;
|
|
444
|
+
}
|
|
445
|
+
const fontRef = fonts.get(fontName)?.as(PdfObjectReference);
|
|
446
|
+
if (!fontRef || !this.document) {
|
|
447
|
+
this.fontEncodingMaps.set(fontName, null);
|
|
448
|
+
return null;
|
|
449
|
+
}
|
|
450
|
+
// Read the font object
|
|
451
|
+
const fontObj = await this.document.readObject({
|
|
452
|
+
objectNumber: fontRef.objectNumber,
|
|
453
|
+
generationNumber: fontRef.generationNumber,
|
|
454
|
+
});
|
|
455
|
+
if (!fontObj) {
|
|
456
|
+
this.fontEncodingMaps.set(fontName, null);
|
|
457
|
+
return null;
|
|
458
|
+
}
|
|
459
|
+
const fontDict = fontObj.content.as(PdfDictionary);
|
|
460
|
+
const encoding = fontDict.get('Encoding');
|
|
461
|
+
// Handle encoding reference
|
|
462
|
+
let encodingDict = null;
|
|
463
|
+
if (encoding instanceof PdfObjectReference) {
|
|
464
|
+
const encodingObj = await this.document.readObject({
|
|
465
|
+
objectNumber: encoding.objectNumber,
|
|
466
|
+
generationNumber: encoding.generationNumber,
|
|
467
|
+
});
|
|
468
|
+
encodingDict = encodingObj?.content.as(PdfDictionary) ?? null;
|
|
469
|
+
}
|
|
470
|
+
else if (encoding instanceof PdfDictionary) {
|
|
471
|
+
encodingDict = encoding;
|
|
472
|
+
}
|
|
473
|
+
if (!encodingDict) {
|
|
474
|
+
this.fontEncodingMaps.set(fontName, null);
|
|
475
|
+
return null;
|
|
476
|
+
}
|
|
477
|
+
// Parse the Differences array
|
|
478
|
+
const differences = encodingDict.get('Differences')?.as(PdfArray);
|
|
479
|
+
if (!differences) {
|
|
480
|
+
this.fontEncodingMaps.set(fontName, null);
|
|
481
|
+
return null;
|
|
482
|
+
}
|
|
483
|
+
const encodingMap = buildEncodingMap(differences);
|
|
484
|
+
this.fontEncodingMaps.set(fontName, encodingMap);
|
|
485
|
+
return encodingMap;
|
|
486
|
+
}
|
|
386
487
|
static async fromDocument(document) {
|
|
387
488
|
const catalog = document.rootDictionary;
|
|
388
489
|
if (!catalog)
|
|
@@ -413,14 +514,16 @@ export class PdfAcroForm extends PdfDictionary {
|
|
|
413
514
|
const acroForm = new PdfAcroForm({
|
|
414
515
|
dict: acroFormDict,
|
|
415
516
|
container: acroFormContainer,
|
|
517
|
+
document,
|
|
416
518
|
});
|
|
417
|
-
const
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
519
|
+
const fields = new Map();
|
|
520
|
+
const getFields = async (fieldRefs, parent) => {
|
|
521
|
+
for (const fieldRef of fieldRefs.items) {
|
|
522
|
+
const refKey = fieldRef.toString().trim();
|
|
523
|
+
if (fields.has(refKey)) {
|
|
524
|
+
fields.get(refKey).parent = parent;
|
|
421
525
|
continue;
|
|
422
526
|
}
|
|
423
|
-
seen.add(refKey);
|
|
424
527
|
const fieldObject = await document.readObject({
|
|
425
528
|
objectNumber: fieldRef.objectNumber,
|
|
426
529
|
generationNumber: fieldRef.generationNumber,
|
|
@@ -431,15 +534,17 @@ export class PdfAcroForm extends PdfDictionary {
|
|
|
431
534
|
continue;
|
|
432
535
|
const field = new PdfAcroFormField({
|
|
433
536
|
container: fieldObject,
|
|
537
|
+
form: acroForm,
|
|
434
538
|
});
|
|
435
539
|
field.parent = parent;
|
|
436
540
|
field.copyFrom(fieldObject.content);
|
|
437
541
|
// Process child fields (Kids) before adding the parent
|
|
438
542
|
const kids = field.get('Kids')?.as((PdfArray));
|
|
439
543
|
if (kids) {
|
|
440
|
-
await getFields(kids,
|
|
544
|
+
await getFields(kids, field);
|
|
441
545
|
}
|
|
442
546
|
acroForm.fields.push(field);
|
|
547
|
+
fields.set(refKey, field);
|
|
443
548
|
}
|
|
444
549
|
};
|
|
445
550
|
const fieldsArray = new PdfArray();
|
|
@@ -459,8 +564,31 @@ export class PdfAcroForm extends PdfDictionary {
|
|
|
459
564
|
}
|
|
460
565
|
}
|
|
461
566
|
await getFields(fieldsArray);
|
|
567
|
+
// Pre-cache font encoding maps for all fonts used in fields
|
|
568
|
+
await acroForm.cacheAllFontEncodings();
|
|
462
569
|
return acroForm;
|
|
463
570
|
}
|
|
571
|
+
/**
|
|
572
|
+
* Pre-caches encoding maps for all fonts used in the form fields.
|
|
573
|
+
* This makes subsequent field value access faster and synchronous.
|
|
574
|
+
*/
|
|
575
|
+
async cacheAllFontEncodings() {
|
|
576
|
+
const fontNames = new Set();
|
|
577
|
+
// Collect all font names from field DA strings
|
|
578
|
+
for (const field of this.fields) {
|
|
579
|
+
const da = field.get('DA')?.as(PdfString)?.value;
|
|
580
|
+
if (da) {
|
|
581
|
+
const fontMatch = da.match(/\/(\w+)\s+[\d.]+\s+Tf/);
|
|
582
|
+
if (fontMatch) {
|
|
583
|
+
fontNames.add(fontMatch[1]);
|
|
584
|
+
}
|
|
585
|
+
}
|
|
586
|
+
}
|
|
587
|
+
// Pre-cache encoding for each font
|
|
588
|
+
for (const fontName of fontNames) {
|
|
589
|
+
await this.getFontEncodingMap(fontName);
|
|
590
|
+
}
|
|
591
|
+
}
|
|
464
592
|
/**
|
|
465
593
|
* Gets or creates the Annots array for a page.
|
|
466
594
|
* Returns the array and metadata about whether it's an indirect object.
|
|
@@ -11,12 +11,12 @@ export declare class PdfAcroFormManager {
|
|
|
11
11
|
* Checks if the document contains AcroForm fields.
|
|
12
12
|
* @returns True if the document has AcroForm fields, false otherwise
|
|
13
13
|
*/
|
|
14
|
-
|
|
14
|
+
exists(): Promise<boolean>;
|
|
15
15
|
/**
|
|
16
16
|
* Gets the AcroForm object from the document catalog.
|
|
17
17
|
* @returns The AcroForm object or null if not found
|
|
18
18
|
*/
|
|
19
|
-
|
|
19
|
+
read(): Promise<PdfAcroForm | null>;
|
|
20
20
|
/**
|
|
21
21
|
* Writes the provided AcroForm to the associated PDF document.
|
|
22
22
|
* @param acroForm The AcroForm instance to serialize into the document.
|
package/dist/acroform/manager.js
CHANGED
|
@@ -12,9 +12,9 @@ export class PdfAcroFormManager {
|
|
|
12
12
|
* Checks if the document contains AcroForm fields.
|
|
13
13
|
* @returns True if the document has AcroForm fields, false otherwise
|
|
14
14
|
*/
|
|
15
|
-
async
|
|
15
|
+
async exists() {
|
|
16
16
|
try {
|
|
17
|
-
const acroForm = await this.
|
|
17
|
+
const acroForm = await this.read();
|
|
18
18
|
return acroForm !== null;
|
|
19
19
|
}
|
|
20
20
|
catch {
|
|
@@ -25,7 +25,7 @@ export class PdfAcroFormManager {
|
|
|
25
25
|
* Gets the AcroForm object from the document catalog.
|
|
26
26
|
* @returns The AcroForm object or null if not found
|
|
27
27
|
*/
|
|
28
|
-
async
|
|
28
|
+
async read() {
|
|
29
29
|
return await PdfAcroForm.fromDocument(this.document);
|
|
30
30
|
}
|
|
31
31
|
/**
|
package/dist/core/decoder.js
CHANGED
|
@@ -179,7 +179,7 @@ export class PdfDecoder extends IncrementalParser {
|
|
|
179
179
|
out = new PdfBoolean(token.value);
|
|
180
180
|
}
|
|
181
181
|
else if (token instanceof PdfHexadecimalToken) {
|
|
182
|
-
out = new PdfHexadecimal(token.raw, 'hex');
|
|
182
|
+
out = new PdfHexadecimal(token.raw, 'hex', token.originalBytes);
|
|
183
183
|
}
|
|
184
184
|
else if (token instanceof PdfNullToken) {
|
|
185
185
|
out = new PdfNull();
|
|
@@ -188,7 +188,7 @@ export class PdfDecoder extends IncrementalParser {
|
|
|
188
188
|
out = new PdfObjectReference(token.objectNumber, token.generationNumber);
|
|
189
189
|
}
|
|
190
190
|
else if (token instanceof PdfStringToken) {
|
|
191
|
-
out = new PdfString(token.value);
|
|
191
|
+
out = new PdfString(token.value, token.originalBytes);
|
|
192
192
|
}
|
|
193
193
|
else {
|
|
194
194
|
throw new Error(`Unknown primitive token type: ${token.type}`);
|
|
@@ -7,7 +7,12 @@ export declare class PdfHexadecimal extends PdfObject {
|
|
|
7
7
|
* NB: This is the hexadecimal representation, not the actual byte values.
|
|
8
8
|
*/
|
|
9
9
|
raw: ByteArray;
|
|
10
|
-
|
|
10
|
+
/**
|
|
11
|
+
* Original bytes from the PDF file, including angle brackets.
|
|
12
|
+
* Used to preserve exact formatting for incremental updates.
|
|
13
|
+
*/
|
|
14
|
+
private _originalBytes?;
|
|
15
|
+
constructor(value: string | ByteArray, format?: 'hex' | 'bytes', originalBytes?: ByteArray);
|
|
11
16
|
static toHexadecimal(data: string | ByteArray): PdfHexadecimal;
|
|
12
17
|
get bytes(): ByteArray;
|
|
13
18
|
toHexBytes(): ByteArray;
|
|
@@ -10,7 +10,12 @@ export class PdfHexadecimal extends PdfObject {
|
|
|
10
10
|
* NB: This is the hexadecimal representation, not the actual byte values.
|
|
11
11
|
*/
|
|
12
12
|
raw;
|
|
13
|
-
|
|
13
|
+
/**
|
|
14
|
+
* Original bytes from the PDF file, including angle brackets.
|
|
15
|
+
* Used to preserve exact formatting for incremental updates.
|
|
16
|
+
*/
|
|
17
|
+
_originalBytes;
|
|
18
|
+
constructor(value, format = 'hex', originalBytes) {
|
|
14
19
|
super();
|
|
15
20
|
let bytes;
|
|
16
21
|
if (format === 'bytes') {
|
|
@@ -20,6 +25,7 @@ export class PdfHexadecimal extends PdfObject {
|
|
|
20
25
|
bytes = value instanceof Uint8Array ? value : stringToBytes(value);
|
|
21
26
|
}
|
|
22
27
|
this.raw = bytes;
|
|
28
|
+
this._originalBytes = originalBytes;
|
|
23
29
|
}
|
|
24
30
|
static toHexadecimal(data) {
|
|
25
31
|
return new PdfHexadecimal(data, 'bytes');
|
|
@@ -34,9 +40,11 @@ export class PdfHexadecimal extends PdfObject {
|
|
|
34
40
|
return bytesToString(this.toHexBytes());
|
|
35
41
|
}
|
|
36
42
|
tokenize() {
|
|
37
|
-
return [new PdfHexadecimalToken(this.raw)];
|
|
43
|
+
return [new PdfHexadecimalToken(this.raw, this._originalBytes)];
|
|
38
44
|
}
|
|
39
45
|
clone() {
|
|
40
|
-
return new PdfHexadecimal(new Uint8Array(this.raw)
|
|
46
|
+
return new PdfHexadecimal(new Uint8Array(this.raw), 'hex', this._originalBytes
|
|
47
|
+
? new Uint8Array(this._originalBytes)
|
|
48
|
+
: undefined);
|
|
41
49
|
}
|
|
42
50
|
}
|
|
@@ -6,9 +6,19 @@ export declare class PdfString extends PdfObject {
|
|
|
6
6
|
* The raw bytes of the PDF string.
|
|
7
7
|
*/
|
|
8
8
|
private _raw;
|
|
9
|
-
|
|
9
|
+
/**
|
|
10
|
+
* Original bytes from the PDF file, including parentheses and escape sequences.
|
|
11
|
+
* Used to preserve exact formatting for incremental updates.
|
|
12
|
+
*/
|
|
13
|
+
private _originalBytes?;
|
|
14
|
+
constructor(raw: ByteArray | string, originalBytes?: ByteArray);
|
|
10
15
|
get raw(): ByteArray;
|
|
11
16
|
set raw(raw: ByteArray);
|
|
17
|
+
/**
|
|
18
|
+
* Checks if this string is UTF-16BE encoded (has UTF-16BE BOM).
|
|
19
|
+
* UTF-16BE strings start with the byte order mark 0xFE 0xFF.
|
|
20
|
+
*/
|
|
21
|
+
get isUTF16BE(): boolean;
|
|
12
22
|
get value(): string;
|
|
13
23
|
protected tokenize(): PdfStringToken[];
|
|
14
24
|
clone(): this;
|
|
@@ -10,7 +10,12 @@ export class PdfString extends PdfObject {
|
|
|
10
10
|
* The raw bytes of the PDF string.
|
|
11
11
|
*/
|
|
12
12
|
_raw;
|
|
13
|
-
|
|
13
|
+
/**
|
|
14
|
+
* Original bytes from the PDF file, including parentheses and escape sequences.
|
|
15
|
+
* Used to preserve exact formatting for incremental updates.
|
|
16
|
+
*/
|
|
17
|
+
_originalBytes;
|
|
18
|
+
constructor(raw, originalBytes) {
|
|
14
19
|
super();
|
|
15
20
|
if (typeof raw === 'string') {
|
|
16
21
|
// Check if the string contains non-ASCII characters
|
|
@@ -26,6 +31,7 @@ export class PdfString extends PdfObject {
|
|
|
26
31
|
else {
|
|
27
32
|
this._raw = raw;
|
|
28
33
|
}
|
|
34
|
+
this._originalBytes = originalBytes;
|
|
29
35
|
}
|
|
30
36
|
get raw() {
|
|
31
37
|
return this._raw;
|
|
@@ -33,21 +39,30 @@ export class PdfString extends PdfObject {
|
|
|
33
39
|
set raw(raw) {
|
|
34
40
|
this.setModified();
|
|
35
41
|
this._raw = raw;
|
|
42
|
+
// Clear original bytes when modified
|
|
43
|
+
this._originalBytes = undefined;
|
|
44
|
+
}
|
|
45
|
+
/**
|
|
46
|
+
* Checks if this string is UTF-16BE encoded (has UTF-16BE BOM).
|
|
47
|
+
* UTF-16BE strings start with the byte order mark 0xFE 0xFF.
|
|
48
|
+
*/
|
|
49
|
+
get isUTF16BE() {
|
|
50
|
+
return (this.raw.length >= 2 && this.raw[0] === 0xfe && this.raw[1] === 0xff);
|
|
36
51
|
}
|
|
37
52
|
get value() {
|
|
38
53
|
// Check for UTF-16BE BOM (0xFE 0xFF)
|
|
39
|
-
if (this.
|
|
40
|
-
this.raw[0] === 0xfe &&
|
|
41
|
-
this.raw[1] === 0xff) {
|
|
54
|
+
if (this.isUTF16BE) {
|
|
42
55
|
return decodeFromUTF16BE(this.raw);
|
|
43
56
|
}
|
|
44
57
|
// Default: use PDFDocEncoding
|
|
45
58
|
return decodeFromPDFDocEncoding(this.raw);
|
|
46
59
|
}
|
|
47
60
|
tokenize() {
|
|
48
|
-
return [new PdfStringToken(this.raw)];
|
|
61
|
+
return [new PdfStringToken(this.raw, this._originalBytes)];
|
|
49
62
|
}
|
|
50
63
|
clone() {
|
|
51
|
-
return new PdfString(new Uint8Array(this.raw)
|
|
64
|
+
return new PdfString(new Uint8Array(this.raw), this._originalBytes
|
|
65
|
+
? new Uint8Array(this._originalBytes)
|
|
66
|
+
: undefined);
|
|
52
67
|
}
|
|
53
68
|
}
|
package/dist/core/tokeniser.js
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { assert } from '../utils/assert.js';
|
|
2
2
|
import { bytesToString } from '../utils/bytesToString.js';
|
|
3
|
+
import { unescapeString } from '../utils/unescapeString.js';
|
|
3
4
|
import { IncrementalParser } from './incremental-parser.js';
|
|
4
5
|
import { PdfBooleanToken } from './tokens/boolean-token.js';
|
|
5
6
|
import { PdfCommentToken } from './tokens/comment-token.js';
|
|
@@ -128,7 +129,8 @@ export class PdfByteStreamTokeniser extends IncrementalParser {
|
|
|
128
129
|
nameBytes.push(this.next());
|
|
129
130
|
byte = this.peek();
|
|
130
131
|
}
|
|
131
|
-
|
|
132
|
+
const name = bytesToString(new Uint8Array(nameBytes));
|
|
133
|
+
return new PdfNameToken(name);
|
|
132
134
|
}
|
|
133
135
|
nextDictionaryEndToken() {
|
|
134
136
|
this.expect(ByteMap.RIGHT_ANGLE_BRACKET);
|
|
@@ -136,6 +138,8 @@ export class PdfByteStreamTokeniser extends IncrementalParser {
|
|
|
136
138
|
return new PdfEndDictionaryToken();
|
|
137
139
|
}
|
|
138
140
|
nextHexadecimalToken() {
|
|
141
|
+
// Capture starting position (before the opening angle bracket)
|
|
142
|
+
const startIndex = this.bufferIndex;
|
|
139
143
|
this.expect(ByteMap.LEFT_ANGLE_BRACKET);
|
|
140
144
|
const hexBytes = [];
|
|
141
145
|
let byte = this.peek();
|
|
@@ -146,7 +150,10 @@ export class PdfByteStreamTokeniser extends IncrementalParser {
|
|
|
146
150
|
byte = this.peek();
|
|
147
151
|
}
|
|
148
152
|
this.expect(ByteMap.RIGHT_ANGLE_BRACKET);
|
|
149
|
-
|
|
153
|
+
// Capture original bytes including angle brackets for incremental updates
|
|
154
|
+
const endIndex = this.bufferIndex; // After the closing angle bracket
|
|
155
|
+
const originalBytes = new Uint8Array(this.buffer.slice(startIndex, endIndex));
|
|
156
|
+
return new PdfHexadecimalToken(new Uint8Array(hexBytes), originalBytes);
|
|
150
157
|
}
|
|
151
158
|
nextNumberToken() {
|
|
152
159
|
const numberBytes = [];
|
|
@@ -187,8 +194,11 @@ export class PdfByteStreamTokeniser extends IncrementalParser {
|
|
|
187
194
|
return new PdfEndArrayToken();
|
|
188
195
|
}
|
|
189
196
|
nextStringToken() {
|
|
197
|
+
// Capture starting position (before the opening parenthesis)
|
|
198
|
+
const startIndex = this.bufferIndex;
|
|
190
199
|
this.expect(ByteMap.LEFT_PARENTHESIS);
|
|
191
|
-
|
|
200
|
+
// Collect raw bytes until we find the matching closing parenthesis
|
|
201
|
+
const rawBytes = [];
|
|
192
202
|
let nesting = 1;
|
|
193
203
|
let inEscape = false;
|
|
194
204
|
while (inEscape || nesting > 0) {
|
|
@@ -196,87 +206,34 @@ export class PdfByteStreamTokeniser extends IncrementalParser {
|
|
|
196
206
|
if (byte === null) {
|
|
197
207
|
throw new Error('Unexpected end of input in string token');
|
|
198
208
|
}
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
break;
|
|
206
|
-
}
|
|
207
|
-
}
|
|
208
|
-
else if (byte === ByteMap.BACKSLASH || inEscape) {
|
|
209
|
-
inEscape = true;
|
|
210
|
-
const next = this.next();
|
|
211
|
-
let found = false;
|
|
212
|
-
if (this.inputOffset >= 829528) {
|
|
213
|
-
console.log('here', this.inputOffset, next);
|
|
214
|
-
found = true;
|
|
215
|
-
}
|
|
216
|
-
if (next === null) {
|
|
217
|
-
throw new Error('Unexpected end of input in string token');
|
|
209
|
+
// Add byte to rawBytes first (including the closing parenthesis)
|
|
210
|
+
rawBytes.push(byte);
|
|
211
|
+
// Track nesting level for proper parenthesis matching
|
|
212
|
+
if (!inEscape) {
|
|
213
|
+
if (byte === ByteMap.LEFT_PARENTHESIS) {
|
|
214
|
+
nesting++;
|
|
218
215
|
}
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
break; // \n
|
|
223
|
-
case ByteMap.r:
|
|
224
|
-
stringBytes.push(0x0d);
|
|
225
|
-
break; // \r
|
|
226
|
-
case ByteMap.t:
|
|
227
|
-
stringBytes.push(0x09);
|
|
228
|
-
break; // \t
|
|
229
|
-
case ByteMap.b:
|
|
230
|
-
stringBytes.push(0x08);
|
|
231
|
-
break; // \b
|
|
232
|
-
case ByteMap.f:
|
|
233
|
-
stringBytes.push(0x0c);
|
|
234
|
-
break; // \f
|
|
235
|
-
case ByteMap.LEFT_PARENTHESIS:
|
|
236
|
-
stringBytes.push(ByteMap.LEFT_PARENTHESIS);
|
|
237
|
-
break; // \(
|
|
238
|
-
case ByteMap.RIGHT_PARENTHESIS:
|
|
239
|
-
stringBytes.push(ByteMap.RIGHT_PARENTHESIS);
|
|
240
|
-
break; // \)
|
|
241
|
-
case ByteMap.BACKSLASH:
|
|
242
|
-
stringBytes.push(ByteMap.BACKSLASH);
|
|
243
|
-
break; // \\
|
|
244
|
-
case ByteMap.LINE_FEED: // Line feed
|
|
245
|
-
case ByteMap.CARRIAGE_RETURN: // Carriage return
|
|
246
|
-
stringBytes.push(next);
|
|
247
|
-
break;
|
|
248
|
-
default:
|
|
249
|
-
if (PdfByteStreamTokeniser.isOctet(next)) {
|
|
250
|
-
let octal = String.fromCharCode(next);
|
|
251
|
-
// Octal: up to 3 digits
|
|
252
|
-
const next2 = this.peek();
|
|
253
|
-
if (next2 === null) {
|
|
254
|
-
throw new Error('Unexpected end of input in string token');
|
|
255
|
-
}
|
|
256
|
-
if (PdfByteStreamTokeniser.isOctet(next2)) {
|
|
257
|
-
octal += String.fromCharCode(this.next());
|
|
258
|
-
}
|
|
259
|
-
const next3 = this.peek();
|
|
260
|
-
if (next3 === null) {
|
|
261
|
-
throw new Error('Unexpected end of input in string token');
|
|
262
|
-
}
|
|
263
|
-
if (PdfByteStreamTokeniser.isOctet(next3)) {
|
|
264
|
-
octal += String.fromCharCode(this.next());
|
|
265
|
-
}
|
|
266
|
-
stringBytes.push(parseInt(octal, 8));
|
|
267
|
-
}
|
|
268
|
-
else {
|
|
269
|
-
// If it's not a valid escape sequence, just add the next byte
|
|
270
|
-
stringBytes.push(next);
|
|
271
|
-
}
|
|
216
|
+
else if (byte === ByteMap.RIGHT_PARENTHESIS) {
|
|
217
|
+
nesting--;
|
|
218
|
+
if (nesting === 0) {
|
|
272
219
|
break;
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
else if (byte === ByteMap.BACKSLASH) {
|
|
223
|
+
inEscape = true;
|
|
273
224
|
}
|
|
225
|
+
}
|
|
226
|
+
else {
|
|
274
227
|
inEscape = false;
|
|
275
|
-
continue;
|
|
276
228
|
}
|
|
277
|
-
stringBytes.push(byte);
|
|
278
229
|
}
|
|
279
|
-
|
|
230
|
+
// Capture original bytes including parentheses for incremental updates
|
|
231
|
+
const endIndex = this.bufferIndex; // After the closing parenthesis
|
|
232
|
+
const originalBytes = new Uint8Array(this.buffer.slice(startIndex, endIndex));
|
|
233
|
+
// Use unescapeString utility to process escape sequences
|
|
234
|
+
// unescapeString expects bytes including the closing parenthesis
|
|
235
|
+
const unescapedBytes = unescapeString(new Uint8Array(rawBytes));
|
|
236
|
+
return new PdfStringToken(unescapedBytes, originalBytes);
|
|
280
237
|
}
|
|
281
238
|
nextEndObjectToken() {
|
|
282
239
|
this.expect(ByteMap.e);
|
|
@@ -2,6 +2,13 @@ import { ByteArray } from '../../types.js';
|
|
|
2
2
|
import { PdfToken } from './token.js';
|
|
3
3
|
export declare class PdfHexadecimalToken extends PdfToken {
|
|
4
4
|
raw: ByteArray;
|
|
5
|
-
|
|
5
|
+
/**
|
|
6
|
+
* Original bytes from the PDF file, including angle brackets.
|
|
7
|
+
* Used to preserve exact formatting for incremental updates.
|
|
8
|
+
* @internal - Non-enumerable to avoid affecting test comparisons
|
|
9
|
+
*/
|
|
10
|
+
private _originalBytes?;
|
|
11
|
+
constructor(hexadecimal: string | ByteArray, originalBytes?: ByteArray);
|
|
12
|
+
get originalBytes(): ByteArray | undefined;
|
|
6
13
|
private static toBytes;
|
|
7
14
|
}
|