@cj-tech-master/excelts 8.0.0 → 8.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +14 -1
- package/README_zh.md +6 -0
- package/dist/browser/modules/archive/zip/stream.d.ts +4 -0
- package/dist/browser/modules/archive/zip/stream.js +53 -0
- package/dist/browser/modules/pdf/core/crypto.d.ts +65 -0
- package/dist/browser/modules/pdf/core/crypto.js +637 -0
- package/dist/browser/modules/pdf/core/encryption.d.ts +23 -20
- package/dist/browser/modules/pdf/core/encryption.js +88 -261
- package/dist/browser/modules/pdf/core/pdf-writer.d.ts +6 -4
- package/dist/browser/modules/pdf/core/pdf-writer.js +19 -10
- package/dist/browser/modules/pdf/index.d.ts +23 -2
- package/dist/browser/modules/pdf/index.js +21 -3
- package/dist/browser/modules/pdf/reader/annotation-extractor.d.ts +63 -0
- package/dist/browser/modules/pdf/reader/annotation-extractor.js +155 -0
- package/dist/browser/modules/pdf/reader/cmap-parser.d.ts +70 -0
- package/dist/browser/modules/pdf/reader/cmap-parser.js +321 -0
- package/dist/browser/modules/pdf/reader/content-interpreter.d.ts +57 -0
- package/dist/browser/modules/pdf/reader/content-interpreter.js +715 -0
- package/dist/browser/modules/pdf/reader/font-decoder.d.ts +58 -0
- package/dist/browser/modules/pdf/reader/font-decoder.js +1513 -0
- package/dist/browser/modules/pdf/reader/form-extractor.d.ts +48 -0
- package/dist/browser/modules/pdf/reader/form-extractor.js +355 -0
- package/dist/browser/modules/pdf/reader/image-extractor.d.ts +55 -0
- package/dist/browser/modules/pdf/reader/image-extractor.js +220 -0
- package/dist/browser/modules/pdf/reader/metadata-reader.d.ts +56 -0
- package/dist/browser/modules/pdf/reader/metadata-reader.js +275 -0
- package/dist/browser/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
- package/dist/browser/modules/pdf/reader/pdf-decrypt.js +443 -0
- package/dist/browser/modules/pdf/reader/pdf-document.d.ts +191 -0
- package/dist/browser/modules/pdf/reader/pdf-document.js +818 -0
- package/dist/browser/modules/pdf/reader/pdf-parser.d.ts +65 -0
- package/dist/browser/modules/pdf/reader/pdf-parser.js +285 -0
- package/dist/browser/modules/pdf/reader/pdf-reader.d.ts +143 -0
- package/dist/browser/modules/pdf/reader/pdf-reader.js +200 -0
- package/dist/browser/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
- package/dist/browser/modules/pdf/reader/pdf-tokenizer.js +543 -0
- package/dist/browser/modules/pdf/reader/reader-utils.d.ts +15 -0
- package/dist/browser/modules/pdf/reader/reader-utils.js +27 -0
- package/dist/browser/modules/pdf/reader/stream-filters.d.ts +20 -0
- package/dist/browser/modules/pdf/reader/stream-filters.js +456 -0
- package/dist/browser/modules/pdf/reader/text-reconstruction.d.ts +44 -0
- package/dist/browser/modules/pdf/reader/text-reconstruction.js +463 -0
- package/dist/cjs/modules/archive/zip/stream.js +53 -0
- package/dist/cjs/modules/pdf/core/crypto.js +649 -0
- package/dist/cjs/modules/pdf/core/encryption.js +88 -263
- package/dist/cjs/modules/pdf/core/pdf-writer.js +19 -10
- package/dist/cjs/modules/pdf/index.js +23 -4
- package/dist/cjs/modules/pdf/reader/annotation-extractor.js +158 -0
- package/dist/cjs/modules/pdf/reader/cmap-parser.js +326 -0
- package/dist/cjs/modules/pdf/reader/content-interpreter.js +718 -0
- package/dist/cjs/modules/pdf/reader/font-decoder.js +1518 -0
- package/dist/cjs/modules/pdf/reader/form-extractor.js +358 -0
- package/dist/cjs/modules/pdf/reader/image-extractor.js +223 -0
- package/dist/cjs/modules/pdf/reader/metadata-reader.js +278 -0
- package/dist/cjs/modules/pdf/reader/pdf-decrypt.js +447 -0
- package/dist/cjs/modules/pdf/reader/pdf-document.js +822 -0
- package/dist/cjs/modules/pdf/reader/pdf-parser.js +301 -0
- package/dist/cjs/modules/pdf/reader/pdf-reader.js +203 -0
- package/dist/cjs/modules/pdf/reader/pdf-tokenizer.js +517 -0
- package/dist/cjs/modules/pdf/reader/reader-utils.js +30 -0
- package/dist/cjs/modules/pdf/reader/stream-filters.js +459 -0
- package/dist/cjs/modules/pdf/reader/text-reconstruction.js +467 -0
- package/dist/esm/modules/archive/zip/stream.js +53 -0
- package/dist/esm/modules/pdf/core/crypto.js +637 -0
- package/dist/esm/modules/pdf/core/encryption.js +88 -261
- package/dist/esm/modules/pdf/core/pdf-writer.js +19 -10
- package/dist/esm/modules/pdf/index.js +21 -3
- package/dist/esm/modules/pdf/reader/annotation-extractor.js +155 -0
- package/dist/esm/modules/pdf/reader/cmap-parser.js +321 -0
- package/dist/esm/modules/pdf/reader/content-interpreter.js +715 -0
- package/dist/esm/modules/pdf/reader/font-decoder.js +1513 -0
- package/dist/esm/modules/pdf/reader/form-extractor.js +355 -0
- package/dist/esm/modules/pdf/reader/image-extractor.js +220 -0
- package/dist/esm/modules/pdf/reader/metadata-reader.js +275 -0
- package/dist/esm/modules/pdf/reader/pdf-decrypt.js +443 -0
- package/dist/esm/modules/pdf/reader/pdf-document.js +818 -0
- package/dist/esm/modules/pdf/reader/pdf-parser.js +285 -0
- package/dist/esm/modules/pdf/reader/pdf-reader.js +200 -0
- package/dist/esm/modules/pdf/reader/pdf-tokenizer.js +543 -0
- package/dist/esm/modules/pdf/reader/reader-utils.js +27 -0
- package/dist/esm/modules/pdf/reader/stream-filters.js +456 -0
- package/dist/esm/modules/pdf/reader/text-reconstruction.js +463 -0
- package/dist/iife/excelts.iife.js +703 -267
- package/dist/iife/excelts.iife.js.map +1 -1
- package/dist/iife/excelts.iife.min.js +35 -35
- package/dist/types/modules/archive/zip/stream.d.ts +4 -0
- package/dist/types/modules/pdf/core/crypto.d.ts +65 -0
- package/dist/types/modules/pdf/core/encryption.d.ts +23 -20
- package/dist/types/modules/pdf/core/pdf-writer.d.ts +6 -4
- package/dist/types/modules/pdf/index.d.ts +23 -2
- package/dist/types/modules/pdf/reader/annotation-extractor.d.ts +63 -0
- package/dist/types/modules/pdf/reader/cmap-parser.d.ts +70 -0
- package/dist/types/modules/pdf/reader/content-interpreter.d.ts +57 -0
- package/dist/types/modules/pdf/reader/font-decoder.d.ts +58 -0
- package/dist/types/modules/pdf/reader/form-extractor.d.ts +48 -0
- package/dist/types/modules/pdf/reader/image-extractor.d.ts +55 -0
- package/dist/types/modules/pdf/reader/metadata-reader.d.ts +56 -0
- package/dist/types/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
- package/dist/types/modules/pdf/reader/pdf-document.d.ts +191 -0
- package/dist/types/modules/pdf/reader/pdf-parser.d.ts +65 -0
- package/dist/types/modules/pdf/reader/pdf-reader.d.ts +143 -0
- package/dist/types/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
- package/dist/types/modules/pdf/reader/reader-utils.d.ts +15 -0
- package/dist/types/modules/pdf/reader/stream-filters.d.ts +20 -0
- package/dist/types/modules/pdf/reader/text-reconstruction.d.ts +44 -0
- package/package.json +1 -1
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PDF object parser.
|
|
3
|
+
*
|
|
4
|
+
* Parses PDF tokens into typed PDF objects: dictionaries, arrays, strings,
|
|
5
|
+
* numbers, booleans, names, null, indirect references, and streams.
|
|
6
|
+
*
|
|
7
|
+
* @see PDF Reference 1.7, Chapter 3 - Objects
|
|
8
|
+
*/
|
|
9
|
+
import type { Token, PdfTokenizer } from "./pdf-tokenizer.js";
|
|
10
|
+
/** A PDF indirect object reference: `N gen R` */
|
|
11
|
+
export interface PdfRef {
|
|
12
|
+
readonly type: "ref";
|
|
13
|
+
readonly objNum: number;
|
|
14
|
+
readonly gen: number;
|
|
15
|
+
}
|
|
16
|
+
/** A PDF stream: dictionary + raw data bytes */
|
|
17
|
+
export interface PdfStream {
|
|
18
|
+
readonly type: "stream";
|
|
19
|
+
readonly dict: PdfDictValue;
|
|
20
|
+
readonly data: Uint8Array;
|
|
21
|
+
}
|
|
22
|
+
/** A PDF dictionary: key-value pairs where keys are names */
|
|
23
|
+
export type PdfDictValue = Map<string, PdfObject>;
|
|
24
|
+
/** A PDF array */
|
|
25
|
+
export type PdfArrayValue = PdfObject[];
|
|
26
|
+
/**
|
|
27
|
+
* Union type for all possible PDF object values.
|
|
28
|
+
*/
|
|
29
|
+
export type PdfObject = number | string | boolean | null | Uint8Array | PdfRef | PdfDictValue | PdfArrayValue | PdfStream;
|
|
30
|
+
export declare function isPdfRef(obj: PdfObject | undefined): obj is PdfRef;
|
|
31
|
+
export declare function isPdfStream(obj: PdfObject | undefined): obj is PdfStream;
|
|
32
|
+
export declare function isPdfDict(obj: PdfObject | undefined): obj is PdfDictValue;
|
|
33
|
+
export declare function isPdfArray(obj: PdfObject | undefined): obj is PdfArrayValue;
|
|
34
|
+
/** Get a string value from a PDF dictionary */
|
|
35
|
+
export declare function dictGetName(dict: PdfDictValue, key: string): string | undefined;
|
|
36
|
+
/** Get a number value from a PDF dictionary */
|
|
37
|
+
export declare function dictGetNumber(dict: PdfDictValue, key: string): number | undefined;
|
|
38
|
+
/** Get a boolean value from a PDF dictionary */
|
|
39
|
+
export declare function dictGetBool(dict: PdfDictValue, key: string): boolean | undefined;
|
|
40
|
+
/** Get a dictionary value from a PDF dictionary */
|
|
41
|
+
export declare function dictGetDict(dict: PdfDictValue, key: string): PdfDictValue | undefined;
|
|
42
|
+
/** Get an array value from a PDF dictionary */
|
|
43
|
+
export declare function dictGetArray(dict: PdfDictValue, key: string): PdfArrayValue | undefined;
|
|
44
|
+
/** Get a ref from a PDF dictionary */
|
|
45
|
+
export declare function dictGetRef(dict: PdfDictValue, key: string): PdfRef | undefined;
|
|
46
|
+
/** Get bytes (string as Uint8Array) from a PDF dictionary */
|
|
47
|
+
export declare function dictGetBytes(dict: PdfDictValue, key: string): Uint8Array | undefined;
|
|
48
|
+
/** Get a string value that may be either a name (string) or bytes decoded as latin1 */
|
|
49
|
+
export declare function dictGetString(dict: PdfDictValue, key: string): string | undefined;
|
|
50
|
+
/**
|
|
51
|
+
* Decode PDF string bytes to a JavaScript string.
|
|
52
|
+
* Handles UTF-16BE (BOM = FEFF) and PDFDocEncoding (Latin-1 superset).
|
|
53
|
+
*/
|
|
54
|
+
export declare function decodePdfStringBytes(bytes: Uint8Array): string;
|
|
55
|
+
/**
|
|
56
|
+
* Parse a single PDF object from the tokenizer.
|
|
57
|
+
*
|
|
58
|
+
* Handles all PDF object types including dictionaries (with possible streams),
|
|
59
|
+
* arrays, strings, numbers, names, booleans, null, and indirect references.
|
|
60
|
+
*/
|
|
61
|
+
export declare function parseObject(tokenizer: PdfTokenizer): PdfObject;
|
|
62
|
+
/**
|
|
63
|
+
* Parse a PDF object given the first token has already been consumed.
|
|
64
|
+
*/
|
|
65
|
+
export declare function parseObjectFromToken(tokenizer: PdfTokenizer, token: Token): PdfObject;
|
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PDF object parser.
|
|
3
|
+
*
|
|
4
|
+
* Parses PDF tokens into typed PDF objects: dictionaries, arrays, strings,
|
|
5
|
+
* numbers, booleans, names, null, indirect references, and streams.
|
|
6
|
+
*
|
|
7
|
+
* @see PDF Reference 1.7, Chapter 3 - Objects
|
|
8
|
+
*/
|
|
9
|
+
import { TokenType } from "./pdf-tokenizer.js";
|
|
10
|
+
import { PdfStructureError } from "../errors.js";
|
|
11
|
+
// =============================================================================
|
|
12
|
+
// Type Guards
|
|
13
|
+
// =============================================================================
|
|
14
|
+
export function isPdfRef(obj) {
|
|
15
|
+
return obj !== null && typeof obj === "object" && "type" in obj && obj.type === "ref";
|
|
16
|
+
}
|
|
17
|
+
export function isPdfStream(obj) {
|
|
18
|
+
return obj !== null && typeof obj === "object" && "type" in obj && obj.type === "stream";
|
|
19
|
+
}
|
|
20
|
+
export function isPdfDict(obj) {
|
|
21
|
+
return obj instanceof Map;
|
|
22
|
+
}
|
|
23
|
+
export function isPdfArray(obj) {
|
|
24
|
+
return Array.isArray(obj);
|
|
25
|
+
}
|
|
26
|
+
// =============================================================================
|
|
27
|
+
// Dictionary Helpers
|
|
28
|
+
// =============================================================================
|
|
29
|
+
/** Get a string value from a PDF dictionary */
|
|
30
|
+
export function dictGetName(dict, key) {
|
|
31
|
+
const val = dict.get(key);
|
|
32
|
+
return typeof val === "string" ? val : undefined;
|
|
33
|
+
}
|
|
34
|
+
/** Get a number value from a PDF dictionary */
|
|
35
|
+
export function dictGetNumber(dict, key) {
|
|
36
|
+
const val = dict.get(key);
|
|
37
|
+
return typeof val === "number" ? val : undefined;
|
|
38
|
+
}
|
|
39
|
+
/** Get a boolean value from a PDF dictionary */
|
|
40
|
+
export function dictGetBool(dict, key) {
|
|
41
|
+
const val = dict.get(key);
|
|
42
|
+
return typeof val === "boolean" ? val : undefined;
|
|
43
|
+
}
|
|
44
|
+
/** Get a dictionary value from a PDF dictionary */
|
|
45
|
+
export function dictGetDict(dict, key) {
|
|
46
|
+
const val = dict.get(key);
|
|
47
|
+
return isPdfDict(val) ? val : undefined;
|
|
48
|
+
}
|
|
49
|
+
/** Get an array value from a PDF dictionary */
|
|
50
|
+
export function dictGetArray(dict, key) {
|
|
51
|
+
const val = dict.get(key);
|
|
52
|
+
return isPdfArray(val) ? val : undefined;
|
|
53
|
+
}
|
|
54
|
+
/** Get a ref from a PDF dictionary */
|
|
55
|
+
export function dictGetRef(dict, key) {
|
|
56
|
+
const val = dict.get(key);
|
|
57
|
+
return isPdfRef(val) ? val : undefined;
|
|
58
|
+
}
|
|
59
|
+
/** Get bytes (string as Uint8Array) from a PDF dictionary */
|
|
60
|
+
export function dictGetBytes(dict, key) {
|
|
61
|
+
const val = dict.get(key);
|
|
62
|
+
return val instanceof Uint8Array ? val : undefined;
|
|
63
|
+
}
|
|
64
|
+
/** Get a string value that may be either a name (string) or bytes decoded as latin1 */
|
|
65
|
+
export function dictGetString(dict, key) {
|
|
66
|
+
const val = dict.get(key);
|
|
67
|
+
if (typeof val === "string") {
|
|
68
|
+
return val;
|
|
69
|
+
}
|
|
70
|
+
if (val instanceof Uint8Array) {
|
|
71
|
+
return decodePdfStringBytes(val);
|
|
72
|
+
}
|
|
73
|
+
return undefined;
|
|
74
|
+
}
|
|
75
|
+
/**
|
|
76
|
+
* Decode PDF string bytes to a JavaScript string.
|
|
77
|
+
* Handles UTF-16BE (BOM = FEFF) and PDFDocEncoding (Latin-1 superset).
|
|
78
|
+
*/
|
|
79
|
+
export function decodePdfStringBytes(bytes) {
|
|
80
|
+
if (bytes.length >= 2 && bytes[0] === 0xfe && bytes[1] === 0xff) {
|
|
81
|
+
// UTF-16BE
|
|
82
|
+
let result = "";
|
|
83
|
+
for (let i = 2; i + 1 < bytes.length; i += 2) {
|
|
84
|
+
const code = (bytes[i] << 8) | bytes[i + 1];
|
|
85
|
+
// Handle surrogate pairs
|
|
86
|
+
if (code >= 0xd800 && code <= 0xdbff && i + 3 < bytes.length) {
|
|
87
|
+
const low = (bytes[i + 2] << 8) | bytes[i + 3];
|
|
88
|
+
if (low >= 0xdc00 && low <= 0xdfff) {
|
|
89
|
+
const cp = 0x10000 + ((code - 0xd800) << 10) + (low - 0xdc00);
|
|
90
|
+
result += String.fromCodePoint(cp);
|
|
91
|
+
i += 2;
|
|
92
|
+
continue;
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
result += String.fromCharCode(code);
|
|
96
|
+
}
|
|
97
|
+
return result;
|
|
98
|
+
}
|
|
99
|
+
// UTF-8 BOM
|
|
100
|
+
if (bytes.length >= 3 && bytes[0] === 0xef && bytes[1] === 0xbb && bytes[2] === 0xbf) {
|
|
101
|
+
return new TextDecoder("utf-8").decode(bytes.subarray(3));
|
|
102
|
+
}
|
|
103
|
+
// PDFDocEncoding (identical to Latin-1 / ISO 8859-1 for 0x00-0xFF,
|
|
104
|
+
// with some differences in 0x80-0x9F range)
|
|
105
|
+
return decodePdfDocEncoding(bytes);
|
|
106
|
+
}
|
|
107
|
+
/** Decode bytes using PDFDocEncoding */
|
|
108
|
+
function decodePdfDocEncoding(bytes) {
|
|
109
|
+
let result = "";
|
|
110
|
+
for (let i = 0; i < bytes.length; i++) {
|
|
111
|
+
const b = bytes[i];
|
|
112
|
+
const mapped = PDF_DOC_ENCODING[b];
|
|
113
|
+
result += mapped !== undefined ? String.fromCodePoint(mapped) : String.fromCharCode(b);
|
|
114
|
+
}
|
|
115
|
+
return result;
|
|
116
|
+
}
|
|
117
|
+
/**
|
|
118
|
+
* PDFDocEncoding differences from Latin-1 in the 0x80-0xAD range.
|
|
119
|
+
* @see PDF Reference 1.7, Table D.2
|
|
120
|
+
*/
|
|
121
|
+
const PDF_DOC_ENCODING = {
|
|
122
|
+
0x80: 0x2022, // •
|
|
123
|
+
0x81: 0x2020, // †
|
|
124
|
+
0x82: 0x2021, // ‡
|
|
125
|
+
0x83: 0x2026, // …
|
|
126
|
+
0x84: 0x2014, // —
|
|
127
|
+
0x85: 0x2013, // –
|
|
128
|
+
0x86: 0x0192, // ƒ
|
|
129
|
+
0x87: 0x2044, // ⁄
|
|
130
|
+
0x88: 0x2039, // ‹
|
|
131
|
+
0x89: 0x203a, // ›
|
|
132
|
+
0x8a: 0x2212, // −
|
|
133
|
+
0x8b: 0x2030, // ‰
|
|
134
|
+
0x8c: 0x201e, // „
|
|
135
|
+
0x8d: 0x201c, // "
|
|
136
|
+
0x8e: 0x201d, // "
|
|
137
|
+
0x8f: 0x2018, // '
|
|
138
|
+
0x90: 0x2019, // '
|
|
139
|
+
0x91: 0x201a, // ‚
|
|
140
|
+
0x92: 0x2122, // ™
|
|
141
|
+
0x93: 0xfb01, // fi
|
|
142
|
+
0x94: 0xfb02, // fl
|
|
143
|
+
0x95: 0x0141, // Ł
|
|
144
|
+
0x96: 0x0152, // Œ
|
|
145
|
+
0x97: 0x0160, // Š
|
|
146
|
+
0x98: 0x0178, // Ÿ
|
|
147
|
+
0x99: 0x017d, // Ž
|
|
148
|
+
0x9a: 0x0131, // ı
|
|
149
|
+
0x9b: 0x0142, // ł
|
|
150
|
+
0x9c: 0x0153, // œ
|
|
151
|
+
0x9d: 0x0161, // š
|
|
152
|
+
0x9e: 0x017e, // ž
|
|
153
|
+
0xa0: 0x20ac, // €
|
|
154
|
+
0xad: 0x02c7 // ˇ
|
|
155
|
+
};
|
|
156
|
+
// =============================================================================
|
|
157
|
+
// PDF Object Parser
|
|
158
|
+
// =============================================================================
|
|
159
|
+
/**
|
|
160
|
+
* Parse a single PDF object from the tokenizer.
|
|
161
|
+
*
|
|
162
|
+
* Handles all PDF object types including dictionaries (with possible streams),
|
|
163
|
+
* arrays, strings, numbers, names, booleans, null, and indirect references.
|
|
164
|
+
*/
|
|
165
|
+
export function parseObject(tokenizer) {
|
|
166
|
+
const token = tokenizer.next();
|
|
167
|
+
return parseObjectFromToken(tokenizer, token);
|
|
168
|
+
}
|
|
169
|
+
/**
|
|
170
|
+
* Parse a PDF object given the first token has already been consumed.
|
|
171
|
+
*/
|
|
172
|
+
export function parseObjectFromToken(tokenizer, token) {
|
|
173
|
+
switch (token.type) {
|
|
174
|
+
case TokenType.Number: {
|
|
175
|
+
// Could be: number, or start of indirect ref (N gen R) or indirect obj (N gen obj)
|
|
176
|
+
const num = token.numValue;
|
|
177
|
+
const savedPos = tokenizer.pos;
|
|
178
|
+
const next = tokenizer.next();
|
|
179
|
+
if (next.type === TokenType.Number) {
|
|
180
|
+
const gen = next.numValue;
|
|
181
|
+
const next2 = tokenizer.next();
|
|
182
|
+
if (next2.type === TokenType.Keyword && next2.strValue === "R") {
|
|
183
|
+
// Indirect reference: N gen R
|
|
184
|
+
return { type: "ref", objNum: num, gen };
|
|
185
|
+
}
|
|
186
|
+
if (next2.type === TokenType.Keyword && next2.strValue === "obj") {
|
|
187
|
+
// Indirect object definition: N gen obj ... endobj
|
|
188
|
+
const obj = parseObject(tokenizer);
|
|
189
|
+
// Check if it's a stream
|
|
190
|
+
if (isPdfDict(obj)) {
|
|
191
|
+
tokenizer.skipWhitespaceAndComments();
|
|
192
|
+
const peekPos = tokenizer.pos;
|
|
193
|
+
const maybeStream = tokenizer.next();
|
|
194
|
+
if (maybeStream.type === TokenType.Keyword && maybeStream.strValue === "stream") {
|
|
195
|
+
const length = dictGetNumber(obj, "Length") ?? -1;
|
|
196
|
+
const streamData = tokenizer.readStreamContent(length);
|
|
197
|
+
// Consume endobj
|
|
198
|
+
const endobj = tokenizer.next();
|
|
199
|
+
if (endobj.type !== TokenType.Keyword || endobj.strValue !== "endobj") {
|
|
200
|
+
// Some PDFs don't have endobj after endstream — tolerate
|
|
201
|
+
tokenizer.pos = endobj.offset;
|
|
202
|
+
}
|
|
203
|
+
return { type: "stream", dict: obj, data: streamData };
|
|
204
|
+
}
|
|
205
|
+
// Not a stream — restore position
|
|
206
|
+
tokenizer.pos = peekPos;
|
|
207
|
+
}
|
|
208
|
+
// Consume endobj
|
|
209
|
+
tokenizer.skipWhitespaceAndComments();
|
|
210
|
+
const peekEnd = tokenizer.pos;
|
|
211
|
+
const endTok = tokenizer.next();
|
|
212
|
+
if (endTok.type !== TokenType.Keyword || endTok.strValue !== "endobj") {
|
|
213
|
+
tokenizer.pos = peekEnd;
|
|
214
|
+
}
|
|
215
|
+
return obj;
|
|
216
|
+
}
|
|
217
|
+
// Not a ref or obj definition — restore
|
|
218
|
+
tokenizer.pos = savedPos;
|
|
219
|
+
return num;
|
|
220
|
+
}
|
|
221
|
+
// Not followed by another number — just a number
|
|
222
|
+
tokenizer.pos = savedPos;
|
|
223
|
+
return num;
|
|
224
|
+
}
|
|
225
|
+
case TokenType.LiteralString:
|
|
226
|
+
case TokenType.HexString:
|
|
227
|
+
return token.rawBytes ?? new Uint8Array(0);
|
|
228
|
+
case TokenType.Name:
|
|
229
|
+
return token.strValue;
|
|
230
|
+
case TokenType.Boolean:
|
|
231
|
+
return token.boolValue;
|
|
232
|
+
case TokenType.Null:
|
|
233
|
+
return null;
|
|
234
|
+
case TokenType.DictBegin:
|
|
235
|
+
return parseDictionary(tokenizer);
|
|
236
|
+
case TokenType.ArrayBegin:
|
|
237
|
+
return parseArray(tokenizer);
|
|
238
|
+
case TokenType.EOF:
|
|
239
|
+
throw new PdfStructureError("Unexpected end of input while parsing PDF object");
|
|
240
|
+
default:
|
|
241
|
+
// Keywords like "endobj", "stream" etc. are unexpected in object context
|
|
242
|
+
// Return them as-is for the caller to handle
|
|
243
|
+
return token.strValue ?? null;
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
/**
|
|
247
|
+
* Parse a PDF dictionary (after the `<<` token has been consumed).
|
|
248
|
+
*/
|
|
249
|
+
function parseDictionary(tokenizer) {
|
|
250
|
+
const dict = new Map();
|
|
251
|
+
while (true) {
|
|
252
|
+
const keyToken = tokenizer.next();
|
|
253
|
+
if (keyToken.type === TokenType.DictEnd) {
|
|
254
|
+
break;
|
|
255
|
+
}
|
|
256
|
+
if (keyToken.type === TokenType.EOF) {
|
|
257
|
+
throw new PdfStructureError("Unexpected EOF in dictionary");
|
|
258
|
+
}
|
|
259
|
+
if (keyToken.type !== TokenType.Name) {
|
|
260
|
+
// Some malformed PDFs have non-name keys — skip and try again
|
|
261
|
+
continue;
|
|
262
|
+
}
|
|
263
|
+
const key = keyToken.strValue;
|
|
264
|
+
const value = parseObject(tokenizer);
|
|
265
|
+
dict.set(key, value);
|
|
266
|
+
}
|
|
267
|
+
return dict;
|
|
268
|
+
}
|
|
269
|
+
/**
|
|
270
|
+
* Parse a PDF array (after the `[` token has been consumed).
|
|
271
|
+
*/
|
|
272
|
+
function parseArray(tokenizer) {
|
|
273
|
+
const arr = [];
|
|
274
|
+
while (true) {
|
|
275
|
+
const token = tokenizer.next();
|
|
276
|
+
if (token.type === TokenType.ArrayEnd) {
|
|
277
|
+
break;
|
|
278
|
+
}
|
|
279
|
+
if (token.type === TokenType.EOF) {
|
|
280
|
+
throw new PdfStructureError("Unexpected EOF in array");
|
|
281
|
+
}
|
|
282
|
+
arr.push(parseObjectFromToken(tokenizer, token));
|
|
283
|
+
}
|
|
284
|
+
return arr;
|
|
285
|
+
}
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PDF reader — public API.
|
|
3
|
+
*
|
|
4
|
+
* Provides a high-level, zero-dependency interface for reading PDF files.
|
|
5
|
+
* Supports:
|
|
6
|
+
* - Text extraction with multilingual support (WinAnsi, MacRoman, CJK via
|
|
7
|
+
* ToUnicode CMap, Identity-H/V, Symbol, ZapfDingbats)
|
|
8
|
+
* - Image extraction (JPEG, JPEG2000, raw/Flate, CCITT, JBIG2)
|
|
9
|
+
* - Annotation extraction (links, comments, highlights, stamps, etc.)
|
|
10
|
+
* - Form field extraction (AcroForm: text inputs, checkboxes, radio buttons, dropdowns)
|
|
11
|
+
* - Metadata reading (Info dictionary + XMP)
|
|
12
|
+
* - Encrypted PDFs:
|
|
13
|
+
* - RC4 (40-bit and 128-bit) — tested via roundtrip
|
|
14
|
+
* - AES-128 (V=4, R=4) — implemented, requires external test fixtures
|
|
15
|
+
* - AES-256 (V=5, R=5) — implemented, requires external test fixtures
|
|
16
|
+
* - Cross-reference tables and streams (PDF 1.5+)
|
|
17
|
+
* - Incremental updates and xref recovery
|
|
18
|
+
*
|
|
19
|
+
* @example Basic text extraction:
|
|
20
|
+
* ```typescript
|
|
21
|
+
* import { readPdf } from "excelts/pdf";
|
|
22
|
+
*
|
|
23
|
+
* const pdf = readPdf(pdfBytes);
|
|
24
|
+
* console.log(pdf.text); // All text from all pages
|
|
25
|
+
* console.log(pdf.pages[0].text); // Text from page 1
|
|
26
|
+
* ```
|
|
27
|
+
*
|
|
28
|
+
* @example Image extraction:
|
|
29
|
+
* ```typescript
|
|
30
|
+
* const pdf = readPdf(pdfBytes);
|
|
31
|
+
* for (const image of pdf.pages[0].images) {
|
|
32
|
+
* console.log(image.format, image.width, image.height);
|
|
33
|
+
* fs.writeFileSync(`image.${image.format}`, image.data);
|
|
34
|
+
* }
|
|
35
|
+
* ```
|
|
36
|
+
*
|
|
37
|
+
* @example Metadata:
|
|
38
|
+
* ```typescript
|
|
39
|
+
* const pdf = readPdf(pdfBytes);
|
|
40
|
+
* console.log(pdf.metadata.title);
|
|
41
|
+
* console.log(pdf.metadata.author);
|
|
42
|
+
* console.log(pdf.metadata.pageCount);
|
|
43
|
+
* ```
|
|
44
|
+
*
|
|
45
|
+
* @example Encrypted PDF:
|
|
46
|
+
* ```typescript
|
|
47
|
+
* const pdf = readPdf(pdfBytes, { password: "secret" });
|
|
48
|
+
* ```
|
|
49
|
+
*/
|
|
50
|
+
import type { TextLine } from "./text-reconstruction.js";
|
|
51
|
+
import type { TextFragment } from "./content-interpreter.js";
|
|
52
|
+
import type { ExtractedImage } from "./image-extractor.js";
|
|
53
|
+
import type { PdfAnnotation } from "./annotation-extractor.js";
|
|
54
|
+
import type { PdfFormField } from "./form-extractor.js";
|
|
55
|
+
import type { PdfMetadata } from "./metadata-reader.js";
|
|
56
|
+
/**
|
|
57
|
+
* Options for reading a PDF.
|
|
58
|
+
*/
|
|
59
|
+
export interface ReadPdfOptions {
|
|
60
|
+
/**
|
|
61
|
+
* Password for encrypted PDFs.
|
|
62
|
+
* Can be either the user password or owner password.
|
|
63
|
+
* @default ""
|
|
64
|
+
*/
|
|
65
|
+
password?: string;
|
|
66
|
+
/**
|
|
67
|
+
* Which pages to extract (1-based).
|
|
68
|
+
* If omitted, all pages are extracted.
|
|
69
|
+
* @example [1, 3, 5] — extract pages 1, 3, and 5
|
|
70
|
+
*/
|
|
71
|
+
pages?: number[];
|
|
72
|
+
/**
|
|
73
|
+
* Whether to extract text.
|
|
74
|
+
* @default true
|
|
75
|
+
*/
|
|
76
|
+
extractText?: boolean;
|
|
77
|
+
/**
|
|
78
|
+
* Whether to extract images.
|
|
79
|
+
* @default true
|
|
80
|
+
*/
|
|
81
|
+
extractImages?: boolean;
|
|
82
|
+
/**
|
|
83
|
+
* Whether to extract metadata.
|
|
84
|
+
* @default true
|
|
85
|
+
*/
|
|
86
|
+
extractMetadata?: boolean;
|
|
87
|
+
/**
|
|
88
|
+
* Whether to extract annotations (links, comments, highlights, etc.).
|
|
89
|
+
* @default true
|
|
90
|
+
*/
|
|
91
|
+
extractAnnotations?: boolean;
|
|
92
|
+
/**
|
|
93
|
+
* Whether to extract form fields (AcroForm: text inputs, checkboxes, dropdowns, etc.).
|
|
94
|
+
* @default true
|
|
95
|
+
*/
|
|
96
|
+
extractFormFields?: boolean;
|
|
97
|
+
}
|
|
98
|
+
/**
|
|
99
|
+
* A single page from a read PDF.
|
|
100
|
+
*/
|
|
101
|
+
export interface ReadPdfPage {
|
|
102
|
+
/** 1-based page number */
|
|
103
|
+
pageNumber: number;
|
|
104
|
+
/** Extracted text content */
|
|
105
|
+
text: string;
|
|
106
|
+
/** Structured text lines with position information */
|
|
107
|
+
textLines: TextLine[];
|
|
108
|
+
/** Raw text fragments with exact positions */
|
|
109
|
+
textFragments: TextFragment[];
|
|
110
|
+
/** Extracted images */
|
|
111
|
+
images: ExtractedImage[];
|
|
112
|
+
/** Extracted annotations (links, comments, highlights, etc.) */
|
|
113
|
+
annotations: PdfAnnotation[];
|
|
114
|
+
/** Page width in points */
|
|
115
|
+
width: number;
|
|
116
|
+
/** Page height in points */
|
|
117
|
+
height: number;
|
|
118
|
+
/** Warnings encountered during extraction (non-fatal errors) */
|
|
119
|
+
warnings: string[];
|
|
120
|
+
}
|
|
121
|
+
/**
|
|
122
|
+
* Result of reading a PDF.
|
|
123
|
+
*/
|
|
124
|
+
export interface ReadPdfResult {
|
|
125
|
+
/** All text from all pages concatenated */
|
|
126
|
+
text: string;
|
|
127
|
+
/** Per-page results */
|
|
128
|
+
pages: ReadPdfPage[];
|
|
129
|
+
/** Document metadata */
|
|
130
|
+
metadata: PdfMetadata;
|
|
131
|
+
/** Form fields extracted from AcroForm (document-level) */
|
|
132
|
+
formFields: PdfFormField[];
|
|
133
|
+
}
|
|
134
|
+
/**
|
|
135
|
+
* Read a PDF file and extract text, images, and metadata.
|
|
136
|
+
*
|
|
137
|
+
* @param data - Raw PDF file bytes
|
|
138
|
+
* @param options - Extraction options
|
|
139
|
+
* @returns Extracted content
|
|
140
|
+
* @throws {PdfStructureError} If the PDF structure is invalid
|
|
141
|
+
* @throws {PdfError} If decryption fails (wrong password)
|
|
142
|
+
*/
|
|
143
|
+
export declare function readPdf(data: Uint8Array, options?: ReadPdfOptions): ReadPdfResult;
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PDF reader — public API.
|
|
3
|
+
*
|
|
4
|
+
* Provides a high-level, zero-dependency interface for reading PDF files.
|
|
5
|
+
* Supports:
|
|
6
|
+
* - Text extraction with multilingual support (WinAnsi, MacRoman, CJK via
|
|
7
|
+
* ToUnicode CMap, Identity-H/V, Symbol, ZapfDingbats)
|
|
8
|
+
* - Image extraction (JPEG, JPEG2000, raw/Flate, CCITT, JBIG2)
|
|
9
|
+
* - Annotation extraction (links, comments, highlights, stamps, etc.)
|
|
10
|
+
* - Form field extraction (AcroForm: text inputs, checkboxes, radio buttons, dropdowns)
|
|
11
|
+
* - Metadata reading (Info dictionary + XMP)
|
|
12
|
+
* - Encrypted PDFs:
|
|
13
|
+
* - RC4 (40-bit and 128-bit) — tested via roundtrip
|
|
14
|
+
* - AES-128 (V=4, R=4) — implemented, requires external test fixtures
|
|
15
|
+
* - AES-256 (V=5, R=5) — implemented, requires external test fixtures
|
|
16
|
+
* - Cross-reference tables and streams (PDF 1.5+)
|
|
17
|
+
* - Incremental updates and xref recovery
|
|
18
|
+
*
|
|
19
|
+
* @example Basic text extraction:
|
|
20
|
+
* ```typescript
|
|
21
|
+
* import { readPdf } from "excelts/pdf";
|
|
22
|
+
*
|
|
23
|
+
* const pdf = readPdf(pdfBytes);
|
|
24
|
+
* console.log(pdf.text); // All text from all pages
|
|
25
|
+
* console.log(pdf.pages[0].text); // Text from page 1
|
|
26
|
+
* ```
|
|
27
|
+
*
|
|
28
|
+
* @example Image extraction:
|
|
29
|
+
* ```typescript
|
|
30
|
+
* const pdf = readPdf(pdfBytes);
|
|
31
|
+
* for (const image of pdf.pages[0].images) {
|
|
32
|
+
* console.log(image.format, image.width, image.height);
|
|
33
|
+
* fs.writeFileSync(`image.${image.format}`, image.data);
|
|
34
|
+
* }
|
|
35
|
+
* ```
|
|
36
|
+
*
|
|
37
|
+
* @example Metadata:
|
|
38
|
+
* ```typescript
|
|
39
|
+
* const pdf = readPdf(pdfBytes);
|
|
40
|
+
* console.log(pdf.metadata.title);
|
|
41
|
+
* console.log(pdf.metadata.author);
|
|
42
|
+
* console.log(pdf.metadata.pageCount);
|
|
43
|
+
* ```
|
|
44
|
+
*
|
|
45
|
+
* @example Encrypted PDF:
|
|
46
|
+
* ```typescript
|
|
47
|
+
* const pdf = readPdf(pdfBytes, { password: "secret" });
|
|
48
|
+
* ```
|
|
49
|
+
*/
|
|
50
|
+
import { PdfDocument } from "./pdf-document.js";
|
|
51
|
+
import { initDecryption, isEncrypted } from "./pdf-decrypt.js";
|
|
52
|
+
import { extractTextFromPage } from "./content-interpreter.js";
|
|
53
|
+
import { reconstructText, reconstructTextLines } from "./text-reconstruction.js";
|
|
54
|
+
import { extractImagesFromPage } from "./image-extractor.js";
|
|
55
|
+
import { extractAnnotationsFromPage } from "./annotation-extractor.js";
|
|
56
|
+
import { extractFormFields } from "./form-extractor.js";
|
|
57
|
+
import { extractMetadata } from "./metadata-reader.js";
|
|
58
|
+
import { PdfStructureError } from "../errors.js";
|
|
59
|
+
// =============================================================================
|
|
60
|
+
// Public API
|
|
61
|
+
// =============================================================================
|
|
62
|
+
/**
|
|
63
|
+
* Read a PDF file and extract text, images, and metadata.
|
|
64
|
+
*
|
|
65
|
+
* @param data - Raw PDF file bytes
|
|
66
|
+
* @param options - Extraction options
|
|
67
|
+
* @returns Extracted content
|
|
68
|
+
* @throws {PdfStructureError} If the PDF structure is invalid
|
|
69
|
+
* @throws {PdfError} If decryption fails (wrong password)
|
|
70
|
+
*/
|
|
71
|
+
export function readPdf(data, options) {
|
|
72
|
+
const opts = {
|
|
73
|
+
password: options?.password ?? "",
|
|
74
|
+
pages: options?.pages,
|
|
75
|
+
extractText: options?.extractText ?? true,
|
|
76
|
+
extractImages: options?.extractImages ?? true,
|
|
77
|
+
extractMetadata: options?.extractMetadata ?? true,
|
|
78
|
+
extractAnnotations: options?.extractAnnotations ?? true,
|
|
79
|
+
extractFormFields: options?.extractFormFields ?? true
|
|
80
|
+
};
|
|
81
|
+
// Parse document structure
|
|
82
|
+
const doc = new PdfDocument(data);
|
|
83
|
+
// Handle encryption
|
|
84
|
+
if (isEncrypted(doc)) {
|
|
85
|
+
const success = initDecryption(doc, opts.password);
|
|
86
|
+
if (!success) {
|
|
87
|
+
throw new PdfStructureError("Failed to decrypt PDF: incorrect password");
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
// Extract metadata
|
|
91
|
+
const metadata = opts.extractMetadata ? extractMetadata(doc) : createEmptyMetadata();
|
|
92
|
+
// Get pages (with object identity for correct decryption)
|
|
93
|
+
const pagesInfo = doc.getPagesWithObjInfo();
|
|
94
|
+
const pageIndicesToProcess = opts.pages
|
|
95
|
+
? opts.pages.map(p => p - 1).filter(p => p >= 0 && p < pagesInfo.length)
|
|
96
|
+
: Array.from({ length: pagesInfo.length }, (_, i) => i);
|
|
97
|
+
// Process each page
|
|
98
|
+
const pages = [];
|
|
99
|
+
for (const pageIdx of pageIndicesToProcess) {
|
|
100
|
+
const { dict: pageDict } = pagesInfo[pageIdx];
|
|
101
|
+
const pageNumber = pageIdx + 1;
|
|
102
|
+
const warnings = [];
|
|
103
|
+
// Extract text
|
|
104
|
+
let text = "";
|
|
105
|
+
let textLines = [];
|
|
106
|
+
let textFragments = [];
|
|
107
|
+
if (opts.extractText) {
|
|
108
|
+
try {
|
|
109
|
+
textFragments = extractTextFromPage(pageDict, doc);
|
|
110
|
+
text = reconstructText(textFragments);
|
|
111
|
+
textLines = reconstructTextLines(textFragments);
|
|
112
|
+
}
|
|
113
|
+
catch (err) {
|
|
114
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
115
|
+
warnings.push(`Text extraction failed on page ${pageNumber}: ${msg}`);
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
// Extract images
|
|
119
|
+
let images = [];
|
|
120
|
+
if (opts.extractImages) {
|
|
121
|
+
try {
|
|
122
|
+
images = extractImagesFromPage(pageDict, doc);
|
|
123
|
+
}
|
|
124
|
+
catch (err) {
|
|
125
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
126
|
+
warnings.push(`Image extraction failed on page ${pageNumber}: ${msg}`);
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
// Extract annotations
|
|
130
|
+
let annotations = [];
|
|
131
|
+
if (opts.extractAnnotations) {
|
|
132
|
+
try {
|
|
133
|
+
annotations = extractAnnotationsFromPage(pageDict, doc);
|
|
134
|
+
}
|
|
135
|
+
catch (err) {
|
|
136
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
137
|
+
warnings.push(`Annotation extraction failed on page ${pageNumber}: ${msg}`);
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
// Get page dimensions
|
|
141
|
+
const { width, height } = getPageDimensions(pageDict, doc);
|
|
142
|
+
pages.push({
|
|
143
|
+
pageNumber,
|
|
144
|
+
text,
|
|
145
|
+
textLines,
|
|
146
|
+
textFragments,
|
|
147
|
+
images,
|
|
148
|
+
annotations,
|
|
149
|
+
width,
|
|
150
|
+
height,
|
|
151
|
+
warnings
|
|
152
|
+
});
|
|
153
|
+
}
|
|
154
|
+
// Concatenate all page text
|
|
155
|
+
const allText = pages.map(p => p.text).join("\n\n");
|
|
156
|
+
// Update page count in metadata
|
|
157
|
+
if (opts.extractMetadata) {
|
|
158
|
+
metadata.pageCount = pagesInfo.length;
|
|
159
|
+
}
|
|
160
|
+
// Extract form fields (document-level, not per-page)
|
|
161
|
+
let formFields = [];
|
|
162
|
+
if (opts.extractFormFields) {
|
|
163
|
+
try {
|
|
164
|
+
formFields = extractFormFields(doc);
|
|
165
|
+
}
|
|
166
|
+
catch {
|
|
167
|
+
// Non-fatal — just return empty
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
return {
|
|
171
|
+
text: allText,
|
|
172
|
+
pages,
|
|
173
|
+
metadata,
|
|
174
|
+
formFields
|
|
175
|
+
};
|
|
176
|
+
}
|
|
177
|
+
// =============================================================================
|
|
178
|
+
// Helpers
|
|
179
|
+
// =============================================================================
|
|
180
|
+
function getPageDimensions(pageDict, doc) {
|
|
181
|
+
return doc.resolvePageBox(pageDict) ?? { width: 612, height: 792 }; // Default: US Letter
|
|
182
|
+
}
|
|
183
|
+
function createEmptyMetadata() {
|
|
184
|
+
return {
|
|
185
|
+
title: "",
|
|
186
|
+
author: "",
|
|
187
|
+
subject: "",
|
|
188
|
+
keywords: "",
|
|
189
|
+
creator: "",
|
|
190
|
+
producer: "",
|
|
191
|
+
creationDate: null,
|
|
192
|
+
modDate: null,
|
|
193
|
+
pdfVersion: "",
|
|
194
|
+
pageCount: 0,
|
|
195
|
+
encrypted: false,
|
|
196
|
+
pageSize: null,
|
|
197
|
+
xmpXml: null,
|
|
198
|
+
custom: {}
|
|
199
|
+
};
|
|
200
|
+
}
|