@cj-tech-master/excelts 8.0.0 → 8.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +14 -1
- package/README_zh.md +6 -0
- package/dist/browser/modules/archive/zip/stream.d.ts +4 -0
- package/dist/browser/modules/archive/zip/stream.js +53 -0
- package/dist/browser/modules/pdf/core/crypto.d.ts +65 -0
- package/dist/browser/modules/pdf/core/crypto.js +637 -0
- package/dist/browser/modules/pdf/core/encryption.d.ts +23 -20
- package/dist/browser/modules/pdf/core/encryption.js +88 -261
- package/dist/browser/modules/pdf/core/pdf-writer.d.ts +6 -4
- package/dist/browser/modules/pdf/core/pdf-writer.js +19 -10
- package/dist/browser/modules/pdf/index.d.ts +23 -2
- package/dist/browser/modules/pdf/index.js +21 -3
- package/dist/browser/modules/pdf/reader/annotation-extractor.d.ts +63 -0
- package/dist/browser/modules/pdf/reader/annotation-extractor.js +155 -0
- package/dist/browser/modules/pdf/reader/cmap-parser.d.ts +70 -0
- package/dist/browser/modules/pdf/reader/cmap-parser.js +321 -0
- package/dist/browser/modules/pdf/reader/content-interpreter.d.ts +57 -0
- package/dist/browser/modules/pdf/reader/content-interpreter.js +715 -0
- package/dist/browser/modules/pdf/reader/font-decoder.d.ts +58 -0
- package/dist/browser/modules/pdf/reader/font-decoder.js +1513 -0
- package/dist/browser/modules/pdf/reader/form-extractor.d.ts +48 -0
- package/dist/browser/modules/pdf/reader/form-extractor.js +355 -0
- package/dist/browser/modules/pdf/reader/image-extractor.d.ts +55 -0
- package/dist/browser/modules/pdf/reader/image-extractor.js +220 -0
- package/dist/browser/modules/pdf/reader/metadata-reader.d.ts +56 -0
- package/dist/browser/modules/pdf/reader/metadata-reader.js +275 -0
- package/dist/browser/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
- package/dist/browser/modules/pdf/reader/pdf-decrypt.js +443 -0
- package/dist/browser/modules/pdf/reader/pdf-document.d.ts +191 -0
- package/dist/browser/modules/pdf/reader/pdf-document.js +818 -0
- package/dist/browser/modules/pdf/reader/pdf-parser.d.ts +65 -0
- package/dist/browser/modules/pdf/reader/pdf-parser.js +285 -0
- package/dist/browser/modules/pdf/reader/pdf-reader.d.ts +143 -0
- package/dist/browser/modules/pdf/reader/pdf-reader.js +200 -0
- package/dist/browser/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
- package/dist/browser/modules/pdf/reader/pdf-tokenizer.js +543 -0
- package/dist/browser/modules/pdf/reader/reader-utils.d.ts +15 -0
- package/dist/browser/modules/pdf/reader/reader-utils.js +27 -0
- package/dist/browser/modules/pdf/reader/stream-filters.d.ts +20 -0
- package/dist/browser/modules/pdf/reader/stream-filters.js +456 -0
- package/dist/browser/modules/pdf/reader/text-reconstruction.d.ts +44 -0
- package/dist/browser/modules/pdf/reader/text-reconstruction.js +463 -0
- package/dist/cjs/modules/archive/zip/stream.js +53 -0
- package/dist/cjs/modules/pdf/core/crypto.js +649 -0
- package/dist/cjs/modules/pdf/core/encryption.js +88 -263
- package/dist/cjs/modules/pdf/core/pdf-writer.js +19 -10
- package/dist/cjs/modules/pdf/index.js +23 -4
- package/dist/cjs/modules/pdf/reader/annotation-extractor.js +158 -0
- package/dist/cjs/modules/pdf/reader/cmap-parser.js +326 -0
- package/dist/cjs/modules/pdf/reader/content-interpreter.js +718 -0
- package/dist/cjs/modules/pdf/reader/font-decoder.js +1518 -0
- package/dist/cjs/modules/pdf/reader/form-extractor.js +358 -0
- package/dist/cjs/modules/pdf/reader/image-extractor.js +223 -0
- package/dist/cjs/modules/pdf/reader/metadata-reader.js +278 -0
- package/dist/cjs/modules/pdf/reader/pdf-decrypt.js +447 -0
- package/dist/cjs/modules/pdf/reader/pdf-document.js +822 -0
- package/dist/cjs/modules/pdf/reader/pdf-parser.js +301 -0
- package/dist/cjs/modules/pdf/reader/pdf-reader.js +203 -0
- package/dist/cjs/modules/pdf/reader/pdf-tokenizer.js +517 -0
- package/dist/cjs/modules/pdf/reader/reader-utils.js +30 -0
- package/dist/cjs/modules/pdf/reader/stream-filters.js +459 -0
- package/dist/cjs/modules/pdf/reader/text-reconstruction.js +467 -0
- package/dist/esm/modules/archive/zip/stream.js +53 -0
- package/dist/esm/modules/pdf/core/crypto.js +637 -0
- package/dist/esm/modules/pdf/core/encryption.js +88 -261
- package/dist/esm/modules/pdf/core/pdf-writer.js +19 -10
- package/dist/esm/modules/pdf/index.js +21 -3
- package/dist/esm/modules/pdf/reader/annotation-extractor.js +155 -0
- package/dist/esm/modules/pdf/reader/cmap-parser.js +321 -0
- package/dist/esm/modules/pdf/reader/content-interpreter.js +715 -0
- package/dist/esm/modules/pdf/reader/font-decoder.js +1513 -0
- package/dist/esm/modules/pdf/reader/form-extractor.js +355 -0
- package/dist/esm/modules/pdf/reader/image-extractor.js +220 -0
- package/dist/esm/modules/pdf/reader/metadata-reader.js +275 -0
- package/dist/esm/modules/pdf/reader/pdf-decrypt.js +443 -0
- package/dist/esm/modules/pdf/reader/pdf-document.js +818 -0
- package/dist/esm/modules/pdf/reader/pdf-parser.js +285 -0
- package/dist/esm/modules/pdf/reader/pdf-reader.js +200 -0
- package/dist/esm/modules/pdf/reader/pdf-tokenizer.js +543 -0
- package/dist/esm/modules/pdf/reader/reader-utils.js +27 -0
- package/dist/esm/modules/pdf/reader/stream-filters.js +456 -0
- package/dist/esm/modules/pdf/reader/text-reconstruction.js +463 -0
- package/dist/iife/excelts.iife.js +703 -267
- package/dist/iife/excelts.iife.js.map +1 -1
- package/dist/iife/excelts.iife.min.js +35 -35
- package/dist/types/modules/archive/zip/stream.d.ts +4 -0
- package/dist/types/modules/pdf/core/crypto.d.ts +65 -0
- package/dist/types/modules/pdf/core/encryption.d.ts +23 -20
- package/dist/types/modules/pdf/core/pdf-writer.d.ts +6 -4
- package/dist/types/modules/pdf/index.d.ts +23 -2
- package/dist/types/modules/pdf/reader/annotation-extractor.d.ts +63 -0
- package/dist/types/modules/pdf/reader/cmap-parser.d.ts +70 -0
- package/dist/types/modules/pdf/reader/content-interpreter.d.ts +57 -0
- package/dist/types/modules/pdf/reader/font-decoder.d.ts +58 -0
- package/dist/types/modules/pdf/reader/form-extractor.d.ts +48 -0
- package/dist/types/modules/pdf/reader/image-extractor.d.ts +55 -0
- package/dist/types/modules/pdf/reader/metadata-reader.d.ts +56 -0
- package/dist/types/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
- package/dist/types/modules/pdf/reader/pdf-document.d.ts +191 -0
- package/dist/types/modules/pdf/reader/pdf-parser.d.ts +65 -0
- package/dist/types/modules/pdf/reader/pdf-reader.d.ts +143 -0
- package/dist/types/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
- package/dist/types/modules/pdf/reader/reader-utils.d.ts +15 -0
- package/dist/types/modules/pdf/reader/stream-filters.d.ts +20 -0
- package/dist/types/modules/pdf/reader/text-reconstruction.d.ts +44 -0
- package/package.json +1 -1
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PDF document parser.
|
|
3
|
+
*
|
|
4
|
+
* Handles the high-level PDF file structure:
|
|
5
|
+
* - Locating startxref
|
|
6
|
+
* - Parsing cross-reference tables (traditional and stream-based)
|
|
7
|
+
* - Reading trailer dictionaries
|
|
8
|
+
* - Resolving indirect object references
|
|
9
|
+
* - Handling incremental updates
|
|
10
|
+
*
|
|
11
|
+
* @see PDF Reference 1.7, §3.4 - File Structure
|
|
12
|
+
*/
|
|
13
|
+
import type { PdfObject, PdfDictValue, PdfRef, PdfStream } from "./pdf-parser.js";
|
|
14
|
+
/** Result of resolving an object with its object/generation numbers for decryption */
|
|
15
|
+
interface ResolvedObject {
|
|
16
|
+
/** The resolved PDF object */
|
|
17
|
+
obj: PdfObject | null;
|
|
18
|
+
/** The object number */
|
|
19
|
+
objNum: number;
|
|
20
|
+
/** The generation number */
|
|
21
|
+
gen: number;
|
|
22
|
+
}
|
|
23
|
+
/**
|
|
24
|
+
* Parsed PDF document with lazy object resolution.
|
|
25
|
+
*
|
|
26
|
+
* Reads the cross-reference table and trailer on construction,
|
|
27
|
+
* then resolves individual objects on demand with caching.
|
|
28
|
+
*/
|
|
29
|
+
export declare class PdfDocument {
|
|
30
|
+
private tokenizer;
|
|
31
|
+
private xref;
|
|
32
|
+
private cache;
|
|
33
|
+
readonly trailer: PdfDictValue;
|
|
34
|
+
/** Encryption handler (set externally after decryption is initialized) */
|
|
35
|
+
decryptFn: ((data: Uint8Array, objNum: number, gen: number) => Uint8Array) | null;
|
|
36
|
+
constructor(data: Uint8Array);
|
|
37
|
+
/** Get the underlying raw data */
|
|
38
|
+
get data(): Uint8Array;
|
|
39
|
+
private parseFileStructure;
|
|
40
|
+
/**
|
|
41
|
+
* Find the startxref offset by scanning backward from EOF.
|
|
42
|
+
*/
|
|
43
|
+
private findStartxref;
|
|
44
|
+
/**
|
|
45
|
+
* Parse the xref chain starting at the given offset.
|
|
46
|
+
* Follows /Prev links for incremental updates.
|
|
47
|
+
* Returns the merged trailer dictionary.
|
|
48
|
+
*/
|
|
49
|
+
private parseXrefChain;
|
|
50
|
+
/**
|
|
51
|
+
* Parse a traditional xref table and its trailer.
|
|
52
|
+
*/
|
|
53
|
+
private parseTraditionalXref;
|
|
54
|
+
/**
|
|
55
|
+
* Parse a cross-reference stream (PDF 1.5+).
|
|
56
|
+
*/
|
|
57
|
+
private parseXrefStream;
|
|
58
|
+
/**
|
|
59
|
+
* Reconstruct the xref table by scanning the entire file for `N N obj` patterns.
|
|
60
|
+
* This is a fallback for corrupted or broken PDFs where the normal xref parsing fails.
|
|
61
|
+
*
|
|
62
|
+
* @returns A synthetic trailer dictionary
|
|
63
|
+
*/
|
|
64
|
+
private reconstructXref;
|
|
65
|
+
/**
|
|
66
|
+
* Merge trailer entries from an older trailer into the current one.
|
|
67
|
+
* Only adds keys that don't already exist.
|
|
68
|
+
*/
|
|
69
|
+
private mergeTrailer;
|
|
70
|
+
/**
|
|
71
|
+
* Resolve a PDF object by its object number and generation.
|
|
72
|
+
* Returns null if the object doesn't exist.
|
|
73
|
+
*/
|
|
74
|
+
resolve(objNum: number, gen?: number): PdfObject | null;
|
|
75
|
+
/**
|
|
76
|
+
* Resolve a PDF object and return it along with its object/generation numbers.
|
|
77
|
+
* Useful for tracking which object a value came from (for decryption).
|
|
78
|
+
*
|
|
79
|
+
* @param objNum - The object number to resolve
|
|
80
|
+
* @param gen - The generation number (default 0)
|
|
81
|
+
* @returns The resolved object with its objNum and gen for decryption context
|
|
82
|
+
*/
|
|
83
|
+
resolveWithObjNum(objNum: number, gen?: number): ResolvedObject;
|
|
84
|
+
/**
|
|
85
|
+
* Dereference a PdfRef to its actual object value.
|
|
86
|
+
* If the input is not a PdfRef, returns it as-is.
|
|
87
|
+
*/
|
|
88
|
+
deref(obj: PdfObject | null | undefined): PdfObject | null;
|
|
89
|
+
/**
|
|
90
|
+
* Dereference a PdfRef and assert it's a dictionary.
|
|
91
|
+
*/
|
|
92
|
+
derefDict(obj: PdfObject | null | undefined): PdfDictValue | null;
|
|
93
|
+
/**
|
|
94
|
+
* Dereference a PdfRef and get the stream, along with the objNum/gen
|
|
95
|
+
* needed for correct per-object decryption.
|
|
96
|
+
*/
|
|
97
|
+
derefStream(obj: PdfObject | null | undefined): PdfStream | null;
|
|
98
|
+
/**
|
|
99
|
+
* Dereference a PdfRef and get the stream with its object number and generation.
|
|
100
|
+
* Returns null if the object is not a stream.
|
|
101
|
+
* The objNum/gen are needed for correct per-object decryption (V1-V4).
|
|
102
|
+
*/
|
|
103
|
+
derefStreamWithObjNum(obj: PdfObject | null | undefined): {
|
|
104
|
+
stream: PdfStream;
|
|
105
|
+
objNum: number;
|
|
106
|
+
gen: number;
|
|
107
|
+
} | null;
|
|
108
|
+
/**
|
|
109
|
+
* Get decoded stream data from a stream object.
|
|
110
|
+
* Applies filter chain decoding and decryption.
|
|
111
|
+
*
|
|
112
|
+
* When objNum/gen are not provided (default 0), decryption may not
|
|
113
|
+
* produce correct results. Use {@link resolveWithObjNum} to obtain
|
|
114
|
+
* the correct objNum/gen for the stream's containing object.
|
|
115
|
+
*/
|
|
116
|
+
getStreamData(stream: PdfStream, objNum?: number, gen?: number): Uint8Array;
|
|
117
|
+
/**
|
|
118
|
+
* Decrypt a string value (bytes) if encryption is active.
|
|
119
|
+
*/
|
|
120
|
+
decryptString(bytes: Uint8Array, objNum: number, gen: number): Uint8Array;
|
|
121
|
+
/**
|
|
122
|
+
* Decode a PDF string to a JS string, with optional decryption.
|
|
123
|
+
*/
|
|
124
|
+
decodeString(bytes: Uint8Array, objNum?: number, gen?: number): string;
|
|
125
|
+
/**
|
|
126
|
+
* Recursively decrypt all string values (Uint8Array) within a parsed PDF object.
|
|
127
|
+
* PDF spec requires all strings in an encrypted document to be decrypted using
|
|
128
|
+
* the per-object key derived from the containing object's objNum/gen.
|
|
129
|
+
* Streams are NOT decrypted here — they are decrypted in getStreamData().
|
|
130
|
+
*/
|
|
131
|
+
private decryptObjectStrings;
|
|
132
|
+
/**
|
|
133
|
+
* Get the catalog dictionary (the root of the document structure).
|
|
134
|
+
*/
|
|
135
|
+
getCatalog(): PdfDictValue;
|
|
136
|
+
/**
|
|
137
|
+
* Get the pages array from the page tree.
|
|
138
|
+
* Returns an array of page dictionaries in order.
|
|
139
|
+
*/
|
|
140
|
+
getPages(): PdfDictValue[];
|
|
141
|
+
/**
|
|
142
|
+
* Get pages with their object numbers (needed for correct decryption of
|
|
143
|
+
* inline streams within page objects).
|
|
144
|
+
*/
|
|
145
|
+
getPagesWithObjInfo(): Array<{
|
|
146
|
+
dict: PdfDictValue;
|
|
147
|
+
objNum: number;
|
|
148
|
+
gen: number;
|
|
149
|
+
}>;
|
|
150
|
+
/**
|
|
151
|
+
* Recursively collect page dictionaries from the page tree.
|
|
152
|
+
* Uses a visited set to prevent infinite recursion on cyclic page trees.
|
|
153
|
+
*/
|
|
154
|
+
private collectPages;
|
|
155
|
+
/**
|
|
156
|
+
* Get the object number for a given object reference.
|
|
157
|
+
* Useful for tracking which object a value came from (for decryption).
|
|
158
|
+
*/
|
|
159
|
+
getObjNumForRef(ref: PdfRef): number;
|
|
160
|
+
/**
|
|
161
|
+
* Parse an object definition at the given byte offset.
|
|
162
|
+
*/
|
|
163
|
+
private parseObjectAt;
|
|
164
|
+
/**
|
|
165
|
+
* Parse a compressed object from an object stream.
|
|
166
|
+
* @param objStmNum - The object number of the object stream
|
|
167
|
+
* @param index - The index of the object within the stream
|
|
168
|
+
*/
|
|
169
|
+
private parseCompressedObject;
|
|
170
|
+
/**
|
|
171
|
+
* Parse all objects from an object stream.
|
|
172
|
+
* @returns Map of object number → object value
|
|
173
|
+
*/
|
|
174
|
+
private parseObjectStream;
|
|
175
|
+
/**
|
|
176
|
+
* Resolve a page's bounding box (MediaBox/CropBox) with indirect ref resolution
|
|
177
|
+
* and parent inheritance. Returns `{ width, height }` or null if no box found.
|
|
178
|
+
*
|
|
179
|
+
* This is a shared helper so callers don't duplicate box resolution logic.
|
|
180
|
+
*/
|
|
181
|
+
resolvePageBox(pageDict: PdfDictValue, visited?: Set<PdfDictValue>): {
|
|
182
|
+
width: number;
|
|
183
|
+
height: number;
|
|
184
|
+
} | null;
|
|
185
|
+
/**
|
|
186
|
+
* Resolve a page's Resources dictionary, inheriting from parent pages if needed.
|
|
187
|
+
* Protected against cyclic parent chains.
|
|
188
|
+
*/
|
|
189
|
+
resolvePageResources(pageDict: PdfDictValue, visited?: Set<PdfDictValue>): PdfDictValue;
|
|
190
|
+
}
|
|
191
|
+
export {};
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PDF object parser.
|
|
3
|
+
*
|
|
4
|
+
* Parses PDF tokens into typed PDF objects: dictionaries, arrays, strings,
|
|
5
|
+
* numbers, booleans, names, null, indirect references, and streams.
|
|
6
|
+
*
|
|
7
|
+
* @see PDF Reference 1.7, Chapter 3 - Objects
|
|
8
|
+
*/
|
|
9
|
+
import type { Token, PdfTokenizer } from "./pdf-tokenizer.js";
|
|
10
|
+
/** A PDF indirect object reference: `N gen R` */
|
|
11
|
+
export interface PdfRef {
|
|
12
|
+
readonly type: "ref";
|
|
13
|
+
readonly objNum: number;
|
|
14
|
+
readonly gen: number;
|
|
15
|
+
}
|
|
16
|
+
/** A PDF stream: dictionary + raw data bytes */
|
|
17
|
+
export interface PdfStream {
|
|
18
|
+
readonly type: "stream";
|
|
19
|
+
readonly dict: PdfDictValue;
|
|
20
|
+
readonly data: Uint8Array;
|
|
21
|
+
}
|
|
22
|
+
/** A PDF dictionary: key-value pairs where keys are names */
|
|
23
|
+
export type PdfDictValue = Map<string, PdfObject>;
|
|
24
|
+
/** A PDF array */
|
|
25
|
+
export type PdfArrayValue = PdfObject[];
|
|
26
|
+
/**
|
|
27
|
+
* Union type for all possible PDF object values.
|
|
28
|
+
*/
|
|
29
|
+
export type PdfObject = number | string | boolean | null | Uint8Array | PdfRef | PdfDictValue | PdfArrayValue | PdfStream;
|
|
30
|
+
export declare function isPdfRef(obj: PdfObject | undefined): obj is PdfRef;
|
|
31
|
+
export declare function isPdfStream(obj: PdfObject | undefined): obj is PdfStream;
|
|
32
|
+
export declare function isPdfDict(obj: PdfObject | undefined): obj is PdfDictValue;
|
|
33
|
+
export declare function isPdfArray(obj: PdfObject | undefined): obj is PdfArrayValue;
|
|
34
|
+
/** Get a string value from a PDF dictionary */
|
|
35
|
+
export declare function dictGetName(dict: PdfDictValue, key: string): string | undefined;
|
|
36
|
+
/** Get a number value from a PDF dictionary */
|
|
37
|
+
export declare function dictGetNumber(dict: PdfDictValue, key: string): number | undefined;
|
|
38
|
+
/** Get a boolean value from a PDF dictionary */
|
|
39
|
+
export declare function dictGetBool(dict: PdfDictValue, key: string): boolean | undefined;
|
|
40
|
+
/** Get a dictionary value from a PDF dictionary */
|
|
41
|
+
export declare function dictGetDict(dict: PdfDictValue, key: string): PdfDictValue | undefined;
|
|
42
|
+
/** Get an array value from a PDF dictionary */
|
|
43
|
+
export declare function dictGetArray(dict: PdfDictValue, key: string): PdfArrayValue | undefined;
|
|
44
|
+
/** Get a ref from a PDF dictionary */
|
|
45
|
+
export declare function dictGetRef(dict: PdfDictValue, key: string): PdfRef | undefined;
|
|
46
|
+
/** Get bytes (string as Uint8Array) from a PDF dictionary */
|
|
47
|
+
export declare function dictGetBytes(dict: PdfDictValue, key: string): Uint8Array | undefined;
|
|
48
|
+
/** Get a string value that may be either a name (string) or bytes decoded as latin1 */
|
|
49
|
+
export declare function dictGetString(dict: PdfDictValue, key: string): string | undefined;
|
|
50
|
+
/**
|
|
51
|
+
* Decode PDF string bytes to a JavaScript string.
|
|
52
|
+
* Handles UTF-16BE (BOM = FEFF) and PDFDocEncoding (Latin-1 superset).
|
|
53
|
+
*/
|
|
54
|
+
export declare function decodePdfStringBytes(bytes: Uint8Array): string;
|
|
55
|
+
/**
|
|
56
|
+
* Parse a single PDF object from the tokenizer.
|
|
57
|
+
*
|
|
58
|
+
* Handles all PDF object types including dictionaries (with possible streams),
|
|
59
|
+
* arrays, strings, numbers, names, booleans, null, and indirect references.
|
|
60
|
+
*/
|
|
61
|
+
export declare function parseObject(tokenizer: PdfTokenizer): PdfObject;
|
|
62
|
+
/**
|
|
63
|
+
* Parse a PDF object given the first token has already been consumed.
|
|
64
|
+
*/
|
|
65
|
+
export declare function parseObjectFromToken(tokenizer: PdfTokenizer, token: Token): PdfObject;
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PDF reader — public API.
|
|
3
|
+
*
|
|
4
|
+
* Provides a high-level, zero-dependency interface for reading PDF files.
|
|
5
|
+
* Supports:
|
|
6
|
+
* - Text extraction with multilingual support (WinAnsi, MacRoman, CJK via
|
|
7
|
+
* ToUnicode CMap, Identity-H/V, Symbol, ZapfDingbats)
|
|
8
|
+
* - Image extraction (JPEG, JPEG2000, raw/Flate, CCITT, JBIG2)
|
|
9
|
+
* - Annotation extraction (links, comments, highlights, stamps, etc.)
|
|
10
|
+
* - Form field extraction (AcroForm: text inputs, checkboxes, radio buttons, dropdowns)
|
|
11
|
+
* - Metadata reading (Info dictionary + XMP)
|
|
12
|
+
* - Encrypted PDFs:
|
|
13
|
+
* - RC4 (40-bit and 128-bit) — tested via roundtrip
|
|
14
|
+
* - AES-128 (V=4, R=4) — implemented, requires external test fixtures
|
|
15
|
+
* - AES-256 (V=5, R=5) — implemented, requires external test fixtures
|
|
16
|
+
* - Cross-reference tables and streams (PDF 1.5+)
|
|
17
|
+
* - Incremental updates and xref recovery
|
|
18
|
+
*
|
|
19
|
+
* @example Basic text extraction:
|
|
20
|
+
* ```typescript
|
|
21
|
+
* import { readPdf } from "excelts/pdf";
|
|
22
|
+
*
|
|
23
|
+
* const pdf = readPdf(pdfBytes);
|
|
24
|
+
* console.log(pdf.text); // All text from all pages
|
|
25
|
+
* console.log(pdf.pages[0].text); // Text from page 1
|
|
26
|
+
* ```
|
|
27
|
+
*
|
|
28
|
+
* @example Image extraction:
|
|
29
|
+
* ```typescript
|
|
30
|
+
* const pdf = readPdf(pdfBytes);
|
|
31
|
+
* for (const image of pdf.pages[0].images) {
|
|
32
|
+
* console.log(image.format, image.width, image.height);
|
|
33
|
+
* fs.writeFileSync(`image.${image.format}`, image.data);
|
|
34
|
+
* }
|
|
35
|
+
* ```
|
|
36
|
+
*
|
|
37
|
+
* @example Metadata:
|
|
38
|
+
* ```typescript
|
|
39
|
+
* const pdf = readPdf(pdfBytes);
|
|
40
|
+
* console.log(pdf.metadata.title);
|
|
41
|
+
* console.log(pdf.metadata.author);
|
|
42
|
+
* console.log(pdf.metadata.pageCount);
|
|
43
|
+
* ```
|
|
44
|
+
*
|
|
45
|
+
* @example Encrypted PDF:
|
|
46
|
+
* ```typescript
|
|
47
|
+
* const pdf = readPdf(pdfBytes, { password: "secret" });
|
|
48
|
+
* ```
|
|
49
|
+
*/
|
|
50
|
+
import type { TextLine } from "./text-reconstruction.js";
|
|
51
|
+
import type { TextFragment } from "./content-interpreter.js";
|
|
52
|
+
import type { ExtractedImage } from "./image-extractor.js";
|
|
53
|
+
import type { PdfAnnotation } from "./annotation-extractor.js";
|
|
54
|
+
import type { PdfFormField } from "./form-extractor.js";
|
|
55
|
+
import type { PdfMetadata } from "./metadata-reader.js";
|
|
56
|
+
/**
|
|
57
|
+
* Options for reading a PDF.
|
|
58
|
+
*/
|
|
59
|
+
export interface ReadPdfOptions {
|
|
60
|
+
/**
|
|
61
|
+
* Password for encrypted PDFs.
|
|
62
|
+
* Can be either the user password or owner password.
|
|
63
|
+
* @default ""
|
|
64
|
+
*/
|
|
65
|
+
password?: string;
|
|
66
|
+
/**
|
|
67
|
+
* Which pages to extract (1-based).
|
|
68
|
+
* If omitted, all pages are extracted.
|
|
69
|
+
* @example [1, 3, 5] — extract pages 1, 3, and 5
|
|
70
|
+
*/
|
|
71
|
+
pages?: number[];
|
|
72
|
+
/**
|
|
73
|
+
* Whether to extract text.
|
|
74
|
+
* @default true
|
|
75
|
+
*/
|
|
76
|
+
extractText?: boolean;
|
|
77
|
+
/**
|
|
78
|
+
* Whether to extract images.
|
|
79
|
+
* @default true
|
|
80
|
+
*/
|
|
81
|
+
extractImages?: boolean;
|
|
82
|
+
/**
|
|
83
|
+
* Whether to extract metadata.
|
|
84
|
+
* @default true
|
|
85
|
+
*/
|
|
86
|
+
extractMetadata?: boolean;
|
|
87
|
+
/**
|
|
88
|
+
* Whether to extract annotations (links, comments, highlights, etc.).
|
|
89
|
+
* @default true
|
|
90
|
+
*/
|
|
91
|
+
extractAnnotations?: boolean;
|
|
92
|
+
/**
|
|
93
|
+
* Whether to extract form fields (AcroForm: text inputs, checkboxes, dropdowns, etc.).
|
|
94
|
+
* @default true
|
|
95
|
+
*/
|
|
96
|
+
extractFormFields?: boolean;
|
|
97
|
+
}
|
|
98
|
+
/**
|
|
99
|
+
* A single page from a read PDF.
|
|
100
|
+
*/
|
|
101
|
+
export interface ReadPdfPage {
|
|
102
|
+
/** 1-based page number */
|
|
103
|
+
pageNumber: number;
|
|
104
|
+
/** Extracted text content */
|
|
105
|
+
text: string;
|
|
106
|
+
/** Structured text lines with position information */
|
|
107
|
+
textLines: TextLine[];
|
|
108
|
+
/** Raw text fragments with exact positions */
|
|
109
|
+
textFragments: TextFragment[];
|
|
110
|
+
/** Extracted images */
|
|
111
|
+
images: ExtractedImage[];
|
|
112
|
+
/** Extracted annotations (links, comments, highlights, etc.) */
|
|
113
|
+
annotations: PdfAnnotation[];
|
|
114
|
+
/** Page width in points */
|
|
115
|
+
width: number;
|
|
116
|
+
/** Page height in points */
|
|
117
|
+
height: number;
|
|
118
|
+
/** Warnings encountered during extraction (non-fatal errors) */
|
|
119
|
+
warnings: string[];
|
|
120
|
+
}
|
|
121
|
+
/**
|
|
122
|
+
* Result of reading a PDF.
|
|
123
|
+
*/
|
|
124
|
+
export interface ReadPdfResult {
|
|
125
|
+
/** All text from all pages concatenated */
|
|
126
|
+
text: string;
|
|
127
|
+
/** Per-page results */
|
|
128
|
+
pages: ReadPdfPage[];
|
|
129
|
+
/** Document metadata */
|
|
130
|
+
metadata: PdfMetadata;
|
|
131
|
+
/** Form fields extracted from AcroForm (document-level) */
|
|
132
|
+
formFields: PdfFormField[];
|
|
133
|
+
}
|
|
134
|
+
/**
|
|
135
|
+
* Read a PDF file and extract text, images, and metadata.
|
|
136
|
+
*
|
|
137
|
+
* @param data - Raw PDF file bytes
|
|
138
|
+
* @param options - Extraction options
|
|
139
|
+
* @returns Extracted content
|
|
140
|
+
* @throws {PdfStructureError} If the PDF structure is invalid
|
|
141
|
+
* @throws {PdfError} If decryption fails (wrong password)
|
|
142
|
+
*/
|
|
143
|
+
export declare function readPdf(data: Uint8Array, options?: ReadPdfOptions): ReadPdfResult;
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PDF tokenizer / lexer.
|
|
3
|
+
*
|
|
4
|
+
* Scans raw PDF bytes and produces a stream of typed tokens.
|
|
5
|
+
* Handles all PDF token types: numbers, strings (literal and hex),
|
|
6
|
+
* names, booleans, null, keywords, and delimiters.
|
|
7
|
+
*
|
|
8
|
+
* @see PDF Reference 1.7, §3.1 - Lexical Conventions
|
|
9
|
+
*/
|
|
10
|
+
export declare const enum TokenType {
|
|
11
|
+
/** Integer or real number */
|
|
12
|
+
Number = 0,
|
|
13
|
+
/** Literal string delimited by parentheses `(...)` */
|
|
14
|
+
LiteralString = 1,
|
|
15
|
+
/** Hex string delimited by angle brackets `<...>` */
|
|
16
|
+
HexString = 2,
|
|
17
|
+
/** Name object starting with `/` */
|
|
18
|
+
Name = 3,
|
|
19
|
+
/** Boolean `true` or `false` */
|
|
20
|
+
Boolean = 4,
|
|
21
|
+
/** The `null` keyword */
|
|
22
|
+
Null = 5,
|
|
23
|
+
/** Keywords: obj, endobj, stream, endstream, xref, trailer, startxref, R */
|
|
24
|
+
Keyword = 6,
|
|
25
|
+
/** `<<` dict begin */
|
|
26
|
+
DictBegin = 7,
|
|
27
|
+
/** `>>` dict end */
|
|
28
|
+
DictEnd = 8,
|
|
29
|
+
/** `[` array begin */
|
|
30
|
+
ArrayBegin = 9,
|
|
31
|
+
/** `]` array end */
|
|
32
|
+
ArrayEnd = 10,
|
|
33
|
+
/** End of input */
|
|
34
|
+
EOF = 11
|
|
35
|
+
}
|
|
36
|
+
export interface Token {
|
|
37
|
+
type: TokenType;
|
|
38
|
+
/** Numeric value (for Number tokens) */
|
|
39
|
+
numValue?: number;
|
|
40
|
+
/** String value (for String, Name, Keyword, Boolean tokens) */
|
|
41
|
+
strValue?: string;
|
|
42
|
+
/** Raw bytes (for LiteralString and HexString tokens) */
|
|
43
|
+
rawBytes?: Uint8Array;
|
|
44
|
+
/** Boolean value (for Boolean tokens) */
|
|
45
|
+
boolValue?: boolean;
|
|
46
|
+
/** Byte offset where this token starts */
|
|
47
|
+
offset: number;
|
|
48
|
+
}
|
|
49
|
+
/**
|
|
50
|
+
* Byte-level PDF tokenizer.
|
|
51
|
+
*
|
|
52
|
+
* Provides a `next()` method that returns the next token from the input.
|
|
53
|
+
* The tokenizer maintains a mutable position pointer that advances through
|
|
54
|
+
* the input bytes.
|
|
55
|
+
*/
|
|
56
|
+
export declare class PdfTokenizer {
|
|
57
|
+
private data;
|
|
58
|
+
pos: number;
|
|
59
|
+
constructor(data: Uint8Array, offset?: number);
|
|
60
|
+
/** Get current position */
|
|
61
|
+
get position(): number;
|
|
62
|
+
/** Set current position */
|
|
63
|
+
set position(offset: number);
|
|
64
|
+
/** Get the underlying data */
|
|
65
|
+
get bytes(): Uint8Array;
|
|
66
|
+
/** Peek at the byte at the current position without consuming it */
|
|
67
|
+
peek(): number;
|
|
68
|
+
/** Read the next token */
|
|
69
|
+
next(): Token;
|
|
70
|
+
skipWhitespaceAndComments(): void;
|
|
71
|
+
private readLiteralString;
|
|
72
|
+
private readHexString;
|
|
73
|
+
private readName;
|
|
74
|
+
private readNumber;
|
|
75
|
+
private readKeyword;
|
|
76
|
+
/**
|
|
77
|
+
* Search forward for a byte sequence starting from the current position.
|
|
78
|
+
* Returns the offset where the sequence starts, or -1 if not found.
|
|
79
|
+
* Does NOT advance the position.
|
|
80
|
+
*/
|
|
81
|
+
findSequence(seq: Uint8Array, from?: number): number;
|
|
82
|
+
/**
|
|
83
|
+
* Search backward for a byte sequence starting from `from` (or end of data).
|
|
84
|
+
* Returns the offset where the sequence starts, or -1 if not found.
|
|
85
|
+
*/
|
|
86
|
+
findSequenceBackward(seq: Uint8Array, from?: number): number;
|
|
87
|
+
/**
|
|
88
|
+
* Read a line of text at the current position. Advances past the line ending.
|
|
89
|
+
*/
|
|
90
|
+
readLine(): string;
|
|
91
|
+
/**
|
|
92
|
+
* Extract a slice of the underlying data.
|
|
93
|
+
*/
|
|
94
|
+
slice(start: number, end: number): Uint8Array;
|
|
95
|
+
/**
|
|
96
|
+
* Read the stream content following a `stream` keyword.
|
|
97
|
+
* The tokenizer should be positioned right after the `stream` keyword.
|
|
98
|
+
* Returns the raw stream bytes (between stream\n and endstream).
|
|
99
|
+
*/
|
|
100
|
+
readStreamContent(length: number): Uint8Array;
|
|
101
|
+
}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared utility functions for PDF reader modules.
|
|
3
|
+
*/
|
|
4
|
+
import type { PdfDocument } from "./pdf-document.js";
|
|
5
|
+
import type { PdfDictValue } from "./pdf-parser.js";
|
|
6
|
+
/**
|
|
7
|
+
* Safely extract a string value from a PDF dictionary entry.
|
|
8
|
+
* Handles both name strings and Uint8Array PDF strings (with BOM/encoding detection).
|
|
9
|
+
*
|
|
10
|
+
* @param dict - The PDF dictionary
|
|
11
|
+
* @param key - The key to look up
|
|
12
|
+
* @param doc - The PDF document for resolving indirect references
|
|
13
|
+
* @returns The string value, or empty string if not found or not a string
|
|
14
|
+
*/
|
|
15
|
+
export declare function getDictStringValue(dict: PdfDictValue, key: string, doc: PdfDocument): string;
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PDF stream filter decoder chain.
|
|
3
|
+
*
|
|
4
|
+
* Decodes PDF stream data by applying the appropriate filter(s)
|
|
5
|
+
* specified in the stream dictionary's /Filter entry.
|
|
6
|
+
*
|
|
7
|
+
* Supported filters:
|
|
8
|
+
* - /FlateDecode (zlib/deflate compression)
|
|
9
|
+
* - /ASCII85Decode (ASCII base-85 encoding)
|
|
10
|
+
* - /ASCIIHexDecode (ASCII hexadecimal encoding)
|
|
11
|
+
* - /LZWDecode (LZW compression)
|
|
12
|
+
* - /RunLengthDecode (run-length encoding)
|
|
13
|
+
*
|
|
14
|
+
* @see PDF Reference 1.7, §3.3 - Filters
|
|
15
|
+
*/
|
|
16
|
+
import type { PdfDictValue } from "./pdf-parser.js";
|
|
17
|
+
/**
|
|
18
|
+
* Decode stream data by applying the filter chain from the stream dictionary.
|
|
19
|
+
*/
|
|
20
|
+
export declare function decodeStreamFilters(data: Uint8Array, dict: PdfDictValue): Uint8Array;
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Text reconstruction from positioned text fragments.
|
|
3
|
+
*
|
|
4
|
+
* Assembles raw text fragments extracted from PDF content streams into
|
|
5
|
+
* coherent, human-readable text with proper reading order, line breaks,
|
|
6
|
+
* and paragraph detection.
|
|
7
|
+
*
|
|
8
|
+
* Challenges addressed:
|
|
9
|
+
* - PDF text has no semantic structure (only "draw char at (x,y)")
|
|
10
|
+
* - Text fragments may be out of order
|
|
11
|
+
* - Word and line boundaries must be inferred from positions
|
|
12
|
+
* - Columns and tables need proper handling
|
|
13
|
+
* - Different fonts/sizes affect spacing thresholds
|
|
14
|
+
* - Multi-column layouts need column detection
|
|
15
|
+
* - RTL (Arabic, Hebrew) text needs right-to-left sorting
|
|
16
|
+
* - Vertical CJK text needs column-based grouping
|
|
17
|
+
*
|
|
18
|
+
* @see PDF Reference 1.7, Chapter 5 - Text
|
|
19
|
+
*/
|
|
20
|
+
import type { TextFragment } from "./content-interpreter.js";
|
|
21
|
+
/**
|
|
22
|
+
* Reconstruct readable text from positioned text fragments.
|
|
23
|
+
*
|
|
24
|
+
* @param fragments - Raw text fragments with positions from content stream
|
|
25
|
+
* @returns Reconstructed text with proper line breaks and spacing
|
|
26
|
+
*/
|
|
27
|
+
export declare function reconstructText(fragments: TextFragment[]): string;
|
|
28
|
+
/**
|
|
29
|
+
* Detailed text extraction result preserving position information.
|
|
30
|
+
*/
|
|
31
|
+
export interface TextLine {
|
|
32
|
+
/** The text content of this line */
|
|
33
|
+
text: string;
|
|
34
|
+
/** Y position (PDF coordinate, origin = bottom-left) */
|
|
35
|
+
y: number;
|
|
36
|
+
/** X position of the start of the line */
|
|
37
|
+
x: number;
|
|
38
|
+
/** Font size of the first fragment */
|
|
39
|
+
fontSize: number;
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Extract text as structured lines.
|
|
43
|
+
*/
|
|
44
|
+
export declare function reconstructTextLines(fragments: TextFragment[]): TextLine[];
|
package/package.json
CHANGED