@cj-tech-master/excelts 8.0.0 → 8.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. package/README.md +14 -1
  2. package/README_zh.md +6 -0
  3. package/dist/browser/modules/archive/zip/stream.d.ts +4 -0
  4. package/dist/browser/modules/archive/zip/stream.js +53 -0
  5. package/dist/browser/modules/pdf/core/crypto.d.ts +65 -0
  6. package/dist/browser/modules/pdf/core/crypto.js +637 -0
  7. package/dist/browser/modules/pdf/core/encryption.d.ts +23 -20
  8. package/dist/browser/modules/pdf/core/encryption.js +88 -261
  9. package/dist/browser/modules/pdf/core/pdf-writer.d.ts +6 -4
  10. package/dist/browser/modules/pdf/core/pdf-writer.js +19 -10
  11. package/dist/browser/modules/pdf/index.d.ts +23 -2
  12. package/dist/browser/modules/pdf/index.js +21 -3
  13. package/dist/browser/modules/pdf/reader/annotation-extractor.d.ts +63 -0
  14. package/dist/browser/modules/pdf/reader/annotation-extractor.js +155 -0
  15. package/dist/browser/modules/pdf/reader/cmap-parser.d.ts +70 -0
  16. package/dist/browser/modules/pdf/reader/cmap-parser.js +321 -0
  17. package/dist/browser/modules/pdf/reader/content-interpreter.d.ts +57 -0
  18. package/dist/browser/modules/pdf/reader/content-interpreter.js +715 -0
  19. package/dist/browser/modules/pdf/reader/font-decoder.d.ts +58 -0
  20. package/dist/browser/modules/pdf/reader/font-decoder.js +1513 -0
  21. package/dist/browser/modules/pdf/reader/form-extractor.d.ts +48 -0
  22. package/dist/browser/modules/pdf/reader/form-extractor.js +355 -0
  23. package/dist/browser/modules/pdf/reader/image-extractor.d.ts +55 -0
  24. package/dist/browser/modules/pdf/reader/image-extractor.js +220 -0
  25. package/dist/browser/modules/pdf/reader/metadata-reader.d.ts +56 -0
  26. package/dist/browser/modules/pdf/reader/metadata-reader.js +275 -0
  27. package/dist/browser/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
  28. package/dist/browser/modules/pdf/reader/pdf-decrypt.js +443 -0
  29. package/dist/browser/modules/pdf/reader/pdf-document.d.ts +191 -0
  30. package/dist/browser/modules/pdf/reader/pdf-document.js +818 -0
  31. package/dist/browser/modules/pdf/reader/pdf-parser.d.ts +65 -0
  32. package/dist/browser/modules/pdf/reader/pdf-parser.js +285 -0
  33. package/dist/browser/modules/pdf/reader/pdf-reader.d.ts +143 -0
  34. package/dist/browser/modules/pdf/reader/pdf-reader.js +200 -0
  35. package/dist/browser/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
  36. package/dist/browser/modules/pdf/reader/pdf-tokenizer.js +543 -0
  37. package/dist/browser/modules/pdf/reader/reader-utils.d.ts +15 -0
  38. package/dist/browser/modules/pdf/reader/reader-utils.js +27 -0
  39. package/dist/browser/modules/pdf/reader/stream-filters.d.ts +20 -0
  40. package/dist/browser/modules/pdf/reader/stream-filters.js +456 -0
  41. package/dist/browser/modules/pdf/reader/text-reconstruction.d.ts +44 -0
  42. package/dist/browser/modules/pdf/reader/text-reconstruction.js +463 -0
  43. package/dist/cjs/modules/archive/zip/stream.js +53 -0
  44. package/dist/cjs/modules/pdf/core/crypto.js +649 -0
  45. package/dist/cjs/modules/pdf/core/encryption.js +88 -263
  46. package/dist/cjs/modules/pdf/core/pdf-writer.js +19 -10
  47. package/dist/cjs/modules/pdf/index.js +23 -4
  48. package/dist/cjs/modules/pdf/reader/annotation-extractor.js +158 -0
  49. package/dist/cjs/modules/pdf/reader/cmap-parser.js +326 -0
  50. package/dist/cjs/modules/pdf/reader/content-interpreter.js +718 -0
  51. package/dist/cjs/modules/pdf/reader/font-decoder.js +1518 -0
  52. package/dist/cjs/modules/pdf/reader/form-extractor.js +358 -0
  53. package/dist/cjs/modules/pdf/reader/image-extractor.js +223 -0
  54. package/dist/cjs/modules/pdf/reader/metadata-reader.js +278 -0
  55. package/dist/cjs/modules/pdf/reader/pdf-decrypt.js +447 -0
  56. package/dist/cjs/modules/pdf/reader/pdf-document.js +822 -0
  57. package/dist/cjs/modules/pdf/reader/pdf-parser.js +301 -0
  58. package/dist/cjs/modules/pdf/reader/pdf-reader.js +203 -0
  59. package/dist/cjs/modules/pdf/reader/pdf-tokenizer.js +517 -0
  60. package/dist/cjs/modules/pdf/reader/reader-utils.js +30 -0
  61. package/dist/cjs/modules/pdf/reader/stream-filters.js +459 -0
  62. package/dist/cjs/modules/pdf/reader/text-reconstruction.js +467 -0
  63. package/dist/esm/modules/archive/zip/stream.js +53 -0
  64. package/dist/esm/modules/pdf/core/crypto.js +637 -0
  65. package/dist/esm/modules/pdf/core/encryption.js +88 -261
  66. package/dist/esm/modules/pdf/core/pdf-writer.js +19 -10
  67. package/dist/esm/modules/pdf/index.js +21 -3
  68. package/dist/esm/modules/pdf/reader/annotation-extractor.js +155 -0
  69. package/dist/esm/modules/pdf/reader/cmap-parser.js +321 -0
  70. package/dist/esm/modules/pdf/reader/content-interpreter.js +715 -0
  71. package/dist/esm/modules/pdf/reader/font-decoder.js +1513 -0
  72. package/dist/esm/modules/pdf/reader/form-extractor.js +355 -0
  73. package/dist/esm/modules/pdf/reader/image-extractor.js +220 -0
  74. package/dist/esm/modules/pdf/reader/metadata-reader.js +275 -0
  75. package/dist/esm/modules/pdf/reader/pdf-decrypt.js +443 -0
  76. package/dist/esm/modules/pdf/reader/pdf-document.js +818 -0
  77. package/dist/esm/modules/pdf/reader/pdf-parser.js +285 -0
  78. package/dist/esm/modules/pdf/reader/pdf-reader.js +200 -0
  79. package/dist/esm/modules/pdf/reader/pdf-tokenizer.js +543 -0
  80. package/dist/esm/modules/pdf/reader/reader-utils.js +27 -0
  81. package/dist/esm/modules/pdf/reader/stream-filters.js +456 -0
  82. package/dist/esm/modules/pdf/reader/text-reconstruction.js +463 -0
  83. package/dist/iife/excelts.iife.js +703 -267
  84. package/dist/iife/excelts.iife.js.map +1 -1
  85. package/dist/iife/excelts.iife.min.js +35 -35
  86. package/dist/types/modules/archive/zip/stream.d.ts +4 -0
  87. package/dist/types/modules/pdf/core/crypto.d.ts +65 -0
  88. package/dist/types/modules/pdf/core/encryption.d.ts +23 -20
  89. package/dist/types/modules/pdf/core/pdf-writer.d.ts +6 -4
  90. package/dist/types/modules/pdf/index.d.ts +23 -2
  91. package/dist/types/modules/pdf/reader/annotation-extractor.d.ts +63 -0
  92. package/dist/types/modules/pdf/reader/cmap-parser.d.ts +70 -0
  93. package/dist/types/modules/pdf/reader/content-interpreter.d.ts +57 -0
  94. package/dist/types/modules/pdf/reader/font-decoder.d.ts +58 -0
  95. package/dist/types/modules/pdf/reader/form-extractor.d.ts +48 -0
  96. package/dist/types/modules/pdf/reader/image-extractor.d.ts +55 -0
  97. package/dist/types/modules/pdf/reader/metadata-reader.d.ts +56 -0
  98. package/dist/types/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
  99. package/dist/types/modules/pdf/reader/pdf-document.d.ts +191 -0
  100. package/dist/types/modules/pdf/reader/pdf-parser.d.ts +65 -0
  101. package/dist/types/modules/pdf/reader/pdf-reader.d.ts +143 -0
  102. package/dist/types/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
  103. package/dist/types/modules/pdf/reader/reader-utils.d.ts +15 -0
  104. package/dist/types/modules/pdf/reader/stream-filters.d.ts +20 -0
  105. package/dist/types/modules/pdf/reader/text-reconstruction.d.ts +44 -0
  106. package/package.json +1 -1
@@ -0,0 +1,191 @@
1
+ /**
2
+ * PDF document parser.
3
+ *
4
+ * Handles the high-level PDF file structure:
5
+ * - Locating startxref
6
+ * - Parsing cross-reference tables (traditional and stream-based)
7
+ * - Reading trailer dictionaries
8
+ * - Resolving indirect object references
9
+ * - Handling incremental updates
10
+ *
11
+ * @see PDF Reference 1.7, §3.4 - File Structure
12
+ */
13
+ import type { PdfObject, PdfDictValue, PdfRef, PdfStream } from "./pdf-parser.js";
14
+ /** Result of resolving an object with its object/generation numbers for decryption */
15
+ interface ResolvedObject {
16
+ /** The resolved PDF object */
17
+ obj: PdfObject | null;
18
+ /** The object number */
19
+ objNum: number;
20
+ /** The generation number */
21
+ gen: number;
22
+ }
23
+ /**
24
+ * Parsed PDF document with lazy object resolution.
25
+ *
26
+ * Reads the cross-reference table and trailer on construction,
27
+ * then resolves individual objects on demand with caching.
28
+ */
29
+ export declare class PdfDocument {
30
+ private tokenizer;
31
+ private xref;
32
+ private cache;
33
+ readonly trailer: PdfDictValue;
34
+ /** Encryption handler (set externally after decryption is initialized) */
35
+ decryptFn: ((data: Uint8Array, objNum: number, gen: number) => Uint8Array) | null;
36
+ constructor(data: Uint8Array);
37
+ /** Get the underlying raw data */
38
+ get data(): Uint8Array;
39
+ private parseFileStructure;
40
+ /**
41
+ * Find the startxref offset by scanning backward from EOF.
42
+ */
43
+ private findStartxref;
44
+ /**
45
+ * Parse the xref chain starting at the given offset.
46
+ * Follows /Prev links for incremental updates.
47
+ * Returns the merged trailer dictionary.
48
+ */
49
+ private parseXrefChain;
50
+ /**
51
+ * Parse a traditional xref table and its trailer.
52
+ */
53
+ private parseTraditionalXref;
54
+ /**
55
+ * Parse a cross-reference stream (PDF 1.5+).
56
+ */
57
+ private parseXrefStream;
58
+ /**
59
+ * Reconstruct the xref table by scanning the entire file for `N N obj` patterns.
60
+ * This is a fallback for corrupted or broken PDFs where the normal xref parsing fails.
61
+ *
62
+ * @returns A synthetic trailer dictionary
63
+ */
64
+ private reconstructXref;
65
+ /**
66
+ * Merge trailer entries from an older trailer into the current one.
67
+ * Only adds keys that don't already exist.
68
+ */
69
+ private mergeTrailer;
70
+ /**
71
+ * Resolve a PDF object by its object number and generation.
72
+ * Returns null if the object doesn't exist.
73
+ */
74
+ resolve(objNum: number, gen?: number): PdfObject | null;
75
+ /**
76
+ * Resolve a PDF object and return it along with its object/generation numbers.
77
+ * Useful for tracking which object a value came from (for decryption).
78
+ *
79
+ * @param objNum - The object number to resolve
80
+ * @param gen - The generation number (default 0)
81
+ * @returns The resolved object with its objNum and gen for decryption context
82
+ */
83
+ resolveWithObjNum(objNum: number, gen?: number): ResolvedObject;
84
+ /**
85
+ * Dereference a PdfRef to its actual object value.
86
+ * If the input is not a PdfRef, returns it as-is.
87
+ */
88
+ deref(obj: PdfObject | null | undefined): PdfObject | null;
89
+ /**
90
+ * Dereference a PdfRef and assert it's a dictionary.
91
+ */
92
+ derefDict(obj: PdfObject | null | undefined): PdfDictValue | null;
93
+ /**
94
+ * Dereference a PdfRef and get the stream, along with the objNum/gen
95
+ * needed for correct per-object decryption.
96
+ */
97
+ derefStream(obj: PdfObject | null | undefined): PdfStream | null;
98
+ /**
99
+ * Dereference a PdfRef and get the stream with its object number and generation.
100
+ * Returns null if the object is not a stream.
101
+ * The objNum/gen are needed for correct per-object decryption (V1-V4).
102
+ */
103
+ derefStreamWithObjNum(obj: PdfObject | null | undefined): {
104
+ stream: PdfStream;
105
+ objNum: number;
106
+ gen: number;
107
+ } | null;
108
+ /**
109
+ * Get decoded stream data from a stream object.
110
+ * Applies filter chain decoding and decryption.
111
+ *
112
+ * When objNum/gen are not provided (default 0), decryption may not
113
+ * produce correct results. Use {@link resolveWithObjNum} to obtain
114
+ * the correct objNum/gen for the stream's containing object.
115
+ */
116
+ getStreamData(stream: PdfStream, objNum?: number, gen?: number): Uint8Array;
117
+ /**
118
+ * Decrypt a string value (bytes) if encryption is active.
119
+ */
120
+ decryptString(bytes: Uint8Array, objNum: number, gen: number): Uint8Array;
121
+ /**
122
+ * Decode a PDF string to a JS string, with optional decryption.
123
+ */
124
+ decodeString(bytes: Uint8Array, objNum?: number, gen?: number): string;
125
+ /**
126
+ * Recursively decrypt all string values (Uint8Array) within a parsed PDF object.
127
+ * PDF spec requires all strings in an encrypted document to be decrypted using
128
+ * the per-object key derived from the containing object's objNum/gen.
129
+ * Streams are NOT decrypted here — they are decrypted in getStreamData().
130
+ */
131
+ private decryptObjectStrings;
132
+ /**
133
+ * Get the catalog dictionary (the root of the document structure).
134
+ */
135
+ getCatalog(): PdfDictValue;
136
+ /**
137
+ * Get the pages array from the page tree.
138
+ * Returns an array of page dictionaries in order.
139
+ */
140
+ getPages(): PdfDictValue[];
141
+ /**
142
+ * Get pages with their object numbers (needed for correct decryption of
143
+ * inline streams within page objects).
144
+ */
145
+ getPagesWithObjInfo(): Array<{
146
+ dict: PdfDictValue;
147
+ objNum: number;
148
+ gen: number;
149
+ }>;
150
+ /**
151
+ * Recursively collect page dictionaries from the page tree.
152
+ * Uses a visited set to prevent infinite recursion on cyclic page trees.
153
+ */
154
+ private collectPages;
155
+ /**
156
+ * Get the object number for a given object reference.
157
+ * Useful for tracking which object a value came from (for decryption).
158
+ */
159
+ getObjNumForRef(ref: PdfRef): number;
160
+ /**
161
+ * Parse an object definition at the given byte offset.
162
+ */
163
+ private parseObjectAt;
164
+ /**
165
+ * Parse a compressed object from an object stream.
166
+ * @param objStmNum - The object number of the object stream
167
+ * @param index - The index of the object within the stream
168
+ */
169
+ private parseCompressedObject;
170
+ /**
171
+ * Parse all objects from an object stream.
172
+ * @returns Map of object number → object value
173
+ */
174
+ private parseObjectStream;
175
+ /**
176
+ * Resolve a page's bounding box (MediaBox/CropBox) with indirect ref resolution
177
+ * and parent inheritance. Returns `{ width, height }` or null if no box found.
178
+ *
179
+ * This is a shared helper so callers don't duplicate box resolution logic.
180
+ */
181
+ resolvePageBox(pageDict: PdfDictValue, visited?: Set<PdfDictValue>): {
182
+ width: number;
183
+ height: number;
184
+ } | null;
185
+ /**
186
+ * Resolve a page's Resources dictionary, inheriting from parent pages if needed.
187
+ * Protected against cyclic parent chains.
188
+ */
189
+ resolvePageResources(pageDict: PdfDictValue, visited?: Set<PdfDictValue>): PdfDictValue;
190
+ }
191
+ export {};
@@ -0,0 +1,65 @@
1
+ /**
2
+ * PDF object parser.
3
+ *
4
+ * Parses PDF tokens into typed PDF objects: dictionaries, arrays, strings,
5
+ * numbers, booleans, names, null, indirect references, and streams.
6
+ *
7
+ * @see PDF Reference 1.7, Chapter 3 - Objects
8
+ */
9
+ import type { Token, PdfTokenizer } from "./pdf-tokenizer.js";
10
+ /** A PDF indirect object reference: `N gen R` */
11
+ export interface PdfRef {
12
+ readonly type: "ref";
13
+ readonly objNum: number;
14
+ readonly gen: number;
15
+ }
16
+ /** A PDF stream: dictionary + raw data bytes */
17
+ export interface PdfStream {
18
+ readonly type: "stream";
19
+ readonly dict: PdfDictValue;
20
+ readonly data: Uint8Array;
21
+ }
22
+ /** A PDF dictionary: key-value pairs where keys are names */
23
+ export type PdfDictValue = Map<string, PdfObject>;
24
+ /** A PDF array */
25
+ export type PdfArrayValue = PdfObject[];
26
+ /**
27
+ * Union type for all possible PDF object values.
28
+ */
29
+ export type PdfObject = number | string | boolean | null | Uint8Array | PdfRef | PdfDictValue | PdfArrayValue | PdfStream;
30
+ export declare function isPdfRef(obj: PdfObject | undefined): obj is PdfRef;
31
+ export declare function isPdfStream(obj: PdfObject | undefined): obj is PdfStream;
32
+ export declare function isPdfDict(obj: PdfObject | undefined): obj is PdfDictValue;
33
+ export declare function isPdfArray(obj: PdfObject | undefined): obj is PdfArrayValue;
34
+ /** Get a string value from a PDF dictionary */
35
+ export declare function dictGetName(dict: PdfDictValue, key: string): string | undefined;
36
+ /** Get a number value from a PDF dictionary */
37
+ export declare function dictGetNumber(dict: PdfDictValue, key: string): number | undefined;
38
+ /** Get a boolean value from a PDF dictionary */
39
+ export declare function dictGetBool(dict: PdfDictValue, key: string): boolean | undefined;
40
+ /** Get a dictionary value from a PDF dictionary */
41
+ export declare function dictGetDict(dict: PdfDictValue, key: string): PdfDictValue | undefined;
42
+ /** Get an array value from a PDF dictionary */
43
+ export declare function dictGetArray(dict: PdfDictValue, key: string): PdfArrayValue | undefined;
44
+ /** Get a ref from a PDF dictionary */
45
+ export declare function dictGetRef(dict: PdfDictValue, key: string): PdfRef | undefined;
46
+ /** Get bytes (string as Uint8Array) from a PDF dictionary */
47
+ export declare function dictGetBytes(dict: PdfDictValue, key: string): Uint8Array | undefined;
48
+ /** Get a string value that may be either a name (string) or bytes decoded as latin1 */
49
+ export declare function dictGetString(dict: PdfDictValue, key: string): string | undefined;
50
+ /**
51
+ * Decode PDF string bytes to a JavaScript string.
52
+ * Handles UTF-16BE (BOM = FEFF) and PDFDocEncoding (Latin-1 superset).
53
+ */
54
+ export declare function decodePdfStringBytes(bytes: Uint8Array): string;
55
+ /**
56
+ * Parse a single PDF object from the tokenizer.
57
+ *
58
+ * Handles all PDF object types including dictionaries (with possible streams),
59
+ * arrays, strings, numbers, names, booleans, null, and indirect references.
60
+ */
61
+ export declare function parseObject(tokenizer: PdfTokenizer): PdfObject;
62
+ /**
63
+ * Parse a PDF object given the first token has already been consumed.
64
+ */
65
+ export declare function parseObjectFromToken(tokenizer: PdfTokenizer, token: Token): PdfObject;
@@ -0,0 +1,143 @@
1
+ /**
2
+ * PDF reader — public API.
3
+ *
4
+ * Provides a high-level, zero-dependency interface for reading PDF files.
5
+ * Supports:
6
+ * - Text extraction with multilingual support (WinAnsi, MacRoman, CJK via
7
+ * ToUnicode CMap, Identity-H/V, Symbol, ZapfDingbats)
8
+ * - Image extraction (JPEG, JPEG2000, raw/Flate, CCITT, JBIG2)
9
+ * - Annotation extraction (links, comments, highlights, stamps, etc.)
10
+ * - Form field extraction (AcroForm: text inputs, checkboxes, radio buttons, dropdowns)
11
+ * - Metadata reading (Info dictionary + XMP)
12
+ * - Encrypted PDFs:
13
+ * - RC4 (40-bit and 128-bit) — tested via roundtrip
14
+ * - AES-128 (V=4, R=4) — implemented, requires external test fixtures
15
+ * - AES-256 (V=5, R=5) — implemented, requires external test fixtures
16
+ * - Cross-reference tables and streams (PDF 1.5+)
17
+ * - Incremental updates and xref recovery
18
+ *
19
+ * @example Basic text extraction:
20
+ * ```typescript
21
+ * import { readPdf } from "excelts/pdf";
22
+ *
23
+ * const pdf = readPdf(pdfBytes);
24
+ * console.log(pdf.text); // All text from all pages
25
+ * console.log(pdf.pages[0].text); // Text from page 1
26
+ * ```
27
+ *
28
+ * @example Image extraction:
29
+ * ```typescript
30
+ * const pdf = readPdf(pdfBytes);
31
+ * for (const image of pdf.pages[0].images) {
32
+ * console.log(image.format, image.width, image.height);
33
+ * fs.writeFileSync(`image.${image.format}`, image.data);
34
+ * }
35
+ * ```
36
+ *
37
+ * @example Metadata:
38
+ * ```typescript
39
+ * const pdf = readPdf(pdfBytes);
40
+ * console.log(pdf.metadata.title);
41
+ * console.log(pdf.metadata.author);
42
+ * console.log(pdf.metadata.pageCount);
43
+ * ```
44
+ *
45
+ * @example Encrypted PDF:
46
+ * ```typescript
47
+ * const pdf = readPdf(pdfBytes, { password: "secret" });
48
+ * ```
49
+ */
50
+ import type { TextLine } from "./text-reconstruction.js";
51
+ import type { TextFragment } from "./content-interpreter.js";
52
+ import type { ExtractedImage } from "./image-extractor.js";
53
+ import type { PdfAnnotation } from "./annotation-extractor.js";
54
+ import type { PdfFormField } from "./form-extractor.js";
55
+ import type { PdfMetadata } from "./metadata-reader.js";
56
+ /**
57
+ * Options for reading a PDF.
58
+ */
59
+ export interface ReadPdfOptions {
60
+ /**
61
+ * Password for encrypted PDFs.
62
+ * Can be either the user password or owner password.
63
+ * @default ""
64
+ */
65
+ password?: string;
66
+ /**
67
+ * Which pages to extract (1-based).
68
+ * If omitted, all pages are extracted.
69
+ * @example [1, 3, 5] — extract pages 1, 3, and 5
70
+ */
71
+ pages?: number[];
72
+ /**
73
+ * Whether to extract text.
74
+ * @default true
75
+ */
76
+ extractText?: boolean;
77
+ /**
78
+ * Whether to extract images.
79
+ * @default true
80
+ */
81
+ extractImages?: boolean;
82
+ /**
83
+ * Whether to extract metadata.
84
+ * @default true
85
+ */
86
+ extractMetadata?: boolean;
87
+ /**
88
+ * Whether to extract annotations (links, comments, highlights, etc.).
89
+ * @default true
90
+ */
91
+ extractAnnotations?: boolean;
92
+ /**
93
+ * Whether to extract form fields (AcroForm: text inputs, checkboxes, dropdowns, etc.).
94
+ * @default true
95
+ */
96
+ extractFormFields?: boolean;
97
+ }
98
+ /**
99
+ * A single page from a read PDF.
100
+ */
101
+ export interface ReadPdfPage {
102
+ /** 1-based page number */
103
+ pageNumber: number;
104
+ /** Extracted text content */
105
+ text: string;
106
+ /** Structured text lines with position information */
107
+ textLines: TextLine[];
108
+ /** Raw text fragments with exact positions */
109
+ textFragments: TextFragment[];
110
+ /** Extracted images */
111
+ images: ExtractedImage[];
112
+ /** Extracted annotations (links, comments, highlights, etc.) */
113
+ annotations: PdfAnnotation[];
114
+ /** Page width in points */
115
+ width: number;
116
+ /** Page height in points */
117
+ height: number;
118
+ /** Warnings encountered during extraction (non-fatal errors) */
119
+ warnings: string[];
120
+ }
121
+ /**
122
+ * Result of reading a PDF.
123
+ */
124
+ export interface ReadPdfResult {
125
+ /** All text from all pages concatenated */
126
+ text: string;
127
+ /** Per-page results */
128
+ pages: ReadPdfPage[];
129
+ /** Document metadata */
130
+ metadata: PdfMetadata;
131
+ /** Form fields extracted from AcroForm (document-level) */
132
+ formFields: PdfFormField[];
133
+ }
134
+ /**
135
+ * Read a PDF file and extract text, images, and metadata.
136
+ *
137
+ * @param data - Raw PDF file bytes
138
+ * @param options - Extraction options
139
+ * @returns Extracted content
140
+ * @throws {PdfStructureError} If the PDF structure is invalid
141
+ * @throws {PdfError} If decryption fails (wrong password)
142
+ */
143
+ export declare function readPdf(data: Uint8Array, options?: ReadPdfOptions): ReadPdfResult;
@@ -0,0 +1,101 @@
1
+ /**
2
+ * PDF tokenizer / lexer.
3
+ *
4
+ * Scans raw PDF bytes and produces a stream of typed tokens.
5
+ * Handles all PDF token types: numbers, strings (literal and hex),
6
+ * names, booleans, null, keywords, and delimiters.
7
+ *
8
+ * @see PDF Reference 1.7, §3.1 - Lexical Conventions
9
+ */
10
+ export declare const enum TokenType {
11
+ /** Integer or real number */
12
+ Number = 0,
13
+ /** Literal string delimited by parentheses `(...)` */
14
+ LiteralString = 1,
15
+ /** Hex string delimited by angle brackets `<...>` */
16
+ HexString = 2,
17
+ /** Name object starting with `/` */
18
+ Name = 3,
19
+ /** Boolean `true` or `false` */
20
+ Boolean = 4,
21
+ /** The `null` keyword */
22
+ Null = 5,
23
+ /** Keywords: obj, endobj, stream, endstream, xref, trailer, startxref, R */
24
+ Keyword = 6,
25
+ /** `<<` dict begin */
26
+ DictBegin = 7,
27
+ /** `>>` dict end */
28
+ DictEnd = 8,
29
+ /** `[` array begin */
30
+ ArrayBegin = 9,
31
+ /** `]` array end */
32
+ ArrayEnd = 10,
33
+ /** End of input */
34
+ EOF = 11
35
+ }
36
+ export interface Token {
37
+ type: TokenType;
38
+ /** Numeric value (for Number tokens) */
39
+ numValue?: number;
40
+ /** String value (for String, Name, Keyword, Boolean tokens) */
41
+ strValue?: string;
42
+ /** Raw bytes (for LiteralString and HexString tokens) */
43
+ rawBytes?: Uint8Array;
44
+ /** Boolean value (for Boolean tokens) */
45
+ boolValue?: boolean;
46
+ /** Byte offset where this token starts */
47
+ offset: number;
48
+ }
49
+ /**
50
+ * Byte-level PDF tokenizer.
51
+ *
52
+ * Provides a `next()` method that returns the next token from the input.
53
+ * The tokenizer maintains a mutable position pointer that advances through
54
+ * the input bytes.
55
+ */
56
+ export declare class PdfTokenizer {
57
+ private data;
58
+ pos: number;
59
+ constructor(data: Uint8Array, offset?: number);
60
+ /** Get current position */
61
+ get position(): number;
62
+ /** Set current position */
63
+ set position(offset: number);
64
+ /** Get the underlying data */
65
+ get bytes(): Uint8Array;
66
+ /** Peek at the byte at the current position without consuming it */
67
+ peek(): number;
68
+ /** Read the next token */
69
+ next(): Token;
70
+ skipWhitespaceAndComments(): void;
71
+ private readLiteralString;
72
+ private readHexString;
73
+ private readName;
74
+ private readNumber;
75
+ private readKeyword;
76
+ /**
77
+ * Search forward for a byte sequence starting from the current position.
78
+ * Returns the offset where the sequence starts, or -1 if not found.
79
+ * Does NOT advance the position.
80
+ */
81
+ findSequence(seq: Uint8Array, from?: number): number;
82
+ /**
83
+ * Search backward for a byte sequence starting from `from` (or end of data).
84
+ * Returns the offset where the sequence starts, or -1 if not found.
85
+ */
86
+ findSequenceBackward(seq: Uint8Array, from?: number): number;
87
+ /**
88
+ * Read a line of text at the current position. Advances past the line ending.
89
+ */
90
+ readLine(): string;
91
+ /**
92
+ * Extract a slice of the underlying data.
93
+ */
94
+ slice(start: number, end: number): Uint8Array;
95
+ /**
96
+ * Read the stream content following a `stream` keyword.
97
+ * The tokenizer should be positioned right after the `stream` keyword.
98
+ * Returns the raw stream bytes (between stream\n and endstream).
99
+ */
100
+ readStreamContent(length: number): Uint8Array;
101
+ }
@@ -0,0 +1,15 @@
1
+ /**
2
+ * Shared utility functions for PDF reader modules.
3
+ */
4
+ import type { PdfDocument } from "./pdf-document.js";
5
+ import type { PdfDictValue } from "./pdf-parser.js";
6
+ /**
7
+ * Safely extract a string value from a PDF dictionary entry.
8
+ * Handles both name strings and Uint8Array PDF strings (with BOM/encoding detection).
9
+ *
10
+ * @param dict - The PDF dictionary
11
+ * @param key - The key to look up
12
+ * @param doc - The PDF document for resolving indirect references
13
+ * @returns The string value, or empty string if not found or not a string
14
+ */
15
+ export declare function getDictStringValue(dict: PdfDictValue, key: string, doc: PdfDocument): string;
@@ -0,0 +1,20 @@
1
+ /**
2
+ * PDF stream filter decoder chain.
3
+ *
4
+ * Decodes PDF stream data by applying the appropriate filter(s)
5
+ * specified in the stream dictionary's /Filter entry.
6
+ *
7
+ * Supported filters:
8
+ * - /FlateDecode (zlib/deflate compression)
9
+ * - /ASCII85Decode (ASCII base-85 encoding)
10
+ * - /ASCIIHexDecode (ASCII hexadecimal encoding)
11
+ * - /LZWDecode (LZW compression)
12
+ * - /RunLengthDecode (run-length encoding)
13
+ *
14
+ * @see PDF Reference 1.7, §3.3 - Filters
15
+ */
16
+ import type { PdfDictValue } from "./pdf-parser.js";
17
+ /**
18
+ * Decode stream data by applying the filter chain from the stream dictionary.
19
+ */
20
+ export declare function decodeStreamFilters(data: Uint8Array, dict: PdfDictValue): Uint8Array;
@@ -0,0 +1,44 @@
1
+ /**
2
+ * Text reconstruction from positioned text fragments.
3
+ *
4
+ * Assembles raw text fragments extracted from PDF content streams into
5
+ * coherent, human-readable text with proper reading order, line breaks,
6
+ * and paragraph detection.
7
+ *
8
+ * Challenges addressed:
9
+ * - PDF text has no semantic structure (only "draw char at (x,y)")
10
+ * - Text fragments may be out of order
11
+ * - Word and line boundaries must be inferred from positions
12
+ * - Columns and tables need proper handling
13
+ * - Different fonts/sizes affect spacing thresholds
14
+ * - Multi-column layouts need column detection
15
+ * - RTL (Arabic, Hebrew) text needs right-to-left sorting
16
+ * - Vertical CJK text needs column-based grouping
17
+ *
18
+ * @see PDF Reference 1.7, Chapter 5 - Text
19
+ */
20
+ import type { TextFragment } from "./content-interpreter.js";
21
+ /**
22
+ * Reconstruct readable text from positioned text fragments.
23
+ *
24
+ * @param fragments - Raw text fragments with positions from content stream
25
+ * @returns Reconstructed text with proper line breaks and spacing
26
+ */
27
+ export declare function reconstructText(fragments: TextFragment[]): string;
28
+ /**
29
+ * Detailed text extraction result preserving position information.
30
+ */
31
+ export interface TextLine {
32
+ /** The text content of this line */
33
+ text: string;
34
+ /** Y position (PDF coordinate, origin = bottom-left) */
35
+ y: number;
36
+ /** X position of the start of the line */
37
+ x: number;
38
+ /** Font size of the first fragment */
39
+ fontSize: number;
40
+ }
41
+ /**
42
+ * Extract text as structured lines.
43
+ */
44
+ export declare function reconstructTextLines(fragments: TextFragment[]): TextLine[];
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@cj-tech-master/excelts",
3
- "version": "8.0.0",
3
+ "version": "8.1.0",
4
4
  "description": "Zero-dependency TypeScript toolkit — Excel (XLSX), PDF, CSV, Markdown, XML, ZIP/TAR, and streaming.",
5
5
  "type": "module",
6
6
  "main": "./dist/cjs/index.js",