@cj-tech-master/excelts 8.0.0 → 8.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +14 -1
- package/README_zh.md +6 -0
- package/dist/browser/modules/archive/zip/stream.d.ts +4 -0
- package/dist/browser/modules/archive/zip/stream.js +53 -0
- package/dist/browser/modules/pdf/core/crypto.d.ts +65 -0
- package/dist/browser/modules/pdf/core/crypto.js +637 -0
- package/dist/browser/modules/pdf/core/encryption.d.ts +23 -20
- package/dist/browser/modules/pdf/core/encryption.js +88 -261
- package/dist/browser/modules/pdf/core/pdf-writer.d.ts +6 -4
- package/dist/browser/modules/pdf/core/pdf-writer.js +19 -10
- package/dist/browser/modules/pdf/index.d.ts +23 -2
- package/dist/browser/modules/pdf/index.js +21 -3
- package/dist/browser/modules/pdf/reader/annotation-extractor.d.ts +63 -0
- package/dist/browser/modules/pdf/reader/annotation-extractor.js +155 -0
- package/dist/browser/modules/pdf/reader/cmap-parser.d.ts +70 -0
- package/dist/browser/modules/pdf/reader/cmap-parser.js +321 -0
- package/dist/browser/modules/pdf/reader/content-interpreter.d.ts +57 -0
- package/dist/browser/modules/pdf/reader/content-interpreter.js +715 -0
- package/dist/browser/modules/pdf/reader/font-decoder.d.ts +58 -0
- package/dist/browser/modules/pdf/reader/font-decoder.js +1513 -0
- package/dist/browser/modules/pdf/reader/form-extractor.d.ts +48 -0
- package/dist/browser/modules/pdf/reader/form-extractor.js +355 -0
- package/dist/browser/modules/pdf/reader/image-extractor.d.ts +55 -0
- package/dist/browser/modules/pdf/reader/image-extractor.js +220 -0
- package/dist/browser/modules/pdf/reader/metadata-reader.d.ts +56 -0
- package/dist/browser/modules/pdf/reader/metadata-reader.js +275 -0
- package/dist/browser/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
- package/dist/browser/modules/pdf/reader/pdf-decrypt.js +443 -0
- package/dist/browser/modules/pdf/reader/pdf-document.d.ts +191 -0
- package/dist/browser/modules/pdf/reader/pdf-document.js +818 -0
- package/dist/browser/modules/pdf/reader/pdf-parser.d.ts +65 -0
- package/dist/browser/modules/pdf/reader/pdf-parser.js +285 -0
- package/dist/browser/modules/pdf/reader/pdf-reader.d.ts +143 -0
- package/dist/browser/modules/pdf/reader/pdf-reader.js +200 -0
- package/dist/browser/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
- package/dist/browser/modules/pdf/reader/pdf-tokenizer.js +543 -0
- package/dist/browser/modules/pdf/reader/reader-utils.d.ts +15 -0
- package/dist/browser/modules/pdf/reader/reader-utils.js +27 -0
- package/dist/browser/modules/pdf/reader/stream-filters.d.ts +20 -0
- package/dist/browser/modules/pdf/reader/stream-filters.js +456 -0
- package/dist/browser/modules/pdf/reader/text-reconstruction.d.ts +44 -0
- package/dist/browser/modules/pdf/reader/text-reconstruction.js +463 -0
- package/dist/cjs/modules/archive/zip/stream.js +53 -0
- package/dist/cjs/modules/pdf/core/crypto.js +649 -0
- package/dist/cjs/modules/pdf/core/encryption.js +88 -263
- package/dist/cjs/modules/pdf/core/pdf-writer.js +19 -10
- package/dist/cjs/modules/pdf/index.js +23 -4
- package/dist/cjs/modules/pdf/reader/annotation-extractor.js +158 -0
- package/dist/cjs/modules/pdf/reader/cmap-parser.js +326 -0
- package/dist/cjs/modules/pdf/reader/content-interpreter.js +718 -0
- package/dist/cjs/modules/pdf/reader/font-decoder.js +1518 -0
- package/dist/cjs/modules/pdf/reader/form-extractor.js +358 -0
- package/dist/cjs/modules/pdf/reader/image-extractor.js +223 -0
- package/dist/cjs/modules/pdf/reader/metadata-reader.js +278 -0
- package/dist/cjs/modules/pdf/reader/pdf-decrypt.js +447 -0
- package/dist/cjs/modules/pdf/reader/pdf-document.js +822 -0
- package/dist/cjs/modules/pdf/reader/pdf-parser.js +301 -0
- package/dist/cjs/modules/pdf/reader/pdf-reader.js +203 -0
- package/dist/cjs/modules/pdf/reader/pdf-tokenizer.js +517 -0
- package/dist/cjs/modules/pdf/reader/reader-utils.js +30 -0
- package/dist/cjs/modules/pdf/reader/stream-filters.js +459 -0
- package/dist/cjs/modules/pdf/reader/text-reconstruction.js +467 -0
- package/dist/esm/modules/archive/zip/stream.js +53 -0
- package/dist/esm/modules/pdf/core/crypto.js +637 -0
- package/dist/esm/modules/pdf/core/encryption.js +88 -261
- package/dist/esm/modules/pdf/core/pdf-writer.js +19 -10
- package/dist/esm/modules/pdf/index.js +21 -3
- package/dist/esm/modules/pdf/reader/annotation-extractor.js +155 -0
- package/dist/esm/modules/pdf/reader/cmap-parser.js +321 -0
- package/dist/esm/modules/pdf/reader/content-interpreter.js +715 -0
- package/dist/esm/modules/pdf/reader/font-decoder.js +1513 -0
- package/dist/esm/modules/pdf/reader/form-extractor.js +355 -0
- package/dist/esm/modules/pdf/reader/image-extractor.js +220 -0
- package/dist/esm/modules/pdf/reader/metadata-reader.js +275 -0
- package/dist/esm/modules/pdf/reader/pdf-decrypt.js +443 -0
- package/dist/esm/modules/pdf/reader/pdf-document.js +818 -0
- package/dist/esm/modules/pdf/reader/pdf-parser.js +285 -0
- package/dist/esm/modules/pdf/reader/pdf-reader.js +200 -0
- package/dist/esm/modules/pdf/reader/pdf-tokenizer.js +543 -0
- package/dist/esm/modules/pdf/reader/reader-utils.js +27 -0
- package/dist/esm/modules/pdf/reader/stream-filters.js +456 -0
- package/dist/esm/modules/pdf/reader/text-reconstruction.js +463 -0
- package/dist/iife/excelts.iife.js +703 -267
- package/dist/iife/excelts.iife.js.map +1 -1
- package/dist/iife/excelts.iife.min.js +35 -35
- package/dist/types/modules/archive/zip/stream.d.ts +4 -0
- package/dist/types/modules/pdf/core/crypto.d.ts +65 -0
- package/dist/types/modules/pdf/core/encryption.d.ts +23 -20
- package/dist/types/modules/pdf/core/pdf-writer.d.ts +6 -4
- package/dist/types/modules/pdf/index.d.ts +23 -2
- package/dist/types/modules/pdf/reader/annotation-extractor.d.ts +63 -0
- package/dist/types/modules/pdf/reader/cmap-parser.d.ts +70 -0
- package/dist/types/modules/pdf/reader/content-interpreter.d.ts +57 -0
- package/dist/types/modules/pdf/reader/font-decoder.d.ts +58 -0
- package/dist/types/modules/pdf/reader/form-extractor.d.ts +48 -0
- package/dist/types/modules/pdf/reader/image-extractor.d.ts +55 -0
- package/dist/types/modules/pdf/reader/metadata-reader.d.ts +56 -0
- package/dist/types/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
- package/dist/types/modules/pdf/reader/pdf-document.d.ts +191 -0
- package/dist/types/modules/pdf/reader/pdf-parser.d.ts +65 -0
- package/dist/types/modules/pdf/reader/pdf-reader.d.ts +143 -0
- package/dist/types/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
- package/dist/types/modules/pdf/reader/reader-utils.d.ts +15 -0
- package/dist/types/modules/pdf/reader/stream-filters.d.ts +20 -0
- package/dist/types/modules/pdf/reader/text-reconstruction.d.ts +44 -0
- package/package.json +1 -1
|
@@ -60,6 +60,8 @@ export declare class ZipDeflateFile {
|
|
|
60
60
|
private _dataQueue;
|
|
61
61
|
private _finalQueued;
|
|
62
62
|
private _pushChain;
|
|
63
|
+
private _inputBuf;
|
|
64
|
+
private _inputPos;
|
|
63
65
|
private _syncDeflater;
|
|
64
66
|
private _syncZlibReady;
|
|
65
67
|
readonly name: string;
|
|
@@ -159,6 +161,8 @@ export declare class ZipDeflateFile {
|
|
|
159
161
|
* memory growth when callers push data in a tight synchronous loop.
|
|
160
162
|
*/
|
|
161
163
|
push(data: Uint8Array, final?: boolean, callback?: (err?: Error | null) => void): Promise<void>;
|
|
164
|
+
/** Enqueue an async push through the _pushChain serialization. */
|
|
165
|
+
private _pushAsync;
|
|
162
166
|
/**
|
|
163
167
|
* Synchronous push path — compresses and emits data without any Promises.
|
|
164
168
|
*
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared cryptographic primitives for PDF encryption/decryption.
|
|
3
|
+
*
|
|
4
|
+
* Zero-dependency, pure JavaScript implementations of:
|
|
5
|
+
* - AES (128/256-bit) CBC encrypt and decrypt
|
|
6
|
+
* - SHA-256
|
|
7
|
+
* - MD5
|
|
8
|
+
* - RC4 (for reading legacy PDFs only)
|
|
9
|
+
*
|
|
10
|
+
* @see FIPS 197 — AES
|
|
11
|
+
* @see FIPS 180-4 — SHA-256
|
|
12
|
+
* @see RFC 1321 — MD5
|
|
13
|
+
*/
|
|
14
|
+
/**
|
|
15
|
+
* AES-CBC encryption with PKCS#7 padding.
|
|
16
|
+
* Supports AES-128 (16-byte key) and AES-256 (32-byte key).
|
|
17
|
+
*/
|
|
18
|
+
export declare function aesCbcEncrypt(plaintext: Uint8Array, key: Uint8Array, iv: Uint8Array): Uint8Array;
|
|
19
|
+
/**
|
|
20
|
+
* AES-CBC decryption with PKCS#7 padding removal.
|
|
21
|
+
* Supports AES-128 (16-byte key) and AES-256 (32-byte key).
|
|
22
|
+
*/
|
|
23
|
+
export declare function aesCbcDecrypt(ciphertext: Uint8Array, key: Uint8Array, iv: Uint8Array): Uint8Array;
|
|
24
|
+
/**
|
|
25
|
+
* AES-CBC decryption WITHOUT PKCS#7 padding removal.
|
|
26
|
+
* Used for key derivation in V=5 where the output length is known.
|
|
27
|
+
*/
|
|
28
|
+
export declare function aesCbcDecryptRaw(ciphertext: Uint8Array, key: Uint8Array, iv: Uint8Array): Uint8Array;
|
|
29
|
+
/**
|
|
30
|
+
* AES-CBC encryption WITHOUT PKCS#7 padding.
|
|
31
|
+
* Used when the plaintext is already block-aligned (e.g., encrypting
|
|
32
|
+
* the 32-byte file encryption key in V=5).
|
|
33
|
+
*
|
|
34
|
+
* @throws if plaintext length is not a multiple of 16.
|
|
35
|
+
*/
|
|
36
|
+
export declare function aesCbcEncryptRaw(plaintext: Uint8Array, key: Uint8Array, iv: Uint8Array): Uint8Array;
|
|
37
|
+
/**
|
|
38
|
+
* AES-ECB encryption of a single 16-byte block (no padding, no IV).
|
|
39
|
+
* Used for the /Perms value in V=5 encryption.
|
|
40
|
+
*/
|
|
41
|
+
export declare function aesEcbEncrypt(block: Uint8Array, key: Uint8Array): Uint8Array;
|
|
42
|
+
/**
|
|
43
|
+
* SHA-256 hash function.
|
|
44
|
+
* @returns 32-byte digest
|
|
45
|
+
*/
|
|
46
|
+
export declare function sha256(input: Uint8Array): Uint8Array;
|
|
47
|
+
/**
|
|
48
|
+
* MD5 hash function (RFC 1321).
|
|
49
|
+
* @returns 16-byte digest
|
|
50
|
+
*/
|
|
51
|
+
export declare function md5(input: Uint8Array): Uint8Array;
|
|
52
|
+
/**
|
|
53
|
+
* RC4 stream cipher.
|
|
54
|
+
* @deprecated Only used for reading legacy encrypted PDFs. Writer uses AES-256.
|
|
55
|
+
*/
|
|
56
|
+
export declare function rc4(key: Uint8Array, data: Uint8Array): Uint8Array;
|
|
57
|
+
/**
|
|
58
|
+
* Generate pseudo-random bytes.
|
|
59
|
+
* Uses Math.random — adequate for PDF IVs but not cryptographically secure.
|
|
60
|
+
*/
|
|
61
|
+
export declare function randomBytes(length: number): Uint8Array;
|
|
62
|
+
/**
|
|
63
|
+
* Concatenate multiple Uint8Arrays.
|
|
64
|
+
*/
|
|
65
|
+
export declare function concatArrays(...arrays: Uint8Array[]): Uint8Array;
|
|
@@ -1,13 +1,17 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* PDF encryption support (Standard Security Handler,
|
|
2
|
+
* PDF encryption support (Standard Security Handler, V=5, R=5).
|
|
3
3
|
*
|
|
4
|
-
* Implements
|
|
4
|
+
* Implements AES-256 encryption compatible with PDF 2.0 (ISO 32000-2:2020).
|
|
5
5
|
* Supports:
|
|
6
6
|
* - User password (required to open the document)
|
|
7
7
|
* - Owner password (grants full access)
|
|
8
8
|
* - Permission flags (print, copy, modify, etc.)
|
|
9
9
|
*
|
|
10
|
-
*
|
|
10
|
+
* The file encryption key (FEK) is a random 256-bit key.
|
|
11
|
+
* All streams and strings are encrypted using AES-256-CBC with a random
|
|
12
|
+
* 16-byte IV prepended to each encrypted value.
|
|
13
|
+
*
|
|
14
|
+
* @see ISO 32000-2:2020, §7.6 — Encryption
|
|
11
15
|
*/
|
|
12
16
|
/**
|
|
13
17
|
* PDF encryption options.
|
|
@@ -43,35 +47,34 @@ export interface PdfPermissions {
|
|
|
43
47
|
printHighQuality: boolean;
|
|
44
48
|
}
|
|
45
49
|
/**
|
|
46
|
-
* Encryption state used during PDF generation.
|
|
50
|
+
* Encryption state used during PDF generation (V=5, R=5, AES-256).
|
|
47
51
|
*/
|
|
48
52
|
export interface EncryptionState {
|
|
49
|
-
/**
|
|
53
|
+
/** 32-byte file encryption key */
|
|
50
54
|
encryptionKey: Uint8Array;
|
|
51
|
-
/** O value (32
|
|
55
|
+
/** 48-byte O value: hash(32) + validation salt(8) + key salt(8) */
|
|
52
56
|
oValue: Uint8Array;
|
|
53
|
-
/** U value (32
|
|
57
|
+
/** 48-byte U value: hash(32) + validation salt(8) + key salt(8) */
|
|
54
58
|
uValue: Uint8Array;
|
|
59
|
+
/** 32-byte encrypted owner key (OE) */
|
|
60
|
+
oeValue: Uint8Array;
|
|
61
|
+
/** 32-byte encrypted user key (UE) */
|
|
62
|
+
ueValue: Uint8Array;
|
|
63
|
+
/** 16-byte encrypted permissions (Perms) */
|
|
64
|
+
permsValue: Uint8Array;
|
|
55
65
|
/** Permissions integer (P value) */
|
|
56
66
|
permissions: number;
|
|
57
67
|
/** File identifier (16 bytes) */
|
|
58
68
|
fileId: Uint8Array;
|
|
59
69
|
}
|
|
60
70
|
/**
|
|
61
|
-
* Initialize encryption state
|
|
71
|
+
* Initialize encryption state for AES-256 (V=5, R=5).
|
|
62
72
|
*/
|
|
63
73
|
export declare function initEncryption(options: PdfEncryptionOptions): EncryptionState;
|
|
64
74
|
/**
|
|
65
|
-
* Encrypt
|
|
66
|
-
*
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
/**
|
|
70
|
-
* RC4 stream cipher implementation.
|
|
71
|
-
*/
|
|
72
|
-
export declare function rc4(key: Uint8Array, data: Uint8Array): Uint8Array;
|
|
73
|
-
/**
|
|
74
|
-
* MD5 hash implementation (RFC 1321).
|
|
75
|
-
* Returns 16-byte digest.
|
|
75
|
+
* Encrypt data for a PDF object using AES-256-CBC.
|
|
76
|
+
*
|
|
77
|
+
* For V=5/R=5, the file encryption key is used directly (no per-object key derivation).
|
|
78
|
+
* A random 16-byte IV is prepended to the ciphertext.
|
|
76
79
|
*/
|
|
77
|
-
export declare function
|
|
80
|
+
export declare function encryptData(data: Uint8Array, _objectNumber: number, _generation: number, encryptionKey: Uint8Array): Uint8Array;
|
|
@@ -1,20 +1,22 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* PDF file writer.
|
|
3
3
|
*
|
|
4
|
-
* Assembles a complete PDF document from indirect objects.
|
|
4
|
+
* Assembles a complete PDF 2.0 document from indirect objects.
|
|
5
5
|
* Handles the four sections of a PDF file:
|
|
6
|
-
* 1. Header (%PDF-
|
|
6
|
+
* 1. Header (%PDF-2.0)
|
|
7
7
|
* 2. Body (indirect objects)
|
|
8
8
|
* 3. Cross-reference table
|
|
9
9
|
* 4. Trailer (with document catalog reference)
|
|
10
10
|
*
|
|
11
|
-
*
|
|
11
|
+
* Encryption uses AES-256 (V=5, R=5) per ISO 32000-2:2020.
|
|
12
|
+
*
|
|
13
|
+
* @see ISO 32000-2:2020, Chapter 7.5 — File Structure
|
|
12
14
|
*/
|
|
13
15
|
import { PdfDict } from "./pdf-object.js";
|
|
14
16
|
import type { PdfContentStream } from "./pdf-stream.js";
|
|
15
17
|
import type { EncryptionState } from "./encryption.js";
|
|
16
18
|
/**
|
|
17
|
-
* Constructs a valid PDF
|
|
19
|
+
* Constructs a valid PDF 2.0 file from a set of indirect objects.
|
|
18
20
|
*
|
|
19
21
|
* Usage:
|
|
20
22
|
* 1. Allocate object numbers with allocObject()
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* PDF module for excelts.
|
|
3
3
|
*
|
|
4
|
-
* A full-featured, zero-dependency PDF engine.
|
|
4
|
+
* A full-featured, zero-dependency PDF engine for both writing and reading.
|
|
5
5
|
*
|
|
6
|
-
* @example Standalone:
|
|
6
|
+
* @example Standalone PDF generation:
|
|
7
7
|
* ```typescript
|
|
8
8
|
* import { pdf } from "excelts/pdf";
|
|
9
9
|
*
|
|
@@ -25,13 +25,34 @@
|
|
|
25
25
|
* const bytes = excelToPdf(workbook);
|
|
26
26
|
* ```
|
|
27
27
|
*
|
|
28
|
+
* @example Read PDF — extract text, images, and metadata:
|
|
29
|
+
* ```typescript
|
|
30
|
+
* import { readPdf } from "excelts/pdf";
|
|
31
|
+
*
|
|
32
|
+
* const result = readPdf(pdfBytes);
|
|
33
|
+
* console.log(result.text); // All text
|
|
34
|
+
* console.log(result.pages[0].text); // Page 1 text
|
|
35
|
+
* console.log(result.pages[0].images); // Page 1 images
|
|
36
|
+
* console.log(result.pages[0].annotations); // Page 1 annotations
|
|
37
|
+
* console.log(result.metadata.title); // Document title
|
|
38
|
+
* console.log(result.formFields); // Form fields
|
|
39
|
+
* ```
|
|
40
|
+
*
|
|
28
41
|
* @module pdf
|
|
29
42
|
*/
|
|
30
43
|
/** Standalone PDF generation — accepts plain arrays, sheet objects, or workbooks. */
|
|
31
44
|
export { pdf } from "./pdf.js";
|
|
32
45
|
/** Excel-to-PDF conversion — accepts an Excel Workbook instance. */
|
|
33
46
|
export { excelToPdf } from "./excel-bridge.js";
|
|
47
|
+
/** Read a PDF file and extract text, images, and metadata. */
|
|
48
|
+
export { readPdf } from "./reader/pdf-reader.js";
|
|
34
49
|
export type { PdfCell, PdfRow, PdfColumn, PdfSheet, PdfBook, PdfImage } from "./pdf.js";
|
|
35
50
|
export type { PdfExportOptions, PdfOrientation, PdfPageSize, PdfMargins, PdfColor, PageSizeName } from "./types.js";
|
|
36
51
|
export { PageSizes } from "./types.js";
|
|
52
|
+
export type { ReadPdfOptions, ReadPdfResult, ReadPdfPage } from "./reader/pdf-reader.js";
|
|
53
|
+
export type { PdfMetadata } from "./reader/metadata-reader.js";
|
|
54
|
+
export type { ExtractedImage } from "./reader/image-extractor.js";
|
|
55
|
+
export type { TextLine } from "./reader/text-reconstruction.js";
|
|
56
|
+
export type { PdfAnnotation, PdfRect } from "./reader/annotation-extractor.js";
|
|
57
|
+
export type { PdfFormField, PdfFormFieldType } from "./reader/form-extractor.js";
|
|
37
58
|
export { PdfError, PdfRenderError, PdfFontError, PdfStructureError, isPdfError } from "./errors.js";
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PDF annotation extractor.
|
|
3
|
+
*
|
|
4
|
+
* Extracts annotations from a PDF page's `/Annots` array.
|
|
5
|
+
* Supports all standard annotation subtypes defined in PDF Reference 1.7, §12.5.
|
|
6
|
+
*
|
|
7
|
+
* Common annotation types:
|
|
8
|
+
* - **Link** — Hyperlinks (URI, GoTo, GoToR)
|
|
9
|
+
* - **Text** — Sticky notes / comments
|
|
10
|
+
* - **FreeText** — Inline text annotations
|
|
11
|
+
* - **Highlight / Underline / StrikeOut / Squiggly** — Text markup
|
|
12
|
+
* - **Stamp** — Rubber stamp annotations
|
|
13
|
+
* - **Popup** — Associated popup windows
|
|
14
|
+
* - **Widget** — Form field widgets (handled separately by form-extractor)
|
|
15
|
+
*
|
|
16
|
+
* @see PDF Reference 1.7, §12.5 - Annotations
|
|
17
|
+
*/
|
|
18
|
+
import type { PdfDocument } from "./pdf-document.js";
|
|
19
|
+
import type { PdfDictValue } from "./pdf-parser.js";
|
|
20
|
+
/** Rectangle in PDF coordinate space [x1, y1, x2, y2] */
|
|
21
|
+
export interface PdfRect {
|
|
22
|
+
/** Left edge (x1) */
|
|
23
|
+
x1: number;
|
|
24
|
+
/** Bottom edge (y1) */
|
|
25
|
+
y1: number;
|
|
26
|
+
/** Right edge (x2) */
|
|
27
|
+
x2: number;
|
|
28
|
+
/** Top edge (y2) */
|
|
29
|
+
y2: number;
|
|
30
|
+
}
|
|
31
|
+
/** A PDF annotation extracted from a page. */
|
|
32
|
+
export interface PdfAnnotation {
|
|
33
|
+
/** Annotation subtype (e.g. "Link", "Text", "Highlight", "FreeText", "Stamp") */
|
|
34
|
+
subtype: string;
|
|
35
|
+
/** Bounding rectangle in page coordinates (points) */
|
|
36
|
+
rect: PdfRect;
|
|
37
|
+
/** Text content (/Contents entry) */
|
|
38
|
+
contents: string;
|
|
39
|
+
/** Author / title (/T entry) */
|
|
40
|
+
author: string;
|
|
41
|
+
/** Subject (/Subj entry) */
|
|
42
|
+
subject: string;
|
|
43
|
+
/** Modification date (/M entry) — raw PDF date string */
|
|
44
|
+
modifiedDate: string;
|
|
45
|
+
/** For Link annotations: the destination URI */
|
|
46
|
+
uri: string;
|
|
47
|
+
/** For Link annotations: named destination */
|
|
48
|
+
destination: string;
|
|
49
|
+
/** Annotation flags (/F entry) */
|
|
50
|
+
flags: number;
|
|
51
|
+
/** Color (/C entry) — array of 0-3 values in [0,1] */
|
|
52
|
+
color: number[];
|
|
53
|
+
}
|
|
54
|
+
/**
|
|
55
|
+
* Extract annotations from a PDF page.
|
|
56
|
+
*
|
|
57
|
+
* Skips Widget annotations (form fields) — those are handled by the form extractor.
|
|
58
|
+
*
|
|
59
|
+
* @param pageDict - The page dictionary
|
|
60
|
+
* @param doc - The PDF document for resolving references
|
|
61
|
+
* @returns Array of extracted annotations
|
|
62
|
+
*/
|
|
63
|
+
export declare function extractAnnotationsFromPage(pageDict: PdfDictValue, doc: PdfDocument): PdfAnnotation[];
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* CMap parser for PDF text extraction.
|
|
3
|
+
*
|
|
4
|
+
* Parses /ToUnicode CMap programs to build character code → Unicode mappings.
|
|
5
|
+
* This is essential for extracting text from PDFs that use CIDFonts or
|
|
6
|
+
* custom encodings.
|
|
7
|
+
*
|
|
8
|
+
* Supports:
|
|
9
|
+
* - beginbfchar / endbfchar (single character mappings)
|
|
10
|
+
* - beginbfrange / endbfrange (range mappings, including array form)
|
|
11
|
+
* - begincodespacerange / endcodespacerange
|
|
12
|
+
* - Multi-byte character codes (1-4 bytes)
|
|
13
|
+
* - UTF-16BE encoded target strings (including surrogate pairs)
|
|
14
|
+
*
|
|
15
|
+
* @see PDF Reference 1.7, §5.9 - ToUnicode CMaps
|
|
16
|
+
* @see Adobe Technical Note #5411 - CMap Resources
|
|
17
|
+
*/
|
|
18
|
+
/**
|
|
19
|
+
* A parsed CMap that maps character codes to Unicode strings.
|
|
20
|
+
*/
|
|
21
|
+
export declare class CMap {
|
|
22
|
+
private codeSpaceRanges;
|
|
23
|
+
private bfChars;
|
|
24
|
+
private bfRanges;
|
|
25
|
+
/** Number of bytes per character code (detected from codespace ranges) */
|
|
26
|
+
bytesPerCode: number;
|
|
27
|
+
constructor();
|
|
28
|
+
/**
|
|
29
|
+
* Look up the Unicode string for a character code.
|
|
30
|
+
* Uses binary search over sorted bfRanges for efficient lookup.
|
|
31
|
+
*/
|
|
32
|
+
lookup(code: number): string | undefined;
|
|
33
|
+
/**
|
|
34
|
+
* Add a code space range.
|
|
35
|
+
*/
|
|
36
|
+
addCodeSpaceRange(low: number, high: number, bytes: number): void;
|
|
37
|
+
/**
|
|
38
|
+
* Add a bfchar mapping.
|
|
39
|
+
*/
|
|
40
|
+
addBfChar(code: number, unicode: string): void;
|
|
41
|
+
/**
|
|
42
|
+
* Add a bfrange mapping.
|
|
43
|
+
*/
|
|
44
|
+
addBfRange(low: number, high: number, mapping: string | string[]): void;
|
|
45
|
+
/**
|
|
46
|
+
* Sort bfRanges by low value for binary search.
|
|
47
|
+
* Should be called after all ranges have been added.
|
|
48
|
+
*/
|
|
49
|
+
sortRanges(): void;
|
|
50
|
+
/**
|
|
51
|
+
* Determine the code length (in bytes) for a given first byte,
|
|
52
|
+
* using the codespace ranges. When multiple ranges match (e.g. a 1-byte
|
|
53
|
+
* range covering 0x00-0xFF and a 2-byte range whose first byte overlaps),
|
|
54
|
+
* returns the longest match per the PDF spec's greedy matching rule.
|
|
55
|
+
* Falls back to bytesPerCode if no range matches.
|
|
56
|
+
*/
|
|
57
|
+
getCodeLength(firstByte: number): number;
|
|
58
|
+
/**
|
|
59
|
+
* Check if this CMap has any mappings.
|
|
60
|
+
*/
|
|
61
|
+
get isEmpty(): boolean;
|
|
62
|
+
/**
|
|
63
|
+
* Check if this CMap has codespace ranges defined.
|
|
64
|
+
*/
|
|
65
|
+
get hasCodeSpaceRanges(): boolean;
|
|
66
|
+
}
|
|
67
|
+
/**
|
|
68
|
+
* Parse a CMap program (typically from a /ToUnicode stream).
|
|
69
|
+
*/
|
|
70
|
+
export declare function parseCMap(data: Uint8Array): CMap;
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PDF content stream interpreter for text extraction.
|
|
3
|
+
*
|
|
4
|
+
* Implements a full PDF graphics state machine that processes content stream
|
|
5
|
+
* operators to extract positioned text fragments. These fragments are then
|
|
6
|
+
* assembled into readable text by the text reconstruction module.
|
|
7
|
+
*
|
|
8
|
+
* Supported operator categories:
|
|
9
|
+
* - Text state: Tf, Tc, Tw, Tz, TL, Ts, Tr
|
|
10
|
+
* - Text positioning: Td, TD, Tm, T*
|
|
11
|
+
* - Text showing: Tj, TJ, ', "
|
|
12
|
+
* - Text objects: BT, ET
|
|
13
|
+
* - Graphics state: q, Q, cm, gs, i, M, ri, W, W*
|
|
14
|
+
* - Color: CS, cs, SC, sc, SCN, scn
|
|
15
|
+
* - Marked content: BDC, BMC, EMC, MP, DP
|
|
16
|
+
* - Type3 glyph: d0, d1
|
|
17
|
+
* - Shading: sh
|
|
18
|
+
* - Inline images: BI/ID/EI
|
|
19
|
+
* - XObject invocation: Do (for form XObjects containing text)
|
|
20
|
+
*
|
|
21
|
+
* @see PDF Reference 1.7, Chapter 5 - Text
|
|
22
|
+
* @see PDF Reference 1.7, Chapter 4 - Graphics
|
|
23
|
+
*/
|
|
24
|
+
import type { PdfDocument } from "./pdf-document.js";
|
|
25
|
+
import type { PdfDictValue } from "./pdf-parser.js";
|
|
26
|
+
/**
|
|
27
|
+
* A text fragment extracted from a PDF page.
|
|
28
|
+
* Contains the text string and its position in page coordinates.
|
|
29
|
+
*/
|
|
30
|
+
export interface TextFragment {
|
|
31
|
+
/** The extracted text */
|
|
32
|
+
text: string;
|
|
33
|
+
/** X position in page coordinates (points, origin = bottom-left) */
|
|
34
|
+
x: number;
|
|
35
|
+
/** Y position in page coordinates */
|
|
36
|
+
y: number;
|
|
37
|
+
/** Font size in points */
|
|
38
|
+
fontSize: number;
|
|
39
|
+
/** Font name */
|
|
40
|
+
fontName: string;
|
|
41
|
+
/** Width of the text in points */
|
|
42
|
+
width: number;
|
|
43
|
+
/** Character spacing */
|
|
44
|
+
charSpacing: number;
|
|
45
|
+
/** Word spacing */
|
|
46
|
+
wordSpacing: number;
|
|
47
|
+
/** Horizontal scaling factor (100 = normal) */
|
|
48
|
+
horizontalScaling: number;
|
|
49
|
+
/** Whether the text is vertical (WMode=1) */
|
|
50
|
+
isVertical: boolean;
|
|
51
|
+
/** Whether the text is right-to-left (Arabic, Hebrew, etc.) */
|
|
52
|
+
isRtl: boolean;
|
|
53
|
+
}
|
|
54
|
+
/**
|
|
55
|
+
* Extract text fragments from a page's content stream(s).
|
|
56
|
+
*/
|
|
57
|
+
export declare function extractTextFromPage(pageDict: PdfDictValue, doc: PdfDocument): TextFragment[];
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PDF font decoder for text extraction.
|
|
3
|
+
*
|
|
4
|
+
* Handles the mapping from character codes in content streams to Unicode
|
|
5
|
+
* strings. Supports all major PDF font types:
|
|
6
|
+
*
|
|
7
|
+
* - Type 1 fonts (standard 14 + custom with /Encoding)
|
|
8
|
+
* - TrueType fonts (with /Encoding and /ToUnicode)
|
|
9
|
+
* - Type 0 (CID) composite fonts (with /ToUnicode CMap)
|
|
10
|
+
* - Type 3 fonts (with /Encoding and /ToUnicode)
|
|
11
|
+
*
|
|
12
|
+
* @see PDF Reference 1.7, Chapter 5 - Text
|
|
13
|
+
* @see PDF Reference 1.7, §5.5 - Character Encoding
|
|
14
|
+
*/
|
|
15
|
+
import type { CMap } from "./cmap-parser.js";
|
|
16
|
+
import type { PdfDocument } from "./pdf-document.js";
|
|
17
|
+
import type { PdfDictValue } from "./pdf-parser.js";
|
|
18
|
+
/**
|
|
19
|
+
* A resolved font used for text extraction.
|
|
20
|
+
*/
|
|
21
|
+
export interface ResolvedFont {
|
|
22
|
+
/** Font name */
|
|
23
|
+
name: string;
|
|
24
|
+
/** Font subtype: Type1, TrueType, Type0, Type3, CIDFontType0, CIDFontType2, MMType1 */
|
|
25
|
+
subtype: string;
|
|
26
|
+
/** ToUnicode CMap (if available) */
|
|
27
|
+
toUnicode: CMap | null;
|
|
28
|
+
/** Encoding lookup: char code → unicode string */
|
|
29
|
+
encoding: Map<number, string>;
|
|
30
|
+
/** Number of bytes per character code (1 for simple fonts, 1-2 for CID fonts) */
|
|
31
|
+
bytesPerCode: number;
|
|
32
|
+
/** Base font name */
|
|
33
|
+
baseFontName: string;
|
|
34
|
+
/** Whether this is a symbolic font */
|
|
35
|
+
isSymbolic: boolean;
|
|
36
|
+
/** Character widths (code → width in thousandths of text space units) */
|
|
37
|
+
widths: Map<number, number>;
|
|
38
|
+
/** Default width */
|
|
39
|
+
defaultWidth: number;
|
|
40
|
+
/** Missing width for characters not in widths table */
|
|
41
|
+
missingWidth: number;
|
|
42
|
+
/** Whether the font uses Identity-H or Identity-V encoding (codes are Unicode code points) */
|
|
43
|
+
isIdentityEncoding: boolean;
|
|
44
|
+
/** Writing mode: 0 = horizontal, 1 = vertical */
|
|
45
|
+
wmode: number;
|
|
46
|
+
}
|
|
47
|
+
/**
|
|
48
|
+
* Resolve a PDF font dictionary into a ResolvedFont for text extraction.
|
|
49
|
+
*/
|
|
50
|
+
export declare function resolveFont(fontDict: PdfDictValue, doc: PdfDocument): ResolvedFont;
|
|
51
|
+
/**
|
|
52
|
+
* Decode character codes to Unicode text using a resolved font.
|
|
53
|
+
*/
|
|
54
|
+
export declare function decodeText(codes: Uint8Array, font: ResolvedFont): string;
|
|
55
|
+
/**
|
|
56
|
+
* Get the character width for a given code.
|
|
57
|
+
*/
|
|
58
|
+
export declare function getCharWidth(code: number, font: ResolvedFont): number;
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PDF form field (AcroForm) extractor.
|
|
3
|
+
*
|
|
4
|
+
* Extracts interactive form fields from a PDF's `/AcroForm` dictionary.
|
|
5
|
+
* Supports all standard field types:
|
|
6
|
+
* - **Text** (`/Tx`) — Text input fields
|
|
7
|
+
* - **Button** (`/Btn`) — Checkboxes, radio buttons, push buttons
|
|
8
|
+
* - **Choice** (`/Ch`) — Dropdowns (combo boxes) and list boxes
|
|
9
|
+
* - **Signature** (`/Sig`) — Digital signature fields
|
|
10
|
+
*
|
|
11
|
+
* Handles field hierarchies (parent/child), inherited values, and default appearances.
|
|
12
|
+
*
|
|
13
|
+
* @see PDF Reference 1.7, §12.7 - Interactive Forms
|
|
14
|
+
*/
|
|
15
|
+
import type { PdfDocument } from "./pdf-document.js";
|
|
16
|
+
/** Type of form field. */
|
|
17
|
+
export type PdfFormFieldType = "text" | "checkbox" | "radio" | "dropdown" | "listbox" | "button" | "signature" | "unknown";
|
|
18
|
+
/** A single form field extracted from the PDF. */
|
|
19
|
+
export interface PdfFormField {
|
|
20
|
+
/** Fully qualified field name (e.g. "form1.address.city") */
|
|
21
|
+
name: string;
|
|
22
|
+
/** Field type */
|
|
23
|
+
type: PdfFormFieldType;
|
|
24
|
+
/** Current value of the field */
|
|
25
|
+
value: string;
|
|
26
|
+
/** Default value (/DV entry) */
|
|
27
|
+
defaultValue: string;
|
|
28
|
+
/** Whether the field is read-only */
|
|
29
|
+
readOnly: boolean;
|
|
30
|
+
/** Whether the field is required */
|
|
31
|
+
required: boolean;
|
|
32
|
+
/** For choice fields: the list of available options */
|
|
33
|
+
options: string[];
|
|
34
|
+
/** For checkboxes/radio buttons: the export value when checked */
|
|
35
|
+
exportValue: string;
|
|
36
|
+
/** Field flags (/Ff entry) — raw bit field */
|
|
37
|
+
flags: number;
|
|
38
|
+
}
|
|
39
|
+
/**
|
|
40
|
+
* Extract form fields from a PDF document.
|
|
41
|
+
*
|
|
42
|
+
* Reads the `/AcroForm` dictionary from the catalog and recursively
|
|
43
|
+
* traverses the field tree.
|
|
44
|
+
*
|
|
45
|
+
* @param doc - The PDF document
|
|
46
|
+
* @returns Array of extracted form fields
|
|
47
|
+
*/
|
|
48
|
+
export declare function extractFormFields(doc: PdfDocument): PdfFormField[];
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PDF image extraction.
|
|
3
|
+
*
|
|
4
|
+
* Extracts images from PDF pages including:
|
|
5
|
+
* - Inline images (BI/ID/EI operators)
|
|
6
|
+
* - XObject images (/Subtype /Image)
|
|
7
|
+
* - Images with various color spaces and filters
|
|
8
|
+
*
|
|
9
|
+
* Supported image formats:
|
|
10
|
+
* - JPEG (DCTDecode) — extracted as-is
|
|
11
|
+
* - JPEG2000 (JPXDecode) — extracted as-is
|
|
12
|
+
* - Raw/Flate-compressed pixel data — extracted with metadata
|
|
13
|
+
* - CCITT fax — extracted as-is
|
|
14
|
+
*
|
|
15
|
+
* @see PDF Reference 1.7, §4.8 - Images
|
|
16
|
+
*/
|
|
17
|
+
import type { PdfDocument } from "./pdf-document.js";
|
|
18
|
+
import type { PdfDictValue } from "./pdf-parser.js";
|
|
19
|
+
/**
|
|
20
|
+
* An extracted image from a PDF page.
|
|
21
|
+
*/
|
|
22
|
+
export interface ExtractedImage {
|
|
23
|
+
/** Image index within the page (0-based) */
|
|
24
|
+
index: number;
|
|
25
|
+
/** Image width in pixels */
|
|
26
|
+
width: number;
|
|
27
|
+
/** Image height in pixels */
|
|
28
|
+
height: number;
|
|
29
|
+
/** Bits per component */
|
|
30
|
+
bitsPerComponent: number;
|
|
31
|
+
/** Color space name */
|
|
32
|
+
colorSpace: string;
|
|
33
|
+
/** Number of color components (1=gray, 3=RGB, 4=CMYK) */
|
|
34
|
+
components: number;
|
|
35
|
+
/**
|
|
36
|
+
* Image data format:
|
|
37
|
+
* - "jpeg" — raw JPEG data (can be written directly as .jpg)
|
|
38
|
+
* - "jpx" — JPEG 2000 data
|
|
39
|
+
* - "raw" — raw pixel data (RGB/CMYK/Gray, decompressed)
|
|
40
|
+
* - "ccitt" — CCITT fax compressed data
|
|
41
|
+
*/
|
|
42
|
+
format: "jpeg" | "jpx" | "raw" | "ccitt" | "jbig2";
|
|
43
|
+
/** The image data */
|
|
44
|
+
data: Uint8Array;
|
|
45
|
+
/** Alpha mask data (if present) — same dimensions, 1 component, 8 bits */
|
|
46
|
+
alphaMask: Uint8Array | null;
|
|
47
|
+
/** Filter name from the original stream */
|
|
48
|
+
filter: string;
|
|
49
|
+
/** XObject name (if it was a named XObject) */
|
|
50
|
+
name: string;
|
|
51
|
+
}
|
|
52
|
+
/**
|
|
53
|
+
* Extract all images from a PDF page.
|
|
54
|
+
*/
|
|
55
|
+
export declare function extractImagesFromPage(pageDict: PdfDictValue, doc: PdfDocument): ExtractedImage[];
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PDF metadata reader.
|
|
3
|
+
*
|
|
4
|
+
* Extracts document metadata from:
|
|
5
|
+
* 1. Info Dictionary (traditional metadata)
|
|
6
|
+
* - Title, Author, Subject, Keywords, Creator, Producer
|
|
7
|
+
* - CreationDate, ModDate
|
|
8
|
+
*
|
|
9
|
+
* 2. XMP Metadata Stream (XML-based, more comprehensive)
|
|
10
|
+
* - All of the above plus:
|
|
11
|
+
* - Dublin Core metadata, custom properties
|
|
12
|
+
*
|
|
13
|
+
* @see PDF Reference 1.7, §10.2 - Metadata
|
|
14
|
+
* @see XMP Specification Part 1
|
|
15
|
+
*/
|
|
16
|
+
import type { PdfDocument } from "./pdf-document.js";
|
|
17
|
+
/**
|
|
18
|
+
* PDF document metadata.
|
|
19
|
+
*/
|
|
20
|
+
export interface PdfMetadata {
|
|
21
|
+
/** Document title */
|
|
22
|
+
title: string;
|
|
23
|
+
/** Document author */
|
|
24
|
+
author: string;
|
|
25
|
+
/** Document subject */
|
|
26
|
+
subject: string;
|
|
27
|
+
/** Document keywords */
|
|
28
|
+
keywords: string;
|
|
29
|
+
/** Application that created the original document */
|
|
30
|
+
creator: string;
|
|
31
|
+
/** Application that produced the PDF */
|
|
32
|
+
producer: string;
|
|
33
|
+
/** Date the document was created */
|
|
34
|
+
creationDate: Date | null;
|
|
35
|
+
/** Date the document was last modified */
|
|
36
|
+
modDate: Date | null;
|
|
37
|
+
/** PDF version */
|
|
38
|
+
pdfVersion: string;
|
|
39
|
+
/** Number of pages */
|
|
40
|
+
pageCount: number;
|
|
41
|
+
/** Whether the document is encrypted */
|
|
42
|
+
encrypted: boolean;
|
|
43
|
+
/** Page size of the first page (in points) */
|
|
44
|
+
pageSize: {
|
|
45
|
+
width: number;
|
|
46
|
+
height: number;
|
|
47
|
+
} | null;
|
|
48
|
+
/** Raw XMP metadata XML (if available) */
|
|
49
|
+
xmpXml: string | null;
|
|
50
|
+
/** Additional custom metadata from Info dictionary */
|
|
51
|
+
custom: Record<string, string>;
|
|
52
|
+
}
|
|
53
|
+
/**
|
|
54
|
+
* Extract metadata from a PDF document.
|
|
55
|
+
*/
|
|
56
|
+
export declare function extractMetadata(doc: PdfDocument): PdfMetadata;
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PDF decryption for reading encrypted PDFs.
|
|
3
|
+
*
|
|
4
|
+
* Supports:
|
|
5
|
+
* - Standard Security Handler (V1/V2/V4/V5, R2/R3/R4/R5)
|
|
6
|
+
* - RC4 encryption (40-bit and 128-bit)
|
|
7
|
+
* - AES-128 encryption (PDF 1.6+)
|
|
8
|
+
* - AES-256 encryption (PDF 2.0, V=5, R=5)
|
|
9
|
+
*
|
|
10
|
+
* @see PDF Reference 1.7, §3.5 - Encryption
|
|
11
|
+
* @see PDF 2.0 (ISO 32000-2), §7.6 - Encryption
|
|
12
|
+
*/
|
|
13
|
+
import type { PdfDocument } from "./pdf-document.js";
|
|
14
|
+
/**
|
|
15
|
+
* Initialize decryption for a PDF document.
|
|
16
|
+
* Returns true if decryption was successfully initialized, false if
|
|
17
|
+
* the password was incorrect.
|
|
18
|
+
*
|
|
19
|
+
* @param doc - The PDF document
|
|
20
|
+
* @param password - User or owner password (empty string for no password)
|
|
21
|
+
*/
|
|
22
|
+
export declare function initDecryption(doc: PdfDocument, password?: string): boolean;
|
|
23
|
+
/**
|
|
24
|
+
* Check if the document is encrypted.
|
|
25
|
+
*/
|
|
26
|
+
export declare function isEncrypted(doc: PdfDocument): boolean;
|