@cj-tech-master/excelts 8.0.0 → 8.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. package/README.md +14 -1
  2. package/README_zh.md +6 -0
  3. package/dist/browser/modules/archive/zip/stream.d.ts +4 -0
  4. package/dist/browser/modules/archive/zip/stream.js +53 -0
  5. package/dist/browser/modules/pdf/core/crypto.d.ts +65 -0
  6. package/dist/browser/modules/pdf/core/crypto.js +637 -0
  7. package/dist/browser/modules/pdf/core/encryption.d.ts +23 -20
  8. package/dist/browser/modules/pdf/core/encryption.js +88 -261
  9. package/dist/browser/modules/pdf/core/pdf-writer.d.ts +6 -4
  10. package/dist/browser/modules/pdf/core/pdf-writer.js +19 -10
  11. package/dist/browser/modules/pdf/index.d.ts +23 -2
  12. package/dist/browser/modules/pdf/index.js +21 -3
  13. package/dist/browser/modules/pdf/reader/annotation-extractor.d.ts +63 -0
  14. package/dist/browser/modules/pdf/reader/annotation-extractor.js +155 -0
  15. package/dist/browser/modules/pdf/reader/cmap-parser.d.ts +70 -0
  16. package/dist/browser/modules/pdf/reader/cmap-parser.js +321 -0
  17. package/dist/browser/modules/pdf/reader/content-interpreter.d.ts +57 -0
  18. package/dist/browser/modules/pdf/reader/content-interpreter.js +715 -0
  19. package/dist/browser/modules/pdf/reader/font-decoder.d.ts +58 -0
  20. package/dist/browser/modules/pdf/reader/font-decoder.js +1513 -0
  21. package/dist/browser/modules/pdf/reader/form-extractor.d.ts +48 -0
  22. package/dist/browser/modules/pdf/reader/form-extractor.js +355 -0
  23. package/dist/browser/modules/pdf/reader/image-extractor.d.ts +55 -0
  24. package/dist/browser/modules/pdf/reader/image-extractor.js +220 -0
  25. package/dist/browser/modules/pdf/reader/metadata-reader.d.ts +56 -0
  26. package/dist/browser/modules/pdf/reader/metadata-reader.js +275 -0
  27. package/dist/browser/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
  28. package/dist/browser/modules/pdf/reader/pdf-decrypt.js +443 -0
  29. package/dist/browser/modules/pdf/reader/pdf-document.d.ts +191 -0
  30. package/dist/browser/modules/pdf/reader/pdf-document.js +818 -0
  31. package/dist/browser/modules/pdf/reader/pdf-parser.d.ts +65 -0
  32. package/dist/browser/modules/pdf/reader/pdf-parser.js +285 -0
  33. package/dist/browser/modules/pdf/reader/pdf-reader.d.ts +143 -0
  34. package/dist/browser/modules/pdf/reader/pdf-reader.js +200 -0
  35. package/dist/browser/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
  36. package/dist/browser/modules/pdf/reader/pdf-tokenizer.js +543 -0
  37. package/dist/browser/modules/pdf/reader/reader-utils.d.ts +15 -0
  38. package/dist/browser/modules/pdf/reader/reader-utils.js +27 -0
  39. package/dist/browser/modules/pdf/reader/stream-filters.d.ts +20 -0
  40. package/dist/browser/modules/pdf/reader/stream-filters.js +456 -0
  41. package/dist/browser/modules/pdf/reader/text-reconstruction.d.ts +44 -0
  42. package/dist/browser/modules/pdf/reader/text-reconstruction.js +463 -0
  43. package/dist/cjs/modules/archive/zip/stream.js +53 -0
  44. package/dist/cjs/modules/pdf/core/crypto.js +649 -0
  45. package/dist/cjs/modules/pdf/core/encryption.js +88 -263
  46. package/dist/cjs/modules/pdf/core/pdf-writer.js +19 -10
  47. package/dist/cjs/modules/pdf/index.js +23 -4
  48. package/dist/cjs/modules/pdf/reader/annotation-extractor.js +158 -0
  49. package/dist/cjs/modules/pdf/reader/cmap-parser.js +326 -0
  50. package/dist/cjs/modules/pdf/reader/content-interpreter.js +718 -0
  51. package/dist/cjs/modules/pdf/reader/font-decoder.js +1518 -0
  52. package/dist/cjs/modules/pdf/reader/form-extractor.js +358 -0
  53. package/dist/cjs/modules/pdf/reader/image-extractor.js +223 -0
  54. package/dist/cjs/modules/pdf/reader/metadata-reader.js +278 -0
  55. package/dist/cjs/modules/pdf/reader/pdf-decrypt.js +447 -0
  56. package/dist/cjs/modules/pdf/reader/pdf-document.js +822 -0
  57. package/dist/cjs/modules/pdf/reader/pdf-parser.js +301 -0
  58. package/dist/cjs/modules/pdf/reader/pdf-reader.js +203 -0
  59. package/dist/cjs/modules/pdf/reader/pdf-tokenizer.js +517 -0
  60. package/dist/cjs/modules/pdf/reader/reader-utils.js +30 -0
  61. package/dist/cjs/modules/pdf/reader/stream-filters.js +459 -0
  62. package/dist/cjs/modules/pdf/reader/text-reconstruction.js +467 -0
  63. package/dist/esm/modules/archive/zip/stream.js +53 -0
  64. package/dist/esm/modules/pdf/core/crypto.js +637 -0
  65. package/dist/esm/modules/pdf/core/encryption.js +88 -261
  66. package/dist/esm/modules/pdf/core/pdf-writer.js +19 -10
  67. package/dist/esm/modules/pdf/index.js +21 -3
  68. package/dist/esm/modules/pdf/reader/annotation-extractor.js +155 -0
  69. package/dist/esm/modules/pdf/reader/cmap-parser.js +321 -0
  70. package/dist/esm/modules/pdf/reader/content-interpreter.js +715 -0
  71. package/dist/esm/modules/pdf/reader/font-decoder.js +1513 -0
  72. package/dist/esm/modules/pdf/reader/form-extractor.js +355 -0
  73. package/dist/esm/modules/pdf/reader/image-extractor.js +220 -0
  74. package/dist/esm/modules/pdf/reader/metadata-reader.js +275 -0
  75. package/dist/esm/modules/pdf/reader/pdf-decrypt.js +443 -0
  76. package/dist/esm/modules/pdf/reader/pdf-document.js +818 -0
  77. package/dist/esm/modules/pdf/reader/pdf-parser.js +285 -0
  78. package/dist/esm/modules/pdf/reader/pdf-reader.js +200 -0
  79. package/dist/esm/modules/pdf/reader/pdf-tokenizer.js +543 -0
  80. package/dist/esm/modules/pdf/reader/reader-utils.js +27 -0
  81. package/dist/esm/modules/pdf/reader/stream-filters.js +456 -0
  82. package/dist/esm/modules/pdf/reader/text-reconstruction.js +463 -0
  83. package/dist/iife/excelts.iife.js +703 -267
  84. package/dist/iife/excelts.iife.js.map +1 -1
  85. package/dist/iife/excelts.iife.min.js +35 -35
  86. package/dist/types/modules/archive/zip/stream.d.ts +4 -0
  87. package/dist/types/modules/pdf/core/crypto.d.ts +65 -0
  88. package/dist/types/modules/pdf/core/encryption.d.ts +23 -20
  89. package/dist/types/modules/pdf/core/pdf-writer.d.ts +6 -4
  90. package/dist/types/modules/pdf/index.d.ts +23 -2
  91. package/dist/types/modules/pdf/reader/annotation-extractor.d.ts +63 -0
  92. package/dist/types/modules/pdf/reader/cmap-parser.d.ts +70 -0
  93. package/dist/types/modules/pdf/reader/content-interpreter.d.ts +57 -0
  94. package/dist/types/modules/pdf/reader/font-decoder.d.ts +58 -0
  95. package/dist/types/modules/pdf/reader/form-extractor.d.ts +48 -0
  96. package/dist/types/modules/pdf/reader/image-extractor.d.ts +55 -0
  97. package/dist/types/modules/pdf/reader/metadata-reader.d.ts +56 -0
  98. package/dist/types/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
  99. package/dist/types/modules/pdf/reader/pdf-document.d.ts +191 -0
  100. package/dist/types/modules/pdf/reader/pdf-parser.d.ts +65 -0
  101. package/dist/types/modules/pdf/reader/pdf-reader.d.ts +143 -0
  102. package/dist/types/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
  103. package/dist/types/modules/pdf/reader/reader-utils.d.ts +15 -0
  104. package/dist/types/modules/pdf/reader/stream-filters.d.ts +20 -0
  105. package/dist/types/modules/pdf/reader/text-reconstruction.d.ts +44 -0
  106. package/package.json +1 -1
@@ -60,6 +60,8 @@ export declare class ZipDeflateFile {
60
60
  private _dataQueue;
61
61
  private _finalQueued;
62
62
  private _pushChain;
63
+ private _inputBuf;
64
+ private _inputPos;
63
65
  private _syncDeflater;
64
66
  private _syncZlibReady;
65
67
  readonly name: string;
@@ -159,6 +161,8 @@ export declare class ZipDeflateFile {
159
161
  * memory growth when callers push data in a tight synchronous loop.
160
162
  */
161
163
  push(data: Uint8Array, final?: boolean, callback?: (err?: Error | null) => void): Promise<void>;
164
+ /** Enqueue an async push through the _pushChain serialization. */
165
+ private _pushAsync;
162
166
  /**
163
167
  * Synchronous push path — compresses and emits data without any Promises.
164
168
  *
@@ -0,0 +1,65 @@
1
+ /**
2
+ * Shared cryptographic primitives for PDF encryption/decryption.
3
+ *
4
+ * Zero-dependency, pure JavaScript implementations of:
5
+ * - AES (128/256-bit) CBC encrypt and decrypt
6
+ * - SHA-256
7
+ * - MD5
8
+ * - RC4 (for reading legacy PDFs only)
9
+ *
10
+ * @see FIPS 197 — AES
11
+ * @see FIPS 180-4 — SHA-256
12
+ * @see RFC 1321 — MD5
13
+ */
14
+ /**
15
+ * AES-CBC encryption with PKCS#7 padding.
16
+ * Supports AES-128 (16-byte key) and AES-256 (32-byte key).
17
+ */
18
+ export declare function aesCbcEncrypt(plaintext: Uint8Array, key: Uint8Array, iv: Uint8Array): Uint8Array;
19
+ /**
20
+ * AES-CBC decryption with PKCS#7 padding removal.
21
+ * Supports AES-128 (16-byte key) and AES-256 (32-byte key).
22
+ */
23
+ export declare function aesCbcDecrypt(ciphertext: Uint8Array, key: Uint8Array, iv: Uint8Array): Uint8Array;
24
+ /**
25
+ * AES-CBC decryption WITHOUT PKCS#7 padding removal.
26
+ * Used for key derivation in V=5 where the output length is known.
27
+ */
28
+ export declare function aesCbcDecryptRaw(ciphertext: Uint8Array, key: Uint8Array, iv: Uint8Array): Uint8Array;
29
+ /**
30
+ * AES-CBC encryption WITHOUT PKCS#7 padding.
31
+ * Used when the plaintext is already block-aligned (e.g., encrypting
32
+ * the 32-byte file encryption key in V=5).
33
+ *
34
+ * @throws if plaintext length is not a multiple of 16.
35
+ */
36
+ export declare function aesCbcEncryptRaw(plaintext: Uint8Array, key: Uint8Array, iv: Uint8Array): Uint8Array;
37
+ /**
38
+ * AES-ECB encryption of a single 16-byte block (no padding, no IV).
39
+ * Used for the /Perms value in V=5 encryption.
40
+ */
41
+ export declare function aesEcbEncrypt(block: Uint8Array, key: Uint8Array): Uint8Array;
42
+ /**
43
+ * SHA-256 hash function.
44
+ * @returns 32-byte digest
45
+ */
46
+ export declare function sha256(input: Uint8Array): Uint8Array;
47
+ /**
48
+ * MD5 hash function (RFC 1321).
49
+ * @returns 16-byte digest
50
+ */
51
+ export declare function md5(input: Uint8Array): Uint8Array;
52
+ /**
53
+ * RC4 stream cipher.
54
+ * @deprecated Only used for reading legacy encrypted PDFs. Writer uses AES-256.
55
+ */
56
+ export declare function rc4(key: Uint8Array, data: Uint8Array): Uint8Array;
57
+ /**
58
+ * Generate pseudo-random bytes.
59
+ * Uses Math.random — adequate for PDF IVs but not cryptographically secure.
60
+ */
61
+ export declare function randomBytes(length: number): Uint8Array;
62
+ /**
63
+ * Concatenate multiple Uint8Arrays.
64
+ */
65
+ export declare function concatArrays(...arrays: Uint8Array[]): Uint8Array;
@@ -1,13 +1,17 @@
1
1
  /**
2
- * PDF encryption support (Standard Security Handler, Revision 3).
2
+ * PDF encryption support (Standard Security Handler, V=5, R=5).
3
3
  *
4
- * Implements RC4-128 encryption compatible with PDF 1.4.
4
+ * Implements AES-256 encryption compatible with PDF 2.0 (ISO 32000-2:2020).
5
5
  * Supports:
6
6
  * - User password (required to open the document)
7
7
  * - Owner password (grants full access)
8
8
  * - Permission flags (print, copy, modify, etc.)
9
9
  *
10
- * @see PDF Reference 1.7, §3.5 - Encryption
10
+ * The file encryption key (FEK) is a random 256-bit key.
11
+ * All streams and strings are encrypted using AES-256-CBC with a random
12
+ * 16-byte IV prepended to each encrypted value.
13
+ *
14
+ * @see ISO 32000-2:2020, §7.6 — Encryption
11
15
  */
12
16
  /**
13
17
  * PDF encryption options.
@@ -43,35 +47,34 @@ export interface PdfPermissions {
43
47
  printHighQuality: boolean;
44
48
  }
45
49
  /**
46
- * Encryption state used during PDF generation.
50
+ * Encryption state used during PDF generation (V=5, R=5, AES-256).
47
51
  */
48
52
  export interface EncryptionState {
49
- /** Encryption key (variable length, up to 16 bytes) */
53
+ /** 32-byte file encryption key */
50
54
  encryptionKey: Uint8Array;
51
- /** O value (32 bytes) for the Encrypt dictionary */
55
+ /** 48-byte O value: hash(32) + validation salt(8) + key salt(8) */
52
56
  oValue: Uint8Array;
53
- /** U value (32 bytes) for the Encrypt dictionary */
57
+ /** 48-byte U value: hash(32) + validation salt(8) + key salt(8) */
54
58
  uValue: Uint8Array;
59
+ /** 32-byte encrypted owner key (OE) */
60
+ oeValue: Uint8Array;
61
+ /** 32-byte encrypted user key (UE) */
62
+ ueValue: Uint8Array;
63
+ /** 16-byte encrypted permissions (Perms) */
64
+ permsValue: Uint8Array;
55
65
  /** Permissions integer (P value) */
56
66
  permissions: number;
57
67
  /** File identifier (16 bytes) */
58
68
  fileId: Uint8Array;
59
69
  }
60
70
  /**
61
- * Initialize encryption state from the given options.
71
+ * Initialize encryption state for AES-256 (V=5, R=5).
62
72
  */
63
73
  export declare function initEncryption(options: PdfEncryptionOptions): EncryptionState;
64
74
  /**
65
- * Encrypt a string or stream for a specific PDF object.
66
- * Per-object encryption key = MD5(encryptionKey + objectNumber + generation).
67
- */
68
- export declare function encryptData(data: Uint8Array, objectNumber: number, generation: number, encryptionKey: Uint8Array): Uint8Array;
69
- /**
70
- * RC4 stream cipher implementation.
71
- */
72
- export declare function rc4(key: Uint8Array, data: Uint8Array): Uint8Array;
73
- /**
74
- * MD5 hash implementation (RFC 1321).
75
- * Returns 16-byte digest.
75
+ * Encrypt data for a PDF object using AES-256-CBC.
76
+ *
77
+ * For V=5/R=5, the file encryption key is used directly (no per-object key derivation).
78
+ * A random 16-byte IV is prepended to the ciphertext.
76
79
  */
77
- export declare function md5(input: Uint8Array): Uint8Array;
80
+ export declare function encryptData(data: Uint8Array, _objectNumber: number, _generation: number, encryptionKey: Uint8Array): Uint8Array;
@@ -1,20 +1,22 @@
1
1
  /**
2
2
  * PDF file writer.
3
3
  *
4
- * Assembles a complete PDF document from indirect objects.
4
+ * Assembles a complete PDF 2.0 document from indirect objects.
5
5
  * Handles the four sections of a PDF file:
6
- * 1. Header (%PDF-1.4)
6
+ * 1. Header (%PDF-2.0)
7
7
  * 2. Body (indirect objects)
8
8
  * 3. Cross-reference table
9
9
  * 4. Trailer (with document catalog reference)
10
10
  *
11
- * @see PDF Reference 1.7, Chapter 3.4 - File Structure
11
+ * Encryption uses AES-256 (V=5, R=5) per ISO 32000-2:2020.
12
+ *
13
+ * @see ISO 32000-2:2020, Chapter 7.5 — File Structure
12
14
  */
13
15
  import { PdfDict } from "./pdf-object.js";
14
16
  import type { PdfContentStream } from "./pdf-stream.js";
15
17
  import type { EncryptionState } from "./encryption.js";
16
18
  /**
17
- * Constructs a valid PDF 1.4 file from a set of indirect objects.
19
+ * Constructs a valid PDF 2.0 file from a set of indirect objects.
18
20
  *
19
21
  * Usage:
20
22
  * 1. Allocate object numbers with allocObject()
@@ -1,9 +1,9 @@
1
1
  /**
2
2
  * PDF module for excelts.
3
3
  *
4
- * A full-featured, zero-dependency PDF engine.
4
+ * A full-featured, zero-dependency PDF engine for both writing and reading.
5
5
  *
6
- * @example Standalone:
6
+ * @example Standalone PDF generation:
7
7
  * ```typescript
8
8
  * import { pdf } from "excelts/pdf";
9
9
  *
@@ -25,13 +25,34 @@
25
25
  * const bytes = excelToPdf(workbook);
26
26
  * ```
27
27
  *
28
+ * @example Read PDF — extract text, images, and metadata:
29
+ * ```typescript
30
+ * import { readPdf } from "excelts/pdf";
31
+ *
32
+ * const result = readPdf(pdfBytes);
33
+ * console.log(result.text); // All text
34
+ * console.log(result.pages[0].text); // Page 1 text
35
+ * console.log(result.pages[0].images); // Page 1 images
36
+ * console.log(result.pages[0].annotations); // Page 1 annotations
37
+ * console.log(result.metadata.title); // Document title
38
+ * console.log(result.formFields); // Form fields
39
+ * ```
40
+ *
28
41
  * @module pdf
29
42
  */
30
43
  /** Standalone PDF generation — accepts plain arrays, sheet objects, or workbooks. */
31
44
  export { pdf } from "./pdf.js";
32
45
  /** Excel-to-PDF conversion — accepts an Excel Workbook instance. */
33
46
  export { excelToPdf } from "./excel-bridge.js";
47
+ /** Read a PDF file and extract text, images, and metadata. */
48
+ export { readPdf } from "./reader/pdf-reader.js";
34
49
  export type { PdfCell, PdfRow, PdfColumn, PdfSheet, PdfBook, PdfImage } from "./pdf.js";
35
50
  export type { PdfExportOptions, PdfOrientation, PdfPageSize, PdfMargins, PdfColor, PageSizeName } from "./types.js";
36
51
  export { PageSizes } from "./types.js";
52
+ export type { ReadPdfOptions, ReadPdfResult, ReadPdfPage } from "./reader/pdf-reader.js";
53
+ export type { PdfMetadata } from "./reader/metadata-reader.js";
54
+ export type { ExtractedImage } from "./reader/image-extractor.js";
55
+ export type { TextLine } from "./reader/text-reconstruction.js";
56
+ export type { PdfAnnotation, PdfRect } from "./reader/annotation-extractor.js";
57
+ export type { PdfFormField, PdfFormFieldType } from "./reader/form-extractor.js";
37
58
  export { PdfError, PdfRenderError, PdfFontError, PdfStructureError, isPdfError } from "./errors.js";
@@ -0,0 +1,63 @@
1
+ /**
2
+ * PDF annotation extractor.
3
+ *
4
+ * Extracts annotations from a PDF page's `/Annots` array.
5
+ * Supports all standard annotation subtypes defined in PDF Reference 1.7, §12.5.
6
+ *
7
+ * Common annotation types:
8
+ * - **Link** — Hyperlinks (URI, GoTo, GoToR)
9
+ * - **Text** — Sticky notes / comments
10
+ * - **FreeText** — Inline text annotations
11
+ * - **Highlight / Underline / StrikeOut / Squiggly** — Text markup
12
+ * - **Stamp** — Rubber stamp annotations
13
+ * - **Popup** — Associated popup windows
14
+ * - **Widget** — Form field widgets (handled separately by form-extractor)
15
+ *
16
+ * @see PDF Reference 1.7, §12.5 - Annotations
17
+ */
18
+ import type { PdfDocument } from "./pdf-document.js";
19
+ import type { PdfDictValue } from "./pdf-parser.js";
20
+ /** Rectangle in PDF coordinate space [x1, y1, x2, y2] */
21
+ export interface PdfRect {
22
+ /** Left edge (x1) */
23
+ x1: number;
24
+ /** Bottom edge (y1) */
25
+ y1: number;
26
+ /** Right edge (x2) */
27
+ x2: number;
28
+ /** Top edge (y2) */
29
+ y2: number;
30
+ }
31
+ /** A PDF annotation extracted from a page. */
32
+ export interface PdfAnnotation {
33
+ /** Annotation subtype (e.g. "Link", "Text", "Highlight", "FreeText", "Stamp") */
34
+ subtype: string;
35
+ /** Bounding rectangle in page coordinates (points) */
36
+ rect: PdfRect;
37
+ /** Text content (/Contents entry) */
38
+ contents: string;
39
+ /** Author / title (/T entry) */
40
+ author: string;
41
+ /** Subject (/Subj entry) */
42
+ subject: string;
43
+ /** Modification date (/M entry) — raw PDF date string */
44
+ modifiedDate: string;
45
+ /** For Link annotations: the destination URI */
46
+ uri: string;
47
+ /** For Link annotations: named destination */
48
+ destination: string;
49
+ /** Annotation flags (/F entry) */
50
+ flags: number;
51
+ /** Color (/C entry) — array of 0-3 values in [0,1] */
52
+ color: number[];
53
+ }
54
+ /**
55
+ * Extract annotations from a PDF page.
56
+ *
57
+ * Skips Widget annotations (form fields) — those are handled by the form extractor.
58
+ *
59
+ * @param pageDict - The page dictionary
60
+ * @param doc - The PDF document for resolving references
61
+ * @returns Array of extracted annotations
62
+ */
63
+ export declare function extractAnnotationsFromPage(pageDict: PdfDictValue, doc: PdfDocument): PdfAnnotation[];
@@ -0,0 +1,70 @@
1
+ /**
2
+ * CMap parser for PDF text extraction.
3
+ *
4
+ * Parses /ToUnicode CMap programs to build character code → Unicode mappings.
5
+ * This is essential for extracting text from PDFs that use CIDFonts or
6
+ * custom encodings.
7
+ *
8
+ * Supports:
9
+ * - beginbfchar / endbfchar (single character mappings)
10
+ * - beginbfrange / endbfrange (range mappings, including array form)
11
+ * - begincodespacerange / endcodespacerange
12
+ * - Multi-byte character codes (1-4 bytes)
13
+ * - UTF-16BE encoded target strings (including surrogate pairs)
14
+ *
15
+ * @see PDF Reference 1.7, §5.9 - ToUnicode CMaps
16
+ * @see Adobe Technical Note #5411 - CMap Resources
17
+ */
18
+ /**
19
+ * A parsed CMap that maps character codes to Unicode strings.
20
+ */
21
+ export declare class CMap {
22
+ private codeSpaceRanges;
23
+ private bfChars;
24
+ private bfRanges;
25
+ /** Number of bytes per character code (detected from codespace ranges) */
26
+ bytesPerCode: number;
27
+ constructor();
28
+ /**
29
+ * Look up the Unicode string for a character code.
30
+ * Uses binary search over sorted bfRanges for efficient lookup.
31
+ */
32
+ lookup(code: number): string | undefined;
33
+ /**
34
+ * Add a code space range.
35
+ */
36
+ addCodeSpaceRange(low: number, high: number, bytes: number): void;
37
+ /**
38
+ * Add a bfchar mapping.
39
+ */
40
+ addBfChar(code: number, unicode: string): void;
41
+ /**
42
+ * Add a bfrange mapping.
43
+ */
44
+ addBfRange(low: number, high: number, mapping: string | string[]): void;
45
+ /**
46
+ * Sort bfRanges by low value for binary search.
47
+ * Should be called after all ranges have been added.
48
+ */
49
+ sortRanges(): void;
50
+ /**
51
+ * Determine the code length (in bytes) for a given first byte,
52
+ * using the codespace ranges. When multiple ranges match (e.g. a 1-byte
53
+ * range covering 0x00-0xFF and a 2-byte range whose first byte overlaps),
54
+ * returns the longest match per the PDF spec's greedy matching rule.
55
+ * Falls back to bytesPerCode if no range matches.
56
+ */
57
+ getCodeLength(firstByte: number): number;
58
+ /**
59
+ * Check if this CMap has any mappings.
60
+ */
61
+ get isEmpty(): boolean;
62
+ /**
63
+ * Check if this CMap has codespace ranges defined.
64
+ */
65
+ get hasCodeSpaceRanges(): boolean;
66
+ }
67
+ /**
68
+ * Parse a CMap program (typically from a /ToUnicode stream).
69
+ */
70
+ export declare function parseCMap(data: Uint8Array): CMap;
@@ -0,0 +1,57 @@
1
+ /**
2
+ * PDF content stream interpreter for text extraction.
3
+ *
4
+ * Implements a full PDF graphics state machine that processes content stream
5
+ * operators to extract positioned text fragments. These fragments are then
6
+ * assembled into readable text by the text reconstruction module.
7
+ *
8
+ * Supported operator categories:
9
+ * - Text state: Tf, Tc, Tw, Tz, TL, Ts, Tr
10
+ * - Text positioning: Td, TD, Tm, T*
11
+ * - Text showing: Tj, TJ, ', "
12
+ * - Text objects: BT, ET
13
+ * - Graphics state: q, Q, cm, gs, i, M, ri, W, W*
14
+ * - Color: CS, cs, SC, sc, SCN, scn
15
+ * - Marked content: BDC, BMC, EMC, MP, DP
16
+ * - Type3 glyph: d0, d1
17
+ * - Shading: sh
18
+ * - Inline images: BI/ID/EI
19
+ * - XObject invocation: Do (for form XObjects containing text)
20
+ *
21
+ * @see PDF Reference 1.7, Chapter 5 - Text
22
+ * @see PDF Reference 1.7, Chapter 4 - Graphics
23
+ */
24
+ import type { PdfDocument } from "./pdf-document.js";
25
+ import type { PdfDictValue } from "./pdf-parser.js";
26
+ /**
27
+ * A text fragment extracted from a PDF page.
28
+ * Contains the text string and its position in page coordinates.
29
+ */
30
+ export interface TextFragment {
31
+ /** The extracted text */
32
+ text: string;
33
+ /** X position in page coordinates (points, origin = bottom-left) */
34
+ x: number;
35
+ /** Y position in page coordinates */
36
+ y: number;
37
+ /** Font size in points */
38
+ fontSize: number;
39
+ /** Font name */
40
+ fontName: string;
41
+ /** Width of the text in points */
42
+ width: number;
43
+ /** Character spacing */
44
+ charSpacing: number;
45
+ /** Word spacing */
46
+ wordSpacing: number;
47
+ /** Horizontal scaling factor (100 = normal) */
48
+ horizontalScaling: number;
49
+ /** Whether the text is vertical (WMode=1) */
50
+ isVertical: boolean;
51
+ /** Whether the text is right-to-left (Arabic, Hebrew, etc.) */
52
+ isRtl: boolean;
53
+ }
54
+ /**
55
+ * Extract text fragments from a page's content stream(s).
56
+ */
57
+ export declare function extractTextFromPage(pageDict: PdfDictValue, doc: PdfDocument): TextFragment[];
@@ -0,0 +1,58 @@
1
+ /**
2
+ * PDF font decoder for text extraction.
3
+ *
4
+ * Handles the mapping from character codes in content streams to Unicode
5
+ * strings. Supports all major PDF font types:
6
+ *
7
+ * - Type 1 fonts (standard 14 + custom with /Encoding)
8
+ * - TrueType fonts (with /Encoding and /ToUnicode)
9
+ * - Type 0 (CID) composite fonts (with /ToUnicode CMap)
10
+ * - Type 3 fonts (with /Encoding and /ToUnicode)
11
+ *
12
+ * @see PDF Reference 1.7, Chapter 5 - Text
13
+ * @see PDF Reference 1.7, §5.5 - Character Encoding
14
+ */
15
+ import type { CMap } from "./cmap-parser.js";
16
+ import type { PdfDocument } from "./pdf-document.js";
17
+ import type { PdfDictValue } from "./pdf-parser.js";
18
+ /**
19
+ * A resolved font used for text extraction.
20
+ */
21
+ export interface ResolvedFont {
22
+ /** Font name */
23
+ name: string;
24
+ /** Font subtype: Type1, TrueType, Type0, Type3, CIDFontType0, CIDFontType2, MMType1 */
25
+ subtype: string;
26
+ /** ToUnicode CMap (if available) */
27
+ toUnicode: CMap | null;
28
+ /** Encoding lookup: char code → unicode string */
29
+ encoding: Map<number, string>;
30
+ /** Number of bytes per character code (1 for simple fonts, 1-2 for CID fonts) */
31
+ bytesPerCode: number;
32
+ /** Base font name */
33
+ baseFontName: string;
34
+ /** Whether this is a symbolic font */
35
+ isSymbolic: boolean;
36
+ /** Character widths (code → width in thousandths of text space units) */
37
+ widths: Map<number, number>;
38
+ /** Default width */
39
+ defaultWidth: number;
40
+ /** Missing width for characters not in widths table */
41
+ missingWidth: number;
42
+ /** Whether the font uses Identity-H or Identity-V encoding (codes are Unicode code points) */
43
+ isIdentityEncoding: boolean;
44
+ /** Writing mode: 0 = horizontal, 1 = vertical */
45
+ wmode: number;
46
+ }
47
+ /**
48
+ * Resolve a PDF font dictionary into a ResolvedFont for text extraction.
49
+ */
50
+ export declare function resolveFont(fontDict: PdfDictValue, doc: PdfDocument): ResolvedFont;
51
+ /**
52
+ * Decode character codes to Unicode text using a resolved font.
53
+ */
54
+ export declare function decodeText(codes: Uint8Array, font: ResolvedFont): string;
55
+ /**
56
+ * Get the character width for a given code.
57
+ */
58
+ export declare function getCharWidth(code: number, font: ResolvedFont): number;
@@ -0,0 +1,48 @@
1
+ /**
2
+ * PDF form field (AcroForm) extractor.
3
+ *
4
+ * Extracts interactive form fields from a PDF's `/AcroForm` dictionary.
5
+ * Supports all standard field types:
6
+ * - **Text** (`/Tx`) — Text input fields
7
+ * - **Button** (`/Btn`) — Checkboxes, radio buttons, push buttons
8
+ * - **Choice** (`/Ch`) — Dropdowns (combo boxes) and list boxes
9
+ * - **Signature** (`/Sig`) — Digital signature fields
10
+ *
11
+ * Handles field hierarchies (parent/child), inherited values, and default appearances.
12
+ *
13
+ * @see PDF Reference 1.7, §12.7 - Interactive Forms
14
+ */
15
+ import type { PdfDocument } from "./pdf-document.js";
16
+ /** Type of form field. */
17
+ export type PdfFormFieldType = "text" | "checkbox" | "radio" | "dropdown" | "listbox" | "button" | "signature" | "unknown";
18
+ /** A single form field extracted from the PDF. */
19
+ export interface PdfFormField {
20
+ /** Fully qualified field name (e.g. "form1.address.city") */
21
+ name: string;
22
+ /** Field type */
23
+ type: PdfFormFieldType;
24
+ /** Current value of the field */
25
+ value: string;
26
+ /** Default value (/DV entry) */
27
+ defaultValue: string;
28
+ /** Whether the field is read-only */
29
+ readOnly: boolean;
30
+ /** Whether the field is required */
31
+ required: boolean;
32
+ /** For choice fields: the list of available options */
33
+ options: string[];
34
+ /** For checkboxes/radio buttons: the export value when checked */
35
+ exportValue: string;
36
+ /** Field flags (/Ff entry) — raw bit field */
37
+ flags: number;
38
+ }
39
+ /**
40
+ * Extract form fields from a PDF document.
41
+ *
42
+ * Reads the `/AcroForm` dictionary from the catalog and recursively
43
+ * traverses the field tree.
44
+ *
45
+ * @param doc - The PDF document
46
+ * @returns Array of extracted form fields
47
+ */
48
+ export declare function extractFormFields(doc: PdfDocument): PdfFormField[];
@@ -0,0 +1,55 @@
1
+ /**
2
+ * PDF image extraction.
3
+ *
4
+ * Extracts images from PDF pages including:
5
+ * - Inline images (BI/ID/EI operators)
6
+ * - XObject images (/Subtype /Image)
7
+ * - Images with various color spaces and filters
8
+ *
9
+ * Supported image formats:
10
+ * - JPEG (DCTDecode) — extracted as-is
11
+ * - JPEG2000 (JPXDecode) — extracted as-is
12
+ * - Raw/Flate-compressed pixel data — extracted with metadata
13
+ * - CCITT fax — extracted as-is
14
+ *
15
+ * @see PDF Reference 1.7, §4.8 - Images
16
+ */
17
+ import type { PdfDocument } from "./pdf-document.js";
18
+ import type { PdfDictValue } from "./pdf-parser.js";
19
+ /**
20
+ * An extracted image from a PDF page.
21
+ */
22
+ export interface ExtractedImage {
23
+ /** Image index within the page (0-based) */
24
+ index: number;
25
+ /** Image width in pixels */
26
+ width: number;
27
+ /** Image height in pixels */
28
+ height: number;
29
+ /** Bits per component */
30
+ bitsPerComponent: number;
31
+ /** Color space name */
32
+ colorSpace: string;
33
+ /** Number of color components (1=gray, 3=RGB, 4=CMYK) */
34
+ components: number;
35
+ /**
36
+ * Image data format:
37
+ * - "jpeg" — raw JPEG data (can be written directly as .jpg)
38
+ * - "jpx" — JPEG 2000 data
39
+ * - "raw" — raw pixel data (RGB/CMYK/Gray, decompressed)
40
+ * - "ccitt" — CCITT fax compressed data
41
+ */
42
+ format: "jpeg" | "jpx" | "raw" | "ccitt" | "jbig2";
43
+ /** The image data */
44
+ data: Uint8Array;
45
+ /** Alpha mask data (if present) — same dimensions, 1 component, 8 bits */
46
+ alphaMask: Uint8Array | null;
47
+ /** Filter name from the original stream */
48
+ filter: string;
49
+ /** XObject name (if it was a named XObject) */
50
+ name: string;
51
+ }
52
+ /**
53
+ * Extract all images from a PDF page.
54
+ */
55
+ export declare function extractImagesFromPage(pageDict: PdfDictValue, doc: PdfDocument): ExtractedImage[];
@@ -0,0 +1,56 @@
1
+ /**
2
+ * PDF metadata reader.
3
+ *
4
+ * Extracts document metadata from:
5
+ * 1. Info Dictionary (traditional metadata)
6
+ * - Title, Author, Subject, Keywords, Creator, Producer
7
+ * - CreationDate, ModDate
8
+ *
9
+ * 2. XMP Metadata Stream (XML-based, more comprehensive)
10
+ * - All of the above plus:
11
+ * - Dublin Core metadata, custom properties
12
+ *
13
+ * @see PDF Reference 1.7, §10.2 - Metadata
14
+ * @see XMP Specification Part 1
15
+ */
16
+ import type { PdfDocument } from "./pdf-document.js";
17
+ /**
18
+ * PDF document metadata.
19
+ */
20
+ export interface PdfMetadata {
21
+ /** Document title */
22
+ title: string;
23
+ /** Document author */
24
+ author: string;
25
+ /** Document subject */
26
+ subject: string;
27
+ /** Document keywords */
28
+ keywords: string;
29
+ /** Application that created the original document */
30
+ creator: string;
31
+ /** Application that produced the PDF */
32
+ producer: string;
33
+ /** Date the document was created */
34
+ creationDate: Date | null;
35
+ /** Date the document was last modified */
36
+ modDate: Date | null;
37
+ /** PDF version */
38
+ pdfVersion: string;
39
+ /** Number of pages */
40
+ pageCount: number;
41
+ /** Whether the document is encrypted */
42
+ encrypted: boolean;
43
+ /** Page size of the first page (in points) */
44
+ pageSize: {
45
+ width: number;
46
+ height: number;
47
+ } | null;
48
+ /** Raw XMP metadata XML (if available) */
49
+ xmpXml: string | null;
50
+ /** Additional custom metadata from Info dictionary */
51
+ custom: Record<string, string>;
52
+ }
53
+ /**
54
+ * Extract metadata from a PDF document.
55
+ */
56
+ export declare function extractMetadata(doc: PdfDocument): PdfMetadata;
@@ -0,0 +1,26 @@
1
+ /**
2
+ * PDF decryption for reading encrypted PDFs.
3
+ *
4
+ * Supports:
5
+ * - Standard Security Handler (V1/V2/V4/V5, R2/R3/R4/R5)
6
+ * - RC4 encryption (40-bit and 128-bit)
7
+ * - AES-128 encryption (PDF 1.6+)
8
+ * - AES-256 encryption (PDF 2.0, V=5, R=5)
9
+ *
10
+ * @see PDF Reference 1.7, §3.5 - Encryption
11
+ * @see PDF 2.0 (ISO 32000-2), §7.6 - Encryption
12
+ */
13
+ import type { PdfDocument } from "./pdf-document.js";
14
+ /**
15
+ * Initialize decryption for a PDF document.
16
+ * Returns true if decryption was successfully initialized, false if
17
+ * the password was incorrect.
18
+ *
19
+ * @param doc - The PDF document
20
+ * @param password - User or owner password (empty string for no password)
21
+ */
22
+ export declare function initDecryption(doc: PdfDocument, password?: string): boolean;
23
+ /**
24
+ * Check if the document is encrypted.
25
+ */
26
+ export declare function isEncrypted(doc: PdfDocument): boolean;