@cj-tech-master/excelts 8.0.0 → 8.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. package/README.md +14 -1
  2. package/README_zh.md +6 -0
  3. package/dist/browser/modules/archive/zip/stream.d.ts +4 -0
  4. package/dist/browser/modules/archive/zip/stream.js +53 -0
  5. package/dist/browser/modules/pdf/core/crypto.d.ts +65 -0
  6. package/dist/browser/modules/pdf/core/crypto.js +637 -0
  7. package/dist/browser/modules/pdf/core/encryption.d.ts +23 -20
  8. package/dist/browser/modules/pdf/core/encryption.js +88 -261
  9. package/dist/browser/modules/pdf/core/pdf-writer.d.ts +6 -4
  10. package/dist/browser/modules/pdf/core/pdf-writer.js +19 -10
  11. package/dist/browser/modules/pdf/index.d.ts +23 -2
  12. package/dist/browser/modules/pdf/index.js +21 -3
  13. package/dist/browser/modules/pdf/reader/annotation-extractor.d.ts +63 -0
  14. package/dist/browser/modules/pdf/reader/annotation-extractor.js +155 -0
  15. package/dist/browser/modules/pdf/reader/cmap-parser.d.ts +70 -0
  16. package/dist/browser/modules/pdf/reader/cmap-parser.js +321 -0
  17. package/dist/browser/modules/pdf/reader/content-interpreter.d.ts +57 -0
  18. package/dist/browser/modules/pdf/reader/content-interpreter.js +715 -0
  19. package/dist/browser/modules/pdf/reader/font-decoder.d.ts +58 -0
  20. package/dist/browser/modules/pdf/reader/font-decoder.js +1513 -0
  21. package/dist/browser/modules/pdf/reader/form-extractor.d.ts +48 -0
  22. package/dist/browser/modules/pdf/reader/form-extractor.js +355 -0
  23. package/dist/browser/modules/pdf/reader/image-extractor.d.ts +55 -0
  24. package/dist/browser/modules/pdf/reader/image-extractor.js +220 -0
  25. package/dist/browser/modules/pdf/reader/metadata-reader.d.ts +56 -0
  26. package/dist/browser/modules/pdf/reader/metadata-reader.js +275 -0
  27. package/dist/browser/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
  28. package/dist/browser/modules/pdf/reader/pdf-decrypt.js +443 -0
  29. package/dist/browser/modules/pdf/reader/pdf-document.d.ts +191 -0
  30. package/dist/browser/modules/pdf/reader/pdf-document.js +818 -0
  31. package/dist/browser/modules/pdf/reader/pdf-parser.d.ts +65 -0
  32. package/dist/browser/modules/pdf/reader/pdf-parser.js +285 -0
  33. package/dist/browser/modules/pdf/reader/pdf-reader.d.ts +143 -0
  34. package/dist/browser/modules/pdf/reader/pdf-reader.js +200 -0
  35. package/dist/browser/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
  36. package/dist/browser/modules/pdf/reader/pdf-tokenizer.js +543 -0
  37. package/dist/browser/modules/pdf/reader/reader-utils.d.ts +15 -0
  38. package/dist/browser/modules/pdf/reader/reader-utils.js +27 -0
  39. package/dist/browser/modules/pdf/reader/stream-filters.d.ts +20 -0
  40. package/dist/browser/modules/pdf/reader/stream-filters.js +456 -0
  41. package/dist/browser/modules/pdf/reader/text-reconstruction.d.ts +44 -0
  42. package/dist/browser/modules/pdf/reader/text-reconstruction.js +463 -0
  43. package/dist/cjs/modules/archive/zip/stream.js +53 -0
  44. package/dist/cjs/modules/pdf/core/crypto.js +649 -0
  45. package/dist/cjs/modules/pdf/core/encryption.js +88 -263
  46. package/dist/cjs/modules/pdf/core/pdf-writer.js +19 -10
  47. package/dist/cjs/modules/pdf/index.js +23 -4
  48. package/dist/cjs/modules/pdf/reader/annotation-extractor.js +158 -0
  49. package/dist/cjs/modules/pdf/reader/cmap-parser.js +326 -0
  50. package/dist/cjs/modules/pdf/reader/content-interpreter.js +718 -0
  51. package/dist/cjs/modules/pdf/reader/font-decoder.js +1518 -0
  52. package/dist/cjs/modules/pdf/reader/form-extractor.js +358 -0
  53. package/dist/cjs/modules/pdf/reader/image-extractor.js +223 -0
  54. package/dist/cjs/modules/pdf/reader/metadata-reader.js +278 -0
  55. package/dist/cjs/modules/pdf/reader/pdf-decrypt.js +447 -0
  56. package/dist/cjs/modules/pdf/reader/pdf-document.js +822 -0
  57. package/dist/cjs/modules/pdf/reader/pdf-parser.js +301 -0
  58. package/dist/cjs/modules/pdf/reader/pdf-reader.js +203 -0
  59. package/dist/cjs/modules/pdf/reader/pdf-tokenizer.js +517 -0
  60. package/dist/cjs/modules/pdf/reader/reader-utils.js +30 -0
  61. package/dist/cjs/modules/pdf/reader/stream-filters.js +459 -0
  62. package/dist/cjs/modules/pdf/reader/text-reconstruction.js +467 -0
  63. package/dist/esm/modules/archive/zip/stream.js +53 -0
  64. package/dist/esm/modules/pdf/core/crypto.js +637 -0
  65. package/dist/esm/modules/pdf/core/encryption.js +88 -261
  66. package/dist/esm/modules/pdf/core/pdf-writer.js +19 -10
  67. package/dist/esm/modules/pdf/index.js +21 -3
  68. package/dist/esm/modules/pdf/reader/annotation-extractor.js +155 -0
  69. package/dist/esm/modules/pdf/reader/cmap-parser.js +321 -0
  70. package/dist/esm/modules/pdf/reader/content-interpreter.js +715 -0
  71. package/dist/esm/modules/pdf/reader/font-decoder.js +1513 -0
  72. package/dist/esm/modules/pdf/reader/form-extractor.js +355 -0
  73. package/dist/esm/modules/pdf/reader/image-extractor.js +220 -0
  74. package/dist/esm/modules/pdf/reader/metadata-reader.js +275 -0
  75. package/dist/esm/modules/pdf/reader/pdf-decrypt.js +443 -0
  76. package/dist/esm/modules/pdf/reader/pdf-document.js +818 -0
  77. package/dist/esm/modules/pdf/reader/pdf-parser.js +285 -0
  78. package/dist/esm/modules/pdf/reader/pdf-reader.js +200 -0
  79. package/dist/esm/modules/pdf/reader/pdf-tokenizer.js +543 -0
  80. package/dist/esm/modules/pdf/reader/reader-utils.js +27 -0
  81. package/dist/esm/modules/pdf/reader/stream-filters.js +456 -0
  82. package/dist/esm/modules/pdf/reader/text-reconstruction.js +463 -0
  83. package/dist/iife/excelts.iife.js +703 -267
  84. package/dist/iife/excelts.iife.js.map +1 -1
  85. package/dist/iife/excelts.iife.min.js +35 -35
  86. package/dist/types/modules/archive/zip/stream.d.ts +4 -0
  87. package/dist/types/modules/pdf/core/crypto.d.ts +65 -0
  88. package/dist/types/modules/pdf/core/encryption.d.ts +23 -20
  89. package/dist/types/modules/pdf/core/pdf-writer.d.ts +6 -4
  90. package/dist/types/modules/pdf/index.d.ts +23 -2
  91. package/dist/types/modules/pdf/reader/annotation-extractor.d.ts +63 -0
  92. package/dist/types/modules/pdf/reader/cmap-parser.d.ts +70 -0
  93. package/dist/types/modules/pdf/reader/content-interpreter.d.ts +57 -0
  94. package/dist/types/modules/pdf/reader/font-decoder.d.ts +58 -0
  95. package/dist/types/modules/pdf/reader/form-extractor.d.ts +48 -0
  96. package/dist/types/modules/pdf/reader/image-extractor.d.ts +55 -0
  97. package/dist/types/modules/pdf/reader/metadata-reader.d.ts +56 -0
  98. package/dist/types/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
  99. package/dist/types/modules/pdf/reader/pdf-document.d.ts +191 -0
  100. package/dist/types/modules/pdf/reader/pdf-parser.d.ts +65 -0
  101. package/dist/types/modules/pdf/reader/pdf-reader.d.ts +143 -0
  102. package/dist/types/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
  103. package/dist/types/modules/pdf/reader/reader-utils.d.ts +15 -0
  104. package/dist/types/modules/pdf/reader/stream-filters.d.ts +20 -0
  105. package/dist/types/modules/pdf/reader/text-reconstruction.d.ts +44 -0
  106. package/package.json +1 -1
@@ -0,0 +1,301 @@
1
+ "use strict";
2
+ /**
3
+ * PDF object parser.
4
+ *
5
+ * Parses PDF tokens into typed PDF objects: dictionaries, arrays, strings,
6
+ * numbers, booleans, names, null, indirect references, and streams.
7
+ *
8
+ * @see PDF Reference 1.7, Chapter 3 - Objects
9
+ */
10
+ Object.defineProperty(exports, "__esModule", { value: true });
11
+ exports.isPdfRef = isPdfRef;
12
+ exports.isPdfStream = isPdfStream;
13
+ exports.isPdfDict = isPdfDict;
14
+ exports.isPdfArray = isPdfArray;
15
+ exports.dictGetName = dictGetName;
16
+ exports.dictGetNumber = dictGetNumber;
17
+ exports.dictGetBool = dictGetBool;
18
+ exports.dictGetDict = dictGetDict;
19
+ exports.dictGetArray = dictGetArray;
20
+ exports.dictGetRef = dictGetRef;
21
+ exports.dictGetBytes = dictGetBytes;
22
+ exports.dictGetString = dictGetString;
23
+ exports.decodePdfStringBytes = decodePdfStringBytes;
24
+ exports.parseObject = parseObject;
25
+ exports.parseObjectFromToken = parseObjectFromToken;
26
+ const errors_1 = require("../errors");
27
+ // =============================================================================
28
+ // Type Guards
29
+ // =============================================================================
30
+ function isPdfRef(obj) {
31
+ return obj !== null && typeof obj === "object" && "type" in obj && obj.type === "ref";
32
+ }
33
+ function isPdfStream(obj) {
34
+ return obj !== null && typeof obj === "object" && "type" in obj && obj.type === "stream";
35
+ }
36
+ function isPdfDict(obj) {
37
+ return obj instanceof Map;
38
+ }
39
+ function isPdfArray(obj) {
40
+ return Array.isArray(obj);
41
+ }
42
+ // =============================================================================
43
+ // Dictionary Helpers
44
+ // =============================================================================
45
+ /** Get a string value from a PDF dictionary */
46
+ function dictGetName(dict, key) {
47
+ const val = dict.get(key);
48
+ return typeof val === "string" ? val : undefined;
49
+ }
50
+ /** Get a number value from a PDF dictionary */
51
+ function dictGetNumber(dict, key) {
52
+ const val = dict.get(key);
53
+ return typeof val === "number" ? val : undefined;
54
+ }
55
+ /** Get a boolean value from a PDF dictionary */
56
+ function dictGetBool(dict, key) {
57
+ const val = dict.get(key);
58
+ return typeof val === "boolean" ? val : undefined;
59
+ }
60
+ /** Get a dictionary value from a PDF dictionary */
61
+ function dictGetDict(dict, key) {
62
+ const val = dict.get(key);
63
+ return isPdfDict(val) ? val : undefined;
64
+ }
65
+ /** Get an array value from a PDF dictionary */
66
+ function dictGetArray(dict, key) {
67
+ const val = dict.get(key);
68
+ return isPdfArray(val) ? val : undefined;
69
+ }
70
+ /** Get a ref from a PDF dictionary */
71
+ function dictGetRef(dict, key) {
72
+ const val = dict.get(key);
73
+ return isPdfRef(val) ? val : undefined;
74
+ }
75
+ /** Get bytes (string as Uint8Array) from a PDF dictionary */
76
+ function dictGetBytes(dict, key) {
77
+ const val = dict.get(key);
78
+ return val instanceof Uint8Array ? val : undefined;
79
+ }
80
+ /** Get a string value that may be either a name (string) or bytes decoded as latin1 */
81
+ function dictGetString(dict, key) {
82
+ const val = dict.get(key);
83
+ if (typeof val === "string") {
84
+ return val;
85
+ }
86
+ if (val instanceof Uint8Array) {
87
+ return decodePdfStringBytes(val);
88
+ }
89
+ return undefined;
90
+ }
91
+ /**
92
+ * Decode PDF string bytes to a JavaScript string.
93
+ * Handles UTF-16BE (BOM = FEFF) and PDFDocEncoding (Latin-1 superset).
94
+ */
95
+ function decodePdfStringBytes(bytes) {
96
+ if (bytes.length >= 2 && bytes[0] === 0xfe && bytes[1] === 0xff) {
97
+ // UTF-16BE
98
+ let result = "";
99
+ for (let i = 2; i + 1 < bytes.length; i += 2) {
100
+ const code = (bytes[i] << 8) | bytes[i + 1];
101
+ // Handle surrogate pairs
102
+ if (code >= 0xd800 && code <= 0xdbff && i + 3 < bytes.length) {
103
+ const low = (bytes[i + 2] << 8) | bytes[i + 3];
104
+ if (low >= 0xdc00 && low <= 0xdfff) {
105
+ const cp = 0x10000 + ((code - 0xd800) << 10) + (low - 0xdc00);
106
+ result += String.fromCodePoint(cp);
107
+ i += 2;
108
+ continue;
109
+ }
110
+ }
111
+ result += String.fromCharCode(code);
112
+ }
113
+ return result;
114
+ }
115
+ // UTF-8 BOM
116
+ if (bytes.length >= 3 && bytes[0] === 0xef && bytes[1] === 0xbb && bytes[2] === 0xbf) {
117
+ return new TextDecoder("utf-8").decode(bytes.subarray(3));
118
+ }
119
+ // PDFDocEncoding (identical to Latin-1 / ISO 8859-1 for 0x00-0xFF,
120
+ // with some differences in 0x80-0x9F range)
121
+ return decodePdfDocEncoding(bytes);
122
+ }
123
+ /** Decode bytes using PDFDocEncoding */
124
+ function decodePdfDocEncoding(bytes) {
125
+ let result = "";
126
+ for (let i = 0; i < bytes.length; i++) {
127
+ const b = bytes[i];
128
+ const mapped = PDF_DOC_ENCODING[b];
129
+ result += mapped !== undefined ? String.fromCodePoint(mapped) : String.fromCharCode(b);
130
+ }
131
+ return result;
132
+ }
133
+ /**
134
+ * PDFDocEncoding differences from Latin-1 in the 0x80-0xAD range.
135
+ * @see PDF Reference 1.7, Table D.2
136
+ */
137
+ const PDF_DOC_ENCODING = {
138
+ 0x80: 0x2022, // •
139
+ 0x81: 0x2020, // †
140
+ 0x82: 0x2021, // ‡
141
+ 0x83: 0x2026, // …
142
+ 0x84: 0x2014, // —
143
+ 0x85: 0x2013, // –
144
+ 0x86: 0x0192, // ƒ
145
+ 0x87: 0x2044, // ⁄
146
+ 0x88: 0x2039, // ‹
147
+ 0x89: 0x203a, // ›
148
+ 0x8a: 0x2212, // −
149
+ 0x8b: 0x2030, // ‰
150
+ 0x8c: 0x201e, // „
151
+ 0x8d: 0x201c, // "
152
+ 0x8e: 0x201d, // "
153
+ 0x8f: 0x2018, // '
154
+ 0x90: 0x2019, // '
155
+ 0x91: 0x201a, // ‚
156
+ 0x92: 0x2122, // ™
157
+ 0x93: 0xfb01, // fi
158
+ 0x94: 0xfb02, // fl
159
+ 0x95: 0x0141, // Ł
160
+ 0x96: 0x0152, // Œ
161
+ 0x97: 0x0160, // Š
162
+ 0x98: 0x0178, // Ÿ
163
+ 0x99: 0x017d, // Ž
164
+ 0x9a: 0x0131, // ı
165
+ 0x9b: 0x0142, // ł
166
+ 0x9c: 0x0153, // œ
167
+ 0x9d: 0x0161, // š
168
+ 0x9e: 0x017e, // ž
169
+ 0xa0: 0x20ac, // €
170
+ 0xad: 0x02c7 // ˇ
171
+ };
172
+ // =============================================================================
173
+ // PDF Object Parser
174
+ // =============================================================================
175
+ /**
176
+ * Parse a single PDF object from the tokenizer.
177
+ *
178
+ * Handles all PDF object types including dictionaries (with possible streams),
179
+ * arrays, strings, numbers, names, booleans, null, and indirect references.
180
+ */
181
+ function parseObject(tokenizer) {
182
+ const token = tokenizer.next();
183
+ return parseObjectFromToken(tokenizer, token);
184
+ }
185
+ /**
186
+ * Parse a PDF object given the first token has already been consumed.
187
+ */
188
+ function parseObjectFromToken(tokenizer, token) {
189
+ switch (token.type) {
190
+ case 0 /* TokenType.Number */: {
191
+ // Could be: number, or start of indirect ref (N gen R) or indirect obj (N gen obj)
192
+ const num = token.numValue;
193
+ const savedPos = tokenizer.pos;
194
+ const next = tokenizer.next();
195
+ if (next.type === 0 /* TokenType.Number */) {
196
+ const gen = next.numValue;
197
+ const next2 = tokenizer.next();
198
+ if (next2.type === 6 /* TokenType.Keyword */ && next2.strValue === "R") {
199
+ // Indirect reference: N gen R
200
+ return { type: "ref", objNum: num, gen };
201
+ }
202
+ if (next2.type === 6 /* TokenType.Keyword */ && next2.strValue === "obj") {
203
+ // Indirect object definition: N gen obj ... endobj
204
+ const obj = parseObject(tokenizer);
205
+ // Check if it's a stream
206
+ if (isPdfDict(obj)) {
207
+ tokenizer.skipWhitespaceAndComments();
208
+ const peekPos = tokenizer.pos;
209
+ const maybeStream = tokenizer.next();
210
+ if (maybeStream.type === 6 /* TokenType.Keyword */ && maybeStream.strValue === "stream") {
211
+ const length = dictGetNumber(obj, "Length") ?? -1;
212
+ const streamData = tokenizer.readStreamContent(length);
213
+ // Consume endobj
214
+ const endobj = tokenizer.next();
215
+ if (endobj.type !== 6 /* TokenType.Keyword */ || endobj.strValue !== "endobj") {
216
+ // Some PDFs don't have endobj after endstream — tolerate
217
+ tokenizer.pos = endobj.offset;
218
+ }
219
+ return { type: "stream", dict: obj, data: streamData };
220
+ }
221
+ // Not a stream — restore position
222
+ tokenizer.pos = peekPos;
223
+ }
224
+ // Consume endobj
225
+ tokenizer.skipWhitespaceAndComments();
226
+ const peekEnd = tokenizer.pos;
227
+ const endTok = tokenizer.next();
228
+ if (endTok.type !== 6 /* TokenType.Keyword */ || endTok.strValue !== "endobj") {
229
+ tokenizer.pos = peekEnd;
230
+ }
231
+ return obj;
232
+ }
233
+ // Not a ref or obj definition — restore
234
+ tokenizer.pos = savedPos;
235
+ return num;
236
+ }
237
+ // Not followed by another number — just a number
238
+ tokenizer.pos = savedPos;
239
+ return num;
240
+ }
241
+ case 1 /* TokenType.LiteralString */:
242
+ case 2 /* TokenType.HexString */:
243
+ return token.rawBytes ?? new Uint8Array(0);
244
+ case 3 /* TokenType.Name */:
245
+ return token.strValue;
246
+ case 4 /* TokenType.Boolean */:
247
+ return token.boolValue;
248
+ case 5 /* TokenType.Null */:
249
+ return null;
250
+ case 7 /* TokenType.DictBegin */:
251
+ return parseDictionary(tokenizer);
252
+ case 9 /* TokenType.ArrayBegin */:
253
+ return parseArray(tokenizer);
254
+ case 11 /* TokenType.EOF */:
255
+ throw new errors_1.PdfStructureError("Unexpected end of input while parsing PDF object");
256
+ default:
257
+ // Keywords like "endobj", "stream" etc. are unexpected in object context
258
+ // Return them as-is for the caller to handle
259
+ return token.strValue ?? null;
260
+ }
261
+ }
262
+ /**
263
+ * Parse a PDF dictionary (after the `<<` token has been consumed).
264
+ */
265
+ function parseDictionary(tokenizer) {
266
+ const dict = new Map();
267
+ while (true) {
268
+ const keyToken = tokenizer.next();
269
+ if (keyToken.type === 8 /* TokenType.DictEnd */) {
270
+ break;
271
+ }
272
+ if (keyToken.type === 11 /* TokenType.EOF */) {
273
+ throw new errors_1.PdfStructureError("Unexpected EOF in dictionary");
274
+ }
275
+ if (keyToken.type !== 3 /* TokenType.Name */) {
276
+ // Some malformed PDFs have non-name keys — skip and try again
277
+ continue;
278
+ }
279
+ const key = keyToken.strValue;
280
+ const value = parseObject(tokenizer);
281
+ dict.set(key, value);
282
+ }
283
+ return dict;
284
+ }
285
+ /**
286
+ * Parse a PDF array (after the `[` token has been consumed).
287
+ */
288
+ function parseArray(tokenizer) {
289
+ const arr = [];
290
+ while (true) {
291
+ const token = tokenizer.next();
292
+ if (token.type === 10 /* TokenType.ArrayEnd */) {
293
+ break;
294
+ }
295
+ if (token.type === 11 /* TokenType.EOF */) {
296
+ throw new errors_1.PdfStructureError("Unexpected EOF in array");
297
+ }
298
+ arr.push(parseObjectFromToken(tokenizer, token));
299
+ }
300
+ return arr;
301
+ }
@@ -0,0 +1,203 @@
1
+ "use strict";
2
+ /**
3
+ * PDF reader — public API.
4
+ *
5
+ * Provides a high-level, zero-dependency interface for reading PDF files.
6
+ * Supports:
7
+ * - Text extraction with multilingual support (WinAnsi, MacRoman, CJK via
8
+ * ToUnicode CMap, Identity-H/V, Symbol, ZapfDingbats)
9
+ * - Image extraction (JPEG, JPEG2000, raw/Flate, CCITT, JBIG2)
10
+ * - Annotation extraction (links, comments, highlights, stamps, etc.)
11
+ * - Form field extraction (AcroForm: text inputs, checkboxes, radio buttons, dropdowns)
12
+ * - Metadata reading (Info dictionary + XMP)
13
+ * - Encrypted PDFs:
14
+ * - RC4 (40-bit and 128-bit) — tested via roundtrip
15
+ * - AES-128 (V=4, R=4) — implemented, requires external test fixtures
16
+ * - AES-256 (V=5, R=5) — implemented, requires external test fixtures
17
+ * - Cross-reference tables and streams (PDF 1.5+)
18
+ * - Incremental updates and xref recovery
19
+ *
20
+ * @example Basic text extraction:
21
+ * ```typescript
22
+ * import { readPdf } from "excelts/pdf";
23
+ *
24
+ * const pdf = readPdf(pdfBytes);
25
+ * console.log(pdf.text); // All text from all pages
26
+ * console.log(pdf.pages[0].text); // Text from page 1
27
+ * ```
28
+ *
29
+ * @example Image extraction:
30
+ * ```typescript
31
+ * const pdf = readPdf(pdfBytes);
32
+ * for (const image of pdf.pages[0].images) {
33
+ * console.log(image.format, image.width, image.height);
34
+ * fs.writeFileSync(`image.${image.format}`, image.data);
35
+ * }
36
+ * ```
37
+ *
38
+ * @example Metadata:
39
+ * ```typescript
40
+ * const pdf = readPdf(pdfBytes);
41
+ * console.log(pdf.metadata.title);
42
+ * console.log(pdf.metadata.author);
43
+ * console.log(pdf.metadata.pageCount);
44
+ * ```
45
+ *
46
+ * @example Encrypted PDF:
47
+ * ```typescript
48
+ * const pdf = readPdf(pdfBytes, { password: "secret" });
49
+ * ```
50
+ */
51
+ Object.defineProperty(exports, "__esModule", { value: true });
52
+ exports.readPdf = readPdf;
53
+ const pdf_document_1 = require("./pdf-document");
54
+ const pdf_decrypt_1 = require("./pdf-decrypt");
55
+ const content_interpreter_1 = require("./content-interpreter");
56
+ const text_reconstruction_1 = require("./text-reconstruction");
57
+ const image_extractor_1 = require("./image-extractor");
58
+ const annotation_extractor_1 = require("./annotation-extractor");
59
+ const form_extractor_1 = require("./form-extractor");
60
+ const metadata_reader_1 = require("./metadata-reader");
61
+ const errors_1 = require("../errors");
62
+ // =============================================================================
63
+ // Public API
64
+ // =============================================================================
65
+ /**
66
+ * Read a PDF file and extract text, images, and metadata.
67
+ *
68
+ * @param data - Raw PDF file bytes
69
+ * @param options - Extraction options
70
+ * @returns Extracted content
71
+ * @throws {PdfStructureError} If the PDF structure is invalid
72
+ * @throws {PdfError} If decryption fails (wrong password)
73
+ */
74
+ function readPdf(data, options) {
75
+ const opts = {
76
+ password: options?.password ?? "",
77
+ pages: options?.pages,
78
+ extractText: options?.extractText ?? true,
79
+ extractImages: options?.extractImages ?? true,
80
+ extractMetadata: options?.extractMetadata ?? true,
81
+ extractAnnotations: options?.extractAnnotations ?? true,
82
+ extractFormFields: options?.extractFormFields ?? true
83
+ };
84
+ // Parse document structure
85
+ const doc = new pdf_document_1.PdfDocument(data);
86
+ // Handle encryption
87
+ if ((0, pdf_decrypt_1.isEncrypted)(doc)) {
88
+ const success = (0, pdf_decrypt_1.initDecryption)(doc, opts.password);
89
+ if (!success) {
90
+ throw new errors_1.PdfStructureError("Failed to decrypt PDF: incorrect password");
91
+ }
92
+ }
93
+ // Extract metadata
94
+ const metadata = opts.extractMetadata ? (0, metadata_reader_1.extractMetadata)(doc) : createEmptyMetadata();
95
+ // Get pages (with object identity for correct decryption)
96
+ const pagesInfo = doc.getPagesWithObjInfo();
97
+ const pageIndicesToProcess = opts.pages
98
+ ? opts.pages.map(p => p - 1).filter(p => p >= 0 && p < pagesInfo.length)
99
+ : Array.from({ length: pagesInfo.length }, (_, i) => i);
100
+ // Process each page
101
+ const pages = [];
102
+ for (const pageIdx of pageIndicesToProcess) {
103
+ const { dict: pageDict } = pagesInfo[pageIdx];
104
+ const pageNumber = pageIdx + 1;
105
+ const warnings = [];
106
+ // Extract text
107
+ let text = "";
108
+ let textLines = [];
109
+ let textFragments = [];
110
+ if (opts.extractText) {
111
+ try {
112
+ textFragments = (0, content_interpreter_1.extractTextFromPage)(pageDict, doc);
113
+ text = (0, text_reconstruction_1.reconstructText)(textFragments);
114
+ textLines = (0, text_reconstruction_1.reconstructTextLines)(textFragments);
115
+ }
116
+ catch (err) {
117
+ const msg = err instanceof Error ? err.message : String(err);
118
+ warnings.push(`Text extraction failed on page ${pageNumber}: ${msg}`);
119
+ }
120
+ }
121
+ // Extract images
122
+ let images = [];
123
+ if (opts.extractImages) {
124
+ try {
125
+ images = (0, image_extractor_1.extractImagesFromPage)(pageDict, doc);
126
+ }
127
+ catch (err) {
128
+ const msg = err instanceof Error ? err.message : String(err);
129
+ warnings.push(`Image extraction failed on page ${pageNumber}: ${msg}`);
130
+ }
131
+ }
132
+ // Extract annotations
133
+ let annotations = [];
134
+ if (opts.extractAnnotations) {
135
+ try {
136
+ annotations = (0, annotation_extractor_1.extractAnnotationsFromPage)(pageDict, doc);
137
+ }
138
+ catch (err) {
139
+ const msg = err instanceof Error ? err.message : String(err);
140
+ warnings.push(`Annotation extraction failed on page ${pageNumber}: ${msg}`);
141
+ }
142
+ }
143
+ // Get page dimensions
144
+ const { width, height } = getPageDimensions(pageDict, doc);
145
+ pages.push({
146
+ pageNumber,
147
+ text,
148
+ textLines,
149
+ textFragments,
150
+ images,
151
+ annotations,
152
+ width,
153
+ height,
154
+ warnings
155
+ });
156
+ }
157
+ // Concatenate all page text
158
+ const allText = pages.map(p => p.text).join("\n\n");
159
+ // Update page count in metadata
160
+ if (opts.extractMetadata) {
161
+ metadata.pageCount = pagesInfo.length;
162
+ }
163
+ // Extract form fields (document-level, not per-page)
164
+ let formFields = [];
165
+ if (opts.extractFormFields) {
166
+ try {
167
+ formFields = (0, form_extractor_1.extractFormFields)(doc);
168
+ }
169
+ catch {
170
+ // Non-fatal — just return empty
171
+ }
172
+ }
173
+ return {
174
+ text: allText,
175
+ pages,
176
+ metadata,
177
+ formFields
178
+ };
179
+ }
180
+ // =============================================================================
181
+ // Helpers
182
+ // =============================================================================
183
+ function getPageDimensions(pageDict, doc) {
184
+ return doc.resolvePageBox(pageDict) ?? { width: 612, height: 792 }; // Default: US Letter
185
+ }
186
+ function createEmptyMetadata() {
187
+ return {
188
+ title: "",
189
+ author: "",
190
+ subject: "",
191
+ keywords: "",
192
+ creator: "",
193
+ producer: "",
194
+ creationDate: null,
195
+ modDate: null,
196
+ pdfVersion: "",
197
+ pageCount: 0,
198
+ encrypted: false,
199
+ pageSize: null,
200
+ xmpXml: null,
201
+ custom: {}
202
+ };
203
+ }