@cj-tech-master/excelts 8.0.0 → 8.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. package/README.md +14 -1
  2. package/README_zh.md +6 -0
  3. package/dist/browser/modules/archive/zip/stream.d.ts +4 -0
  4. package/dist/browser/modules/archive/zip/stream.js +53 -0
  5. package/dist/browser/modules/pdf/core/crypto.d.ts +65 -0
  6. package/dist/browser/modules/pdf/core/crypto.js +637 -0
  7. package/dist/browser/modules/pdf/core/encryption.d.ts +23 -20
  8. package/dist/browser/modules/pdf/core/encryption.js +88 -261
  9. package/dist/browser/modules/pdf/core/pdf-writer.d.ts +6 -4
  10. package/dist/browser/modules/pdf/core/pdf-writer.js +19 -10
  11. package/dist/browser/modules/pdf/index.d.ts +23 -2
  12. package/dist/browser/modules/pdf/index.js +21 -3
  13. package/dist/browser/modules/pdf/reader/annotation-extractor.d.ts +63 -0
  14. package/dist/browser/modules/pdf/reader/annotation-extractor.js +155 -0
  15. package/dist/browser/modules/pdf/reader/cmap-parser.d.ts +70 -0
  16. package/dist/browser/modules/pdf/reader/cmap-parser.js +321 -0
  17. package/dist/browser/modules/pdf/reader/content-interpreter.d.ts +57 -0
  18. package/dist/browser/modules/pdf/reader/content-interpreter.js +715 -0
  19. package/dist/browser/modules/pdf/reader/font-decoder.d.ts +58 -0
  20. package/dist/browser/modules/pdf/reader/font-decoder.js +1513 -0
  21. package/dist/browser/modules/pdf/reader/form-extractor.d.ts +48 -0
  22. package/dist/browser/modules/pdf/reader/form-extractor.js +355 -0
  23. package/dist/browser/modules/pdf/reader/image-extractor.d.ts +55 -0
  24. package/dist/browser/modules/pdf/reader/image-extractor.js +220 -0
  25. package/dist/browser/modules/pdf/reader/metadata-reader.d.ts +56 -0
  26. package/dist/browser/modules/pdf/reader/metadata-reader.js +275 -0
  27. package/dist/browser/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
  28. package/dist/browser/modules/pdf/reader/pdf-decrypt.js +443 -0
  29. package/dist/browser/modules/pdf/reader/pdf-document.d.ts +191 -0
  30. package/dist/browser/modules/pdf/reader/pdf-document.js +818 -0
  31. package/dist/browser/modules/pdf/reader/pdf-parser.d.ts +65 -0
  32. package/dist/browser/modules/pdf/reader/pdf-parser.js +285 -0
  33. package/dist/browser/modules/pdf/reader/pdf-reader.d.ts +143 -0
  34. package/dist/browser/modules/pdf/reader/pdf-reader.js +200 -0
  35. package/dist/browser/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
  36. package/dist/browser/modules/pdf/reader/pdf-tokenizer.js +543 -0
  37. package/dist/browser/modules/pdf/reader/reader-utils.d.ts +15 -0
  38. package/dist/browser/modules/pdf/reader/reader-utils.js +27 -0
  39. package/dist/browser/modules/pdf/reader/stream-filters.d.ts +20 -0
  40. package/dist/browser/modules/pdf/reader/stream-filters.js +456 -0
  41. package/dist/browser/modules/pdf/reader/text-reconstruction.d.ts +44 -0
  42. package/dist/browser/modules/pdf/reader/text-reconstruction.js +463 -0
  43. package/dist/cjs/modules/archive/zip/stream.js +53 -0
  44. package/dist/cjs/modules/pdf/core/crypto.js +649 -0
  45. package/dist/cjs/modules/pdf/core/encryption.js +88 -263
  46. package/dist/cjs/modules/pdf/core/pdf-writer.js +19 -10
  47. package/dist/cjs/modules/pdf/index.js +23 -4
  48. package/dist/cjs/modules/pdf/reader/annotation-extractor.js +158 -0
  49. package/dist/cjs/modules/pdf/reader/cmap-parser.js +326 -0
  50. package/dist/cjs/modules/pdf/reader/content-interpreter.js +718 -0
  51. package/dist/cjs/modules/pdf/reader/font-decoder.js +1518 -0
  52. package/dist/cjs/modules/pdf/reader/form-extractor.js +358 -0
  53. package/dist/cjs/modules/pdf/reader/image-extractor.js +223 -0
  54. package/dist/cjs/modules/pdf/reader/metadata-reader.js +278 -0
  55. package/dist/cjs/modules/pdf/reader/pdf-decrypt.js +447 -0
  56. package/dist/cjs/modules/pdf/reader/pdf-document.js +822 -0
  57. package/dist/cjs/modules/pdf/reader/pdf-parser.js +301 -0
  58. package/dist/cjs/modules/pdf/reader/pdf-reader.js +203 -0
  59. package/dist/cjs/modules/pdf/reader/pdf-tokenizer.js +517 -0
  60. package/dist/cjs/modules/pdf/reader/reader-utils.js +30 -0
  61. package/dist/cjs/modules/pdf/reader/stream-filters.js +459 -0
  62. package/dist/cjs/modules/pdf/reader/text-reconstruction.js +467 -0
  63. package/dist/esm/modules/archive/zip/stream.js +53 -0
  64. package/dist/esm/modules/pdf/core/crypto.js +637 -0
  65. package/dist/esm/modules/pdf/core/encryption.js +88 -261
  66. package/dist/esm/modules/pdf/core/pdf-writer.js +19 -10
  67. package/dist/esm/modules/pdf/index.js +21 -3
  68. package/dist/esm/modules/pdf/reader/annotation-extractor.js +155 -0
  69. package/dist/esm/modules/pdf/reader/cmap-parser.js +321 -0
  70. package/dist/esm/modules/pdf/reader/content-interpreter.js +715 -0
  71. package/dist/esm/modules/pdf/reader/font-decoder.js +1513 -0
  72. package/dist/esm/modules/pdf/reader/form-extractor.js +355 -0
  73. package/dist/esm/modules/pdf/reader/image-extractor.js +220 -0
  74. package/dist/esm/modules/pdf/reader/metadata-reader.js +275 -0
  75. package/dist/esm/modules/pdf/reader/pdf-decrypt.js +443 -0
  76. package/dist/esm/modules/pdf/reader/pdf-document.js +818 -0
  77. package/dist/esm/modules/pdf/reader/pdf-parser.js +285 -0
  78. package/dist/esm/modules/pdf/reader/pdf-reader.js +200 -0
  79. package/dist/esm/modules/pdf/reader/pdf-tokenizer.js +543 -0
  80. package/dist/esm/modules/pdf/reader/reader-utils.js +27 -0
  81. package/dist/esm/modules/pdf/reader/stream-filters.js +456 -0
  82. package/dist/esm/modules/pdf/reader/text-reconstruction.js +463 -0
  83. package/dist/iife/excelts.iife.js +703 -267
  84. package/dist/iife/excelts.iife.js.map +1 -1
  85. package/dist/iife/excelts.iife.min.js +35 -35
  86. package/dist/types/modules/archive/zip/stream.d.ts +4 -0
  87. package/dist/types/modules/pdf/core/crypto.d.ts +65 -0
  88. package/dist/types/modules/pdf/core/encryption.d.ts +23 -20
  89. package/dist/types/modules/pdf/core/pdf-writer.d.ts +6 -4
  90. package/dist/types/modules/pdf/index.d.ts +23 -2
  91. package/dist/types/modules/pdf/reader/annotation-extractor.d.ts +63 -0
  92. package/dist/types/modules/pdf/reader/cmap-parser.d.ts +70 -0
  93. package/dist/types/modules/pdf/reader/content-interpreter.d.ts +57 -0
  94. package/dist/types/modules/pdf/reader/font-decoder.d.ts +58 -0
  95. package/dist/types/modules/pdf/reader/form-extractor.d.ts +48 -0
  96. package/dist/types/modules/pdf/reader/image-extractor.d.ts +55 -0
  97. package/dist/types/modules/pdf/reader/metadata-reader.d.ts +56 -0
  98. package/dist/types/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
  99. package/dist/types/modules/pdf/reader/pdf-document.d.ts +191 -0
  100. package/dist/types/modules/pdf/reader/pdf-parser.d.ts +65 -0
  101. package/dist/types/modules/pdf/reader/pdf-reader.d.ts +143 -0
  102. package/dist/types/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
  103. package/dist/types/modules/pdf/reader/reader-utils.d.ts +15 -0
  104. package/dist/types/modules/pdf/reader/stream-filters.d.ts +20 -0
  105. package/dist/types/modules/pdf/reader/text-reconstruction.d.ts +44 -0
  106. package/package.json +1 -1
@@ -0,0 +1,358 @@
1
+ "use strict";
2
+ /**
3
+ * PDF form field (AcroForm) extractor.
4
+ *
5
+ * Extracts interactive form fields from a PDF's `/AcroForm` dictionary.
6
+ * Supports all standard field types:
7
+ * - **Text** (`/Tx`) — Text input fields
8
+ * - **Button** (`/Btn`) — Checkboxes, radio buttons, push buttons
9
+ * - **Choice** (`/Ch`) — Dropdowns (combo boxes) and list boxes
10
+ * - **Signature** (`/Sig`) — Digital signature fields
11
+ *
12
+ * Handles field hierarchies (parent/child), inherited values, and default appearances.
13
+ *
14
+ * @see PDF Reference 1.7, §12.7 - Interactive Forms
15
+ */
16
+ Object.defineProperty(exports, "__esModule", { value: true });
17
+ exports.extractFormFields = extractFormFields;
18
+ const pdf_parser_1 = require("./pdf-parser");
19
+ const reader_utils_1 = require("./reader-utils");
20
+ // =============================================================================
21
+ // Public API
22
+ // =============================================================================
23
+ /**
24
+ * Extract form fields from a PDF document.
25
+ *
26
+ * Reads the `/AcroForm` dictionary from the catalog and recursively
27
+ * traverses the field tree.
28
+ *
29
+ * @param doc - The PDF document
30
+ * @returns Array of extracted form fields
31
+ */
32
+ function extractFormFields(doc) {
33
+ try {
34
+ const catalog = doc.getCatalog();
35
+ const acroFormObj = catalog.get("AcroForm");
36
+ if (!acroFormObj) {
37
+ return [];
38
+ }
39
+ const acroForm = doc.derefDict(acroFormObj);
40
+ if (!acroForm) {
41
+ return [];
42
+ }
43
+ const fieldsObj = acroForm.get("Fields");
44
+ if (!fieldsObj) {
45
+ return [];
46
+ }
47
+ const fieldsArr = doc.deref(fieldsObj);
48
+ if (!(0, pdf_parser_1.isPdfArray)(fieldsArr)) {
49
+ return [];
50
+ }
51
+ const fields = [];
52
+ const visited = new Set();
53
+ for (const fieldRef of fieldsArr) {
54
+ collectFields(fieldRef, doc, "", fields, visited);
55
+ }
56
+ return fields;
57
+ }
58
+ catch {
59
+ return [];
60
+ }
61
+ }
62
+ // =============================================================================
63
+ // Field Tree Traversal
64
+ // =============================================================================
65
+ /** Field flags from PDF spec §12.7.3 */
66
+ const FLAG_READ_ONLY = 1 << 0;
67
+ const FLAG_REQUIRED = 1 << 1;
68
+ // Button-specific
69
+ const FLAG_PUSHBUTTON = 1 << 16;
70
+ const FLAG_RADIO = 1 << 15;
71
+ // Choice-specific
72
+ const FLAG_COMBO = 1 << 17;
73
+ /** Maximum depth for parent traversal to prevent cycles in malformed PDFs */
74
+ const MAX_INHERIT_DEPTH = 20;
75
+ function collectFields(fieldObj, doc, parentName, result, visited) {
76
+ // Track visited objects to avoid cycles
77
+ if ((0, pdf_parser_1.isPdfRef)(fieldObj)) {
78
+ if (visited.has(fieldObj.objNum)) {
79
+ return;
80
+ }
81
+ visited.add(fieldObj.objNum);
82
+ }
83
+ const dict = doc.derefDict(fieldObj);
84
+ if (!dict) {
85
+ return;
86
+ }
87
+ // Build the fully qualified field name
88
+ const partialName = (0, reader_utils_1.getDictStringValue)(dict, "T", doc);
89
+ const fullName = parentName
90
+ ? partialName
91
+ ? `${parentName}.${partialName}`
92
+ : parentName
93
+ : partialName;
94
+ // Check for children (/Kids)
95
+ const kidsObj = dict.get("Kids");
96
+ if (kidsObj) {
97
+ const kids = doc.deref(kidsObj);
98
+ if ((0, pdf_parser_1.isPdfArray)(kids)) {
99
+ // Check if kids are field nodes or widget nodes
100
+ // If kids have /T entries, they are field nodes (continue recursion)
101
+ // If kids don't have /T, they are widget annotations — treat parent as the field
102
+ let hasFieldChildren = false;
103
+ for (const kid of kids) {
104
+ const kidDict = doc.derefDict(kid);
105
+ if (kidDict && kidDict.has("T")) {
106
+ hasFieldChildren = true;
107
+ break;
108
+ }
109
+ }
110
+ if (hasFieldChildren) {
111
+ // Recurse into child fields
112
+ for (const kid of kids) {
113
+ collectFields(kid, doc, fullName, result, visited);
114
+ }
115
+ return;
116
+ }
117
+ // Kids are widgets — extract value from first kid or parent
118
+ // For radio buttons, collect export values from kids
119
+ const ft = resolveFieldType(dict, doc);
120
+ if (ft === "Btn") {
121
+ const ff = resolveFieldFlags(dict, doc);
122
+ if ((ff & FLAG_RADIO) !== 0 && (ff & FLAG_PUSHBUTTON) === 0) {
123
+ // Radio button: current value is on the parent, export values on kids
124
+ const field = parseRadioField(dict, kids, fullName, ff, doc);
125
+ if (field) {
126
+ result.push(field);
127
+ }
128
+ return;
129
+ }
130
+ }
131
+ }
132
+ }
133
+ // Leaf field — extract its properties
134
+ const ft = resolveFieldType(dict, doc);
135
+ if (!ft) {
136
+ return; // Not a real field (no /FT)
137
+ }
138
+ const field = parseField(dict, fullName, ft, doc);
139
+ if (field) {
140
+ result.push(field);
141
+ }
142
+ }
143
+ // =============================================================================
144
+ // Field Parsing
145
+ // =============================================================================
146
+ /**
147
+ * Resolve /FT (field type) which may be inherited from parent.
148
+ */
149
+ function resolveFieldType(dict, doc, depth = 0) {
150
+ const ft = (0, pdf_parser_1.dictGetName)(dict, "FT");
151
+ if (ft) {
152
+ return ft;
153
+ }
154
+ if (depth >= MAX_INHERIT_DEPTH) {
155
+ return undefined;
156
+ }
157
+ // Check parent
158
+ const parent = dict.get("Parent");
159
+ if (parent) {
160
+ const parentDict = doc.derefDict(parent);
161
+ if (parentDict) {
162
+ return resolveFieldType(parentDict, doc, depth + 1);
163
+ }
164
+ }
165
+ return undefined;
166
+ }
167
+ /**
168
+ * Resolve /Ff (field flags) which may be inherited from parent.
169
+ */
170
+ function resolveFieldFlags(dict, doc, depth = 0) {
171
+ const ff = (0, pdf_parser_1.dictGetNumber)(dict, "Ff");
172
+ if (ff !== undefined) {
173
+ return ff;
174
+ }
175
+ if (depth >= MAX_INHERIT_DEPTH) {
176
+ return 0;
177
+ }
178
+ const parent = dict.get("Parent");
179
+ if (parent) {
180
+ const parentDict = doc.derefDict(parent);
181
+ if (parentDict) {
182
+ return resolveFieldFlags(parentDict, doc, depth + 1);
183
+ }
184
+ }
185
+ return 0;
186
+ }
187
+ function parseField(dict, name, ft, doc) {
188
+ const ff = resolveFieldFlags(dict, doc);
189
+ const value = getFieldValue(dict, doc);
190
+ const defaultValue = (0, reader_utils_1.getDictStringValue)(dict, "DV", doc);
191
+ const type = classifyFieldType(ft, ff);
192
+ let options = [];
193
+ let exportValue = "";
194
+ if (ft === "Ch") {
195
+ options = parseChoiceOptions(dict, doc);
196
+ }
197
+ if (ft === "Btn" && (ff & FLAG_PUSHBUTTON) === 0 && (ff & FLAG_RADIO) === 0) {
198
+ // Checkbox — extract export value from /AP /N keys
199
+ exportValue = parseCheckboxExportValue(dict, doc);
200
+ }
201
+ return {
202
+ name: name || "(unnamed)",
203
+ type,
204
+ value,
205
+ defaultValue,
206
+ readOnly: (ff & FLAG_READ_ONLY) !== 0,
207
+ required: (ff & FLAG_REQUIRED) !== 0,
208
+ options,
209
+ exportValue,
210
+ flags: ff
211
+ };
212
+ }
213
+ function parseRadioField(parentDict, kids, name, ff, doc) {
214
+ const value = getFieldValue(parentDict, doc);
215
+ const defaultValue = (0, reader_utils_1.getDictStringValue)(parentDict, "DV", doc);
216
+ // Collect export values from kid appearance dictionaries
217
+ const options = [];
218
+ for (const kid of kids) {
219
+ const kidDict = doc.derefDict(kid);
220
+ if (!kidDict) {
221
+ continue;
222
+ }
223
+ const apDict = doc.derefDict(kidDict.get("AP"));
224
+ if (!apDict) {
225
+ continue;
226
+ }
227
+ const nDict = doc.derefDict(apDict.get("N"));
228
+ if (!nDict) {
229
+ continue;
230
+ }
231
+ // Keys of /AP /N are the possible states (e.g. "/Choice1", "/Off")
232
+ for (const key of nDict.keys()) {
233
+ if (key !== "Off" && !options.includes(key)) {
234
+ options.push(key);
235
+ }
236
+ }
237
+ }
238
+ return {
239
+ name: name || "(unnamed)",
240
+ type: "radio",
241
+ value,
242
+ defaultValue,
243
+ readOnly: (ff & FLAG_READ_ONLY) !== 0,
244
+ required: (ff & FLAG_REQUIRED) !== 0,
245
+ options,
246
+ exportValue: "",
247
+ flags: ff
248
+ };
249
+ }
250
+ function classifyFieldType(ft, ff) {
251
+ switch (ft) {
252
+ case "Tx":
253
+ return "text";
254
+ case "Btn":
255
+ if ((ff & FLAG_PUSHBUTTON) !== 0) {
256
+ return "button";
257
+ }
258
+ if ((ff & FLAG_RADIO) !== 0) {
259
+ return "radio";
260
+ }
261
+ return "checkbox";
262
+ case "Ch":
263
+ if ((ff & FLAG_COMBO) !== 0) {
264
+ return "dropdown";
265
+ }
266
+ return "listbox";
267
+ case "Sig":
268
+ return "signature";
269
+ default:
270
+ return "unknown";
271
+ }
272
+ }
273
+ function parseChoiceOptions(dict, doc) {
274
+ const optObj = dict.get("Opt");
275
+ if (!optObj) {
276
+ return [];
277
+ }
278
+ const optArr = doc.deref(optObj);
279
+ if (!(0, pdf_parser_1.isPdfArray)(optArr)) {
280
+ return [];
281
+ }
282
+ const options = [];
283
+ for (const item of optArr) {
284
+ const resolved = doc.deref(item);
285
+ if (typeof resolved === "string") {
286
+ options.push(resolved);
287
+ }
288
+ else if (resolved instanceof Uint8Array) {
289
+ options.push((0, pdf_parser_1.decodePdfStringBytes)(resolved));
290
+ }
291
+ else if ((0, pdf_parser_1.isPdfArray)(resolved) && resolved.length >= 2) {
292
+ // [exportValue, displayValue] pair
293
+ const display = doc.deref(resolved[1]);
294
+ if (typeof display === "string") {
295
+ options.push(display);
296
+ }
297
+ else if (display instanceof Uint8Array) {
298
+ options.push((0, pdf_parser_1.decodePdfStringBytes)(display));
299
+ }
300
+ }
301
+ }
302
+ return options;
303
+ }
304
+ function parseCheckboxExportValue(dict, doc) {
305
+ // The export value is the key in /AP /N that isn't "Off"
306
+ const apDict = doc.derefDict(dict.get("AP"));
307
+ if (!apDict) {
308
+ return "Yes"; // Default per spec
309
+ }
310
+ const nDict = doc.derefDict(apDict.get("N"));
311
+ if (!nDict) {
312
+ return "Yes";
313
+ }
314
+ for (const key of nDict.keys()) {
315
+ if (key !== "Off") {
316
+ return key;
317
+ }
318
+ }
319
+ return "Yes";
320
+ }
321
+ // =============================================================================
322
+ // Value Extraction
323
+ // =============================================================================
324
+ /** Get the field value (/V entry), resolving from parent if needed. */
325
+ function getFieldValue(dict, doc, depth = 0) {
326
+ const val = dict.get("V");
327
+ if (val !== undefined) {
328
+ return resolveValue(val, doc);
329
+ }
330
+ if (depth >= MAX_INHERIT_DEPTH) {
331
+ return "";
332
+ }
333
+ // Inherit from parent
334
+ const parent = dict.get("Parent");
335
+ if (parent) {
336
+ const parentDict = doc.derefDict(parent);
337
+ if (parentDict) {
338
+ return getFieldValue(parentDict, doc, depth + 1);
339
+ }
340
+ }
341
+ return "";
342
+ }
343
+ function resolveValue(val, doc) {
344
+ const resolved = doc.deref(val);
345
+ if (typeof resolved === "string") {
346
+ return resolved;
347
+ }
348
+ if (resolved instanceof Uint8Array) {
349
+ return (0, pdf_parser_1.decodePdfStringBytes)(resolved);
350
+ }
351
+ if (typeof resolved === "number") {
352
+ return String(resolved);
353
+ }
354
+ if (typeof resolved === "boolean") {
355
+ return resolved ? "true" : "false";
356
+ }
357
+ return "";
358
+ }
@@ -0,0 +1,223 @@
1
+ "use strict";
2
+ /**
3
+ * PDF image extraction.
4
+ *
5
+ * Extracts images from PDF pages including:
6
+ * - Inline images (BI/ID/EI operators)
7
+ * - XObject images (/Subtype /Image)
8
+ * - Images with various color spaces and filters
9
+ *
10
+ * Supported image formats:
11
+ * - JPEG (DCTDecode) — extracted as-is
12
+ * - JPEG2000 (JPXDecode) — extracted as-is
13
+ * - Raw/Flate-compressed pixel data — extracted with metadata
14
+ * - CCITT fax — extracted as-is
15
+ *
16
+ * @see PDF Reference 1.7, §4.8 - Images
17
+ */
18
+ Object.defineProperty(exports, "__esModule", { value: true });
19
+ exports.extractImagesFromPage = extractImagesFromPage;
20
+ const pdf_parser_1 = require("./pdf-parser");
21
+ // =============================================================================
22
+ // Public API
23
+ // =============================================================================
24
+ /**
25
+ * Extract all images from a PDF page.
26
+ */
27
+ function extractImagesFromPage(pageDict, doc) {
28
+ const images = [];
29
+ // Get page resources (centralized with cycle protection)
30
+ const resources = doc.resolvePageResources(pageDict);
31
+ const xobjects = resources.get("XObject");
32
+ if (!xobjects) {
33
+ return images;
34
+ }
35
+ const xobjDict = doc.derefDict(xobjects);
36
+ if (!xobjDict) {
37
+ return images;
38
+ }
39
+ let index = 0;
40
+ for (const [name, ref] of xobjDict) {
41
+ const result = doc.derefStreamWithObjNum(ref);
42
+ if (!result) {
43
+ continue;
44
+ }
45
+ const streamDict = result.stream.dict;
46
+ const subtype = (0, pdf_parser_1.dictGetName)(streamDict, "Subtype");
47
+ if (subtype !== "Image") {
48
+ continue;
49
+ }
50
+ const image = extractImage(name, result.stream, streamDict, doc, index, result.objNum, result.gen);
51
+ if (image) {
52
+ images.push(image);
53
+ index++;
54
+ }
55
+ }
56
+ return images;
57
+ }
58
+ // =============================================================================
59
+ // Image Extraction
60
+ // =============================================================================
61
+ function extractImage(name, stream, dict, doc, index, objNum = 0, gen = 0) {
62
+ const width = (0, pdf_parser_1.dictGetNumber)(dict, "Width") ?? (0, pdf_parser_1.dictGetNumber)(dict, "W") ?? 0;
63
+ const height = (0, pdf_parser_1.dictGetNumber)(dict, "Height") ?? (0, pdf_parser_1.dictGetNumber)(dict, "H") ?? 0;
64
+ const bpc = (0, pdf_parser_1.dictGetNumber)(dict, "BitsPerComponent") ?? (0, pdf_parser_1.dictGetNumber)(dict, "BPC") ?? 8;
65
+ if (width === 0 || height === 0) {
66
+ return null;
67
+ }
68
+ // Determine color space
69
+ const { colorSpace, components } = resolveColorSpace(dict, doc);
70
+ // Determine filter to understand the image format
71
+ const filter = getFilterName(dict);
72
+ // Extract image data based on filter
73
+ // For all formats, use getStreamData which handles decryption and filter decoding
74
+ let data;
75
+ let format;
76
+ if (filter === "DCTDecode" || filter === "DCT") {
77
+ // JPEG — use getStreamData which handles decryption properly
78
+ data = doc.getStreamData(stream, objNum, gen);
79
+ format = "jpeg";
80
+ }
81
+ else if (filter === "JPXDecode") {
82
+ data = doc.getStreamData(stream, objNum, gen);
83
+ format = "jpx";
84
+ }
85
+ else if (filter === "CCITTFaxDecode" || filter === "CCF") {
86
+ data = doc.getStreamData(stream, objNum, gen);
87
+ format = "ccitt";
88
+ }
89
+ else if (filter === "JBIG2Decode") {
90
+ data = doc.getStreamData(stream, objNum, gen);
91
+ format = "jbig2";
92
+ }
93
+ else {
94
+ // Decode all filters to get raw pixel data
95
+ data = doc.getStreamData(stream, objNum, gen);
96
+ format = "raw";
97
+ }
98
+ // Extract soft mask (alpha channel)
99
+ let alphaMask = null;
100
+ const smaskRef = dict.get("SMask");
101
+ if (smaskRef) {
102
+ const smaskResult = doc.derefStreamWithObjNum(smaskRef);
103
+ if (smaskResult) {
104
+ alphaMask = doc.getStreamData(smaskResult.stream, smaskResult.objNum, smaskResult.gen);
105
+ }
106
+ }
107
+ return {
108
+ index,
109
+ width,
110
+ height,
111
+ bitsPerComponent: bpc,
112
+ colorSpace,
113
+ components,
114
+ format,
115
+ data,
116
+ alphaMask,
117
+ filter,
118
+ name
119
+ };
120
+ }
121
+ // =============================================================================
122
+ // Color Space Resolution
123
+ // =============================================================================
124
+ function resolveColorSpace(dict, doc) {
125
+ const cs = dict.get("ColorSpace") ?? dict.get("CS");
126
+ if (typeof cs === "string") {
127
+ return colorSpaceInfo(cs);
128
+ }
129
+ if ((0, pdf_parser_1.isPdfArray)(cs) && cs.length > 0) {
130
+ const csName = cs[0];
131
+ if (typeof csName === "string") {
132
+ if (csName === "ICCBased") {
133
+ // ICC-based color space — get N from the profile stream
134
+ if (cs.length > 1) {
135
+ const profileStream = doc.derefStream(cs[1]);
136
+ if (profileStream) {
137
+ const n = (0, pdf_parser_1.dictGetNumber)(profileStream.dict, "N") ?? 3;
138
+ return {
139
+ colorSpace: "ICCBased",
140
+ components: n
141
+ };
142
+ }
143
+ }
144
+ return { colorSpace: "ICCBased", components: 3 };
145
+ }
146
+ if (csName === "Indexed" || csName === "I") {
147
+ return { colorSpace: "Indexed", components: 1 };
148
+ }
149
+ if (csName === "Separation") {
150
+ return { colorSpace: "Separation", components: 1 };
151
+ }
152
+ if (csName === "DeviceN") {
153
+ const numComponents = (0, pdf_parser_1.isPdfArray)(cs[1]) ? cs[1].length : 1;
154
+ return { colorSpace: "DeviceN", components: numComponents };
155
+ }
156
+ return colorSpaceInfo(csName);
157
+ }
158
+ }
159
+ if ((0, pdf_parser_1.isPdfRef)(cs)) {
160
+ const resolved = doc.deref(cs);
161
+ if (typeof resolved === "string") {
162
+ return colorSpaceInfo(resolved);
163
+ }
164
+ if ((0, pdf_parser_1.isPdfArray)(resolved) && resolved.length > 0 && typeof resolved[0] === "string") {
165
+ return resolveColorSpace(new Map([["ColorSpace", resolved]]), doc);
166
+ }
167
+ }
168
+ return { colorSpace: "DeviceRGB", components: 3 };
169
+ }
170
+ function colorSpaceInfo(name) {
171
+ switch (name) {
172
+ case "DeviceGray":
173
+ case "G":
174
+ case "CalGray":
175
+ return { colorSpace: name, components: 1 };
176
+ case "DeviceRGB":
177
+ case "RGB":
178
+ case "CalRGB":
179
+ return { colorSpace: name, components: 3 };
180
+ case "DeviceCMYK":
181
+ case "CMYK":
182
+ return { colorSpace: name, components: 4 };
183
+ default:
184
+ return { colorSpace: name, components: 3 };
185
+ }
186
+ }
187
+ // =============================================================================
188
+ // Helpers
189
+ // =============================================================================
190
+ /**
191
+ * Get the image-specific filter name from a stream dictionary.
192
+ * For filter chains, finds the last image-specific filter (DCT, JPX, CCITT, JBIG2).
193
+ * For non-image-specific chains (e.g., just FlateDecode), returns that filter.
194
+ */
195
+ function getFilterName(dict) {
196
+ const filter = dict.get("Filter") ?? dict.get("F");
197
+ if (typeof filter === "string") {
198
+ return filter;
199
+ }
200
+ if ((0, pdf_parser_1.isPdfArray)(filter) && filter.length > 0) {
201
+ // Look for the last image-specific filter in the chain
202
+ const imageFilters = new Set([
203
+ "DCTDecode",
204
+ "DCT",
205
+ "JPXDecode",
206
+ "CCITTFaxDecode",
207
+ "CCF",
208
+ "JBIG2Decode"
209
+ ]);
210
+ for (let i = filter.length - 1; i >= 0; i--) {
211
+ const f = filter[i];
212
+ if (typeof f === "string" && imageFilters.has(f)) {
213
+ return f;
214
+ }
215
+ }
216
+ // No image-specific filter found — return the first filter
217
+ const first = filter[0];
218
+ if (typeof first === "string") {
219
+ return first;
220
+ }
221
+ }
222
+ return "";
223
+ }