@cj-tech-master/excelts 8.0.0 → 8.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. package/README.md +14 -1
  2. package/README_zh.md +6 -0
  3. package/dist/browser/modules/archive/zip/stream.d.ts +4 -0
  4. package/dist/browser/modules/archive/zip/stream.js +53 -0
  5. package/dist/browser/modules/pdf/core/crypto.d.ts +65 -0
  6. package/dist/browser/modules/pdf/core/crypto.js +637 -0
  7. package/dist/browser/modules/pdf/core/encryption.d.ts +23 -20
  8. package/dist/browser/modules/pdf/core/encryption.js +88 -261
  9. package/dist/browser/modules/pdf/core/pdf-writer.d.ts +6 -4
  10. package/dist/browser/modules/pdf/core/pdf-writer.js +19 -10
  11. package/dist/browser/modules/pdf/index.d.ts +23 -2
  12. package/dist/browser/modules/pdf/index.js +21 -3
  13. package/dist/browser/modules/pdf/reader/annotation-extractor.d.ts +63 -0
  14. package/dist/browser/modules/pdf/reader/annotation-extractor.js +155 -0
  15. package/dist/browser/modules/pdf/reader/cmap-parser.d.ts +70 -0
  16. package/dist/browser/modules/pdf/reader/cmap-parser.js +321 -0
  17. package/dist/browser/modules/pdf/reader/content-interpreter.d.ts +57 -0
  18. package/dist/browser/modules/pdf/reader/content-interpreter.js +715 -0
  19. package/dist/browser/modules/pdf/reader/font-decoder.d.ts +58 -0
  20. package/dist/browser/modules/pdf/reader/font-decoder.js +1513 -0
  21. package/dist/browser/modules/pdf/reader/form-extractor.d.ts +48 -0
  22. package/dist/browser/modules/pdf/reader/form-extractor.js +355 -0
  23. package/dist/browser/modules/pdf/reader/image-extractor.d.ts +55 -0
  24. package/dist/browser/modules/pdf/reader/image-extractor.js +220 -0
  25. package/dist/browser/modules/pdf/reader/metadata-reader.d.ts +56 -0
  26. package/dist/browser/modules/pdf/reader/metadata-reader.js +275 -0
  27. package/dist/browser/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
  28. package/dist/browser/modules/pdf/reader/pdf-decrypt.js +443 -0
  29. package/dist/browser/modules/pdf/reader/pdf-document.d.ts +191 -0
  30. package/dist/browser/modules/pdf/reader/pdf-document.js +818 -0
  31. package/dist/browser/modules/pdf/reader/pdf-parser.d.ts +65 -0
  32. package/dist/browser/modules/pdf/reader/pdf-parser.js +285 -0
  33. package/dist/browser/modules/pdf/reader/pdf-reader.d.ts +143 -0
  34. package/dist/browser/modules/pdf/reader/pdf-reader.js +200 -0
  35. package/dist/browser/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
  36. package/dist/browser/modules/pdf/reader/pdf-tokenizer.js +543 -0
  37. package/dist/browser/modules/pdf/reader/reader-utils.d.ts +15 -0
  38. package/dist/browser/modules/pdf/reader/reader-utils.js +27 -0
  39. package/dist/browser/modules/pdf/reader/stream-filters.d.ts +20 -0
  40. package/dist/browser/modules/pdf/reader/stream-filters.js +456 -0
  41. package/dist/browser/modules/pdf/reader/text-reconstruction.d.ts +44 -0
  42. package/dist/browser/modules/pdf/reader/text-reconstruction.js +463 -0
  43. package/dist/cjs/modules/archive/zip/stream.js +53 -0
  44. package/dist/cjs/modules/pdf/core/crypto.js +649 -0
  45. package/dist/cjs/modules/pdf/core/encryption.js +88 -263
  46. package/dist/cjs/modules/pdf/core/pdf-writer.js +19 -10
  47. package/dist/cjs/modules/pdf/index.js +23 -4
  48. package/dist/cjs/modules/pdf/reader/annotation-extractor.js +158 -0
  49. package/dist/cjs/modules/pdf/reader/cmap-parser.js +326 -0
  50. package/dist/cjs/modules/pdf/reader/content-interpreter.js +718 -0
  51. package/dist/cjs/modules/pdf/reader/font-decoder.js +1518 -0
  52. package/dist/cjs/modules/pdf/reader/form-extractor.js +358 -0
  53. package/dist/cjs/modules/pdf/reader/image-extractor.js +223 -0
  54. package/dist/cjs/modules/pdf/reader/metadata-reader.js +278 -0
  55. package/dist/cjs/modules/pdf/reader/pdf-decrypt.js +447 -0
  56. package/dist/cjs/modules/pdf/reader/pdf-document.js +822 -0
  57. package/dist/cjs/modules/pdf/reader/pdf-parser.js +301 -0
  58. package/dist/cjs/modules/pdf/reader/pdf-reader.js +203 -0
  59. package/dist/cjs/modules/pdf/reader/pdf-tokenizer.js +517 -0
  60. package/dist/cjs/modules/pdf/reader/reader-utils.js +30 -0
  61. package/dist/cjs/modules/pdf/reader/stream-filters.js +459 -0
  62. package/dist/cjs/modules/pdf/reader/text-reconstruction.js +467 -0
  63. package/dist/esm/modules/archive/zip/stream.js +53 -0
  64. package/dist/esm/modules/pdf/core/crypto.js +637 -0
  65. package/dist/esm/modules/pdf/core/encryption.js +88 -261
  66. package/dist/esm/modules/pdf/core/pdf-writer.js +19 -10
  67. package/dist/esm/modules/pdf/index.js +21 -3
  68. package/dist/esm/modules/pdf/reader/annotation-extractor.js +155 -0
  69. package/dist/esm/modules/pdf/reader/cmap-parser.js +321 -0
  70. package/dist/esm/modules/pdf/reader/content-interpreter.js +715 -0
  71. package/dist/esm/modules/pdf/reader/font-decoder.js +1513 -0
  72. package/dist/esm/modules/pdf/reader/form-extractor.js +355 -0
  73. package/dist/esm/modules/pdf/reader/image-extractor.js +220 -0
  74. package/dist/esm/modules/pdf/reader/metadata-reader.js +275 -0
  75. package/dist/esm/modules/pdf/reader/pdf-decrypt.js +443 -0
  76. package/dist/esm/modules/pdf/reader/pdf-document.js +818 -0
  77. package/dist/esm/modules/pdf/reader/pdf-parser.js +285 -0
  78. package/dist/esm/modules/pdf/reader/pdf-reader.js +200 -0
  79. package/dist/esm/modules/pdf/reader/pdf-tokenizer.js +543 -0
  80. package/dist/esm/modules/pdf/reader/reader-utils.js +27 -0
  81. package/dist/esm/modules/pdf/reader/stream-filters.js +456 -0
  82. package/dist/esm/modules/pdf/reader/text-reconstruction.js +463 -0
  83. package/dist/iife/excelts.iife.js +703 -267
  84. package/dist/iife/excelts.iife.js.map +1 -1
  85. package/dist/iife/excelts.iife.min.js +35 -35
  86. package/dist/types/modules/archive/zip/stream.d.ts +4 -0
  87. package/dist/types/modules/pdf/core/crypto.d.ts +65 -0
  88. package/dist/types/modules/pdf/core/encryption.d.ts +23 -20
  89. package/dist/types/modules/pdf/core/pdf-writer.d.ts +6 -4
  90. package/dist/types/modules/pdf/index.d.ts +23 -2
  91. package/dist/types/modules/pdf/reader/annotation-extractor.d.ts +63 -0
  92. package/dist/types/modules/pdf/reader/cmap-parser.d.ts +70 -0
  93. package/dist/types/modules/pdf/reader/content-interpreter.d.ts +57 -0
  94. package/dist/types/modules/pdf/reader/font-decoder.d.ts +58 -0
  95. package/dist/types/modules/pdf/reader/form-extractor.d.ts +48 -0
  96. package/dist/types/modules/pdf/reader/image-extractor.d.ts +55 -0
  97. package/dist/types/modules/pdf/reader/metadata-reader.d.ts +56 -0
  98. package/dist/types/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
  99. package/dist/types/modules/pdf/reader/pdf-document.d.ts +191 -0
  100. package/dist/types/modules/pdf/reader/pdf-parser.d.ts +65 -0
  101. package/dist/types/modules/pdf/reader/pdf-reader.d.ts +143 -0
  102. package/dist/types/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
  103. package/dist/types/modules/pdf/reader/reader-utils.d.ts +15 -0
  104. package/dist/types/modules/pdf/reader/stream-filters.d.ts +20 -0
  105. package/dist/types/modules/pdf/reader/text-reconstruction.d.ts +44 -0
  106. package/package.json +1 -1
@@ -0,0 +1,48 @@
1
+ /**
2
+ * PDF form field (AcroForm) extractor.
3
+ *
4
+ * Extracts interactive form fields from a PDF's `/AcroForm` dictionary.
5
+ * Supports all standard field types:
6
+ * - **Text** (`/Tx`) — Text input fields
7
+ * - **Button** (`/Btn`) — Checkboxes, radio buttons, push buttons
8
+ * - **Choice** (`/Ch`) — Dropdowns (combo boxes) and list boxes
9
+ * - **Signature** (`/Sig`) — Digital signature fields
10
+ *
11
+ * Handles field hierarchies (parent/child), inherited values, and default appearances.
12
+ *
13
+ * @see PDF Reference 1.7, §12.7 - Interactive Forms
14
+ */
15
+ import type { PdfDocument } from "./pdf-document.js";
16
+ /** Type of form field. */
17
+ export type PdfFormFieldType = "text" | "checkbox" | "radio" | "dropdown" | "listbox" | "button" | "signature" | "unknown";
18
+ /** A single form field extracted from the PDF. */
19
+ export interface PdfFormField {
20
+ /** Fully qualified field name (e.g. "form1.address.city") */
21
+ name: string;
22
+ /** Field type */
23
+ type: PdfFormFieldType;
24
+ /** Current value of the field */
25
+ value: string;
26
+ /** Default value (/DV entry) */
27
+ defaultValue: string;
28
+ /** Whether the field is read-only */
29
+ readOnly: boolean;
30
+ /** Whether the field is required */
31
+ required: boolean;
32
+ /** For choice fields: the list of available options */
33
+ options: string[];
34
+ /** For checkboxes/radio buttons: the export value when checked */
35
+ exportValue: string;
36
+ /** Field flags (/Ff entry) — raw bit field */
37
+ flags: number;
38
+ }
39
+ /**
40
+ * Extract form fields from a PDF document.
41
+ *
42
+ * Reads the `/AcroForm` dictionary from the catalog and recursively
43
+ * traverses the field tree.
44
+ *
45
+ * @param doc - The PDF document
46
+ * @returns Array of extracted form fields
47
+ */
48
+ export declare function extractFormFields(doc: PdfDocument): PdfFormField[];
@@ -0,0 +1,355 @@
1
+ /**
2
+ * PDF form field (AcroForm) extractor.
3
+ *
4
+ * Extracts interactive form fields from a PDF's `/AcroForm` dictionary.
5
+ * Supports all standard field types:
6
+ * - **Text** (`/Tx`) — Text input fields
7
+ * - **Button** (`/Btn`) — Checkboxes, radio buttons, push buttons
8
+ * - **Choice** (`/Ch`) — Dropdowns (combo boxes) and list boxes
9
+ * - **Signature** (`/Sig`) — Digital signature fields
10
+ *
11
+ * Handles field hierarchies (parent/child), inherited values, and default appearances.
12
+ *
13
+ * @see PDF Reference 1.7, §12.7 - Interactive Forms
14
+ */
15
+ import { isPdfArray, isPdfRef, dictGetName, dictGetNumber, decodePdfStringBytes } from "./pdf-parser.js";
16
+ import { getDictStringValue } from "./reader-utils.js";
17
+ // =============================================================================
18
+ // Public API
19
+ // =============================================================================
20
+ /**
21
+ * Extract form fields from a PDF document.
22
+ *
23
+ * Reads the `/AcroForm` dictionary from the catalog and recursively
24
+ * traverses the field tree.
25
+ *
26
+ * @param doc - The PDF document
27
+ * @returns Array of extracted form fields
28
+ */
29
+ export function extractFormFields(doc) {
30
+ try {
31
+ const catalog = doc.getCatalog();
32
+ const acroFormObj = catalog.get("AcroForm");
33
+ if (!acroFormObj) {
34
+ return [];
35
+ }
36
+ const acroForm = doc.derefDict(acroFormObj);
37
+ if (!acroForm) {
38
+ return [];
39
+ }
40
+ const fieldsObj = acroForm.get("Fields");
41
+ if (!fieldsObj) {
42
+ return [];
43
+ }
44
+ const fieldsArr = doc.deref(fieldsObj);
45
+ if (!isPdfArray(fieldsArr)) {
46
+ return [];
47
+ }
48
+ const fields = [];
49
+ const visited = new Set();
50
+ for (const fieldRef of fieldsArr) {
51
+ collectFields(fieldRef, doc, "", fields, visited);
52
+ }
53
+ return fields;
54
+ }
55
+ catch {
56
+ return [];
57
+ }
58
+ }
59
+ // =============================================================================
60
+ // Field Tree Traversal
61
+ // =============================================================================
62
+ /** Field flags from PDF spec §12.7.3 */
63
+ const FLAG_READ_ONLY = 1 << 0;
64
+ const FLAG_REQUIRED = 1 << 1;
65
+ // Button-specific
66
+ const FLAG_PUSHBUTTON = 1 << 16;
67
+ const FLAG_RADIO = 1 << 15;
68
+ // Choice-specific
69
+ const FLAG_COMBO = 1 << 17;
70
+ /** Maximum depth for parent traversal to prevent cycles in malformed PDFs */
71
+ const MAX_INHERIT_DEPTH = 20;
72
+ function collectFields(fieldObj, doc, parentName, result, visited) {
73
+ // Track visited objects to avoid cycles
74
+ if (isPdfRef(fieldObj)) {
75
+ if (visited.has(fieldObj.objNum)) {
76
+ return;
77
+ }
78
+ visited.add(fieldObj.objNum);
79
+ }
80
+ const dict = doc.derefDict(fieldObj);
81
+ if (!dict) {
82
+ return;
83
+ }
84
+ // Build the fully qualified field name
85
+ const partialName = getDictStringValue(dict, "T", doc);
86
+ const fullName = parentName
87
+ ? partialName
88
+ ? `${parentName}.${partialName}`
89
+ : parentName
90
+ : partialName;
91
+ // Check for children (/Kids)
92
+ const kidsObj = dict.get("Kids");
93
+ if (kidsObj) {
94
+ const kids = doc.deref(kidsObj);
95
+ if (isPdfArray(kids)) {
96
+ // Check if kids are field nodes or widget nodes
97
+ // If kids have /T entries, they are field nodes (continue recursion)
98
+ // If kids don't have /T, they are widget annotations — treat parent as the field
99
+ let hasFieldChildren = false;
100
+ for (const kid of kids) {
101
+ const kidDict = doc.derefDict(kid);
102
+ if (kidDict && kidDict.has("T")) {
103
+ hasFieldChildren = true;
104
+ break;
105
+ }
106
+ }
107
+ if (hasFieldChildren) {
108
+ // Recurse into child fields
109
+ for (const kid of kids) {
110
+ collectFields(kid, doc, fullName, result, visited);
111
+ }
112
+ return;
113
+ }
114
+ // Kids are widgets — extract value from first kid or parent
115
+ // For radio buttons, collect export values from kids
116
+ const ft = resolveFieldType(dict, doc);
117
+ if (ft === "Btn") {
118
+ const ff = resolveFieldFlags(dict, doc);
119
+ if ((ff & FLAG_RADIO) !== 0 && (ff & FLAG_PUSHBUTTON) === 0) {
120
+ // Radio button: current value is on the parent, export values on kids
121
+ const field = parseRadioField(dict, kids, fullName, ff, doc);
122
+ if (field) {
123
+ result.push(field);
124
+ }
125
+ return;
126
+ }
127
+ }
128
+ }
129
+ }
130
+ // Leaf field — extract its properties
131
+ const ft = resolveFieldType(dict, doc);
132
+ if (!ft) {
133
+ return; // Not a real field (no /FT)
134
+ }
135
+ const field = parseField(dict, fullName, ft, doc);
136
+ if (field) {
137
+ result.push(field);
138
+ }
139
+ }
140
+ // =============================================================================
141
+ // Field Parsing
142
+ // =============================================================================
143
+ /**
144
+ * Resolve /FT (field type) which may be inherited from parent.
145
+ */
146
+ function resolveFieldType(dict, doc, depth = 0) {
147
+ const ft = dictGetName(dict, "FT");
148
+ if (ft) {
149
+ return ft;
150
+ }
151
+ if (depth >= MAX_INHERIT_DEPTH) {
152
+ return undefined;
153
+ }
154
+ // Check parent
155
+ const parent = dict.get("Parent");
156
+ if (parent) {
157
+ const parentDict = doc.derefDict(parent);
158
+ if (parentDict) {
159
+ return resolveFieldType(parentDict, doc, depth + 1);
160
+ }
161
+ }
162
+ return undefined;
163
+ }
164
+ /**
165
+ * Resolve /Ff (field flags) which may be inherited from parent.
166
+ */
167
+ function resolveFieldFlags(dict, doc, depth = 0) {
168
+ const ff = dictGetNumber(dict, "Ff");
169
+ if (ff !== undefined) {
170
+ return ff;
171
+ }
172
+ if (depth >= MAX_INHERIT_DEPTH) {
173
+ return 0;
174
+ }
175
+ const parent = dict.get("Parent");
176
+ if (parent) {
177
+ const parentDict = doc.derefDict(parent);
178
+ if (parentDict) {
179
+ return resolveFieldFlags(parentDict, doc, depth + 1);
180
+ }
181
+ }
182
+ return 0;
183
+ }
184
+ function parseField(dict, name, ft, doc) {
185
+ const ff = resolveFieldFlags(dict, doc);
186
+ const value = getFieldValue(dict, doc);
187
+ const defaultValue = getDictStringValue(dict, "DV", doc);
188
+ const type = classifyFieldType(ft, ff);
189
+ let options = [];
190
+ let exportValue = "";
191
+ if (ft === "Ch") {
192
+ options = parseChoiceOptions(dict, doc);
193
+ }
194
+ if (ft === "Btn" && (ff & FLAG_PUSHBUTTON) === 0 && (ff & FLAG_RADIO) === 0) {
195
+ // Checkbox — extract export value from /AP /N keys
196
+ exportValue = parseCheckboxExportValue(dict, doc);
197
+ }
198
+ return {
199
+ name: name || "(unnamed)",
200
+ type,
201
+ value,
202
+ defaultValue,
203
+ readOnly: (ff & FLAG_READ_ONLY) !== 0,
204
+ required: (ff & FLAG_REQUIRED) !== 0,
205
+ options,
206
+ exportValue,
207
+ flags: ff
208
+ };
209
+ }
210
+ function parseRadioField(parentDict, kids, name, ff, doc) {
211
+ const value = getFieldValue(parentDict, doc);
212
+ const defaultValue = getDictStringValue(parentDict, "DV", doc);
213
+ // Collect export values from kid appearance dictionaries
214
+ const options = [];
215
+ for (const kid of kids) {
216
+ const kidDict = doc.derefDict(kid);
217
+ if (!kidDict) {
218
+ continue;
219
+ }
220
+ const apDict = doc.derefDict(kidDict.get("AP"));
221
+ if (!apDict) {
222
+ continue;
223
+ }
224
+ const nDict = doc.derefDict(apDict.get("N"));
225
+ if (!nDict) {
226
+ continue;
227
+ }
228
+ // Keys of /AP /N are the possible states (e.g. "/Choice1", "/Off")
229
+ for (const key of nDict.keys()) {
230
+ if (key !== "Off" && !options.includes(key)) {
231
+ options.push(key);
232
+ }
233
+ }
234
+ }
235
+ return {
236
+ name: name || "(unnamed)",
237
+ type: "radio",
238
+ value,
239
+ defaultValue,
240
+ readOnly: (ff & FLAG_READ_ONLY) !== 0,
241
+ required: (ff & FLAG_REQUIRED) !== 0,
242
+ options,
243
+ exportValue: "",
244
+ flags: ff
245
+ };
246
+ }
247
+ function classifyFieldType(ft, ff) {
248
+ switch (ft) {
249
+ case "Tx":
250
+ return "text";
251
+ case "Btn":
252
+ if ((ff & FLAG_PUSHBUTTON) !== 0) {
253
+ return "button";
254
+ }
255
+ if ((ff & FLAG_RADIO) !== 0) {
256
+ return "radio";
257
+ }
258
+ return "checkbox";
259
+ case "Ch":
260
+ if ((ff & FLAG_COMBO) !== 0) {
261
+ return "dropdown";
262
+ }
263
+ return "listbox";
264
+ case "Sig":
265
+ return "signature";
266
+ default:
267
+ return "unknown";
268
+ }
269
+ }
270
+ function parseChoiceOptions(dict, doc) {
271
+ const optObj = dict.get("Opt");
272
+ if (!optObj) {
273
+ return [];
274
+ }
275
+ const optArr = doc.deref(optObj);
276
+ if (!isPdfArray(optArr)) {
277
+ return [];
278
+ }
279
+ const options = [];
280
+ for (const item of optArr) {
281
+ const resolved = doc.deref(item);
282
+ if (typeof resolved === "string") {
283
+ options.push(resolved);
284
+ }
285
+ else if (resolved instanceof Uint8Array) {
286
+ options.push(decodePdfStringBytes(resolved));
287
+ }
288
+ else if (isPdfArray(resolved) && resolved.length >= 2) {
289
+ // [exportValue, displayValue] pair
290
+ const display = doc.deref(resolved[1]);
291
+ if (typeof display === "string") {
292
+ options.push(display);
293
+ }
294
+ else if (display instanceof Uint8Array) {
295
+ options.push(decodePdfStringBytes(display));
296
+ }
297
+ }
298
+ }
299
+ return options;
300
+ }
301
+ function parseCheckboxExportValue(dict, doc) {
302
+ // The export value is the key in /AP /N that isn't "Off"
303
+ const apDict = doc.derefDict(dict.get("AP"));
304
+ if (!apDict) {
305
+ return "Yes"; // Default per spec
306
+ }
307
+ const nDict = doc.derefDict(apDict.get("N"));
308
+ if (!nDict) {
309
+ return "Yes";
310
+ }
311
+ for (const key of nDict.keys()) {
312
+ if (key !== "Off") {
313
+ return key;
314
+ }
315
+ }
316
+ return "Yes";
317
+ }
318
+ // =============================================================================
319
+ // Value Extraction
320
+ // =============================================================================
321
+ /** Get the field value (/V entry), resolving from parent if needed. */
322
+ function getFieldValue(dict, doc, depth = 0) {
323
+ const val = dict.get("V");
324
+ if (val !== undefined) {
325
+ return resolveValue(val, doc);
326
+ }
327
+ if (depth >= MAX_INHERIT_DEPTH) {
328
+ return "";
329
+ }
330
+ // Inherit from parent
331
+ const parent = dict.get("Parent");
332
+ if (parent) {
333
+ const parentDict = doc.derefDict(parent);
334
+ if (parentDict) {
335
+ return getFieldValue(parentDict, doc, depth + 1);
336
+ }
337
+ }
338
+ return "";
339
+ }
340
+ function resolveValue(val, doc) {
341
+ const resolved = doc.deref(val);
342
+ if (typeof resolved === "string") {
343
+ return resolved;
344
+ }
345
+ if (resolved instanceof Uint8Array) {
346
+ return decodePdfStringBytes(resolved);
347
+ }
348
+ if (typeof resolved === "number") {
349
+ return String(resolved);
350
+ }
351
+ if (typeof resolved === "boolean") {
352
+ return resolved ? "true" : "false";
353
+ }
354
+ return "";
355
+ }
@@ -0,0 +1,55 @@
1
+ /**
2
+ * PDF image extraction.
3
+ *
4
+ * Extracts images from PDF pages including:
5
+ * - Inline images (BI/ID/EI operators)
6
+ * - XObject images (/Subtype /Image)
7
+ * - Images with various color spaces and filters
8
+ *
9
+ * Supported image formats:
10
+ * - JPEG (DCTDecode) — extracted as-is
11
+ * - JPEG2000 (JPXDecode) — extracted as-is
12
+ * - Raw/Flate-compressed pixel data — extracted with metadata
13
+ * - CCITT fax — extracted as-is
14
+ *
15
+ * @see PDF Reference 1.7, §4.8 - Images
16
+ */
17
+ import type { PdfDocument } from "./pdf-document.js";
18
+ import type { PdfDictValue } from "./pdf-parser.js";
19
+ /**
20
+ * An extracted image from a PDF page.
21
+ */
22
+ export interface ExtractedImage {
23
+ /** Image index within the page (0-based) */
24
+ index: number;
25
+ /** Image width in pixels */
26
+ width: number;
27
+ /** Image height in pixels */
28
+ height: number;
29
+ /** Bits per component */
30
+ bitsPerComponent: number;
31
+ /** Color space name */
32
+ colorSpace: string;
33
+ /** Number of color components (1=gray, 3=RGB, 4=CMYK) */
34
+ components: number;
35
+ /**
36
+ * Image data format:
37
+ * - "jpeg" — raw JPEG data (can be written directly as .jpg)
38
+ * - "jpx" — JPEG 2000 data
39
+ * - "raw" — raw pixel data (RGB/CMYK/Gray, decompressed)
40
+ * - "ccitt" — CCITT fax compressed data
41
+ */
42
+ format: "jpeg" | "jpx" | "raw" | "ccitt" | "jbig2";
43
+ /** The image data */
44
+ data: Uint8Array;
45
+ /** Alpha mask data (if present) — same dimensions, 1 component, 8 bits */
46
+ alphaMask: Uint8Array | null;
47
+ /** Filter name from the original stream */
48
+ filter: string;
49
+ /** XObject name (if it was a named XObject) */
50
+ name: string;
51
+ }
52
+ /**
53
+ * Extract all images from a PDF page.
54
+ */
55
+ export declare function extractImagesFromPage(pageDict: PdfDictValue, doc: PdfDocument): ExtractedImage[];
@@ -0,0 +1,220 @@
1
+ /**
2
+ * PDF image extraction.
3
+ *
4
+ * Extracts images from PDF pages including:
5
+ * - Inline images (BI/ID/EI operators)
6
+ * - XObject images (/Subtype /Image)
7
+ * - Images with various color spaces and filters
8
+ *
9
+ * Supported image formats:
10
+ * - JPEG (DCTDecode) — extracted as-is
11
+ * - JPEG2000 (JPXDecode) — extracted as-is
12
+ * - Raw/Flate-compressed pixel data — extracted with metadata
13
+ * - CCITT fax — extracted as-is
14
+ *
15
+ * @see PDF Reference 1.7, §4.8 - Images
16
+ */
17
+ import { isPdfRef, isPdfArray, dictGetName, dictGetNumber } from "./pdf-parser.js";
18
+ // =============================================================================
19
+ // Public API
20
+ // =============================================================================
21
+ /**
22
+ * Extract all images from a PDF page.
23
+ */
24
+ export function extractImagesFromPage(pageDict, doc) {
25
+ const images = [];
26
+ // Get page resources (centralized with cycle protection)
27
+ const resources = doc.resolvePageResources(pageDict);
28
+ const xobjects = resources.get("XObject");
29
+ if (!xobjects) {
30
+ return images;
31
+ }
32
+ const xobjDict = doc.derefDict(xobjects);
33
+ if (!xobjDict) {
34
+ return images;
35
+ }
36
+ let index = 0;
37
+ for (const [name, ref] of xobjDict) {
38
+ const result = doc.derefStreamWithObjNum(ref);
39
+ if (!result) {
40
+ continue;
41
+ }
42
+ const streamDict = result.stream.dict;
43
+ const subtype = dictGetName(streamDict, "Subtype");
44
+ if (subtype !== "Image") {
45
+ continue;
46
+ }
47
+ const image = extractImage(name, result.stream, streamDict, doc, index, result.objNum, result.gen);
48
+ if (image) {
49
+ images.push(image);
50
+ index++;
51
+ }
52
+ }
53
+ return images;
54
+ }
55
+ // =============================================================================
56
+ // Image Extraction
57
+ // =============================================================================
58
+ function extractImage(name, stream, dict, doc, index, objNum = 0, gen = 0) {
59
+ const width = dictGetNumber(dict, "Width") ?? dictGetNumber(dict, "W") ?? 0;
60
+ const height = dictGetNumber(dict, "Height") ?? dictGetNumber(dict, "H") ?? 0;
61
+ const bpc = dictGetNumber(dict, "BitsPerComponent") ?? dictGetNumber(dict, "BPC") ?? 8;
62
+ if (width === 0 || height === 0) {
63
+ return null;
64
+ }
65
+ // Determine color space
66
+ const { colorSpace, components } = resolveColorSpace(dict, doc);
67
+ // Determine filter to understand the image format
68
+ const filter = getFilterName(dict);
69
+ // Extract image data based on filter
70
+ // For all formats, use getStreamData which handles decryption and filter decoding
71
+ let data;
72
+ let format;
73
+ if (filter === "DCTDecode" || filter === "DCT") {
74
+ // JPEG — use getStreamData which handles decryption properly
75
+ data = doc.getStreamData(stream, objNum, gen);
76
+ format = "jpeg";
77
+ }
78
+ else if (filter === "JPXDecode") {
79
+ data = doc.getStreamData(stream, objNum, gen);
80
+ format = "jpx";
81
+ }
82
+ else if (filter === "CCITTFaxDecode" || filter === "CCF") {
83
+ data = doc.getStreamData(stream, objNum, gen);
84
+ format = "ccitt";
85
+ }
86
+ else if (filter === "JBIG2Decode") {
87
+ data = doc.getStreamData(stream, objNum, gen);
88
+ format = "jbig2";
89
+ }
90
+ else {
91
+ // Decode all filters to get raw pixel data
92
+ data = doc.getStreamData(stream, objNum, gen);
93
+ format = "raw";
94
+ }
95
+ // Extract soft mask (alpha channel)
96
+ let alphaMask = null;
97
+ const smaskRef = dict.get("SMask");
98
+ if (smaskRef) {
99
+ const smaskResult = doc.derefStreamWithObjNum(smaskRef);
100
+ if (smaskResult) {
101
+ alphaMask = doc.getStreamData(smaskResult.stream, smaskResult.objNum, smaskResult.gen);
102
+ }
103
+ }
104
+ return {
105
+ index,
106
+ width,
107
+ height,
108
+ bitsPerComponent: bpc,
109
+ colorSpace,
110
+ components,
111
+ format,
112
+ data,
113
+ alphaMask,
114
+ filter,
115
+ name
116
+ };
117
+ }
118
+ // =============================================================================
119
+ // Color Space Resolution
120
+ // =============================================================================
121
+ function resolveColorSpace(dict, doc) {
122
+ const cs = dict.get("ColorSpace") ?? dict.get("CS");
123
+ if (typeof cs === "string") {
124
+ return colorSpaceInfo(cs);
125
+ }
126
+ if (isPdfArray(cs) && cs.length > 0) {
127
+ const csName = cs[0];
128
+ if (typeof csName === "string") {
129
+ if (csName === "ICCBased") {
130
+ // ICC-based color space — get N from the profile stream
131
+ if (cs.length > 1) {
132
+ const profileStream = doc.derefStream(cs[1]);
133
+ if (profileStream) {
134
+ const n = dictGetNumber(profileStream.dict, "N") ?? 3;
135
+ return {
136
+ colorSpace: "ICCBased",
137
+ components: n
138
+ };
139
+ }
140
+ }
141
+ return { colorSpace: "ICCBased", components: 3 };
142
+ }
143
+ if (csName === "Indexed" || csName === "I") {
144
+ return { colorSpace: "Indexed", components: 1 };
145
+ }
146
+ if (csName === "Separation") {
147
+ return { colorSpace: "Separation", components: 1 };
148
+ }
149
+ if (csName === "DeviceN") {
150
+ const numComponents = isPdfArray(cs[1]) ? cs[1].length : 1;
151
+ return { colorSpace: "DeviceN", components: numComponents };
152
+ }
153
+ return colorSpaceInfo(csName);
154
+ }
155
+ }
156
+ if (isPdfRef(cs)) {
157
+ const resolved = doc.deref(cs);
158
+ if (typeof resolved === "string") {
159
+ return colorSpaceInfo(resolved);
160
+ }
161
+ if (isPdfArray(resolved) && resolved.length > 0 && typeof resolved[0] === "string") {
162
+ return resolveColorSpace(new Map([["ColorSpace", resolved]]), doc);
163
+ }
164
+ }
165
+ return { colorSpace: "DeviceRGB", components: 3 };
166
+ }
167
+ function colorSpaceInfo(name) {
168
+ switch (name) {
169
+ case "DeviceGray":
170
+ case "G":
171
+ case "CalGray":
172
+ return { colorSpace: name, components: 1 };
173
+ case "DeviceRGB":
174
+ case "RGB":
175
+ case "CalRGB":
176
+ return { colorSpace: name, components: 3 };
177
+ case "DeviceCMYK":
178
+ case "CMYK":
179
+ return { colorSpace: name, components: 4 };
180
+ default:
181
+ return { colorSpace: name, components: 3 };
182
+ }
183
+ }
184
+ // =============================================================================
185
+ // Helpers
186
+ // =============================================================================
187
+ /**
188
+ * Get the image-specific filter name from a stream dictionary.
189
+ * For filter chains, finds the last image-specific filter (DCT, JPX, CCITT, JBIG2).
190
+ * For non-image-specific chains (e.g., just FlateDecode), returns that filter.
191
+ */
192
+ function getFilterName(dict) {
193
+ const filter = dict.get("Filter") ?? dict.get("F");
194
+ if (typeof filter === "string") {
195
+ return filter;
196
+ }
197
+ if (isPdfArray(filter) && filter.length > 0) {
198
+ // Look for the last image-specific filter in the chain
199
+ const imageFilters = new Set([
200
+ "DCTDecode",
201
+ "DCT",
202
+ "JPXDecode",
203
+ "CCITTFaxDecode",
204
+ "CCF",
205
+ "JBIG2Decode"
206
+ ]);
207
+ for (let i = filter.length - 1; i >= 0; i--) {
208
+ const f = filter[i];
209
+ if (typeof f === "string" && imageFilters.has(f)) {
210
+ return f;
211
+ }
212
+ }
213
+ // No image-specific filter found — return the first filter
214
+ const first = filter[0];
215
+ if (typeof first === "string") {
216
+ return first;
217
+ }
218
+ }
219
+ return "";
220
+ }