@cj-tech-master/excelts 8.0.0 → 8.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +14 -1
- package/README_zh.md +6 -0
- package/dist/browser/modules/archive/zip/stream.d.ts +4 -0
- package/dist/browser/modules/archive/zip/stream.js +53 -0
- package/dist/browser/modules/pdf/core/crypto.d.ts +65 -0
- package/dist/browser/modules/pdf/core/crypto.js +637 -0
- package/dist/browser/modules/pdf/core/encryption.d.ts +23 -20
- package/dist/browser/modules/pdf/core/encryption.js +88 -261
- package/dist/browser/modules/pdf/core/pdf-writer.d.ts +6 -4
- package/dist/browser/modules/pdf/core/pdf-writer.js +19 -10
- package/dist/browser/modules/pdf/index.d.ts +23 -2
- package/dist/browser/modules/pdf/index.js +21 -3
- package/dist/browser/modules/pdf/reader/annotation-extractor.d.ts +63 -0
- package/dist/browser/modules/pdf/reader/annotation-extractor.js +155 -0
- package/dist/browser/modules/pdf/reader/cmap-parser.d.ts +70 -0
- package/dist/browser/modules/pdf/reader/cmap-parser.js +321 -0
- package/dist/browser/modules/pdf/reader/content-interpreter.d.ts +57 -0
- package/dist/browser/modules/pdf/reader/content-interpreter.js +715 -0
- package/dist/browser/modules/pdf/reader/font-decoder.d.ts +58 -0
- package/dist/browser/modules/pdf/reader/font-decoder.js +1513 -0
- package/dist/browser/modules/pdf/reader/form-extractor.d.ts +48 -0
- package/dist/browser/modules/pdf/reader/form-extractor.js +355 -0
- package/dist/browser/modules/pdf/reader/image-extractor.d.ts +55 -0
- package/dist/browser/modules/pdf/reader/image-extractor.js +220 -0
- package/dist/browser/modules/pdf/reader/metadata-reader.d.ts +56 -0
- package/dist/browser/modules/pdf/reader/metadata-reader.js +275 -0
- package/dist/browser/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
- package/dist/browser/modules/pdf/reader/pdf-decrypt.js +443 -0
- package/dist/browser/modules/pdf/reader/pdf-document.d.ts +191 -0
- package/dist/browser/modules/pdf/reader/pdf-document.js +818 -0
- package/dist/browser/modules/pdf/reader/pdf-parser.d.ts +65 -0
- package/dist/browser/modules/pdf/reader/pdf-parser.js +285 -0
- package/dist/browser/modules/pdf/reader/pdf-reader.d.ts +143 -0
- package/dist/browser/modules/pdf/reader/pdf-reader.js +200 -0
- package/dist/browser/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
- package/dist/browser/modules/pdf/reader/pdf-tokenizer.js +543 -0
- package/dist/browser/modules/pdf/reader/reader-utils.d.ts +15 -0
- package/dist/browser/modules/pdf/reader/reader-utils.js +27 -0
- package/dist/browser/modules/pdf/reader/stream-filters.d.ts +20 -0
- package/dist/browser/modules/pdf/reader/stream-filters.js +456 -0
- package/dist/browser/modules/pdf/reader/text-reconstruction.d.ts +44 -0
- package/dist/browser/modules/pdf/reader/text-reconstruction.js +463 -0
- package/dist/cjs/modules/archive/zip/stream.js +53 -0
- package/dist/cjs/modules/pdf/core/crypto.js +649 -0
- package/dist/cjs/modules/pdf/core/encryption.js +88 -263
- package/dist/cjs/modules/pdf/core/pdf-writer.js +19 -10
- package/dist/cjs/modules/pdf/index.js +23 -4
- package/dist/cjs/modules/pdf/reader/annotation-extractor.js +158 -0
- package/dist/cjs/modules/pdf/reader/cmap-parser.js +326 -0
- package/dist/cjs/modules/pdf/reader/content-interpreter.js +718 -0
- package/dist/cjs/modules/pdf/reader/font-decoder.js +1518 -0
- package/dist/cjs/modules/pdf/reader/form-extractor.js +358 -0
- package/dist/cjs/modules/pdf/reader/image-extractor.js +223 -0
- package/dist/cjs/modules/pdf/reader/metadata-reader.js +278 -0
- package/dist/cjs/modules/pdf/reader/pdf-decrypt.js +447 -0
- package/dist/cjs/modules/pdf/reader/pdf-document.js +822 -0
- package/dist/cjs/modules/pdf/reader/pdf-parser.js +301 -0
- package/dist/cjs/modules/pdf/reader/pdf-reader.js +203 -0
- package/dist/cjs/modules/pdf/reader/pdf-tokenizer.js +517 -0
- package/dist/cjs/modules/pdf/reader/reader-utils.js +30 -0
- package/dist/cjs/modules/pdf/reader/stream-filters.js +459 -0
- package/dist/cjs/modules/pdf/reader/text-reconstruction.js +467 -0
- package/dist/esm/modules/archive/zip/stream.js +53 -0
- package/dist/esm/modules/pdf/core/crypto.js +637 -0
- package/dist/esm/modules/pdf/core/encryption.js +88 -261
- package/dist/esm/modules/pdf/core/pdf-writer.js +19 -10
- package/dist/esm/modules/pdf/index.js +21 -3
- package/dist/esm/modules/pdf/reader/annotation-extractor.js +155 -0
- package/dist/esm/modules/pdf/reader/cmap-parser.js +321 -0
- package/dist/esm/modules/pdf/reader/content-interpreter.js +715 -0
- package/dist/esm/modules/pdf/reader/font-decoder.js +1513 -0
- package/dist/esm/modules/pdf/reader/form-extractor.js +355 -0
- package/dist/esm/modules/pdf/reader/image-extractor.js +220 -0
- package/dist/esm/modules/pdf/reader/metadata-reader.js +275 -0
- package/dist/esm/modules/pdf/reader/pdf-decrypt.js +443 -0
- package/dist/esm/modules/pdf/reader/pdf-document.js +818 -0
- package/dist/esm/modules/pdf/reader/pdf-parser.js +285 -0
- package/dist/esm/modules/pdf/reader/pdf-reader.js +200 -0
- package/dist/esm/modules/pdf/reader/pdf-tokenizer.js +543 -0
- package/dist/esm/modules/pdf/reader/reader-utils.js +27 -0
- package/dist/esm/modules/pdf/reader/stream-filters.js +456 -0
- package/dist/esm/modules/pdf/reader/text-reconstruction.js +463 -0
- package/dist/iife/excelts.iife.js +703 -267
- package/dist/iife/excelts.iife.js.map +1 -1
- package/dist/iife/excelts.iife.min.js +35 -35
- package/dist/types/modules/archive/zip/stream.d.ts +4 -0
- package/dist/types/modules/pdf/core/crypto.d.ts +65 -0
- package/dist/types/modules/pdf/core/encryption.d.ts +23 -20
- package/dist/types/modules/pdf/core/pdf-writer.d.ts +6 -4
- package/dist/types/modules/pdf/index.d.ts +23 -2
- package/dist/types/modules/pdf/reader/annotation-extractor.d.ts +63 -0
- package/dist/types/modules/pdf/reader/cmap-parser.d.ts +70 -0
- package/dist/types/modules/pdf/reader/content-interpreter.d.ts +57 -0
- package/dist/types/modules/pdf/reader/font-decoder.d.ts +58 -0
- package/dist/types/modules/pdf/reader/form-extractor.d.ts +48 -0
- package/dist/types/modules/pdf/reader/image-extractor.d.ts +55 -0
- package/dist/types/modules/pdf/reader/metadata-reader.d.ts +56 -0
- package/dist/types/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
- package/dist/types/modules/pdf/reader/pdf-document.d.ts +191 -0
- package/dist/types/modules/pdf/reader/pdf-parser.d.ts +65 -0
- package/dist/types/modules/pdf/reader/pdf-reader.d.ts +143 -0
- package/dist/types/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
- package/dist/types/modules/pdf/reader/reader-utils.d.ts +15 -0
- package/dist/types/modules/pdf/reader/stream-filters.d.ts +20 -0
- package/dist/types/modules/pdf/reader/text-reconstruction.d.ts +44 -0
- package/package.json +1 -1
|
@@ -0,0 +1,358 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* PDF form field (AcroForm) extractor.
|
|
4
|
+
*
|
|
5
|
+
* Extracts interactive form fields from a PDF's `/AcroForm` dictionary.
|
|
6
|
+
* Supports all standard field types:
|
|
7
|
+
* - **Text** (`/Tx`) — Text input fields
|
|
8
|
+
* - **Button** (`/Btn`) — Checkboxes, radio buttons, push buttons
|
|
9
|
+
* - **Choice** (`/Ch`) — Dropdowns (combo boxes) and list boxes
|
|
10
|
+
* - **Signature** (`/Sig`) — Digital signature fields
|
|
11
|
+
*
|
|
12
|
+
* Handles field hierarchies (parent/child), inherited values, and default appearances.
|
|
13
|
+
*
|
|
14
|
+
* @see PDF Reference 1.7, §12.7 - Interactive Forms
|
|
15
|
+
*/
|
|
16
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
17
|
+
exports.extractFormFields = extractFormFields;
|
|
18
|
+
const pdf_parser_1 = require("./pdf-parser");
|
|
19
|
+
const reader_utils_1 = require("./reader-utils");
|
|
20
|
+
// =============================================================================
|
|
21
|
+
// Public API
|
|
22
|
+
// =============================================================================
|
|
23
|
+
/**
|
|
24
|
+
* Extract form fields from a PDF document.
|
|
25
|
+
*
|
|
26
|
+
* Reads the `/AcroForm` dictionary from the catalog and recursively
|
|
27
|
+
* traverses the field tree.
|
|
28
|
+
*
|
|
29
|
+
* @param doc - The PDF document
|
|
30
|
+
* @returns Array of extracted form fields
|
|
31
|
+
*/
|
|
32
|
+
function extractFormFields(doc) {
|
|
33
|
+
try {
|
|
34
|
+
const catalog = doc.getCatalog();
|
|
35
|
+
const acroFormObj = catalog.get("AcroForm");
|
|
36
|
+
if (!acroFormObj) {
|
|
37
|
+
return [];
|
|
38
|
+
}
|
|
39
|
+
const acroForm = doc.derefDict(acroFormObj);
|
|
40
|
+
if (!acroForm) {
|
|
41
|
+
return [];
|
|
42
|
+
}
|
|
43
|
+
const fieldsObj = acroForm.get("Fields");
|
|
44
|
+
if (!fieldsObj) {
|
|
45
|
+
return [];
|
|
46
|
+
}
|
|
47
|
+
const fieldsArr = doc.deref(fieldsObj);
|
|
48
|
+
if (!(0, pdf_parser_1.isPdfArray)(fieldsArr)) {
|
|
49
|
+
return [];
|
|
50
|
+
}
|
|
51
|
+
const fields = [];
|
|
52
|
+
const visited = new Set();
|
|
53
|
+
for (const fieldRef of fieldsArr) {
|
|
54
|
+
collectFields(fieldRef, doc, "", fields, visited);
|
|
55
|
+
}
|
|
56
|
+
return fields;
|
|
57
|
+
}
|
|
58
|
+
catch {
|
|
59
|
+
return [];
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
// =============================================================================
|
|
63
|
+
// Field Tree Traversal
|
|
64
|
+
// =============================================================================
|
|
65
|
+
/** Field flags from PDF spec §12.7.3 */
|
|
66
|
+
const FLAG_READ_ONLY = 1 << 0;
|
|
67
|
+
const FLAG_REQUIRED = 1 << 1;
|
|
68
|
+
// Button-specific
|
|
69
|
+
const FLAG_PUSHBUTTON = 1 << 16;
|
|
70
|
+
const FLAG_RADIO = 1 << 15;
|
|
71
|
+
// Choice-specific
|
|
72
|
+
const FLAG_COMBO = 1 << 17;
|
|
73
|
+
/** Maximum depth for parent traversal to prevent cycles in malformed PDFs */
|
|
74
|
+
const MAX_INHERIT_DEPTH = 20;
|
|
75
|
+
function collectFields(fieldObj, doc, parentName, result, visited) {
|
|
76
|
+
// Track visited objects to avoid cycles
|
|
77
|
+
if ((0, pdf_parser_1.isPdfRef)(fieldObj)) {
|
|
78
|
+
if (visited.has(fieldObj.objNum)) {
|
|
79
|
+
return;
|
|
80
|
+
}
|
|
81
|
+
visited.add(fieldObj.objNum);
|
|
82
|
+
}
|
|
83
|
+
const dict = doc.derefDict(fieldObj);
|
|
84
|
+
if (!dict) {
|
|
85
|
+
return;
|
|
86
|
+
}
|
|
87
|
+
// Build the fully qualified field name
|
|
88
|
+
const partialName = (0, reader_utils_1.getDictStringValue)(dict, "T", doc);
|
|
89
|
+
const fullName = parentName
|
|
90
|
+
? partialName
|
|
91
|
+
? `${parentName}.${partialName}`
|
|
92
|
+
: parentName
|
|
93
|
+
: partialName;
|
|
94
|
+
// Check for children (/Kids)
|
|
95
|
+
const kidsObj = dict.get("Kids");
|
|
96
|
+
if (kidsObj) {
|
|
97
|
+
const kids = doc.deref(kidsObj);
|
|
98
|
+
if ((0, pdf_parser_1.isPdfArray)(kids)) {
|
|
99
|
+
// Check if kids are field nodes or widget nodes
|
|
100
|
+
// If kids have /T entries, they are field nodes (continue recursion)
|
|
101
|
+
// If kids don't have /T, they are widget annotations — treat parent as the field
|
|
102
|
+
let hasFieldChildren = false;
|
|
103
|
+
for (const kid of kids) {
|
|
104
|
+
const kidDict = doc.derefDict(kid);
|
|
105
|
+
if (kidDict && kidDict.has("T")) {
|
|
106
|
+
hasFieldChildren = true;
|
|
107
|
+
break;
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
if (hasFieldChildren) {
|
|
111
|
+
// Recurse into child fields
|
|
112
|
+
for (const kid of kids) {
|
|
113
|
+
collectFields(kid, doc, fullName, result, visited);
|
|
114
|
+
}
|
|
115
|
+
return;
|
|
116
|
+
}
|
|
117
|
+
// Kids are widgets — extract value from first kid or parent
|
|
118
|
+
// For radio buttons, collect export values from kids
|
|
119
|
+
const ft = resolveFieldType(dict, doc);
|
|
120
|
+
if (ft === "Btn") {
|
|
121
|
+
const ff = resolveFieldFlags(dict, doc);
|
|
122
|
+
if ((ff & FLAG_RADIO) !== 0 && (ff & FLAG_PUSHBUTTON) === 0) {
|
|
123
|
+
// Radio button: current value is on the parent, export values on kids
|
|
124
|
+
const field = parseRadioField(dict, kids, fullName, ff, doc);
|
|
125
|
+
if (field) {
|
|
126
|
+
result.push(field);
|
|
127
|
+
}
|
|
128
|
+
return;
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
// Leaf field — extract its properties
|
|
134
|
+
const ft = resolveFieldType(dict, doc);
|
|
135
|
+
if (!ft) {
|
|
136
|
+
return; // Not a real field (no /FT)
|
|
137
|
+
}
|
|
138
|
+
const field = parseField(dict, fullName, ft, doc);
|
|
139
|
+
if (field) {
|
|
140
|
+
result.push(field);
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
// =============================================================================
|
|
144
|
+
// Field Parsing
|
|
145
|
+
// =============================================================================
|
|
146
|
+
/**
|
|
147
|
+
* Resolve /FT (field type) which may be inherited from parent.
|
|
148
|
+
*/
|
|
149
|
+
function resolveFieldType(dict, doc, depth = 0) {
|
|
150
|
+
const ft = (0, pdf_parser_1.dictGetName)(dict, "FT");
|
|
151
|
+
if (ft) {
|
|
152
|
+
return ft;
|
|
153
|
+
}
|
|
154
|
+
if (depth >= MAX_INHERIT_DEPTH) {
|
|
155
|
+
return undefined;
|
|
156
|
+
}
|
|
157
|
+
// Check parent
|
|
158
|
+
const parent = dict.get("Parent");
|
|
159
|
+
if (parent) {
|
|
160
|
+
const parentDict = doc.derefDict(parent);
|
|
161
|
+
if (parentDict) {
|
|
162
|
+
return resolveFieldType(parentDict, doc, depth + 1);
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
return undefined;
|
|
166
|
+
}
|
|
167
|
+
/**
|
|
168
|
+
* Resolve /Ff (field flags) which may be inherited from parent.
|
|
169
|
+
*/
|
|
170
|
+
function resolveFieldFlags(dict, doc, depth = 0) {
|
|
171
|
+
const ff = (0, pdf_parser_1.dictGetNumber)(dict, "Ff");
|
|
172
|
+
if (ff !== undefined) {
|
|
173
|
+
return ff;
|
|
174
|
+
}
|
|
175
|
+
if (depth >= MAX_INHERIT_DEPTH) {
|
|
176
|
+
return 0;
|
|
177
|
+
}
|
|
178
|
+
const parent = dict.get("Parent");
|
|
179
|
+
if (parent) {
|
|
180
|
+
const parentDict = doc.derefDict(parent);
|
|
181
|
+
if (parentDict) {
|
|
182
|
+
return resolveFieldFlags(parentDict, doc, depth + 1);
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
return 0;
|
|
186
|
+
}
|
|
187
|
+
function parseField(dict, name, ft, doc) {
|
|
188
|
+
const ff = resolveFieldFlags(dict, doc);
|
|
189
|
+
const value = getFieldValue(dict, doc);
|
|
190
|
+
const defaultValue = (0, reader_utils_1.getDictStringValue)(dict, "DV", doc);
|
|
191
|
+
const type = classifyFieldType(ft, ff);
|
|
192
|
+
let options = [];
|
|
193
|
+
let exportValue = "";
|
|
194
|
+
if (ft === "Ch") {
|
|
195
|
+
options = parseChoiceOptions(dict, doc);
|
|
196
|
+
}
|
|
197
|
+
if (ft === "Btn" && (ff & FLAG_PUSHBUTTON) === 0 && (ff & FLAG_RADIO) === 0) {
|
|
198
|
+
// Checkbox — extract export value from /AP /N keys
|
|
199
|
+
exportValue = parseCheckboxExportValue(dict, doc);
|
|
200
|
+
}
|
|
201
|
+
return {
|
|
202
|
+
name: name || "(unnamed)",
|
|
203
|
+
type,
|
|
204
|
+
value,
|
|
205
|
+
defaultValue,
|
|
206
|
+
readOnly: (ff & FLAG_READ_ONLY) !== 0,
|
|
207
|
+
required: (ff & FLAG_REQUIRED) !== 0,
|
|
208
|
+
options,
|
|
209
|
+
exportValue,
|
|
210
|
+
flags: ff
|
|
211
|
+
};
|
|
212
|
+
}
|
|
213
|
+
function parseRadioField(parentDict, kids, name, ff, doc) {
|
|
214
|
+
const value = getFieldValue(parentDict, doc);
|
|
215
|
+
const defaultValue = (0, reader_utils_1.getDictStringValue)(parentDict, "DV", doc);
|
|
216
|
+
// Collect export values from kid appearance dictionaries
|
|
217
|
+
const options = [];
|
|
218
|
+
for (const kid of kids) {
|
|
219
|
+
const kidDict = doc.derefDict(kid);
|
|
220
|
+
if (!kidDict) {
|
|
221
|
+
continue;
|
|
222
|
+
}
|
|
223
|
+
const apDict = doc.derefDict(kidDict.get("AP"));
|
|
224
|
+
if (!apDict) {
|
|
225
|
+
continue;
|
|
226
|
+
}
|
|
227
|
+
const nDict = doc.derefDict(apDict.get("N"));
|
|
228
|
+
if (!nDict) {
|
|
229
|
+
continue;
|
|
230
|
+
}
|
|
231
|
+
// Keys of /AP /N are the possible states (e.g. "/Choice1", "/Off")
|
|
232
|
+
for (const key of nDict.keys()) {
|
|
233
|
+
if (key !== "Off" && !options.includes(key)) {
|
|
234
|
+
options.push(key);
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
return {
|
|
239
|
+
name: name || "(unnamed)",
|
|
240
|
+
type: "radio",
|
|
241
|
+
value,
|
|
242
|
+
defaultValue,
|
|
243
|
+
readOnly: (ff & FLAG_READ_ONLY) !== 0,
|
|
244
|
+
required: (ff & FLAG_REQUIRED) !== 0,
|
|
245
|
+
options,
|
|
246
|
+
exportValue: "",
|
|
247
|
+
flags: ff
|
|
248
|
+
};
|
|
249
|
+
}
|
|
250
|
+
function classifyFieldType(ft, ff) {
|
|
251
|
+
switch (ft) {
|
|
252
|
+
case "Tx":
|
|
253
|
+
return "text";
|
|
254
|
+
case "Btn":
|
|
255
|
+
if ((ff & FLAG_PUSHBUTTON) !== 0) {
|
|
256
|
+
return "button";
|
|
257
|
+
}
|
|
258
|
+
if ((ff & FLAG_RADIO) !== 0) {
|
|
259
|
+
return "radio";
|
|
260
|
+
}
|
|
261
|
+
return "checkbox";
|
|
262
|
+
case "Ch":
|
|
263
|
+
if ((ff & FLAG_COMBO) !== 0) {
|
|
264
|
+
return "dropdown";
|
|
265
|
+
}
|
|
266
|
+
return "listbox";
|
|
267
|
+
case "Sig":
|
|
268
|
+
return "signature";
|
|
269
|
+
default:
|
|
270
|
+
return "unknown";
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
function parseChoiceOptions(dict, doc) {
|
|
274
|
+
const optObj = dict.get("Opt");
|
|
275
|
+
if (!optObj) {
|
|
276
|
+
return [];
|
|
277
|
+
}
|
|
278
|
+
const optArr = doc.deref(optObj);
|
|
279
|
+
if (!(0, pdf_parser_1.isPdfArray)(optArr)) {
|
|
280
|
+
return [];
|
|
281
|
+
}
|
|
282
|
+
const options = [];
|
|
283
|
+
for (const item of optArr) {
|
|
284
|
+
const resolved = doc.deref(item);
|
|
285
|
+
if (typeof resolved === "string") {
|
|
286
|
+
options.push(resolved);
|
|
287
|
+
}
|
|
288
|
+
else if (resolved instanceof Uint8Array) {
|
|
289
|
+
options.push((0, pdf_parser_1.decodePdfStringBytes)(resolved));
|
|
290
|
+
}
|
|
291
|
+
else if ((0, pdf_parser_1.isPdfArray)(resolved) && resolved.length >= 2) {
|
|
292
|
+
// [exportValue, displayValue] pair
|
|
293
|
+
const display = doc.deref(resolved[1]);
|
|
294
|
+
if (typeof display === "string") {
|
|
295
|
+
options.push(display);
|
|
296
|
+
}
|
|
297
|
+
else if (display instanceof Uint8Array) {
|
|
298
|
+
options.push((0, pdf_parser_1.decodePdfStringBytes)(display));
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
return options;
|
|
303
|
+
}
|
|
304
|
+
function parseCheckboxExportValue(dict, doc) {
|
|
305
|
+
// The export value is the key in /AP /N that isn't "Off"
|
|
306
|
+
const apDict = doc.derefDict(dict.get("AP"));
|
|
307
|
+
if (!apDict) {
|
|
308
|
+
return "Yes"; // Default per spec
|
|
309
|
+
}
|
|
310
|
+
const nDict = doc.derefDict(apDict.get("N"));
|
|
311
|
+
if (!nDict) {
|
|
312
|
+
return "Yes";
|
|
313
|
+
}
|
|
314
|
+
for (const key of nDict.keys()) {
|
|
315
|
+
if (key !== "Off") {
|
|
316
|
+
return key;
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
return "Yes";
|
|
320
|
+
}
|
|
321
|
+
// =============================================================================
|
|
322
|
+
// Value Extraction
|
|
323
|
+
// =============================================================================
|
|
324
|
+
/** Get the field value (/V entry), resolving from parent if needed. */
|
|
325
|
+
function getFieldValue(dict, doc, depth = 0) {
|
|
326
|
+
const val = dict.get("V");
|
|
327
|
+
if (val !== undefined) {
|
|
328
|
+
return resolveValue(val, doc);
|
|
329
|
+
}
|
|
330
|
+
if (depth >= MAX_INHERIT_DEPTH) {
|
|
331
|
+
return "";
|
|
332
|
+
}
|
|
333
|
+
// Inherit from parent
|
|
334
|
+
const parent = dict.get("Parent");
|
|
335
|
+
if (parent) {
|
|
336
|
+
const parentDict = doc.derefDict(parent);
|
|
337
|
+
if (parentDict) {
|
|
338
|
+
return getFieldValue(parentDict, doc, depth + 1);
|
|
339
|
+
}
|
|
340
|
+
}
|
|
341
|
+
return "";
|
|
342
|
+
}
|
|
343
|
+
function resolveValue(val, doc) {
|
|
344
|
+
const resolved = doc.deref(val);
|
|
345
|
+
if (typeof resolved === "string") {
|
|
346
|
+
return resolved;
|
|
347
|
+
}
|
|
348
|
+
if (resolved instanceof Uint8Array) {
|
|
349
|
+
return (0, pdf_parser_1.decodePdfStringBytes)(resolved);
|
|
350
|
+
}
|
|
351
|
+
if (typeof resolved === "number") {
|
|
352
|
+
return String(resolved);
|
|
353
|
+
}
|
|
354
|
+
if (typeof resolved === "boolean") {
|
|
355
|
+
return resolved ? "true" : "false";
|
|
356
|
+
}
|
|
357
|
+
return "";
|
|
358
|
+
}
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* PDF image extraction.
|
|
4
|
+
*
|
|
5
|
+
* Extracts images from PDF pages including:
|
|
6
|
+
* - Inline images (BI/ID/EI operators)
|
|
7
|
+
* - XObject images (/Subtype /Image)
|
|
8
|
+
* - Images with various color spaces and filters
|
|
9
|
+
*
|
|
10
|
+
* Supported image formats:
|
|
11
|
+
* - JPEG (DCTDecode) — extracted as-is
|
|
12
|
+
* - JPEG2000 (JPXDecode) — extracted as-is
|
|
13
|
+
* - Raw/Flate-compressed pixel data — extracted with metadata
|
|
14
|
+
* - CCITT fax — extracted as-is
|
|
15
|
+
*
|
|
16
|
+
* @see PDF Reference 1.7, §4.8 - Images
|
|
17
|
+
*/
|
|
18
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
19
|
+
exports.extractImagesFromPage = extractImagesFromPage;
|
|
20
|
+
const pdf_parser_1 = require("./pdf-parser");
|
|
21
|
+
// =============================================================================
|
|
22
|
+
// Public API
|
|
23
|
+
// =============================================================================
|
|
24
|
+
/**
|
|
25
|
+
* Extract all images from a PDF page.
|
|
26
|
+
*/
|
|
27
|
+
function extractImagesFromPage(pageDict, doc) {
|
|
28
|
+
const images = [];
|
|
29
|
+
// Get page resources (centralized with cycle protection)
|
|
30
|
+
const resources = doc.resolvePageResources(pageDict);
|
|
31
|
+
const xobjects = resources.get("XObject");
|
|
32
|
+
if (!xobjects) {
|
|
33
|
+
return images;
|
|
34
|
+
}
|
|
35
|
+
const xobjDict = doc.derefDict(xobjects);
|
|
36
|
+
if (!xobjDict) {
|
|
37
|
+
return images;
|
|
38
|
+
}
|
|
39
|
+
let index = 0;
|
|
40
|
+
for (const [name, ref] of xobjDict) {
|
|
41
|
+
const result = doc.derefStreamWithObjNum(ref);
|
|
42
|
+
if (!result) {
|
|
43
|
+
continue;
|
|
44
|
+
}
|
|
45
|
+
const streamDict = result.stream.dict;
|
|
46
|
+
const subtype = (0, pdf_parser_1.dictGetName)(streamDict, "Subtype");
|
|
47
|
+
if (subtype !== "Image") {
|
|
48
|
+
continue;
|
|
49
|
+
}
|
|
50
|
+
const image = extractImage(name, result.stream, streamDict, doc, index, result.objNum, result.gen);
|
|
51
|
+
if (image) {
|
|
52
|
+
images.push(image);
|
|
53
|
+
index++;
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
return images;
|
|
57
|
+
}
|
|
58
|
+
// =============================================================================
|
|
59
|
+
// Image Extraction
|
|
60
|
+
// =============================================================================
|
|
61
|
+
function extractImage(name, stream, dict, doc, index, objNum = 0, gen = 0) {
|
|
62
|
+
const width = (0, pdf_parser_1.dictGetNumber)(dict, "Width") ?? (0, pdf_parser_1.dictGetNumber)(dict, "W") ?? 0;
|
|
63
|
+
const height = (0, pdf_parser_1.dictGetNumber)(dict, "Height") ?? (0, pdf_parser_1.dictGetNumber)(dict, "H") ?? 0;
|
|
64
|
+
const bpc = (0, pdf_parser_1.dictGetNumber)(dict, "BitsPerComponent") ?? (0, pdf_parser_1.dictGetNumber)(dict, "BPC") ?? 8;
|
|
65
|
+
if (width === 0 || height === 0) {
|
|
66
|
+
return null;
|
|
67
|
+
}
|
|
68
|
+
// Determine color space
|
|
69
|
+
const { colorSpace, components } = resolveColorSpace(dict, doc);
|
|
70
|
+
// Determine filter to understand the image format
|
|
71
|
+
const filter = getFilterName(dict);
|
|
72
|
+
// Extract image data based on filter
|
|
73
|
+
// For all formats, use getStreamData which handles decryption and filter decoding
|
|
74
|
+
let data;
|
|
75
|
+
let format;
|
|
76
|
+
if (filter === "DCTDecode" || filter === "DCT") {
|
|
77
|
+
// JPEG — use getStreamData which handles decryption properly
|
|
78
|
+
data = doc.getStreamData(stream, objNum, gen);
|
|
79
|
+
format = "jpeg";
|
|
80
|
+
}
|
|
81
|
+
else if (filter === "JPXDecode") {
|
|
82
|
+
data = doc.getStreamData(stream, objNum, gen);
|
|
83
|
+
format = "jpx";
|
|
84
|
+
}
|
|
85
|
+
else if (filter === "CCITTFaxDecode" || filter === "CCF") {
|
|
86
|
+
data = doc.getStreamData(stream, objNum, gen);
|
|
87
|
+
format = "ccitt";
|
|
88
|
+
}
|
|
89
|
+
else if (filter === "JBIG2Decode") {
|
|
90
|
+
data = doc.getStreamData(stream, objNum, gen);
|
|
91
|
+
format = "jbig2";
|
|
92
|
+
}
|
|
93
|
+
else {
|
|
94
|
+
// Decode all filters to get raw pixel data
|
|
95
|
+
data = doc.getStreamData(stream, objNum, gen);
|
|
96
|
+
format = "raw";
|
|
97
|
+
}
|
|
98
|
+
// Extract soft mask (alpha channel)
|
|
99
|
+
let alphaMask = null;
|
|
100
|
+
const smaskRef = dict.get("SMask");
|
|
101
|
+
if (smaskRef) {
|
|
102
|
+
const smaskResult = doc.derefStreamWithObjNum(smaskRef);
|
|
103
|
+
if (smaskResult) {
|
|
104
|
+
alphaMask = doc.getStreamData(smaskResult.stream, smaskResult.objNum, smaskResult.gen);
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
return {
|
|
108
|
+
index,
|
|
109
|
+
width,
|
|
110
|
+
height,
|
|
111
|
+
bitsPerComponent: bpc,
|
|
112
|
+
colorSpace,
|
|
113
|
+
components,
|
|
114
|
+
format,
|
|
115
|
+
data,
|
|
116
|
+
alphaMask,
|
|
117
|
+
filter,
|
|
118
|
+
name
|
|
119
|
+
};
|
|
120
|
+
}
|
|
121
|
+
// =============================================================================
|
|
122
|
+
// Color Space Resolution
|
|
123
|
+
// =============================================================================
|
|
124
|
+
function resolveColorSpace(dict, doc) {
|
|
125
|
+
const cs = dict.get("ColorSpace") ?? dict.get("CS");
|
|
126
|
+
if (typeof cs === "string") {
|
|
127
|
+
return colorSpaceInfo(cs);
|
|
128
|
+
}
|
|
129
|
+
if ((0, pdf_parser_1.isPdfArray)(cs) && cs.length > 0) {
|
|
130
|
+
const csName = cs[0];
|
|
131
|
+
if (typeof csName === "string") {
|
|
132
|
+
if (csName === "ICCBased") {
|
|
133
|
+
// ICC-based color space — get N from the profile stream
|
|
134
|
+
if (cs.length > 1) {
|
|
135
|
+
const profileStream = doc.derefStream(cs[1]);
|
|
136
|
+
if (profileStream) {
|
|
137
|
+
const n = (0, pdf_parser_1.dictGetNumber)(profileStream.dict, "N") ?? 3;
|
|
138
|
+
return {
|
|
139
|
+
colorSpace: "ICCBased",
|
|
140
|
+
components: n
|
|
141
|
+
};
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
return { colorSpace: "ICCBased", components: 3 };
|
|
145
|
+
}
|
|
146
|
+
if (csName === "Indexed" || csName === "I") {
|
|
147
|
+
return { colorSpace: "Indexed", components: 1 };
|
|
148
|
+
}
|
|
149
|
+
if (csName === "Separation") {
|
|
150
|
+
return { colorSpace: "Separation", components: 1 };
|
|
151
|
+
}
|
|
152
|
+
if (csName === "DeviceN") {
|
|
153
|
+
const numComponents = (0, pdf_parser_1.isPdfArray)(cs[1]) ? cs[1].length : 1;
|
|
154
|
+
return { colorSpace: "DeviceN", components: numComponents };
|
|
155
|
+
}
|
|
156
|
+
return colorSpaceInfo(csName);
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
if ((0, pdf_parser_1.isPdfRef)(cs)) {
|
|
160
|
+
const resolved = doc.deref(cs);
|
|
161
|
+
if (typeof resolved === "string") {
|
|
162
|
+
return colorSpaceInfo(resolved);
|
|
163
|
+
}
|
|
164
|
+
if ((0, pdf_parser_1.isPdfArray)(resolved) && resolved.length > 0 && typeof resolved[0] === "string") {
|
|
165
|
+
return resolveColorSpace(new Map([["ColorSpace", resolved]]), doc);
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
return { colorSpace: "DeviceRGB", components: 3 };
|
|
169
|
+
}
|
|
170
|
+
function colorSpaceInfo(name) {
|
|
171
|
+
switch (name) {
|
|
172
|
+
case "DeviceGray":
|
|
173
|
+
case "G":
|
|
174
|
+
case "CalGray":
|
|
175
|
+
return { colorSpace: name, components: 1 };
|
|
176
|
+
case "DeviceRGB":
|
|
177
|
+
case "RGB":
|
|
178
|
+
case "CalRGB":
|
|
179
|
+
return { colorSpace: name, components: 3 };
|
|
180
|
+
case "DeviceCMYK":
|
|
181
|
+
case "CMYK":
|
|
182
|
+
return { colorSpace: name, components: 4 };
|
|
183
|
+
default:
|
|
184
|
+
return { colorSpace: name, components: 3 };
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
// =============================================================================
|
|
188
|
+
// Helpers
|
|
189
|
+
// =============================================================================
|
|
190
|
+
/**
|
|
191
|
+
* Get the image-specific filter name from a stream dictionary.
|
|
192
|
+
* For filter chains, finds the last image-specific filter (DCT, JPX, CCITT, JBIG2).
|
|
193
|
+
* For non-image-specific chains (e.g., just FlateDecode), returns that filter.
|
|
194
|
+
*/
|
|
195
|
+
function getFilterName(dict) {
|
|
196
|
+
const filter = dict.get("Filter") ?? dict.get("F");
|
|
197
|
+
if (typeof filter === "string") {
|
|
198
|
+
return filter;
|
|
199
|
+
}
|
|
200
|
+
if ((0, pdf_parser_1.isPdfArray)(filter) && filter.length > 0) {
|
|
201
|
+
// Look for the last image-specific filter in the chain
|
|
202
|
+
const imageFilters = new Set([
|
|
203
|
+
"DCTDecode",
|
|
204
|
+
"DCT",
|
|
205
|
+
"JPXDecode",
|
|
206
|
+
"CCITTFaxDecode",
|
|
207
|
+
"CCF",
|
|
208
|
+
"JBIG2Decode"
|
|
209
|
+
]);
|
|
210
|
+
for (let i = filter.length - 1; i >= 0; i--) {
|
|
211
|
+
const f = filter[i];
|
|
212
|
+
if (typeof f === "string" && imageFilters.has(f)) {
|
|
213
|
+
return f;
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
// No image-specific filter found — return the first filter
|
|
217
|
+
const first = filter[0];
|
|
218
|
+
if (typeof first === "string") {
|
|
219
|
+
return first;
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
return "";
|
|
223
|
+
}
|