modern-pdf-lib 0.15.0 → 0.19.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +106 -7
- package/dist/batchOptimize-7U_kD3_j.mjs +392 -0
- package/dist/batchOptimize-xo6BXbGZ.cjs +427 -0
- package/dist/{bridge-C7U4E7St.mjs → bridge-DTH5LMAK.mjs} +3 -3
- package/dist/{bridge-DUcJFVsk.cjs → bridge-DYCQzxF7.cjs} +2 -2
- package/dist/browser.cjs +621 -0
- package/dist/browser.d.cts +190 -0
- package/dist/browser.d.cts.map +1 -0
- package/dist/browser.d.mts +190 -0
- package/dist/browser.d.mts.map +1 -0
- package/dist/browser.mjs +212 -0
- package/dist/cli/index.cjs +247 -0
- package/dist/cli/index.d.cts +1 -0
- package/dist/cli/index.d.mts +1 -0
- package/dist/cli/index.mjs +248 -0
- package/dist/compressionAnalysis-BBv4BkQP.d.mts +261 -0
- package/dist/compressionAnalysis-BBv4BkQP.d.mts.map +1 -0
- package/dist/compressionAnalysis-Bw2alOxt.mjs +1490 -0
- package/dist/compressionAnalysis-CtJ2X9l2.d.cts +261 -0
- package/dist/compressionAnalysis-CtJ2X9l2.d.cts.map +1 -0
- package/dist/compressionAnalysis-eXYyDsrh.cjs +1525 -0
- package/dist/create.cjs +35 -0
- package/dist/create.d.cts +3 -0
- package/dist/create.d.mts +3 -0
- package/dist/create.mjs +5 -0
- package/dist/deduplicateImages-B5lmzL9j.cjs +113 -0
- package/dist/deduplicateImages-BX3Zg8Qp.mjs +102 -0
- package/dist/{fflateAdapter-DX0VqT5k.mjs → fflateAdapter-CBQpGTlx.mjs} +2 -2
- package/dist/{fflateAdapter-AHC_S3cb.cjs → fflateAdapter-LTAeAhaD.cjs} +1 -1
- package/dist/fieldAppearance-C8PoLFSc.d.mts +136 -0
- package/dist/fieldAppearance-C8PoLFSc.d.mts.map +1 -0
- package/dist/fieldAppearance-CdiGFG5e.d.cts +136 -0
- package/dist/fieldAppearance-CdiGFG5e.d.cts.map +1 -0
- package/dist/fontEmbed-Dsu9fo4U.d.mts +636 -0
- package/dist/fontEmbed-Dsu9fo4U.d.mts.map +1 -0
- package/dist/fontEmbed-LID6yG6g.d.cts +636 -0
- package/dist/fontEmbed-LID6yG6g.d.cts.map +1 -0
- package/dist/{fontSubset-pFc8Dueu.cjs → fontSubset-5SLWMmEw.cjs} +1 -1
- package/dist/{fontSubset-ZpLoOZ2e.mjs → fontSubset-DWpduoY2.mjs} +2 -2
- package/dist/forms.cjs +13 -0
- package/dist/forms.d.cts +3 -0
- package/dist/forms.d.mts +3 -0
- package/dist/forms.mjs +3 -0
- package/dist/grayscaleDetect-C2m-eEXR.cjs +96 -0
- package/dist/grayscaleDetect-C6kFF3dk.mjs +84 -0
- package/dist/imageExtract-B6OvUEp-.mjs +155 -0
- package/dist/imageExtract-PxdBvpHj.cjs +166 -0
- package/dist/index-BtYOx5wh.d.mts +4904 -0
- package/dist/index-BtYOx5wh.d.mts.map +1 -0
- package/dist/index-bpktKzCA.d.cts +4904 -0
- package/dist/index-bpktKzCA.d.cts.map +1 -0
- package/dist/index.cjs +288 -25851
- package/dist/index.d.cts +7 -9151
- package/dist/index.d.mts +7 -9151
- package/dist/index.mjs +17 -25665
- package/dist/layout-BZ8tTeAk.mjs +438 -0
- package/dist/layout-Inbqegsk.cjs +563 -0
- package/dist/{libdeflateWasm-Enus0G1k.cjs → libdeflateWasm-BdiDEJOj.cjs} +2 -2
- package/dist/{libdeflateWasm-82loOtIV.mjs → libdeflateWasm-rLppXytE.mjs} +3 -3
- package/dist/loader-3u6Tw5T-.mjs +328 -0
- package/dist/loader-I4zdkoWc.cjs +393 -0
- package/dist/parse.cjs +24 -0
- package/dist/parse.d.cts +4 -0
- package/dist/parse.d.mts +4 -0
- package/dist/parse.mjs +7 -0
- package/dist/pdfCatalog-CYy4NXEY.cjs +173 -0
- package/dist/pdfCatalog-IImGcMbR.mjs +138 -0
- package/dist/pdfDocument-BSiQdNZq.d.cts +4640 -0
- package/dist/pdfDocument-BSiQdNZq.d.cts.map +1 -0
- package/dist/pdfDocument-DOg240g9.mjs +13685 -0
- package/dist/pdfDocument-Duf9LelM.cjs +14110 -0
- package/dist/pdfDocument-i6U5fQ91.d.mts +4640 -0
- package/dist/pdfDocument-i6U5fQ91.d.mts.map +1 -0
- package/dist/pdfForm-9gd40uz9.cjs +1796 -0
- package/dist/pdfForm-BiyNtYem.d.mts +905 -0
- package/dist/pdfForm-BiyNtYem.d.mts.map +1 -0
- package/dist/pdfForm-Cn-cVicP.mjs +1695 -0
- package/dist/pdfForm-SOXJ72LW.d.cts +905 -0
- package/dist/pdfForm-SOXJ72LW.d.cts.map +1 -0
- package/dist/{pdfCatalog-COKoYQ8C.cjs → pdfObjects-1veop1_d.cjs} +2 -172
- package/dist/{pdfCatalog-BB2Wnmud.mjs → pdfObjects-uEsWlfzU.mjs} +3 -138
- package/dist/{pdfPage-N1K2U3jI.mjs → pdfPage-BacMkrLe.mjs} +3024 -4
- package/dist/{pdfPage-DBfdinTR.cjs → pdfPage-CirlQRzJ.cjs} +3148 -104
- package/dist/{pngEmbed-gaJ9S2Dk.mjs → pngEmbed-BLj2zi-5.mjs} +3 -3
- package/dist/{pngEmbed-10m4CfBU.cjs → pngEmbed-D4X4ZN-3.cjs} +2 -2
- package/dist/src-BLWEEbd7.cjs +11852 -0
- package/dist/src-x0g7wiRq.mjs +11103 -0
- package/dist/streamDecode-Bs0_MT_Q.cjs +4607 -0
- package/dist/streamDecode-CWN-nfPJ.mjs +4596 -0
- package/package.json +33 -1
- package/dist/index.d.cts.map +0 -1
- package/dist/index.d.mts.map +0 -1
- package/dist/loader-1VJXLlMZ.mjs +0 -164
- package/dist/loader-CKlBOHma.cjs +0 -166
- package/dist/rolldown-runtime-95iHPtFO.mjs +0 -18
|
@@ -0,0 +1,1490 @@
|
|
|
1
|
+
import { i as PdfName, l as PdfStream, o as PdfNumber, r as PdfDict } from "./pdfObjects-uEsWlfzU.mjs";
|
|
2
|
+
import { a as isJpegWasmReady, r as encodeJpegWasm } from "./bridge-DTH5LMAK.mjs";
|
|
3
|
+
import { n as extractImages, t as decodeImageStream } from "./imageExtract-B6OvUEp-.mjs";
|
|
4
|
+
import { n as isGrayscaleImage } from "./grayscaleDetect-C6kFF3dk.mjs";
|
|
5
|
+
|
|
6
|
+
//#region src/parser/textExtractor.ts
|
|
7
|
+
/**
|
|
8
|
+
* @module parser/textExtractor
|
|
9
|
+
*
|
|
10
|
+
* Extract text content from parsed PDF content streams. Supports both
|
|
11
|
+
* simple text extraction (concatenated strings) and position-aware
|
|
12
|
+
* extraction that tracks the text matrix to compute x/y coordinates.
|
|
13
|
+
*
|
|
14
|
+
* Handles:
|
|
15
|
+
* - All PDF text-showing operators: `Tj`, `TJ`, `'`, `"`
|
|
16
|
+
* - Text-positioning operators: `Td`, `TD`, `Tm`, `T*`
|
|
17
|
+
* - Font selection: `Tf`
|
|
18
|
+
* - Graphics state: `q`/`Q`, `cm`
|
|
19
|
+
* - WinAnsiEncoding (standard single-byte)
|
|
20
|
+
* - Identity-H CID fonts with ToUnicode CMap
|
|
21
|
+
*
|
|
22
|
+
* Reference: PDF 1.7 spec, §9 (Text).
|
|
23
|
+
*
|
|
24
|
+
* @packageDocumentation
|
|
25
|
+
*/
|
|
26
|
+
/**
|
|
27
|
+
* Extract plain text from a sequence of parsed content-stream operators.
|
|
28
|
+
*
|
|
29
|
+
* This function concatenates all text-showing operator strings, inserting
|
|
30
|
+
* spaces between text objects (BT/ET blocks) and newlines at line breaks
|
|
31
|
+
* (`T*`, `Td`, `TD`).
|
|
32
|
+
*
|
|
33
|
+
* @param operators - Parsed content-stream operators.
|
|
34
|
+
* @param resources - Optional page `/Resources` dictionary (used to look
|
|
35
|
+
* up font encodings and ToUnicode CMaps).
|
|
36
|
+
* @param options - Extraction options.
|
|
37
|
+
* @returns The extracted text as a single string.
|
|
38
|
+
*/
|
|
39
|
+
function extractText(operators, resources, options) {
|
|
40
|
+
if (options?.withPositions) return extractTextWithPositions(operators, resources).map((item) => item.text).join(" ");
|
|
41
|
+
const state = new TextState(resources);
|
|
42
|
+
const parts = [];
|
|
43
|
+
let lineHasContent = false;
|
|
44
|
+
for (const op of operators) switch (op.operator) {
|
|
45
|
+
case "BT":
|
|
46
|
+
state.resetTextMatrix();
|
|
47
|
+
if (parts.length > 0 && lineHasContent) parts.push(" ");
|
|
48
|
+
lineHasContent = false;
|
|
49
|
+
break;
|
|
50
|
+
case "ET": break;
|
|
51
|
+
case "Tf":
|
|
52
|
+
state.setFont(operandAsString(op.operands[0]), operandAsNumber(op.operands[1]));
|
|
53
|
+
break;
|
|
54
|
+
case "Tc":
|
|
55
|
+
state.charSpacing = operandAsNumber(op.operands[0]);
|
|
56
|
+
break;
|
|
57
|
+
case "Tw":
|
|
58
|
+
state.wordSpacing = operandAsNumber(op.operands[0]);
|
|
59
|
+
break;
|
|
60
|
+
case "TL":
|
|
61
|
+
state.leading = operandAsNumber(op.operands[0]);
|
|
62
|
+
break;
|
|
63
|
+
case "Tz":
|
|
64
|
+
state.horizontalScaling = operandAsNumber(op.operands[0]);
|
|
65
|
+
break;
|
|
66
|
+
case "Ts":
|
|
67
|
+
state.rise = operandAsNumber(op.operands[0]);
|
|
68
|
+
break;
|
|
69
|
+
case "Td": {
|
|
70
|
+
const tx = operandAsNumber(op.operands[0]);
|
|
71
|
+
const ty = operandAsNumber(op.operands[1]);
|
|
72
|
+
state.moveText(tx, ty);
|
|
73
|
+
if (Math.abs(ty) > .5 && lineHasContent) {
|
|
74
|
+
parts.push("\n");
|
|
75
|
+
lineHasContent = false;
|
|
76
|
+
} else if (Math.abs(tx) > state.fontSize * .3 && lineHasContent) parts.push(" ");
|
|
77
|
+
break;
|
|
78
|
+
}
|
|
79
|
+
case "TD": {
|
|
80
|
+
const tx = operandAsNumber(op.operands[0]);
|
|
81
|
+
const ty = operandAsNumber(op.operands[1]);
|
|
82
|
+
state.leading = -ty;
|
|
83
|
+
state.moveText(tx, ty);
|
|
84
|
+
if (Math.abs(ty) > .5 && lineHasContent) {
|
|
85
|
+
parts.push("\n");
|
|
86
|
+
lineHasContent = false;
|
|
87
|
+
}
|
|
88
|
+
break;
|
|
89
|
+
}
|
|
90
|
+
case "Tm":
|
|
91
|
+
state.setTextMatrix(operandAsNumber(op.operands[0]), operandAsNumber(op.operands[1]), operandAsNumber(op.operands[2]), operandAsNumber(op.operands[3]), operandAsNumber(op.operands[4]), operandAsNumber(op.operands[5]));
|
|
92
|
+
break;
|
|
93
|
+
case "T*":
|
|
94
|
+
state.nextLine();
|
|
95
|
+
if (lineHasContent) {
|
|
96
|
+
parts.push("\n");
|
|
97
|
+
lineHasContent = false;
|
|
98
|
+
}
|
|
99
|
+
break;
|
|
100
|
+
case "Tj": {
|
|
101
|
+
const text = state.decodeString(op.operands[0]);
|
|
102
|
+
if (text.length > 0) {
|
|
103
|
+
parts.push(text);
|
|
104
|
+
lineHasContent = true;
|
|
105
|
+
}
|
|
106
|
+
break;
|
|
107
|
+
}
|
|
108
|
+
case "TJ": {
|
|
109
|
+
const text = state.decodeTJArray(op.operands[0]);
|
|
110
|
+
if (text.length > 0) {
|
|
111
|
+
parts.push(text);
|
|
112
|
+
lineHasContent = true;
|
|
113
|
+
}
|
|
114
|
+
break;
|
|
115
|
+
}
|
|
116
|
+
case "'": {
|
|
117
|
+
state.nextLine();
|
|
118
|
+
if (lineHasContent) {
|
|
119
|
+
parts.push("\n");
|
|
120
|
+
lineHasContent = false;
|
|
121
|
+
}
|
|
122
|
+
const text = state.decodeString(op.operands[0]);
|
|
123
|
+
if (text.length > 0) {
|
|
124
|
+
parts.push(text);
|
|
125
|
+
lineHasContent = true;
|
|
126
|
+
}
|
|
127
|
+
break;
|
|
128
|
+
}
|
|
129
|
+
case "\"": {
|
|
130
|
+
state.wordSpacing = operandAsNumber(op.operands[0]);
|
|
131
|
+
state.charSpacing = operandAsNumber(op.operands[1]);
|
|
132
|
+
state.nextLine();
|
|
133
|
+
if (lineHasContent) {
|
|
134
|
+
parts.push("\n");
|
|
135
|
+
lineHasContent = false;
|
|
136
|
+
}
|
|
137
|
+
const text = state.decodeString(op.operands[2]);
|
|
138
|
+
if (text.length > 0) {
|
|
139
|
+
parts.push(text);
|
|
140
|
+
lineHasContent = true;
|
|
141
|
+
}
|
|
142
|
+
break;
|
|
143
|
+
}
|
|
144
|
+
case "q":
|
|
145
|
+
state.save();
|
|
146
|
+
break;
|
|
147
|
+
case "Q":
|
|
148
|
+
state.restore();
|
|
149
|
+
break;
|
|
150
|
+
case "cm":
|
|
151
|
+
state.concatCTM(operandAsNumber(op.operands[0]), operandAsNumber(op.operands[1]), operandAsNumber(op.operands[2]), operandAsNumber(op.operands[3]), operandAsNumber(op.operands[4]), operandAsNumber(op.operands[5]));
|
|
152
|
+
break;
|
|
153
|
+
default: break;
|
|
154
|
+
}
|
|
155
|
+
return parts.join("");
|
|
156
|
+
}
|
|
157
|
+
/**
|
|
158
|
+
* Extract text with position information from a parsed content stream.
|
|
159
|
+
*
|
|
160
|
+
* Each returned {@link TextItem} includes the text string, its position
|
|
161
|
+
* (x, y), dimensions (width, height), font size, and font name.
|
|
162
|
+
*
|
|
163
|
+
* @param operators - Parsed content-stream operators.
|
|
164
|
+
* @param resources - Optional page `/Resources` dictionary.
|
|
165
|
+
* @returns An array of positioned text items.
|
|
166
|
+
*/
|
|
167
|
+
function extractTextWithPositions(operators, resources) {
|
|
168
|
+
const state = new TextState(resources);
|
|
169
|
+
const items = [];
|
|
170
|
+
for (const op of operators) switch (op.operator) {
|
|
171
|
+
case "BT":
|
|
172
|
+
state.resetTextMatrix();
|
|
173
|
+
break;
|
|
174
|
+
case "ET": break;
|
|
175
|
+
case "Tf":
|
|
176
|
+
state.setFont(operandAsString(op.operands[0]), operandAsNumber(op.operands[1]));
|
|
177
|
+
break;
|
|
178
|
+
case "Tc":
|
|
179
|
+
state.charSpacing = operandAsNumber(op.operands[0]);
|
|
180
|
+
break;
|
|
181
|
+
case "Tw":
|
|
182
|
+
state.wordSpacing = operandAsNumber(op.operands[0]);
|
|
183
|
+
break;
|
|
184
|
+
case "TL":
|
|
185
|
+
state.leading = operandAsNumber(op.operands[0]);
|
|
186
|
+
break;
|
|
187
|
+
case "Tz":
|
|
188
|
+
state.horizontalScaling = operandAsNumber(op.operands[0]);
|
|
189
|
+
break;
|
|
190
|
+
case "Ts":
|
|
191
|
+
state.rise = operandAsNumber(op.operands[0]);
|
|
192
|
+
break;
|
|
193
|
+
case "Td":
|
|
194
|
+
state.moveText(operandAsNumber(op.operands[0]), operandAsNumber(op.operands[1]));
|
|
195
|
+
break;
|
|
196
|
+
case "TD": {
|
|
197
|
+
const tx = operandAsNumber(op.operands[0]);
|
|
198
|
+
const ty = operandAsNumber(op.operands[1]);
|
|
199
|
+
state.leading = -ty;
|
|
200
|
+
state.moveText(tx, ty);
|
|
201
|
+
break;
|
|
202
|
+
}
|
|
203
|
+
case "Tm":
|
|
204
|
+
state.setTextMatrix(operandAsNumber(op.operands[0]), operandAsNumber(op.operands[1]), operandAsNumber(op.operands[2]), operandAsNumber(op.operands[3]), operandAsNumber(op.operands[4]), operandAsNumber(op.operands[5]));
|
|
205
|
+
break;
|
|
206
|
+
case "T*":
|
|
207
|
+
state.nextLine();
|
|
208
|
+
break;
|
|
209
|
+
case "Tj": {
|
|
210
|
+
const text = state.decodeString(op.operands[0]);
|
|
211
|
+
if (text.length > 0) {
|
|
212
|
+
const pos = state.getTextPosition();
|
|
213
|
+
items.push({
|
|
214
|
+
text,
|
|
215
|
+
x: pos.x,
|
|
216
|
+
y: pos.y,
|
|
217
|
+
width: state.estimateWidth(text),
|
|
218
|
+
height: state.fontSize,
|
|
219
|
+
fontSize: state.fontSize,
|
|
220
|
+
fontName: state.fontName
|
|
221
|
+
});
|
|
222
|
+
state.advanceByText(text);
|
|
223
|
+
}
|
|
224
|
+
break;
|
|
225
|
+
}
|
|
226
|
+
case "TJ": {
|
|
227
|
+
const operand = op.operands[0];
|
|
228
|
+
if (Array.isArray(operand)) for (const elem of operand) if (typeof elem === "number") state.advanceByDisplacement(-elem);
|
|
229
|
+
else {
|
|
230
|
+
const text = state.decodeString(elem);
|
|
231
|
+
if (text.length > 0) {
|
|
232
|
+
const pos = state.getTextPosition();
|
|
233
|
+
items.push({
|
|
234
|
+
text,
|
|
235
|
+
x: pos.x,
|
|
236
|
+
y: pos.y,
|
|
237
|
+
width: state.estimateWidth(text),
|
|
238
|
+
height: state.fontSize,
|
|
239
|
+
fontSize: state.fontSize,
|
|
240
|
+
fontName: state.fontName
|
|
241
|
+
});
|
|
242
|
+
state.advanceByText(text);
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
break;
|
|
246
|
+
}
|
|
247
|
+
case "'":
|
|
248
|
+
state.nextLine();
|
|
249
|
+
{
|
|
250
|
+
const text = state.decodeString(op.operands[0]);
|
|
251
|
+
if (text.length > 0) {
|
|
252
|
+
const pos = state.getTextPosition();
|
|
253
|
+
items.push({
|
|
254
|
+
text,
|
|
255
|
+
x: pos.x,
|
|
256
|
+
y: pos.y,
|
|
257
|
+
width: state.estimateWidth(text),
|
|
258
|
+
height: state.fontSize,
|
|
259
|
+
fontSize: state.fontSize,
|
|
260
|
+
fontName: state.fontName
|
|
261
|
+
});
|
|
262
|
+
state.advanceByText(text);
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
break;
|
|
266
|
+
case "\"":
|
|
267
|
+
state.wordSpacing = operandAsNumber(op.operands[0]);
|
|
268
|
+
state.charSpacing = operandAsNumber(op.operands[1]);
|
|
269
|
+
state.nextLine();
|
|
270
|
+
{
|
|
271
|
+
const text = state.decodeString(op.operands[2]);
|
|
272
|
+
if (text.length > 0) {
|
|
273
|
+
const pos = state.getTextPosition();
|
|
274
|
+
items.push({
|
|
275
|
+
text,
|
|
276
|
+
x: pos.x,
|
|
277
|
+
y: pos.y,
|
|
278
|
+
width: state.estimateWidth(text),
|
|
279
|
+
height: state.fontSize,
|
|
280
|
+
fontSize: state.fontSize,
|
|
281
|
+
fontName: state.fontName
|
|
282
|
+
});
|
|
283
|
+
state.advanceByText(text);
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
break;
|
|
287
|
+
case "q":
|
|
288
|
+
state.save();
|
|
289
|
+
break;
|
|
290
|
+
case "Q":
|
|
291
|
+
state.restore();
|
|
292
|
+
break;
|
|
293
|
+
case "cm":
|
|
294
|
+
state.concatCTM(operandAsNumber(op.operands[0]), operandAsNumber(op.operands[1]), operandAsNumber(op.operands[2]), operandAsNumber(op.operands[3]), operandAsNumber(op.operands[4]), operandAsNumber(op.operands[5]));
|
|
295
|
+
break;
|
|
296
|
+
default: break;
|
|
297
|
+
}
|
|
298
|
+
return items;
|
|
299
|
+
}
|
|
300
|
+
/**
|
|
301
|
+
* Parse a ToUnicode CMap stream into a lookup map.
|
|
302
|
+
*
|
|
303
|
+
* Handles the two standard mapping constructs:
|
|
304
|
+
* - `beginbfchar` / `endbfchar` — single code-to-Unicode mappings
|
|
305
|
+
* - `beginbfrange` / `endbfrange` — range-based mappings
|
|
306
|
+
*
|
|
307
|
+
* @param data - The raw CMap stream bytes (already decompressed).
|
|
308
|
+
* @returns A parsed CMap.
|
|
309
|
+
*/
|
|
310
|
+
function parseToUnicodeCMap(data) {
|
|
311
|
+
const text = decodeText(data);
|
|
312
|
+
const map = /* @__PURE__ */ new Map();
|
|
313
|
+
parseBfCharSections(text, map);
|
|
314
|
+
parseBfRangeSections(text, map);
|
|
315
|
+
return { map };
|
|
316
|
+
}
|
|
317
|
+
/**
|
|
318
|
+
* Parse all `beginbfchar`/`endbfchar` sections in a CMap.
|
|
319
|
+
*/
|
|
320
|
+
function parseBfCharSections(text, map) {
|
|
321
|
+
const regex = /beginbfchar\s*([\s\S]*?)\s*endbfchar/g;
|
|
322
|
+
let match;
|
|
323
|
+
while ((match = regex.exec(text)) !== null) {
|
|
324
|
+
const body = match[1];
|
|
325
|
+
const lineRegex = /<([0-9a-fA-F]+)>\s*<([0-9a-fA-F]+)>/g;
|
|
326
|
+
let lineMatch;
|
|
327
|
+
while ((lineMatch = lineRegex.exec(body)) !== null) {
|
|
328
|
+
const srcCode = parseInt(lineMatch[1], 16);
|
|
329
|
+
const dstString = hexToUnicode(lineMatch[2]);
|
|
330
|
+
map.set(srcCode, dstString);
|
|
331
|
+
}
|
|
332
|
+
}
|
|
333
|
+
}
|
|
334
|
+
/**
|
|
335
|
+
* Parse all `beginbfrange`/`endbfrange` sections in a CMap.
|
|
336
|
+
*/
|
|
337
|
+
function parseBfRangeSections(text, map) {
|
|
338
|
+
const regex = /beginbfrange\s*([\s\S]*?)\s*endbfrange/g;
|
|
339
|
+
let match;
|
|
340
|
+
while ((match = regex.exec(text)) !== null) {
|
|
341
|
+
const body = match[1];
|
|
342
|
+
const lineRegex = /<([0-9a-fA-F]+)>\s*<([0-9a-fA-F]+)>\s*(?:<([0-9a-fA-F]+)>|\[([\s\S]*?)\])/g;
|
|
343
|
+
let lineMatch;
|
|
344
|
+
while ((lineMatch = lineRegex.exec(body)) !== null) {
|
|
345
|
+
const srcLow = parseInt(lineMatch[1], 16);
|
|
346
|
+
const srcHigh = parseInt(lineMatch[2], 16);
|
|
347
|
+
if (lineMatch[3]) {
|
|
348
|
+
let dstCode = parseInt(lineMatch[3], 16);
|
|
349
|
+
for (let code = srcLow; code <= srcHigh; code++) {
|
|
350
|
+
map.set(code, codePointToString(dstCode));
|
|
351
|
+
dstCode++;
|
|
352
|
+
}
|
|
353
|
+
} else if (lineMatch[4]) {
|
|
354
|
+
const arrRegex = /<([0-9a-fA-F]+)>/g;
|
|
355
|
+
let arrMatch;
|
|
356
|
+
let code = srcLow;
|
|
357
|
+
while ((arrMatch = arrRegex.exec(lineMatch[4])) !== null && code <= srcHigh) {
|
|
358
|
+
map.set(code, hexToUnicode(arrMatch[1]));
|
|
359
|
+
code++;
|
|
360
|
+
}
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
}
|
|
364
|
+
}
|
|
365
|
+
/**
|
|
366
|
+
* Convert a hex string (2 or 4 hex chars per code point) to a Unicode
|
|
367
|
+
* string.
|
|
368
|
+
*/
|
|
369
|
+
function hexToUnicode(hex) {
|
|
370
|
+
const parts = [];
|
|
371
|
+
const step = hex.length <= 4 ? hex.length : 4;
|
|
372
|
+
for (let i = 0; i < hex.length; i += step) {
|
|
373
|
+
const chunk = hex.slice(i, i + step);
|
|
374
|
+
const code = parseInt(chunk, 16);
|
|
375
|
+
if (!isNaN(code)) parts.push(String.fromCodePoint(code));
|
|
376
|
+
}
|
|
377
|
+
return parts.join("");
|
|
378
|
+
}
|
|
379
|
+
/**
|
|
380
|
+
* Convert a numeric code point to a string.
|
|
381
|
+
*/
|
|
382
|
+
function codePointToString(code) {
|
|
383
|
+
return String.fromCodePoint(code);
|
|
384
|
+
}
|
|
385
|
+
/**
|
|
386
|
+
* WinAnsiEncoding table for bytes 0x80-0x9F that differ from Latin-1.
|
|
387
|
+
* Bytes 0x00-0x7F and 0xA0-0xFF map directly to their Unicode code points.
|
|
388
|
+
*/
|
|
389
|
+
const WIN_ANSI_SPECIAL = {
|
|
390
|
+
128: 8364,
|
|
391
|
+
130: 8218,
|
|
392
|
+
131: 402,
|
|
393
|
+
132: 8222,
|
|
394
|
+
133: 8230,
|
|
395
|
+
134: 8224,
|
|
396
|
+
135: 8225,
|
|
397
|
+
136: 710,
|
|
398
|
+
137: 8240,
|
|
399
|
+
138: 352,
|
|
400
|
+
139: 8249,
|
|
401
|
+
140: 338,
|
|
402
|
+
142: 381,
|
|
403
|
+
145: 8216,
|
|
404
|
+
146: 8217,
|
|
405
|
+
147: 8220,
|
|
406
|
+
148: 8221,
|
|
407
|
+
149: 8226,
|
|
408
|
+
150: 8211,
|
|
409
|
+
151: 8212,
|
|
410
|
+
152: 732,
|
|
411
|
+
153: 8482,
|
|
412
|
+
154: 353,
|
|
413
|
+
155: 8250,
|
|
414
|
+
156: 339,
|
|
415
|
+
158: 382,
|
|
416
|
+
159: 376
|
|
417
|
+
};
|
|
418
|
+
/**
|
|
419
|
+
* Decode a single byte using WinAnsiEncoding.
|
|
420
|
+
*/
|
|
421
|
+
function winAnsiDecode(byte) {
|
|
422
|
+
if (Object.hasOwn(WIN_ANSI_SPECIAL, byte)) return String.fromCodePoint(WIN_ANSI_SPECIAL[byte]);
|
|
423
|
+
return String.fromCharCode(byte);
|
|
424
|
+
}
|
|
425
|
+
/** Identity matrix. */
|
|
426
|
+
function identityMatrix() {
|
|
427
|
+
return [
|
|
428
|
+
1,
|
|
429
|
+
0,
|
|
430
|
+
0,
|
|
431
|
+
1,
|
|
432
|
+
0,
|
|
433
|
+
0
|
|
434
|
+
];
|
|
435
|
+
}
|
|
436
|
+
/**
|
|
437
|
+
* Multiply two 3x3 matrices (stored as 6-element arrays).
|
|
438
|
+
* Result = A * B
|
|
439
|
+
*/
|
|
440
|
+
function multiplyMatrices(a, b) {
|
|
441
|
+
return [
|
|
442
|
+
a[0] * b[0] + a[1] * b[2],
|
|
443
|
+
a[0] * b[1] + a[1] * b[3],
|
|
444
|
+
a[2] * b[0] + a[3] * b[2],
|
|
445
|
+
a[2] * b[1] + a[3] * b[3],
|
|
446
|
+
a[4] * b[0] + a[5] * b[2] + b[4],
|
|
447
|
+
a[4] * b[1] + a[5] * b[3] + b[5]
|
|
448
|
+
];
|
|
449
|
+
}
|
|
450
|
+
/**
|
|
451
|
+
* Tracks the graphics/text state needed for text extraction.
|
|
452
|
+
*/
|
|
453
|
+
var TextState = class {
|
|
454
|
+
/** Current transformation matrix (CTM). */
|
|
455
|
+
ctm = identityMatrix();
|
|
456
|
+
/** Text matrix — set by Tm, updated by Td/TD/T*. */
|
|
457
|
+
textMatrix = identityMatrix();
|
|
458
|
+
/** Text line matrix — the matrix at the start of the current line. */
|
|
459
|
+
textLineMatrix = identityMatrix();
|
|
460
|
+
/** Current font resource name. */
|
|
461
|
+
fontName = "";
|
|
462
|
+
/** Current font size. */
|
|
463
|
+
fontSize = 12;
|
|
464
|
+
/** Character spacing (Tc). */
|
|
465
|
+
charSpacing = 0;
|
|
466
|
+
/** Word spacing (Tw). */
|
|
467
|
+
wordSpacing = 0;
|
|
468
|
+
/** Horizontal scaling (Tz) as a percentage (100 = normal). */
|
|
469
|
+
horizontalScaling = 100;
|
|
470
|
+
/** Text leading (TL). */
|
|
471
|
+
leading = 0;
|
|
472
|
+
/** Text rise (Ts). */
|
|
473
|
+
rise = 0;
|
|
474
|
+
/** Graphics state stack for q/Q. */
|
|
475
|
+
stateStack = [];
|
|
476
|
+
/** Page resources dictionary. */
|
|
477
|
+
resources;
|
|
478
|
+
/** Cache of parsed ToUnicode CMaps per font name. */
|
|
479
|
+
cmapCache = /* @__PURE__ */ new Map();
|
|
480
|
+
/** Cache of font encoding type per font name. */
|
|
481
|
+
fontEncodingCache = /* @__PURE__ */ new Map();
|
|
482
|
+
/** Cache of whether a font is a CID (2-byte) font. */
|
|
483
|
+
cidFontCache = /* @__PURE__ */ new Map();
|
|
484
|
+
constructor(resources) {
|
|
485
|
+
this.resources = resources;
|
|
486
|
+
if (resources) this.analyzeFonts(resources);
|
|
487
|
+
}
|
|
488
|
+
/**
|
|
489
|
+
* Pre-analyze fonts from the resources dictionary to determine encoding
|
|
490
|
+
* types and cache ToUnicode CMaps.
|
|
491
|
+
*/
|
|
492
|
+
analyzeFonts(resources) {
|
|
493
|
+
const fonts = resources.get("/Font");
|
|
494
|
+
if (!(fonts instanceof PdfDict)) return;
|
|
495
|
+
for (const [name, fontObj] of fonts) {
|
|
496
|
+
if (!(fontObj instanceof PdfDict)) continue;
|
|
497
|
+
const subtype = fontObj.get("/Subtype");
|
|
498
|
+
const isCid = subtype instanceof PdfName && (subtype.value === "/Type0" || subtype.value === "/CIDFontType0" || subtype.value === "/CIDFontType2");
|
|
499
|
+
this.cidFontCache.set(name, isCid);
|
|
500
|
+
const encoding = fontObj.get("/Encoding");
|
|
501
|
+
if (encoding instanceof PdfName) this.fontEncodingCache.set(name, encoding.value.replace(/^\//, ""));
|
|
502
|
+
const toUnicode = fontObj.get("/ToUnicode");
|
|
503
|
+
if (toUnicode instanceof PdfStream) try {
|
|
504
|
+
const cmap = parseToUnicodeCMap(toUnicode.data);
|
|
505
|
+
this.cmapCache.set(name, cmap);
|
|
506
|
+
} catch {
|
|
507
|
+
this.cmapCache.set(name, null);
|
|
508
|
+
}
|
|
509
|
+
}
|
|
510
|
+
}
|
|
511
|
+
/**
|
|
512
|
+
* Save the current graphics state (q).
|
|
513
|
+
*/
|
|
514
|
+
save() {
|
|
515
|
+
this.stateStack.push({
|
|
516
|
+
ctm: [...this.ctm],
|
|
517
|
+
fontName: this.fontName,
|
|
518
|
+
fontSize: this.fontSize,
|
|
519
|
+
charSpacing: this.charSpacing,
|
|
520
|
+
wordSpacing: this.wordSpacing,
|
|
521
|
+
horizontalScaling: this.horizontalScaling,
|
|
522
|
+
leading: this.leading,
|
|
523
|
+
rise: this.rise
|
|
524
|
+
});
|
|
525
|
+
}
|
|
526
|
+
/**
|
|
527
|
+
* Restore the previously saved graphics state (Q).
|
|
528
|
+
*/
|
|
529
|
+
restore() {
|
|
530
|
+
const saved = this.stateStack.pop();
|
|
531
|
+
if (saved) {
|
|
532
|
+
this.ctm = saved.ctm;
|
|
533
|
+
this.fontName = saved.fontName;
|
|
534
|
+
this.fontSize = saved.fontSize;
|
|
535
|
+
this.charSpacing = saved.charSpacing;
|
|
536
|
+
this.wordSpacing = saved.wordSpacing;
|
|
537
|
+
this.horizontalScaling = saved.horizontalScaling;
|
|
538
|
+
this.leading = saved.leading;
|
|
539
|
+
this.rise = saved.rise;
|
|
540
|
+
}
|
|
541
|
+
}
|
|
542
|
+
/**
|
|
543
|
+
* Concatenate a matrix with the CTM (cm).
|
|
544
|
+
*/
|
|
545
|
+
concatCTM(a, b, c, d, e, f) {
|
|
546
|
+
this.ctm = multiplyMatrices([
|
|
547
|
+
a,
|
|
548
|
+
b,
|
|
549
|
+
c,
|
|
550
|
+
d,
|
|
551
|
+
e,
|
|
552
|
+
f
|
|
553
|
+
], this.ctm);
|
|
554
|
+
}
|
|
555
|
+
/**
|
|
556
|
+
* Reset the text matrix to identity (called at BT).
|
|
557
|
+
*/
|
|
558
|
+
resetTextMatrix() {
|
|
559
|
+
this.textMatrix = identityMatrix();
|
|
560
|
+
this.textLineMatrix = identityMatrix();
|
|
561
|
+
}
|
|
562
|
+
/**
|
|
563
|
+
* Set the font and size (Tf).
|
|
564
|
+
*/
|
|
565
|
+
setFont(name, size) {
|
|
566
|
+
this.fontName = name.startsWith("/") ? name : `/${name}`;
|
|
567
|
+
this.fontSize = size;
|
|
568
|
+
}
|
|
569
|
+
/**
|
|
570
|
+
* Move text position (Td).
|
|
571
|
+
*/
|
|
572
|
+
moveText(tx, ty) {
|
|
573
|
+
this.textLineMatrix = multiplyMatrices([
|
|
574
|
+
1,
|
|
575
|
+
0,
|
|
576
|
+
0,
|
|
577
|
+
1,
|
|
578
|
+
tx,
|
|
579
|
+
ty
|
|
580
|
+
], this.textLineMatrix);
|
|
581
|
+
this.textMatrix = [...this.textLineMatrix];
|
|
582
|
+
}
|
|
583
|
+
/**
|
|
584
|
+
* Set the text matrix directly (Tm).
|
|
585
|
+
*/
|
|
586
|
+
setTextMatrix(a, b, c, d, e, f) {
|
|
587
|
+
this.textMatrix = [
|
|
588
|
+
a,
|
|
589
|
+
b,
|
|
590
|
+
c,
|
|
591
|
+
d,
|
|
592
|
+
e,
|
|
593
|
+
f
|
|
594
|
+
];
|
|
595
|
+
this.textLineMatrix = [
|
|
596
|
+
a,
|
|
597
|
+
b,
|
|
598
|
+
c,
|
|
599
|
+
d,
|
|
600
|
+
e,
|
|
601
|
+
f
|
|
602
|
+
];
|
|
603
|
+
}
|
|
604
|
+
/**
|
|
605
|
+
* Move to the start of the next line (T*).
|
|
606
|
+
* Equivalent to: 0 -TL Td
|
|
607
|
+
*/
|
|
608
|
+
nextLine() {
|
|
609
|
+
this.moveText(0, -this.leading);
|
|
610
|
+
}
|
|
611
|
+
/**
|
|
612
|
+
* Get the current text position in user-space coordinates.
|
|
613
|
+
*/
|
|
614
|
+
getTextPosition() {
|
|
615
|
+
const combined = multiplyMatrices(this.textMatrix, this.ctm);
|
|
616
|
+
return {
|
|
617
|
+
x: combined[4],
|
|
618
|
+
y: combined[5]
|
|
619
|
+
};
|
|
620
|
+
}
|
|
621
|
+
/**
|
|
622
|
+
* Estimate the width of a text string in user-space units.
|
|
623
|
+
*
|
|
624
|
+
* Uses a rough heuristic: 0.5 * fontSize per character for standard
|
|
625
|
+
* fonts. A production implementation would use font metrics.
|
|
626
|
+
*/
|
|
627
|
+
estimateWidth(text) {
|
|
628
|
+
const avgCharWidth = .5;
|
|
629
|
+
const hScale = this.horizontalScaling / 100;
|
|
630
|
+
return text.length * this.fontSize * avgCharWidth * hScale;
|
|
631
|
+
}
|
|
632
|
+
/**
|
|
633
|
+
* Advance the text matrix by the width of the given text.
|
|
634
|
+
*/
|
|
635
|
+
advanceByText(text) {
|
|
636
|
+
this.textMatrix = multiplyMatrices([
|
|
637
|
+
1,
|
|
638
|
+
0,
|
|
639
|
+
0,
|
|
640
|
+
1,
|
|
641
|
+
this.estimateWidth(text),
|
|
642
|
+
0
|
|
643
|
+
], this.textMatrix);
|
|
644
|
+
}
|
|
645
|
+
/**
|
|
646
|
+
* Advance the text matrix by a TJ displacement value.
|
|
647
|
+
*
|
|
648
|
+
* The displacement is in thousandths of a unit of text space.
|
|
649
|
+
*/
|
|
650
|
+
advanceByDisplacement(displacement) {
|
|
651
|
+
this.textMatrix = multiplyMatrices([
|
|
652
|
+
1,
|
|
653
|
+
0,
|
|
654
|
+
0,
|
|
655
|
+
1,
|
|
656
|
+
displacement / 1e3 * this.fontSize * (this.horizontalScaling / 100),
|
|
657
|
+
0
|
|
658
|
+
], this.textMatrix);
|
|
659
|
+
}
|
|
660
|
+
/**
|
|
661
|
+
* Decode an operand (string or hex string) into a readable text string.
|
|
662
|
+
*
|
|
663
|
+
* Uses the current font's ToUnicode CMap if available, otherwise falls
|
|
664
|
+
* back to WinAnsiEncoding or direct code-point mapping.
|
|
665
|
+
*/
|
|
666
|
+
decodeString(operand) {
|
|
667
|
+
if (operand == null) return "";
|
|
668
|
+
if (typeof operand === "number") return "";
|
|
669
|
+
const raw = typeof operand === "string" ? operand : String(operand);
|
|
670
|
+
const cmap = this.cmapCache.get(this.fontName);
|
|
671
|
+
const isCid = this.cidFontCache.get(this.fontName) ?? false;
|
|
672
|
+
if (cmap) return this.decodeWithCMap(raw, cmap, isCid);
|
|
673
|
+
if (isCid) return this.decodeCIDString(raw);
|
|
674
|
+
return this.decodeWinAnsi(raw);
|
|
675
|
+
}
|
|
676
|
+
/**
|
|
677
|
+
* Decode a TJ array operand (array of strings + numbers).
|
|
678
|
+
*/
|
|
679
|
+
decodeTJArray(operand) {
|
|
680
|
+
if (!Array.isArray(operand)) return this.decodeString(operand);
|
|
681
|
+
const parts = [];
|
|
682
|
+
for (const elem of operand) if (typeof elem === "number") {
|
|
683
|
+
if (elem <= -100) parts.push(" ");
|
|
684
|
+
} else {
|
|
685
|
+
const decoded = this.decodeString(elem);
|
|
686
|
+
if (decoded.length > 0) parts.push(decoded);
|
|
687
|
+
}
|
|
688
|
+
return parts.join("");
|
|
689
|
+
}
|
|
690
|
+
/**
|
|
691
|
+
* Decode a string using a ToUnicode CMap.
|
|
692
|
+
*/
|
|
693
|
+
decodeWithCMap(raw, cmap, isCid) {
|
|
694
|
+
let result = "";
|
|
695
|
+
if (isCid) for (let i = 0; i + 1 < raw.length; i += 2) {
|
|
696
|
+
const code = raw.charCodeAt(i) << 8 | raw.charCodeAt(i + 1);
|
|
697
|
+
const mapped = cmap.map.get(code);
|
|
698
|
+
if (mapped !== void 0) result += mapped;
|
|
699
|
+
else if (code >= 32 && code <= 65535) result += String.fromCharCode(code);
|
|
700
|
+
}
|
|
701
|
+
else for (let i = 0; i < raw.length; i++) {
|
|
702
|
+
const code = raw.charCodeAt(i);
|
|
703
|
+
const mapped = cmap.map.get(code);
|
|
704
|
+
if (mapped !== void 0) result += mapped;
|
|
705
|
+
else result += winAnsiDecode(code);
|
|
706
|
+
}
|
|
707
|
+
return result;
|
|
708
|
+
}
|
|
709
|
+
/**
|
|
710
|
+
* Decode a CID (Identity-H) encoded string without a ToUnicode CMap.
|
|
711
|
+
*/
|
|
712
|
+
decodeCIDString(raw) {
|
|
713
|
+
let result = "";
|
|
714
|
+
for (let i = 0; i + 1 < raw.length; i += 2) {
|
|
715
|
+
const code = raw.charCodeAt(i) << 8 | raw.charCodeAt(i + 1);
|
|
716
|
+
if (code >= 32 && code <= 65535) result += String.fromCharCode(code);
|
|
717
|
+
}
|
|
718
|
+
return result;
|
|
719
|
+
}
|
|
720
|
+
/**
|
|
721
|
+
* Decode a string using WinAnsiEncoding.
|
|
722
|
+
*/
|
|
723
|
+
decodeWinAnsi(raw) {
|
|
724
|
+
let result = "";
|
|
725
|
+
for (let i = 0; i < raw.length; i++) {
|
|
726
|
+
const code = raw.charCodeAt(i);
|
|
727
|
+
result += winAnsiDecode(code);
|
|
728
|
+
}
|
|
729
|
+
return result;
|
|
730
|
+
}
|
|
731
|
+
};
|
|
732
|
+
/**
|
|
733
|
+
* Extract a numeric value from an operand, defaulting to 0.
|
|
734
|
+
*/
|
|
735
|
+
function operandAsNumber(operand) {
|
|
736
|
+
if (typeof operand === "number") return operand;
|
|
737
|
+
if (operand instanceof PdfNumber) return operand.value;
|
|
738
|
+
if (typeof operand === "string") {
|
|
739
|
+
const n = parseFloat(operand);
|
|
740
|
+
return isNaN(n) ? 0 : n;
|
|
741
|
+
}
|
|
742
|
+
return 0;
|
|
743
|
+
}
|
|
744
|
+
/**
|
|
745
|
+
* Extract a string value from an operand.
|
|
746
|
+
*/
|
|
747
|
+
function operandAsString(operand) {
|
|
748
|
+
if (typeof operand === "string") return operand;
|
|
749
|
+
if (operand instanceof PdfName) return operand.value;
|
|
750
|
+
if (typeof operand === "number") return String(operand);
|
|
751
|
+
return "";
|
|
752
|
+
}
|
|
753
|
+
/**
|
|
754
|
+
* Decode raw bytes to a string (ASCII/Latin-1 — sufficient for CMap
|
|
755
|
+
* parsing which is ASCII-based).
|
|
756
|
+
*/
|
|
757
|
+
function decodeText(data) {
|
|
758
|
+
return new TextDecoder("latin1").decode(data);
|
|
759
|
+
}
|
|
760
|
+
|
|
761
|
+
//#endregion
|
|
762
|
+
//#region src/parser/contentStreamParser.ts
|
|
763
|
+
/**
|
|
764
|
+
* @module parser/contentStreamParser
|
|
765
|
+
*
|
|
766
|
+
* Parse PDF content streams (the operator/operand sequences that describe
|
|
767
|
+
* page appearance) into a structured AST.
|
|
768
|
+
*
|
|
769
|
+
* A PDF content stream consists of a flat sequence of *operands* followed
|
|
770
|
+
* by an *operator*. Operands are PDF objects (numbers, strings, names,
|
|
771
|
+
* booleans, arrays, `null`); operators are unquoted letter sequences.
|
|
772
|
+
*
|
|
773
|
+
* Special handling is required for inline images (`BI … ID data EI`).
|
|
774
|
+
*
|
|
775
|
+
* Reference: PDF 1.7 spec, §7.8.2 (Content Streams).
|
|
776
|
+
*
|
|
777
|
+
* @packageDocumentation
|
|
778
|
+
*/
|
|
779
|
+
/**
|
|
780
|
+
* Parse a PDF content stream into an ordered list of operators.
|
|
781
|
+
*
|
|
782
|
+
* @param data - The raw content-stream bytes (already decompressed).
|
|
783
|
+
* @returns An array of operators in document order.
|
|
784
|
+
*/
|
|
785
|
+
function parseContentStream(data) {
|
|
786
|
+
return new ContentStreamLexer(data).parse();
|
|
787
|
+
}
|
|
788
|
+
var TokenType = /* @__PURE__ */ function(TokenType) {
|
|
789
|
+
TokenType[TokenType["Number"] = 0] = "Number";
|
|
790
|
+
TokenType[TokenType["String"] = 1] = "String";
|
|
791
|
+
TokenType[TokenType["HexString"] = 2] = "HexString";
|
|
792
|
+
TokenType[TokenType["Name"] = 3] = "Name";
|
|
793
|
+
TokenType[TokenType["Bool"] = 4] = "Bool";
|
|
794
|
+
TokenType[TokenType["Null"] = 5] = "Null";
|
|
795
|
+
TokenType[TokenType["ArrayStart"] = 6] = "ArrayStart";
|
|
796
|
+
TokenType[TokenType["ArrayEnd"] = 7] = "ArrayEnd";
|
|
797
|
+
TokenType[TokenType["Operator"] = 8] = "Operator";
|
|
798
|
+
TokenType[TokenType["InlineImage"] = 9] = "InlineImage";
|
|
799
|
+
TokenType[TokenType["EOF"] = 10] = "EOF";
|
|
800
|
+
return TokenType;
|
|
801
|
+
}(TokenType || {});
|
|
802
|
+
/**
|
|
803
|
+
* `hexVal[b]` is the numeric value (0-15) of a hex character, or -1 if
|
|
804
|
+
* the byte is not a valid hex digit.
|
|
805
|
+
*/
|
|
806
|
+
const hexVal = /* @__PURE__ */ (() => {
|
|
807
|
+
const t = new Int8Array(256).fill(-1);
|
|
808
|
+
for (let i = 0; i <= 9; i++) t[48 + i] = i;
|
|
809
|
+
for (let i = 0; i < 6; i++) {
|
|
810
|
+
t[65 + i] = 10 + i;
|
|
811
|
+
t[97 + i] = 10 + i;
|
|
812
|
+
}
|
|
813
|
+
return t;
|
|
814
|
+
})();
|
|
815
|
+
/**
|
|
816
|
+
* Combined lexer + parser for PDF content streams.
|
|
817
|
+
*
|
|
818
|
+
* Content streams are simpler than full PDF object syntax — there are no
|
|
819
|
+
* dictionaries (except inside inline images), no indirect references, and
|
|
820
|
+
* no comments outside of string literals.
|
|
821
|
+
*/
|
|
822
|
+
var ContentStreamLexer = class {
|
|
823
|
+
data;
|
|
824
|
+
pos = 0;
|
|
825
|
+
constructor(data) {
|
|
826
|
+
this.data = data;
|
|
827
|
+
}
|
|
828
|
+
/**
|
|
829
|
+
* Parse the entire stream and return all operators.
|
|
830
|
+
*/
|
|
831
|
+
parse() {
|
|
832
|
+
const result = [];
|
|
833
|
+
const operandStack = [];
|
|
834
|
+
while (true) {
|
|
835
|
+
const token = this.nextToken();
|
|
836
|
+
if (token.type === TokenType.EOF) break;
|
|
837
|
+
switch (token.type) {
|
|
838
|
+
case TokenType.Number:
|
|
839
|
+
case TokenType.String:
|
|
840
|
+
case TokenType.HexString:
|
|
841
|
+
case TokenType.Bool:
|
|
842
|
+
case TokenType.Null:
|
|
843
|
+
case TokenType.Name:
|
|
844
|
+
operandStack.push(token.value);
|
|
845
|
+
break;
|
|
846
|
+
case TokenType.ArrayStart: {
|
|
847
|
+
const arr = this.parseArray();
|
|
848
|
+
operandStack.push(arr);
|
|
849
|
+
break;
|
|
850
|
+
}
|
|
851
|
+
case TokenType.ArrayEnd: break;
|
|
852
|
+
case TokenType.Operator: {
|
|
853
|
+
const op = token.value;
|
|
854
|
+
if (op === "BI") {
|
|
855
|
+
const inlineImg = this.parseInlineImage();
|
|
856
|
+
result.push({
|
|
857
|
+
operator: "BI",
|
|
858
|
+
operands: [inlineImg]
|
|
859
|
+
});
|
|
860
|
+
} else result.push({
|
|
861
|
+
operator: op,
|
|
862
|
+
operands: operandStack.splice(0, operandStack.length)
|
|
863
|
+
});
|
|
864
|
+
break;
|
|
865
|
+
}
|
|
866
|
+
case TokenType.InlineImage: break;
|
|
867
|
+
}
|
|
868
|
+
}
|
|
869
|
+
return result;
|
|
870
|
+
}
|
|
871
|
+
/**
|
|
872
|
+
* Parse a PDF array `[…]`. Called after the `[` token has been consumed.
|
|
873
|
+
*/
|
|
874
|
+
parseArray() {
|
|
875
|
+
const items = [];
|
|
876
|
+
while (true) {
|
|
877
|
+
const token = this.nextToken();
|
|
878
|
+
if (token.type === TokenType.EOF) break;
|
|
879
|
+
if (token.type === TokenType.ArrayEnd) break;
|
|
880
|
+
switch (token.type) {
|
|
881
|
+
case TokenType.Number:
|
|
882
|
+
case TokenType.String:
|
|
883
|
+
case TokenType.HexString:
|
|
884
|
+
case TokenType.Bool:
|
|
885
|
+
case TokenType.Null:
|
|
886
|
+
case TokenType.Name:
|
|
887
|
+
items.push(token.value);
|
|
888
|
+
break;
|
|
889
|
+
case TokenType.ArrayStart:
|
|
890
|
+
items.push(this.parseArray());
|
|
891
|
+
break;
|
|
892
|
+
default: break;
|
|
893
|
+
}
|
|
894
|
+
}
|
|
895
|
+
return items;
|
|
896
|
+
}
|
|
897
|
+
/**
|
|
898
|
+
* Parse an inline image.
|
|
899
|
+
*
|
|
900
|
+
* After `BI` has been read, we expect key-value pairs (name + value)
|
|
901
|
+
* until `ID`, then raw binary data until we find `EI` preceded by
|
|
902
|
+
* whitespace.
|
|
903
|
+
*/
|
|
904
|
+
parseInlineImage() {
|
|
905
|
+
const dict = {};
|
|
906
|
+
while (true) {
|
|
907
|
+
this.skipWhitespace();
|
|
908
|
+
if (this.pos >= this.data.length) break;
|
|
909
|
+
if (this.peekKeyword("ID")) {
|
|
910
|
+
this.pos += 2;
|
|
911
|
+
break;
|
|
912
|
+
}
|
|
913
|
+
const keyToken = this.nextToken();
|
|
914
|
+
if (keyToken.type === TokenType.Operator) {
|
|
915
|
+
const kw = keyToken.value;
|
|
916
|
+
if (kw === "ID") break;
|
|
917
|
+
dict[kw] = this.nextToken().value;
|
|
918
|
+
continue;
|
|
919
|
+
}
|
|
920
|
+
if (keyToken.type === TokenType.Name) {
|
|
921
|
+
const name = keyToken.value.value;
|
|
922
|
+
dict[name] = this.nextToken().value;
|
|
923
|
+
} else if (keyToken.type === TokenType.EOF) break;
|
|
924
|
+
}
|
|
925
|
+
if (this.pos < this.data.length) {
|
|
926
|
+
const ch = this.data[this.pos];
|
|
927
|
+
if (ch === 32 || ch === 10 || ch === 13 || ch === 9) {
|
|
928
|
+
this.pos++;
|
|
929
|
+
if (ch === 13 && this.pos < this.data.length && this.data[this.pos] === 10) this.pos++;
|
|
930
|
+
}
|
|
931
|
+
}
|
|
932
|
+
const dataStart = this.pos;
|
|
933
|
+
let dataEnd = this.pos;
|
|
934
|
+
let searchFrom = this.pos;
|
|
935
|
+
while (searchFrom < this.data.length) {
|
|
936
|
+
const eIdx = this.data.indexOf(69, searchFrom);
|
|
937
|
+
if (eIdx === -1 || eIdx + 1 >= this.data.length) {
|
|
938
|
+
this.pos = this.data.length;
|
|
939
|
+
break;
|
|
940
|
+
}
|
|
941
|
+
if (eIdx > dataStart && this.isWhitespace(this.data[eIdx - 1]) && this.data[eIdx + 1] === 73) {
|
|
942
|
+
const afterEI = eIdx + 2;
|
|
943
|
+
if (afterEI >= this.data.length || this.isWhitespace(this.data[afterEI])) {
|
|
944
|
+
dataEnd = eIdx - 1;
|
|
945
|
+
this.pos = afterEI;
|
|
946
|
+
break;
|
|
947
|
+
}
|
|
948
|
+
}
|
|
949
|
+
searchFrom = eIdx + 1;
|
|
950
|
+
}
|
|
951
|
+
return {
|
|
952
|
+
dict,
|
|
953
|
+
data: this.data.slice(dataStart, dataEnd)
|
|
954
|
+
};
|
|
955
|
+
}
|
|
956
|
+
/**
|
|
957
|
+
* Peek ahead to see if the next characters form a given keyword
|
|
958
|
+
* followed by whitespace.
|
|
959
|
+
*/
|
|
960
|
+
peekKeyword(keyword) {
|
|
961
|
+
for (let i = 0; i < keyword.length; i++) {
|
|
962
|
+
if (this.pos + i >= this.data.length) return false;
|
|
963
|
+
if (this.data[this.pos + i] !== keyword.charCodeAt(i)) return false;
|
|
964
|
+
}
|
|
965
|
+
const afterPos = this.pos + keyword.length;
|
|
966
|
+
if (afterPos >= this.data.length) return true;
|
|
967
|
+
const after = this.data[afterPos];
|
|
968
|
+
return this.isWhitespace(after) || this.isDelimiter(after);
|
|
969
|
+
}
|
|
970
|
+
/**
|
|
971
|
+
* Read and return the next token from the stream.
|
|
972
|
+
*/
|
|
973
|
+
nextToken() {
|
|
974
|
+
this.skipWhitespaceAndComments();
|
|
975
|
+
if (this.pos >= this.data.length) return {
|
|
976
|
+
type: TokenType.EOF,
|
|
977
|
+
value: null
|
|
978
|
+
};
|
|
979
|
+
const ch = this.data[this.pos];
|
|
980
|
+
if (ch === 40) return this.readLiteralString();
|
|
981
|
+
if (ch === 60) {
|
|
982
|
+
if (this.pos + 1 < this.data.length && this.data[this.pos + 1] === 60) {
|
|
983
|
+
this.pos += 2;
|
|
984
|
+
return this.nextToken();
|
|
985
|
+
}
|
|
986
|
+
return this.readHexString();
|
|
987
|
+
}
|
|
988
|
+
if (ch === 62 && this.pos + 1 < this.data.length && this.data[this.pos + 1] === 62) {
|
|
989
|
+
this.pos += 2;
|
|
990
|
+
return this.nextToken();
|
|
991
|
+
}
|
|
992
|
+
if (ch === 91) {
|
|
993
|
+
this.pos++;
|
|
994
|
+
return {
|
|
995
|
+
type: TokenType.ArrayStart,
|
|
996
|
+
value: null
|
|
997
|
+
};
|
|
998
|
+
}
|
|
999
|
+
if (ch === 93) {
|
|
1000
|
+
this.pos++;
|
|
1001
|
+
return {
|
|
1002
|
+
type: TokenType.ArrayEnd,
|
|
1003
|
+
value: null
|
|
1004
|
+
};
|
|
1005
|
+
}
|
|
1006
|
+
if (ch === 47) return this.readName();
|
|
1007
|
+
if (this.isNumberStart(ch)) return this.readNumber();
|
|
1008
|
+
if (this.isRegularChar(ch)) return this.readKeyword();
|
|
1009
|
+
this.pos++;
|
|
1010
|
+
return this.nextToken();
|
|
1011
|
+
}
|
|
1012
|
+
/**
|
|
1013
|
+
* Read a literal string `(…)`, handling nested parentheses and escapes.
|
|
1014
|
+
*/
|
|
1015
|
+
readLiteralString() {
|
|
1016
|
+
this.pos++;
|
|
1017
|
+
const parts = [];
|
|
1018
|
+
let depth = 1;
|
|
1019
|
+
while (this.pos < this.data.length && depth > 0) {
|
|
1020
|
+
const ch = this.data[this.pos];
|
|
1021
|
+
if (ch === 92) {
|
|
1022
|
+
this.pos++;
|
|
1023
|
+
if (this.pos >= this.data.length) break;
|
|
1024
|
+
const esc = this.data[this.pos];
|
|
1025
|
+
switch (esc) {
|
|
1026
|
+
case 110:
|
|
1027
|
+
parts.push("\n");
|
|
1028
|
+
this.pos++;
|
|
1029
|
+
break;
|
|
1030
|
+
case 114:
|
|
1031
|
+
parts.push("\r");
|
|
1032
|
+
this.pos++;
|
|
1033
|
+
break;
|
|
1034
|
+
case 116:
|
|
1035
|
+
parts.push(" ");
|
|
1036
|
+
this.pos++;
|
|
1037
|
+
break;
|
|
1038
|
+
case 98:
|
|
1039
|
+
parts.push("\b");
|
|
1040
|
+
this.pos++;
|
|
1041
|
+
break;
|
|
1042
|
+
case 102:
|
|
1043
|
+
parts.push("\f");
|
|
1044
|
+
this.pos++;
|
|
1045
|
+
break;
|
|
1046
|
+
case 40:
|
|
1047
|
+
parts.push("(");
|
|
1048
|
+
this.pos++;
|
|
1049
|
+
break;
|
|
1050
|
+
case 41:
|
|
1051
|
+
parts.push(")");
|
|
1052
|
+
this.pos++;
|
|
1053
|
+
break;
|
|
1054
|
+
case 92:
|
|
1055
|
+
parts.push("\\");
|
|
1056
|
+
this.pos++;
|
|
1057
|
+
break;
|
|
1058
|
+
case 10:
|
|
1059
|
+
this.pos++;
|
|
1060
|
+
break;
|
|
1061
|
+
case 13:
|
|
1062
|
+
this.pos++;
|
|
1063
|
+
if (this.pos < this.data.length && this.data[this.pos] === 10) this.pos++;
|
|
1064
|
+
break;
|
|
1065
|
+
default:
|
|
1066
|
+
if (esc >= 48 && esc <= 55) {
|
|
1067
|
+
let octal = esc - 48;
|
|
1068
|
+
this.pos++;
|
|
1069
|
+
if (this.pos < this.data.length) {
|
|
1070
|
+
const d2 = this.data[this.pos];
|
|
1071
|
+
if (d2 >= 48 && d2 <= 55) {
|
|
1072
|
+
octal = octal * 8 + (d2 - 48);
|
|
1073
|
+
this.pos++;
|
|
1074
|
+
if (this.pos < this.data.length) {
|
|
1075
|
+
const d3 = this.data[this.pos];
|
|
1076
|
+
if (d3 >= 48 && d3 <= 55) {
|
|
1077
|
+
octal = octal * 8 + (d3 - 48);
|
|
1078
|
+
this.pos++;
|
|
1079
|
+
}
|
|
1080
|
+
}
|
|
1081
|
+
}
|
|
1082
|
+
}
|
|
1083
|
+
parts.push(String.fromCharCode(octal & 255));
|
|
1084
|
+
} else {
|
|
1085
|
+
parts.push(String.fromCharCode(esc));
|
|
1086
|
+
this.pos++;
|
|
1087
|
+
}
|
|
1088
|
+
break;
|
|
1089
|
+
}
|
|
1090
|
+
} else if (ch === 40) {
|
|
1091
|
+
depth++;
|
|
1092
|
+
parts.push("(");
|
|
1093
|
+
this.pos++;
|
|
1094
|
+
} else if (ch === 41) {
|
|
1095
|
+
depth--;
|
|
1096
|
+
if (depth > 0) parts.push(")");
|
|
1097
|
+
this.pos++;
|
|
1098
|
+
} else {
|
|
1099
|
+
parts.push(String.fromCharCode(ch));
|
|
1100
|
+
this.pos++;
|
|
1101
|
+
}
|
|
1102
|
+
}
|
|
1103
|
+
return {
|
|
1104
|
+
type: TokenType.String,
|
|
1105
|
+
value: parts.join("")
|
|
1106
|
+
};
|
|
1107
|
+
}
|
|
1108
|
+
/**
|
|
1109
|
+
* Read a hex string `<…>`.
|
|
1110
|
+
*/
|
|
1111
|
+
readHexString() {
|
|
1112
|
+
this.pos++;
|
|
1113
|
+
const bytes = [];
|
|
1114
|
+
let hi = -1;
|
|
1115
|
+
while (this.pos < this.data.length) {
|
|
1116
|
+
const ch = this.data[this.pos];
|
|
1117
|
+
if (ch === 62) {
|
|
1118
|
+
this.pos++;
|
|
1119
|
+
break;
|
|
1120
|
+
}
|
|
1121
|
+
if (this.isWhitespace(ch)) {
|
|
1122
|
+
this.pos++;
|
|
1123
|
+
continue;
|
|
1124
|
+
}
|
|
1125
|
+
const v = hexVal[ch];
|
|
1126
|
+
if (v === -1) {
|
|
1127
|
+
this.pos++;
|
|
1128
|
+
continue;
|
|
1129
|
+
}
|
|
1130
|
+
if (hi === -1) hi = v;
|
|
1131
|
+
else {
|
|
1132
|
+
bytes.push(hi << 4 | v);
|
|
1133
|
+
hi = -1;
|
|
1134
|
+
}
|
|
1135
|
+
this.pos++;
|
|
1136
|
+
}
|
|
1137
|
+
if (hi !== -1) bytes.push(hi << 4);
|
|
1138
|
+
return {
|
|
1139
|
+
type: TokenType.HexString,
|
|
1140
|
+
value: String.fromCharCode.apply(null, bytes)
|
|
1141
|
+
};
|
|
1142
|
+
}
|
|
1143
|
+
/**
|
|
1144
|
+
* Read a PDF name `/…`.
|
|
1145
|
+
*/
|
|
1146
|
+
readName() {
|
|
1147
|
+
this.pos++;
|
|
1148
|
+
const parts = ["/"];
|
|
1149
|
+
while (this.pos < this.data.length) {
|
|
1150
|
+
const ch = this.data[this.pos];
|
|
1151
|
+
if (this.isWhitespace(ch) || this.isDelimiter(ch)) break;
|
|
1152
|
+
if (ch === 35 && this.pos + 2 < this.data.length) {
|
|
1153
|
+
const hi = hexVal[this.data[this.pos + 1]];
|
|
1154
|
+
const lo = hexVal[this.data[this.pos + 2]];
|
|
1155
|
+
if (hi !== -1 && lo !== -1) {
|
|
1156
|
+
parts.push(String.fromCharCode(hi << 4 | lo));
|
|
1157
|
+
this.pos += 3;
|
|
1158
|
+
continue;
|
|
1159
|
+
}
|
|
1160
|
+
}
|
|
1161
|
+
parts.push(String.fromCharCode(ch));
|
|
1162
|
+
this.pos++;
|
|
1163
|
+
}
|
|
1164
|
+
return {
|
|
1165
|
+
type: TokenType.Name,
|
|
1166
|
+
value: PdfName.of(parts.join(""))
|
|
1167
|
+
};
|
|
1168
|
+
}
|
|
1169
|
+
/**
|
|
1170
|
+
* Read a numeric value (integer or real).
|
|
1171
|
+
*/
|
|
1172
|
+
readNumber() {
|
|
1173
|
+
const start = this.pos;
|
|
1174
|
+
let hasDecimal = false;
|
|
1175
|
+
if (this.data[this.pos] === 43 || this.data[this.pos] === 45) this.pos++;
|
|
1176
|
+
while (this.pos < this.data.length) {
|
|
1177
|
+
const ch = this.data[this.pos];
|
|
1178
|
+
if (ch === 46) {
|
|
1179
|
+
if (hasDecimal) break;
|
|
1180
|
+
hasDecimal = true;
|
|
1181
|
+
this.pos++;
|
|
1182
|
+
} else if (ch >= 48 && ch <= 57) this.pos++;
|
|
1183
|
+
else break;
|
|
1184
|
+
}
|
|
1185
|
+
const str = this.decodeAscii(start, this.pos);
|
|
1186
|
+
const value = parseFloat(str);
|
|
1187
|
+
return {
|
|
1188
|
+
type: TokenType.Number,
|
|
1189
|
+
value: isNaN(value) ? 0 : value
|
|
1190
|
+
};
|
|
1191
|
+
}
|
|
1192
|
+
/**
|
|
1193
|
+
* Read a keyword — an operator name or one of the special keywords
|
|
1194
|
+
* `true`, `false`, `null`.
|
|
1195
|
+
*/
|
|
1196
|
+
readKeyword() {
|
|
1197
|
+
const start = this.pos;
|
|
1198
|
+
while (this.pos < this.data.length) {
|
|
1199
|
+
const ch = this.data[this.pos];
|
|
1200
|
+
if (this.isWhitespace(ch) || this.isDelimiter(ch)) break;
|
|
1201
|
+
this.pos++;
|
|
1202
|
+
}
|
|
1203
|
+
const word = this.decodeAscii(start, this.pos);
|
|
1204
|
+
if (word === "true") return {
|
|
1205
|
+
type: TokenType.Bool,
|
|
1206
|
+
value: true
|
|
1207
|
+
};
|
|
1208
|
+
if (word === "false") return {
|
|
1209
|
+
type: TokenType.Bool,
|
|
1210
|
+
value: false
|
|
1211
|
+
};
|
|
1212
|
+
if (word === "null") return {
|
|
1213
|
+
type: TokenType.Null,
|
|
1214
|
+
value: null
|
|
1215
|
+
};
|
|
1216
|
+
return {
|
|
1217
|
+
type: TokenType.Operator,
|
|
1218
|
+
value: word
|
|
1219
|
+
};
|
|
1220
|
+
}
|
|
1221
|
+
/** PDF whitespace characters. */
|
|
1222
|
+
isWhitespace(ch) {
|
|
1223
|
+
return ch === 0 || ch === 9 || ch === 10 || ch === 12 || ch === 13 || ch === 32;
|
|
1224
|
+
}
|
|
1225
|
+
/** PDF delimiter characters. */
|
|
1226
|
+
isDelimiter(ch) {
|
|
1227
|
+
return ch === 40 || ch === 41 || ch === 60 || ch === 62 || ch === 91 || ch === 93 || ch === 123 || ch === 125 || ch === 47 || ch === 37;
|
|
1228
|
+
}
|
|
1229
|
+
/** Whether a character can begin a number. */
|
|
1230
|
+
isNumberStart(ch) {
|
|
1231
|
+
return ch >= 48 && ch <= 57 || ch === 43 || ch === 45 || ch === 46;
|
|
1232
|
+
}
|
|
1233
|
+
/** Whether a character is a regular (non-whitespace, non-delimiter) character. */
|
|
1234
|
+
isRegularChar(ch) {
|
|
1235
|
+
return !this.isWhitespace(ch) && !this.isDelimiter(ch);
|
|
1236
|
+
}
|
|
1237
|
+
/** Skip whitespace. */
|
|
1238
|
+
skipWhitespace() {
|
|
1239
|
+
while (this.pos < this.data.length && this.isWhitespace(this.data[this.pos])) this.pos++;
|
|
1240
|
+
}
|
|
1241
|
+
/** Skip whitespace and `%` comments. */
|
|
1242
|
+
skipWhitespaceAndComments() {
|
|
1243
|
+
while (this.pos < this.data.length) {
|
|
1244
|
+
const ch = this.data[this.pos];
|
|
1245
|
+
if (this.isWhitespace(ch)) this.pos++;
|
|
1246
|
+
else if (ch === 37) {
|
|
1247
|
+
this.pos++;
|
|
1248
|
+
while (this.pos < this.data.length) {
|
|
1249
|
+
const c = this.data[this.pos];
|
|
1250
|
+
if (c === 10 || c === 13) break;
|
|
1251
|
+
this.pos++;
|
|
1252
|
+
}
|
|
1253
|
+
} else break;
|
|
1254
|
+
}
|
|
1255
|
+
}
|
|
1256
|
+
/**
|
|
1257
|
+
* Decode a slice of the data as ASCII text.
|
|
1258
|
+
*/
|
|
1259
|
+
decodeAscii(start, end) {
|
|
1260
|
+
return String.fromCharCode.apply(null, this.data.subarray(start, end));
|
|
1261
|
+
}
|
|
1262
|
+
};
|
|
1263
|
+
|
|
1264
|
+
//#endregion
|
|
1265
|
+
//#region src/assets/image/dpiAnalyze.ts
|
|
1266
|
+
/**
|
|
1267
|
+
* Compute the effective DPI of an image given its pixel dimensions
|
|
1268
|
+
* and display dimensions in points.
|
|
1269
|
+
*
|
|
1270
|
+
* PDF uses 72 points per inch, so:
|
|
1271
|
+
* ```
|
|
1272
|
+
* DPI = imagePixels / (displayPoints / 72)
|
|
1273
|
+
* ```
|
|
1274
|
+
*
|
|
1275
|
+
* @param imageWidth - Image width in pixels.
|
|
1276
|
+
* @param imageHeight - Image height in pixels.
|
|
1277
|
+
* @param displayWidth - Display width in PDF points (1/72 inch).
|
|
1278
|
+
* @param displayHeight - Display height in PDF points (1/72 inch).
|
|
1279
|
+
* @returns DPI information.
|
|
1280
|
+
*
|
|
1281
|
+
* @example
|
|
1282
|
+
* ```ts
|
|
1283
|
+
* import { computeImageDpi } from 'modern-pdf-lib';
|
|
1284
|
+
*
|
|
1285
|
+
* // A 3000×2000 image displayed at 4.17×2.78 inches (300×200 points)
|
|
1286
|
+
* const dpi = computeImageDpi(3000, 2000, 300, 200);
|
|
1287
|
+
* console.log(dpi.effectiveDpi); // 720
|
|
1288
|
+
* ```
|
|
1289
|
+
*/
|
|
1290
|
+
function computeImageDpi(imageWidth, imageHeight, displayWidth, displayHeight) {
|
|
1291
|
+
const xDpi = displayWidth > 0 ? imageWidth / displayWidth * 72 : Infinity;
|
|
1292
|
+
const yDpi = displayHeight > 0 ? imageHeight / displayHeight * 72 : Infinity;
|
|
1293
|
+
return {
|
|
1294
|
+
xDpi,
|
|
1295
|
+
yDpi,
|
|
1296
|
+
effectiveDpi: Math.min(xDpi, yDpi)
|
|
1297
|
+
};
|
|
1298
|
+
}
|
|
1299
|
+
/**
|
|
1300
|
+
* Compute the target pixel dimensions for downscaling an image
|
|
1301
|
+
* to a maximum DPI at a given display size.
|
|
1302
|
+
*
|
|
1303
|
+
* @param imageWidth - Current image width in pixels.
|
|
1304
|
+
* @param imageHeight - Current image height in pixels.
|
|
1305
|
+
* @param displayWidth - Display width in PDF points.
|
|
1306
|
+
* @param displayHeight - Display height in PDF points.
|
|
1307
|
+
* @param maxDpi - Maximum allowed DPI.
|
|
1308
|
+
* @returns Target dimensions, or the original dimensions if no
|
|
1309
|
+
* downscaling is needed.
|
|
1310
|
+
*/
|
|
1311
|
+
function computeTargetDimensions(imageWidth, imageHeight, displayWidth, displayHeight, maxDpi) {
|
|
1312
|
+
const dpi = computeImageDpi(imageWidth, imageHeight, displayWidth, displayHeight);
|
|
1313
|
+
if (dpi.effectiveDpi <= maxDpi || !isFinite(dpi.effectiveDpi)) return {
|
|
1314
|
+
width: imageWidth,
|
|
1315
|
+
height: imageHeight,
|
|
1316
|
+
downscaled: false
|
|
1317
|
+
};
|
|
1318
|
+
const scale = maxDpi / dpi.effectiveDpi;
|
|
1319
|
+
return {
|
|
1320
|
+
width: Math.max(1, Math.round(imageWidth * scale)),
|
|
1321
|
+
height: Math.max(1, Math.round(imageHeight * scale)),
|
|
1322
|
+
downscaled: true
|
|
1323
|
+
};
|
|
1324
|
+
}
|
|
1325
|
+
|
|
1326
|
+
//#endregion
|
|
1327
|
+
//#region src/assets/image/compressionAnalysis.ts
|
|
1328
|
+
/**
|
|
1329
|
+
* Determine the human-readable format name from filter names.
|
|
1330
|
+
* @internal
|
|
1331
|
+
*/
|
|
1332
|
+
function formatFromFilters(filters) {
|
|
1333
|
+
if (filters.length === 0) return "Raw";
|
|
1334
|
+
for (const f of filters) {
|
|
1335
|
+
if (f === "DCTDecode") return "JPEG";
|
|
1336
|
+
if (f === "JPXDecode") return "JPEG2000";
|
|
1337
|
+
if (f === "CCITTFaxDecode") return "CCITT";
|
|
1338
|
+
if (f === "JBIG2Decode") return "JBIG2";
|
|
1339
|
+
if (f === "FlateDecode") return "FlateDecode";
|
|
1340
|
+
if (f === "LZWDecode") return "LZW";
|
|
1341
|
+
if (f === "RunLengthDecode") return "RunLength";
|
|
1342
|
+
if (f === "ASCIIHexDecode") return "ASCIIHex";
|
|
1343
|
+
if (f === "ASCII85Decode") return "ASCII85";
|
|
1344
|
+
}
|
|
1345
|
+
return filters[0] ?? "Unknown";
|
|
1346
|
+
}
|
|
1347
|
+
/**
|
|
1348
|
+
* Estimate JPEG size using a heuristic when WASM is not available.
|
|
1349
|
+
*
|
|
1350
|
+
* At quality 80, JPEG is typically 10–15% of raw pixel data size for
|
|
1351
|
+
* photographic content. We use 12.5% as a reasonable middle estimate
|
|
1352
|
+
* and scale linearly with quality.
|
|
1353
|
+
*
|
|
1354
|
+
* @internal
|
|
1355
|
+
*/
|
|
1356
|
+
function estimateJpegSizeHeuristic(width, height, channels, quality) {
|
|
1357
|
+
const rawSize = width * height * channels;
|
|
1358
|
+
const baseRatio = .125;
|
|
1359
|
+
const qualityFactor = quality / 80;
|
|
1360
|
+
const estimated = Math.round(rawSize * baseRatio * qualityFactor);
|
|
1361
|
+
return Math.max(200, estimated);
|
|
1362
|
+
}
|
|
1363
|
+
/**
|
|
1364
|
+
* Determine the recommendation for an image.
|
|
1365
|
+
* @internal
|
|
1366
|
+
*/
|
|
1367
|
+
function determineRecommendation(savingsPercent, isGrayscale, colorSpace, effectiveDpi, maxDpi) {
|
|
1368
|
+
if (isGrayscale && colorSpace !== "DeviceGray" && colorSpace !== "CalGray") return "grayscale";
|
|
1369
|
+
if (effectiveDpi !== void 0 && isFinite(effectiveDpi) && effectiveDpi > maxDpi) return "downscale";
|
|
1370
|
+
if (savingsPercent > 10) return "recompress";
|
|
1371
|
+
return "keep";
|
|
1372
|
+
}
|
|
1373
|
+
/**
|
|
1374
|
+
* Analyze all images in a PDF and report potential savings without
|
|
1375
|
+
* modifying the document.
|
|
1376
|
+
*
|
|
1377
|
+
* For each image XObject with `bitsPerComponent === 8` and 1–4 channels,
|
|
1378
|
+
* the function estimates the JPEG-encoded size — using the WASM encoder
|
|
1379
|
+
* when available, or a heuristic fallback otherwise.
|
|
1380
|
+
*
|
|
1381
|
+
* @param doc - A parsed `PdfDocument`.
|
|
1382
|
+
* @param options - Optional quality and maxDpi settings.
|
|
1383
|
+
* @returns An `AnalysisReport` with per-image and aggregate statistics.
|
|
1384
|
+
*
|
|
1385
|
+
* @example
|
|
1386
|
+
* ```ts
|
|
1387
|
+
* import { loadPdf, analyzeImages } from 'modern-pdf-lib';
|
|
1388
|
+
*
|
|
1389
|
+
* const doc = await loadPdf(pdfBytes);
|
|
1390
|
+
* const report = analyzeImages(doc, { quality: 75, maxDpi: 150 });
|
|
1391
|
+
*
|
|
1392
|
+
* console.log(`Total savings: ${report.totalSavingsPercent.toFixed(1)}%`);
|
|
1393
|
+
* for (const img of report.images) {
|
|
1394
|
+
* console.log(` ${img.name}: ${img.recommendation} (${img.savingsPercent.toFixed(1)}%)`);
|
|
1395
|
+
* }
|
|
1396
|
+
* ```
|
|
1397
|
+
*/
|
|
1398
|
+
function analyzeImages(doc, options) {
|
|
1399
|
+
const quality = options?.quality ?? 80;
|
|
1400
|
+
const maxDpi = options?.maxDpi ?? 150;
|
|
1401
|
+
const allImages = extractImages(doc);
|
|
1402
|
+
const analyses = [];
|
|
1403
|
+
const wasmReady = isJpegWasmReady();
|
|
1404
|
+
for (const info of allImages) {
|
|
1405
|
+
const currentSize = info.compressedSize;
|
|
1406
|
+
const currentFormat = formatFromFilters(info.filters);
|
|
1407
|
+
if (info.bitsPerComponent !== 8 || info.channels < 1 || info.channels > 4) {
|
|
1408
|
+
analyses.push({
|
|
1409
|
+
name: info.name,
|
|
1410
|
+
pageIndex: info.pageIndex,
|
|
1411
|
+
width: info.width,
|
|
1412
|
+
height: info.height,
|
|
1413
|
+
currentSize,
|
|
1414
|
+
currentFormat,
|
|
1415
|
+
colorSpace: info.colorSpace,
|
|
1416
|
+
estimatedJpegSize: currentSize,
|
|
1417
|
+
estimatedSavings: 0,
|
|
1418
|
+
savingsPercent: 0,
|
|
1419
|
+
isGrayscale: false,
|
|
1420
|
+
effectiveDpi: void 0,
|
|
1421
|
+
recommendation: "keep"
|
|
1422
|
+
});
|
|
1423
|
+
continue;
|
|
1424
|
+
}
|
|
1425
|
+
let pixels;
|
|
1426
|
+
try {
|
|
1427
|
+
pixels = decodeImageStream(info);
|
|
1428
|
+
} catch {
|
|
1429
|
+
analyses.push({
|
|
1430
|
+
name: info.name,
|
|
1431
|
+
pageIndex: info.pageIndex,
|
|
1432
|
+
width: info.width,
|
|
1433
|
+
height: info.height,
|
|
1434
|
+
currentSize,
|
|
1435
|
+
currentFormat,
|
|
1436
|
+
colorSpace: info.colorSpace,
|
|
1437
|
+
estimatedJpegSize: currentSize,
|
|
1438
|
+
estimatedSavings: 0,
|
|
1439
|
+
savingsPercent: 0,
|
|
1440
|
+
isGrayscale: false,
|
|
1441
|
+
effectiveDpi: void 0,
|
|
1442
|
+
recommendation: "keep"
|
|
1443
|
+
});
|
|
1444
|
+
continue;
|
|
1445
|
+
}
|
|
1446
|
+
let grayscale = false;
|
|
1447
|
+
if ((info.channels === 3 || info.channels === 4) && pixels.length >= info.width * info.height * info.channels) grayscale = isGrayscaleImage(pixels, info.width, info.height, info.channels);
|
|
1448
|
+
else if (info.channels === 1) grayscale = true;
|
|
1449
|
+
let estimatedJpegSize;
|
|
1450
|
+
if (wasmReady && pixels.length >= info.width * info.height * info.channels) {
|
|
1451
|
+
const channels = info.channels <= 4 ? info.channels : 3;
|
|
1452
|
+
const encoded = encodeJpegWasm(pixels, info.width, info.height, channels, quality);
|
|
1453
|
+
if (encoded) estimatedJpegSize = encoded.length;
|
|
1454
|
+
else estimatedJpegSize = estimateJpegSizeHeuristic(info.width, info.height, info.channels, quality);
|
|
1455
|
+
} else estimatedJpegSize = estimateJpegSizeHeuristic(info.width, info.height, info.channels, quality);
|
|
1456
|
+
const estimatedSavings = Math.max(0, currentSize - estimatedJpegSize);
|
|
1457
|
+
const savingsPercent = currentSize > 0 ? estimatedSavings / currentSize * 100 : 0;
|
|
1458
|
+
const effectiveDpi = computeImageDpi(info.width, info.height, info.width, info.height).effectiveDpi;
|
|
1459
|
+
const recommendation = determineRecommendation(savingsPercent, grayscale, info.colorSpace, effectiveDpi, maxDpi);
|
|
1460
|
+
analyses.push({
|
|
1461
|
+
name: info.name,
|
|
1462
|
+
pageIndex: info.pageIndex,
|
|
1463
|
+
width: info.width,
|
|
1464
|
+
height: info.height,
|
|
1465
|
+
currentSize,
|
|
1466
|
+
currentFormat,
|
|
1467
|
+
colorSpace: info.colorSpace,
|
|
1468
|
+
estimatedJpegSize,
|
|
1469
|
+
estimatedSavings,
|
|
1470
|
+
savingsPercent,
|
|
1471
|
+
isGrayscale: grayscale,
|
|
1472
|
+
effectiveDpi,
|
|
1473
|
+
recommendation
|
|
1474
|
+
});
|
|
1475
|
+
}
|
|
1476
|
+
const totalCurrentSize = analyses.reduce((sum, a) => sum + a.currentSize, 0);
|
|
1477
|
+
const totalEstimatedSize = analyses.reduce((sum, a) => sum + a.estimatedJpegSize, 0);
|
|
1478
|
+
const totalSavings = Math.max(0, totalCurrentSize - totalEstimatedSize);
|
|
1479
|
+
return {
|
|
1480
|
+
images: analyses,
|
|
1481
|
+
totalCurrentSize,
|
|
1482
|
+
totalEstimatedSize,
|
|
1483
|
+
totalSavings,
|
|
1484
|
+
totalSavingsPercent: totalCurrentSize > 0 ? totalSavings / totalCurrentSize * 100 : 0
|
|
1485
|
+
};
|
|
1486
|
+
}
|
|
1487
|
+
|
|
1488
|
+
//#endregion
|
|
1489
|
+
export { extractText as a, parseContentStream as i, computeImageDpi as n, extractTextWithPositions as o, computeTargetDimensions as r, analyzeImages as t };
|
|
1490
|
+
//# sourceMappingURL=compressionAnalysis-Bw2alOxt.mjs.map
|