modern-pdf-lib 0.15.0 → 0.19.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. package/README.md +106 -7
  2. package/dist/batchOptimize-7U_kD3_j.mjs +392 -0
  3. package/dist/batchOptimize-xo6BXbGZ.cjs +427 -0
  4. package/dist/{bridge-C7U4E7St.mjs → bridge-DTH5LMAK.mjs} +3 -3
  5. package/dist/{bridge-DUcJFVsk.cjs → bridge-DYCQzxF7.cjs} +2 -2
  6. package/dist/browser.cjs +621 -0
  7. package/dist/browser.d.cts +190 -0
  8. package/dist/browser.d.cts.map +1 -0
  9. package/dist/browser.d.mts +190 -0
  10. package/dist/browser.d.mts.map +1 -0
  11. package/dist/browser.mjs +212 -0
  12. package/dist/cli/index.cjs +247 -0
  13. package/dist/cli/index.d.cts +1 -0
  14. package/dist/cli/index.d.mts +1 -0
  15. package/dist/cli/index.mjs +248 -0
  16. package/dist/compressionAnalysis-BBv4BkQP.d.mts +261 -0
  17. package/dist/compressionAnalysis-BBv4BkQP.d.mts.map +1 -0
  18. package/dist/compressionAnalysis-Bw2alOxt.mjs +1490 -0
  19. package/dist/compressionAnalysis-CtJ2X9l2.d.cts +261 -0
  20. package/dist/compressionAnalysis-CtJ2X9l2.d.cts.map +1 -0
  21. package/dist/compressionAnalysis-eXYyDsrh.cjs +1525 -0
  22. package/dist/create.cjs +35 -0
  23. package/dist/create.d.cts +3 -0
  24. package/dist/create.d.mts +3 -0
  25. package/dist/create.mjs +5 -0
  26. package/dist/deduplicateImages-B5lmzL9j.cjs +113 -0
  27. package/dist/deduplicateImages-BX3Zg8Qp.mjs +102 -0
  28. package/dist/{fflateAdapter-DX0VqT5k.mjs → fflateAdapter-CBQpGTlx.mjs} +2 -2
  29. package/dist/{fflateAdapter-AHC_S3cb.cjs → fflateAdapter-LTAeAhaD.cjs} +1 -1
  30. package/dist/fieldAppearance-C8PoLFSc.d.mts +136 -0
  31. package/dist/fieldAppearance-C8PoLFSc.d.mts.map +1 -0
  32. package/dist/fieldAppearance-CdiGFG5e.d.cts +136 -0
  33. package/dist/fieldAppearance-CdiGFG5e.d.cts.map +1 -0
  34. package/dist/fontEmbed-Dsu9fo4U.d.mts +636 -0
  35. package/dist/fontEmbed-Dsu9fo4U.d.mts.map +1 -0
  36. package/dist/fontEmbed-LID6yG6g.d.cts +636 -0
  37. package/dist/fontEmbed-LID6yG6g.d.cts.map +1 -0
  38. package/dist/{fontSubset-pFc8Dueu.cjs → fontSubset-5SLWMmEw.cjs} +1 -1
  39. package/dist/{fontSubset-ZpLoOZ2e.mjs → fontSubset-DWpduoY2.mjs} +2 -2
  40. package/dist/forms.cjs +13 -0
  41. package/dist/forms.d.cts +3 -0
  42. package/dist/forms.d.mts +3 -0
  43. package/dist/forms.mjs +3 -0
  44. package/dist/grayscaleDetect-C2m-eEXR.cjs +96 -0
  45. package/dist/grayscaleDetect-C6kFF3dk.mjs +84 -0
  46. package/dist/imageExtract-B6OvUEp-.mjs +155 -0
  47. package/dist/imageExtract-PxdBvpHj.cjs +166 -0
  48. package/dist/index-BtYOx5wh.d.mts +4904 -0
  49. package/dist/index-BtYOx5wh.d.mts.map +1 -0
  50. package/dist/index-bpktKzCA.d.cts +4904 -0
  51. package/dist/index-bpktKzCA.d.cts.map +1 -0
  52. package/dist/index.cjs +288 -25851
  53. package/dist/index.d.cts +7 -9151
  54. package/dist/index.d.mts +7 -9151
  55. package/dist/index.mjs +17 -25665
  56. package/dist/layout-BZ8tTeAk.mjs +438 -0
  57. package/dist/layout-Inbqegsk.cjs +563 -0
  58. package/dist/{libdeflateWasm-Enus0G1k.cjs → libdeflateWasm-BdiDEJOj.cjs} +2 -2
  59. package/dist/{libdeflateWasm-82loOtIV.mjs → libdeflateWasm-rLppXytE.mjs} +3 -3
  60. package/dist/loader-3u6Tw5T-.mjs +328 -0
  61. package/dist/loader-I4zdkoWc.cjs +393 -0
  62. package/dist/parse.cjs +24 -0
  63. package/dist/parse.d.cts +4 -0
  64. package/dist/parse.d.mts +4 -0
  65. package/dist/parse.mjs +7 -0
  66. package/dist/pdfCatalog-CYy4NXEY.cjs +173 -0
  67. package/dist/pdfCatalog-IImGcMbR.mjs +138 -0
  68. package/dist/pdfDocument-BSiQdNZq.d.cts +4640 -0
  69. package/dist/pdfDocument-BSiQdNZq.d.cts.map +1 -0
  70. package/dist/pdfDocument-DOg240g9.mjs +13685 -0
  71. package/dist/pdfDocument-Duf9LelM.cjs +14110 -0
  72. package/dist/pdfDocument-i6U5fQ91.d.mts +4640 -0
  73. package/dist/pdfDocument-i6U5fQ91.d.mts.map +1 -0
  74. package/dist/pdfForm-9gd40uz9.cjs +1796 -0
  75. package/dist/pdfForm-BiyNtYem.d.mts +905 -0
  76. package/dist/pdfForm-BiyNtYem.d.mts.map +1 -0
  77. package/dist/pdfForm-Cn-cVicP.mjs +1695 -0
  78. package/dist/pdfForm-SOXJ72LW.d.cts +905 -0
  79. package/dist/pdfForm-SOXJ72LW.d.cts.map +1 -0
  80. package/dist/{pdfCatalog-COKoYQ8C.cjs → pdfObjects-1veop1_d.cjs} +2 -172
  81. package/dist/{pdfCatalog-BB2Wnmud.mjs → pdfObjects-uEsWlfzU.mjs} +3 -138
  82. package/dist/{pdfPage-N1K2U3jI.mjs → pdfPage-BacMkrLe.mjs} +3024 -4
  83. package/dist/{pdfPage-DBfdinTR.cjs → pdfPage-CirlQRzJ.cjs} +3148 -104
  84. package/dist/{pngEmbed-gaJ9S2Dk.mjs → pngEmbed-BLj2zi-5.mjs} +3 -3
  85. package/dist/{pngEmbed-10m4CfBU.cjs → pngEmbed-D4X4ZN-3.cjs} +2 -2
  86. package/dist/src-BLWEEbd7.cjs +11852 -0
  87. package/dist/src-x0g7wiRq.mjs +11103 -0
  88. package/dist/streamDecode-Bs0_MT_Q.cjs +4607 -0
  89. package/dist/streamDecode-CWN-nfPJ.mjs +4596 -0
  90. package/package.json +33 -1
  91. package/dist/index.d.cts.map +0 -1
  92. package/dist/index.d.mts.map +0 -1
  93. package/dist/loader-1VJXLlMZ.mjs +0 -164
  94. package/dist/loader-CKlBOHma.cjs +0 -166
  95. package/dist/rolldown-runtime-95iHPtFO.mjs +0 -18
@@ -0,0 +1,1490 @@
1
+ import { i as PdfName, l as PdfStream, o as PdfNumber, r as PdfDict } from "./pdfObjects-uEsWlfzU.mjs";
2
+ import { a as isJpegWasmReady, r as encodeJpegWasm } from "./bridge-DTH5LMAK.mjs";
3
+ import { n as extractImages, t as decodeImageStream } from "./imageExtract-B6OvUEp-.mjs";
4
+ import { n as isGrayscaleImage } from "./grayscaleDetect-C6kFF3dk.mjs";
5
+
6
+ //#region src/parser/textExtractor.ts
7
+ /**
8
+ * @module parser/textExtractor
9
+ *
10
+ * Extract text content from parsed PDF content streams. Supports both
11
+ * simple text extraction (concatenated strings) and position-aware
12
+ * extraction that tracks the text matrix to compute x/y coordinates.
13
+ *
14
+ * Handles:
15
+ * - All PDF text-showing operators: `Tj`, `TJ`, `'`, `"`
16
+ * - Text-positioning operators: `Td`, `TD`, `Tm`, `T*`
17
+ * - Font selection: `Tf`
18
+ * - Graphics state: `q`/`Q`, `cm`
19
+ * - WinAnsiEncoding (standard single-byte)
20
+ * - Identity-H CID fonts with ToUnicode CMap
21
+ *
22
+ * Reference: PDF 1.7 spec, §9 (Text).
23
+ *
24
+ * @packageDocumentation
25
+ */
26
+ /**
27
+ * Extract plain text from a sequence of parsed content-stream operators.
28
+ *
29
+ * This function concatenates all text-showing operator strings, inserting
30
+ * spaces between text objects (BT/ET blocks) and newlines at line breaks
31
+ * (`T*`, `Td`, `TD`).
32
+ *
33
+ * @param operators - Parsed content-stream operators.
34
+ * @param resources - Optional page `/Resources` dictionary (used to look
35
+ * up font encodings and ToUnicode CMaps).
36
+ * @param options - Extraction options.
37
+ * @returns The extracted text as a single string.
38
+ */
39
+ function extractText(operators, resources, options) {
40
+ if (options?.withPositions) return extractTextWithPositions(operators, resources).map((item) => item.text).join(" ");
41
+ const state = new TextState(resources);
42
+ const parts = [];
43
+ let lineHasContent = false;
44
+ for (const op of operators) switch (op.operator) {
45
+ case "BT":
46
+ state.resetTextMatrix();
47
+ if (parts.length > 0 && lineHasContent) parts.push(" ");
48
+ lineHasContent = false;
49
+ break;
50
+ case "ET": break;
51
+ case "Tf":
52
+ state.setFont(operandAsString(op.operands[0]), operandAsNumber(op.operands[1]));
53
+ break;
54
+ case "Tc":
55
+ state.charSpacing = operandAsNumber(op.operands[0]);
56
+ break;
57
+ case "Tw":
58
+ state.wordSpacing = operandAsNumber(op.operands[0]);
59
+ break;
60
+ case "TL":
61
+ state.leading = operandAsNumber(op.operands[0]);
62
+ break;
63
+ case "Tz":
64
+ state.horizontalScaling = operandAsNumber(op.operands[0]);
65
+ break;
66
+ case "Ts":
67
+ state.rise = operandAsNumber(op.operands[0]);
68
+ break;
69
+ case "Td": {
70
+ const tx = operandAsNumber(op.operands[0]);
71
+ const ty = operandAsNumber(op.operands[1]);
72
+ state.moveText(tx, ty);
73
+ if (Math.abs(ty) > .5 && lineHasContent) {
74
+ parts.push("\n");
75
+ lineHasContent = false;
76
+ } else if (Math.abs(tx) > state.fontSize * .3 && lineHasContent) parts.push(" ");
77
+ break;
78
+ }
79
+ case "TD": {
80
+ const tx = operandAsNumber(op.operands[0]);
81
+ const ty = operandAsNumber(op.operands[1]);
82
+ state.leading = -ty;
83
+ state.moveText(tx, ty);
84
+ if (Math.abs(ty) > .5 && lineHasContent) {
85
+ parts.push("\n");
86
+ lineHasContent = false;
87
+ }
88
+ break;
89
+ }
90
+ case "Tm":
91
+ state.setTextMatrix(operandAsNumber(op.operands[0]), operandAsNumber(op.operands[1]), operandAsNumber(op.operands[2]), operandAsNumber(op.operands[3]), operandAsNumber(op.operands[4]), operandAsNumber(op.operands[5]));
92
+ break;
93
+ case "T*":
94
+ state.nextLine();
95
+ if (lineHasContent) {
96
+ parts.push("\n");
97
+ lineHasContent = false;
98
+ }
99
+ break;
100
+ case "Tj": {
101
+ const text = state.decodeString(op.operands[0]);
102
+ if (text.length > 0) {
103
+ parts.push(text);
104
+ lineHasContent = true;
105
+ }
106
+ break;
107
+ }
108
+ case "TJ": {
109
+ const text = state.decodeTJArray(op.operands[0]);
110
+ if (text.length > 0) {
111
+ parts.push(text);
112
+ lineHasContent = true;
113
+ }
114
+ break;
115
+ }
116
+ case "'": {
117
+ state.nextLine();
118
+ if (lineHasContent) {
119
+ parts.push("\n");
120
+ lineHasContent = false;
121
+ }
122
+ const text = state.decodeString(op.operands[0]);
123
+ if (text.length > 0) {
124
+ parts.push(text);
125
+ lineHasContent = true;
126
+ }
127
+ break;
128
+ }
129
+ case "\"": {
130
+ state.wordSpacing = operandAsNumber(op.operands[0]);
131
+ state.charSpacing = operandAsNumber(op.operands[1]);
132
+ state.nextLine();
133
+ if (lineHasContent) {
134
+ parts.push("\n");
135
+ lineHasContent = false;
136
+ }
137
+ const text = state.decodeString(op.operands[2]);
138
+ if (text.length > 0) {
139
+ parts.push(text);
140
+ lineHasContent = true;
141
+ }
142
+ break;
143
+ }
144
+ case "q":
145
+ state.save();
146
+ break;
147
+ case "Q":
148
+ state.restore();
149
+ break;
150
+ case "cm":
151
+ state.concatCTM(operandAsNumber(op.operands[0]), operandAsNumber(op.operands[1]), operandAsNumber(op.operands[2]), operandAsNumber(op.operands[3]), operandAsNumber(op.operands[4]), operandAsNumber(op.operands[5]));
152
+ break;
153
+ default: break;
154
+ }
155
+ return parts.join("");
156
+ }
157
+ /**
158
+ * Extract text with position information from a parsed content stream.
159
+ *
160
+ * Each returned {@link TextItem} includes the text string, its position
161
+ * (x, y), dimensions (width, height), font size, and font name.
162
+ *
163
+ * @param operators - Parsed content-stream operators.
164
+ * @param resources - Optional page `/Resources` dictionary.
165
+ * @returns An array of positioned text items.
166
+ */
167
+ function extractTextWithPositions(operators, resources) {
168
+ const state = new TextState(resources);
169
+ const items = [];
170
+ for (const op of operators) switch (op.operator) {
171
+ case "BT":
172
+ state.resetTextMatrix();
173
+ break;
174
+ case "ET": break;
175
+ case "Tf":
176
+ state.setFont(operandAsString(op.operands[0]), operandAsNumber(op.operands[1]));
177
+ break;
178
+ case "Tc":
179
+ state.charSpacing = operandAsNumber(op.operands[0]);
180
+ break;
181
+ case "Tw":
182
+ state.wordSpacing = operandAsNumber(op.operands[0]);
183
+ break;
184
+ case "TL":
185
+ state.leading = operandAsNumber(op.operands[0]);
186
+ break;
187
+ case "Tz":
188
+ state.horizontalScaling = operandAsNumber(op.operands[0]);
189
+ break;
190
+ case "Ts":
191
+ state.rise = operandAsNumber(op.operands[0]);
192
+ break;
193
+ case "Td":
194
+ state.moveText(operandAsNumber(op.operands[0]), operandAsNumber(op.operands[1]));
195
+ break;
196
+ case "TD": {
197
+ const tx = operandAsNumber(op.operands[0]);
198
+ const ty = operandAsNumber(op.operands[1]);
199
+ state.leading = -ty;
200
+ state.moveText(tx, ty);
201
+ break;
202
+ }
203
+ case "Tm":
204
+ state.setTextMatrix(operandAsNumber(op.operands[0]), operandAsNumber(op.operands[1]), operandAsNumber(op.operands[2]), operandAsNumber(op.operands[3]), operandAsNumber(op.operands[4]), operandAsNumber(op.operands[5]));
205
+ break;
206
+ case "T*":
207
+ state.nextLine();
208
+ break;
209
+ case "Tj": {
210
+ const text = state.decodeString(op.operands[0]);
211
+ if (text.length > 0) {
212
+ const pos = state.getTextPosition();
213
+ items.push({
214
+ text,
215
+ x: pos.x,
216
+ y: pos.y,
217
+ width: state.estimateWidth(text),
218
+ height: state.fontSize,
219
+ fontSize: state.fontSize,
220
+ fontName: state.fontName
221
+ });
222
+ state.advanceByText(text);
223
+ }
224
+ break;
225
+ }
226
+ case "TJ": {
227
+ const operand = op.operands[0];
228
+ if (Array.isArray(operand)) for (const elem of operand) if (typeof elem === "number") state.advanceByDisplacement(-elem);
229
+ else {
230
+ const text = state.decodeString(elem);
231
+ if (text.length > 0) {
232
+ const pos = state.getTextPosition();
233
+ items.push({
234
+ text,
235
+ x: pos.x,
236
+ y: pos.y,
237
+ width: state.estimateWidth(text),
238
+ height: state.fontSize,
239
+ fontSize: state.fontSize,
240
+ fontName: state.fontName
241
+ });
242
+ state.advanceByText(text);
243
+ }
244
+ }
245
+ break;
246
+ }
247
+ case "'":
248
+ state.nextLine();
249
+ {
250
+ const text = state.decodeString(op.operands[0]);
251
+ if (text.length > 0) {
252
+ const pos = state.getTextPosition();
253
+ items.push({
254
+ text,
255
+ x: pos.x,
256
+ y: pos.y,
257
+ width: state.estimateWidth(text),
258
+ height: state.fontSize,
259
+ fontSize: state.fontSize,
260
+ fontName: state.fontName
261
+ });
262
+ state.advanceByText(text);
263
+ }
264
+ }
265
+ break;
266
+ case "\"":
267
+ state.wordSpacing = operandAsNumber(op.operands[0]);
268
+ state.charSpacing = operandAsNumber(op.operands[1]);
269
+ state.nextLine();
270
+ {
271
+ const text = state.decodeString(op.operands[2]);
272
+ if (text.length > 0) {
273
+ const pos = state.getTextPosition();
274
+ items.push({
275
+ text,
276
+ x: pos.x,
277
+ y: pos.y,
278
+ width: state.estimateWidth(text),
279
+ height: state.fontSize,
280
+ fontSize: state.fontSize,
281
+ fontName: state.fontName
282
+ });
283
+ state.advanceByText(text);
284
+ }
285
+ }
286
+ break;
287
+ case "q":
288
+ state.save();
289
+ break;
290
+ case "Q":
291
+ state.restore();
292
+ break;
293
+ case "cm":
294
+ state.concatCTM(operandAsNumber(op.operands[0]), operandAsNumber(op.operands[1]), operandAsNumber(op.operands[2]), operandAsNumber(op.operands[3]), operandAsNumber(op.operands[4]), operandAsNumber(op.operands[5]));
295
+ break;
296
+ default: break;
297
+ }
298
+ return items;
299
+ }
300
+ /**
301
+ * Parse a ToUnicode CMap stream into a lookup map.
302
+ *
303
+ * Handles the two standard mapping constructs:
304
+ * - `beginbfchar` / `endbfchar` — single code-to-Unicode mappings
305
+ * - `beginbfrange` / `endbfrange` — range-based mappings
306
+ *
307
+ * @param data - The raw CMap stream bytes (already decompressed).
308
+ * @returns A parsed CMap.
309
+ */
310
+ function parseToUnicodeCMap(data) {
311
+ const text = decodeText(data);
312
+ const map = /* @__PURE__ */ new Map();
313
+ parseBfCharSections(text, map);
314
+ parseBfRangeSections(text, map);
315
+ return { map };
316
+ }
317
+ /**
318
+ * Parse all `beginbfchar`/`endbfchar` sections in a CMap.
319
+ */
320
+ function parseBfCharSections(text, map) {
321
+ const regex = /beginbfchar\s*([\s\S]*?)\s*endbfchar/g;
322
+ let match;
323
+ while ((match = regex.exec(text)) !== null) {
324
+ const body = match[1];
325
+ const lineRegex = /<([0-9a-fA-F]+)>\s*<([0-9a-fA-F]+)>/g;
326
+ let lineMatch;
327
+ while ((lineMatch = lineRegex.exec(body)) !== null) {
328
+ const srcCode = parseInt(lineMatch[1], 16);
329
+ const dstString = hexToUnicode(lineMatch[2]);
330
+ map.set(srcCode, dstString);
331
+ }
332
+ }
333
+ }
334
+ /**
335
+ * Parse all `beginbfrange`/`endbfrange` sections in a CMap.
336
+ */
337
+ function parseBfRangeSections(text, map) {
338
+ const regex = /beginbfrange\s*([\s\S]*?)\s*endbfrange/g;
339
+ let match;
340
+ while ((match = regex.exec(text)) !== null) {
341
+ const body = match[1];
342
+ const lineRegex = /<([0-9a-fA-F]+)>\s*<([0-9a-fA-F]+)>\s*(?:<([0-9a-fA-F]+)>|\[([\s\S]*?)\])/g;
343
+ let lineMatch;
344
+ while ((lineMatch = lineRegex.exec(body)) !== null) {
345
+ const srcLow = parseInt(lineMatch[1], 16);
346
+ const srcHigh = parseInt(lineMatch[2], 16);
347
+ if (lineMatch[3]) {
348
+ let dstCode = parseInt(lineMatch[3], 16);
349
+ for (let code = srcLow; code <= srcHigh; code++) {
350
+ map.set(code, codePointToString(dstCode));
351
+ dstCode++;
352
+ }
353
+ } else if (lineMatch[4]) {
354
+ const arrRegex = /<([0-9a-fA-F]+)>/g;
355
+ let arrMatch;
356
+ let code = srcLow;
357
+ while ((arrMatch = arrRegex.exec(lineMatch[4])) !== null && code <= srcHigh) {
358
+ map.set(code, hexToUnicode(arrMatch[1]));
359
+ code++;
360
+ }
361
+ }
362
+ }
363
+ }
364
+ }
365
+ /**
366
+ * Convert a hex string (2 or 4 hex chars per code point) to a Unicode
367
+ * string.
368
+ */
369
+ function hexToUnicode(hex) {
370
+ const parts = [];
371
+ const step = hex.length <= 4 ? hex.length : 4;
372
+ for (let i = 0; i < hex.length; i += step) {
373
+ const chunk = hex.slice(i, i + step);
374
+ const code = parseInt(chunk, 16);
375
+ if (!isNaN(code)) parts.push(String.fromCodePoint(code));
376
+ }
377
+ return parts.join("");
378
+ }
379
+ /**
380
+ * Convert a numeric code point to a string.
381
+ */
382
+ function codePointToString(code) {
383
+ return String.fromCodePoint(code);
384
+ }
385
+ /**
386
+ * WinAnsiEncoding table for bytes 0x80-0x9F that differ from Latin-1.
387
+ * Bytes 0x00-0x7F and 0xA0-0xFF map directly to their Unicode code points.
388
+ */
389
+ const WIN_ANSI_SPECIAL = {
390
+ 128: 8364,
391
+ 130: 8218,
392
+ 131: 402,
393
+ 132: 8222,
394
+ 133: 8230,
395
+ 134: 8224,
396
+ 135: 8225,
397
+ 136: 710,
398
+ 137: 8240,
399
+ 138: 352,
400
+ 139: 8249,
401
+ 140: 338,
402
+ 142: 381,
403
+ 145: 8216,
404
+ 146: 8217,
405
+ 147: 8220,
406
+ 148: 8221,
407
+ 149: 8226,
408
+ 150: 8211,
409
+ 151: 8212,
410
+ 152: 732,
411
+ 153: 8482,
412
+ 154: 353,
413
+ 155: 8250,
414
+ 156: 339,
415
+ 158: 382,
416
+ 159: 376
417
+ };
418
+ /**
419
+ * Decode a single byte using WinAnsiEncoding.
420
+ */
421
+ function winAnsiDecode(byte) {
422
+ if (Object.hasOwn(WIN_ANSI_SPECIAL, byte)) return String.fromCodePoint(WIN_ANSI_SPECIAL[byte]);
423
+ return String.fromCharCode(byte);
424
+ }
425
+ /** Identity matrix. */
426
+ function identityMatrix() {
427
+ return [
428
+ 1,
429
+ 0,
430
+ 0,
431
+ 1,
432
+ 0,
433
+ 0
434
+ ];
435
+ }
436
+ /**
437
+ * Multiply two 3x3 matrices (stored as 6-element arrays).
438
+ * Result = A * B
439
+ */
440
+ function multiplyMatrices(a, b) {
441
+ return [
442
+ a[0] * b[0] + a[1] * b[2],
443
+ a[0] * b[1] + a[1] * b[3],
444
+ a[2] * b[0] + a[3] * b[2],
445
+ a[2] * b[1] + a[3] * b[3],
446
+ a[4] * b[0] + a[5] * b[2] + b[4],
447
+ a[4] * b[1] + a[5] * b[3] + b[5]
448
+ ];
449
+ }
450
+ /**
451
+ * Tracks the graphics/text state needed for text extraction.
452
+ */
453
+ var TextState = class {
454
+ /** Current transformation matrix (CTM). */
455
+ ctm = identityMatrix();
456
+ /** Text matrix — set by Tm, updated by Td/TD/T*. */
457
+ textMatrix = identityMatrix();
458
+ /** Text line matrix — the matrix at the start of the current line. */
459
+ textLineMatrix = identityMatrix();
460
+ /** Current font resource name. */
461
+ fontName = "";
462
+ /** Current font size. */
463
+ fontSize = 12;
464
+ /** Character spacing (Tc). */
465
+ charSpacing = 0;
466
+ /** Word spacing (Tw). */
467
+ wordSpacing = 0;
468
+ /** Horizontal scaling (Tz) as a percentage (100 = normal). */
469
+ horizontalScaling = 100;
470
+ /** Text leading (TL). */
471
+ leading = 0;
472
+ /** Text rise (Ts). */
473
+ rise = 0;
474
+ /** Graphics state stack for q/Q. */
475
+ stateStack = [];
476
+ /** Page resources dictionary. */
477
+ resources;
478
+ /** Cache of parsed ToUnicode CMaps per font name. */
479
+ cmapCache = /* @__PURE__ */ new Map();
480
+ /** Cache of font encoding type per font name. */
481
+ fontEncodingCache = /* @__PURE__ */ new Map();
482
+ /** Cache of whether a font is a CID (2-byte) font. */
483
+ cidFontCache = /* @__PURE__ */ new Map();
484
+ constructor(resources) {
485
+ this.resources = resources;
486
+ if (resources) this.analyzeFonts(resources);
487
+ }
488
+ /**
489
+ * Pre-analyze fonts from the resources dictionary to determine encoding
490
+ * types and cache ToUnicode CMaps.
491
+ */
492
+ analyzeFonts(resources) {
493
+ const fonts = resources.get("/Font");
494
+ if (!(fonts instanceof PdfDict)) return;
495
+ for (const [name, fontObj] of fonts) {
496
+ if (!(fontObj instanceof PdfDict)) continue;
497
+ const subtype = fontObj.get("/Subtype");
498
+ const isCid = subtype instanceof PdfName && (subtype.value === "/Type0" || subtype.value === "/CIDFontType0" || subtype.value === "/CIDFontType2");
499
+ this.cidFontCache.set(name, isCid);
500
+ const encoding = fontObj.get("/Encoding");
501
+ if (encoding instanceof PdfName) this.fontEncodingCache.set(name, encoding.value.replace(/^\//, ""));
502
+ const toUnicode = fontObj.get("/ToUnicode");
503
+ if (toUnicode instanceof PdfStream) try {
504
+ const cmap = parseToUnicodeCMap(toUnicode.data);
505
+ this.cmapCache.set(name, cmap);
506
+ } catch {
507
+ this.cmapCache.set(name, null);
508
+ }
509
+ }
510
+ }
511
+ /**
512
+ * Save the current graphics state (q).
513
+ */
514
+ save() {
515
+ this.stateStack.push({
516
+ ctm: [...this.ctm],
517
+ fontName: this.fontName,
518
+ fontSize: this.fontSize,
519
+ charSpacing: this.charSpacing,
520
+ wordSpacing: this.wordSpacing,
521
+ horizontalScaling: this.horizontalScaling,
522
+ leading: this.leading,
523
+ rise: this.rise
524
+ });
525
+ }
526
+ /**
527
+ * Restore the previously saved graphics state (Q).
528
+ */
529
+ restore() {
530
+ const saved = this.stateStack.pop();
531
+ if (saved) {
532
+ this.ctm = saved.ctm;
533
+ this.fontName = saved.fontName;
534
+ this.fontSize = saved.fontSize;
535
+ this.charSpacing = saved.charSpacing;
536
+ this.wordSpacing = saved.wordSpacing;
537
+ this.horizontalScaling = saved.horizontalScaling;
538
+ this.leading = saved.leading;
539
+ this.rise = saved.rise;
540
+ }
541
+ }
542
+ /**
543
+ * Concatenate a matrix with the CTM (cm).
544
+ */
545
+ concatCTM(a, b, c, d, e, f) {
546
+ this.ctm = multiplyMatrices([
547
+ a,
548
+ b,
549
+ c,
550
+ d,
551
+ e,
552
+ f
553
+ ], this.ctm);
554
+ }
555
+ /**
556
+ * Reset the text matrix to identity (called at BT).
557
+ */
558
+ resetTextMatrix() {
559
+ this.textMatrix = identityMatrix();
560
+ this.textLineMatrix = identityMatrix();
561
+ }
562
+ /**
563
+ * Set the font and size (Tf).
564
+ */
565
+ setFont(name, size) {
566
+ this.fontName = name.startsWith("/") ? name : `/${name}`;
567
+ this.fontSize = size;
568
+ }
569
+ /**
570
+ * Move text position (Td).
571
+ */
572
+ moveText(tx, ty) {
573
+ this.textLineMatrix = multiplyMatrices([
574
+ 1,
575
+ 0,
576
+ 0,
577
+ 1,
578
+ tx,
579
+ ty
580
+ ], this.textLineMatrix);
581
+ this.textMatrix = [...this.textLineMatrix];
582
+ }
583
+ /**
584
+ * Set the text matrix directly (Tm).
585
+ */
586
+ setTextMatrix(a, b, c, d, e, f) {
587
+ this.textMatrix = [
588
+ a,
589
+ b,
590
+ c,
591
+ d,
592
+ e,
593
+ f
594
+ ];
595
+ this.textLineMatrix = [
596
+ a,
597
+ b,
598
+ c,
599
+ d,
600
+ e,
601
+ f
602
+ ];
603
+ }
604
+ /**
605
+ * Move to the start of the next line (T*).
606
+ * Equivalent to: 0 -TL Td
607
+ */
608
+ nextLine() {
609
+ this.moveText(0, -this.leading);
610
+ }
611
+ /**
612
+ * Get the current text position in user-space coordinates.
613
+ */
614
+ getTextPosition() {
615
+ const combined = multiplyMatrices(this.textMatrix, this.ctm);
616
+ return {
617
+ x: combined[4],
618
+ y: combined[5]
619
+ };
620
+ }
621
+ /**
622
+ * Estimate the width of a text string in user-space units.
623
+ *
624
+ * Uses a rough heuristic: 0.5 * fontSize per character for standard
625
+ * fonts. A production implementation would use font metrics.
626
+ */
627
+ estimateWidth(text) {
628
+ const avgCharWidth = .5;
629
+ const hScale = this.horizontalScaling / 100;
630
+ return text.length * this.fontSize * avgCharWidth * hScale;
631
+ }
632
+ /**
633
+ * Advance the text matrix by the width of the given text.
634
+ */
635
+ advanceByText(text) {
636
+ this.textMatrix = multiplyMatrices([
637
+ 1,
638
+ 0,
639
+ 0,
640
+ 1,
641
+ this.estimateWidth(text),
642
+ 0
643
+ ], this.textMatrix);
644
+ }
645
+ /**
646
+ * Advance the text matrix by a TJ displacement value.
647
+ *
648
+ * The displacement is in thousandths of a unit of text space.
649
+ */
650
+ advanceByDisplacement(displacement) {
651
+ this.textMatrix = multiplyMatrices([
652
+ 1,
653
+ 0,
654
+ 0,
655
+ 1,
656
+ displacement / 1e3 * this.fontSize * (this.horizontalScaling / 100),
657
+ 0
658
+ ], this.textMatrix);
659
+ }
660
+ /**
661
+ * Decode an operand (string or hex string) into a readable text string.
662
+ *
663
+ * Uses the current font's ToUnicode CMap if available, otherwise falls
664
+ * back to WinAnsiEncoding or direct code-point mapping.
665
+ */
666
+ decodeString(operand) {
667
+ if (operand == null) return "";
668
+ if (typeof operand === "number") return "";
669
+ const raw = typeof operand === "string" ? operand : String(operand);
670
+ const cmap = this.cmapCache.get(this.fontName);
671
+ const isCid = this.cidFontCache.get(this.fontName) ?? false;
672
+ if (cmap) return this.decodeWithCMap(raw, cmap, isCid);
673
+ if (isCid) return this.decodeCIDString(raw);
674
+ return this.decodeWinAnsi(raw);
675
+ }
676
+ /**
677
+ * Decode a TJ array operand (array of strings + numbers).
678
+ */
679
+ decodeTJArray(operand) {
680
+ if (!Array.isArray(operand)) return this.decodeString(operand);
681
+ const parts = [];
682
+ for (const elem of operand) if (typeof elem === "number") {
683
+ if (elem <= -100) parts.push(" ");
684
+ } else {
685
+ const decoded = this.decodeString(elem);
686
+ if (decoded.length > 0) parts.push(decoded);
687
+ }
688
+ return parts.join("");
689
+ }
690
+ /**
691
+ * Decode a string using a ToUnicode CMap.
692
+ */
693
+ decodeWithCMap(raw, cmap, isCid) {
694
+ let result = "";
695
+ if (isCid) for (let i = 0; i + 1 < raw.length; i += 2) {
696
+ const code = raw.charCodeAt(i) << 8 | raw.charCodeAt(i + 1);
697
+ const mapped = cmap.map.get(code);
698
+ if (mapped !== void 0) result += mapped;
699
+ else if (code >= 32 && code <= 65535) result += String.fromCharCode(code);
700
+ }
701
+ else for (let i = 0; i < raw.length; i++) {
702
+ const code = raw.charCodeAt(i);
703
+ const mapped = cmap.map.get(code);
704
+ if (mapped !== void 0) result += mapped;
705
+ else result += winAnsiDecode(code);
706
+ }
707
+ return result;
708
+ }
709
+ /**
710
+ * Decode a CID (Identity-H) encoded string without a ToUnicode CMap.
711
+ */
712
+ decodeCIDString(raw) {
713
+ let result = "";
714
+ for (let i = 0; i + 1 < raw.length; i += 2) {
715
+ const code = raw.charCodeAt(i) << 8 | raw.charCodeAt(i + 1);
716
+ if (code >= 32 && code <= 65535) result += String.fromCharCode(code);
717
+ }
718
+ return result;
719
+ }
720
+ /**
721
+ * Decode a string using WinAnsiEncoding.
722
+ */
723
+ decodeWinAnsi(raw) {
724
+ let result = "";
725
+ for (let i = 0; i < raw.length; i++) {
726
+ const code = raw.charCodeAt(i);
727
+ result += winAnsiDecode(code);
728
+ }
729
+ return result;
730
+ }
731
+ };
732
+ /**
733
+ * Extract a numeric value from an operand, defaulting to 0.
734
+ */
735
+ function operandAsNumber(operand) {
736
+ if (typeof operand === "number") return operand;
737
+ if (operand instanceof PdfNumber) return operand.value;
738
+ if (typeof operand === "string") {
739
+ const n = parseFloat(operand);
740
+ return isNaN(n) ? 0 : n;
741
+ }
742
+ return 0;
743
+ }
744
+ /**
745
+ * Extract a string value from an operand.
746
+ */
747
+ function operandAsString(operand) {
748
+ if (typeof operand === "string") return operand;
749
+ if (operand instanceof PdfName) return operand.value;
750
+ if (typeof operand === "number") return String(operand);
751
+ return "";
752
+ }
753
+ /**
754
+ * Decode raw bytes to a string (ASCII/Latin-1 — sufficient for CMap
755
+ * parsing which is ASCII-based).
756
+ */
757
+ function decodeText(data) {
758
+ return new TextDecoder("latin1").decode(data);
759
+ }
760
+
761
+ //#endregion
762
+ //#region src/parser/contentStreamParser.ts
763
+ /**
764
+ * @module parser/contentStreamParser
765
+ *
766
+ * Parse PDF content streams (the operator/operand sequences that describe
767
+ * page appearance) into a structured AST.
768
+ *
769
+ * A PDF content stream consists of a flat sequence of *operands* followed
770
+ * by an *operator*. Operands are PDF objects (numbers, strings, names,
771
+ * booleans, arrays, `null`); operators are unquoted letter sequences.
772
+ *
773
+ * Special handling is required for inline images (`BI … ID data EI`).
774
+ *
775
+ * Reference: PDF 1.7 spec, §7.8.2 (Content Streams).
776
+ *
777
+ * @packageDocumentation
778
+ */
779
+ /**
780
+ * Parse a PDF content stream into an ordered list of operators.
781
+ *
782
+ * @param data - The raw content-stream bytes (already decompressed).
783
+ * @returns An array of operators in document order.
784
+ */
785
+ function parseContentStream(data) {
786
+ return new ContentStreamLexer(data).parse();
787
+ }
788
+ var TokenType = /* @__PURE__ */ function(TokenType) {
789
+ TokenType[TokenType["Number"] = 0] = "Number";
790
+ TokenType[TokenType["String"] = 1] = "String";
791
+ TokenType[TokenType["HexString"] = 2] = "HexString";
792
+ TokenType[TokenType["Name"] = 3] = "Name";
793
+ TokenType[TokenType["Bool"] = 4] = "Bool";
794
+ TokenType[TokenType["Null"] = 5] = "Null";
795
+ TokenType[TokenType["ArrayStart"] = 6] = "ArrayStart";
796
+ TokenType[TokenType["ArrayEnd"] = 7] = "ArrayEnd";
797
+ TokenType[TokenType["Operator"] = 8] = "Operator";
798
+ TokenType[TokenType["InlineImage"] = 9] = "InlineImage";
799
+ TokenType[TokenType["EOF"] = 10] = "EOF";
800
+ return TokenType;
801
+ }(TokenType || {});
802
+ /**
803
+ * `hexVal[b]` is the numeric value (0-15) of a hex character, or -1 if
804
+ * the byte is not a valid hex digit.
805
+ */
806
+ const hexVal = /* @__PURE__ */ (() => {
807
+ const t = new Int8Array(256).fill(-1);
808
+ for (let i = 0; i <= 9; i++) t[48 + i] = i;
809
+ for (let i = 0; i < 6; i++) {
810
+ t[65 + i] = 10 + i;
811
+ t[97 + i] = 10 + i;
812
+ }
813
+ return t;
814
+ })();
815
+ /**
816
+ * Combined lexer + parser for PDF content streams.
817
+ *
818
+ * Content streams are simpler than full PDF object syntax — there are no
819
+ * dictionaries (except inside inline images), no indirect references, and
820
+ * no comments outside of string literals.
821
+ */
822
+ var ContentStreamLexer = class {
823
+ data;
824
+ pos = 0;
825
+ constructor(data) {
826
+ this.data = data;
827
+ }
828
+ /**
829
+ * Parse the entire stream and return all operators.
830
+ */
831
+ parse() {
832
+ const result = [];
833
+ const operandStack = [];
834
+ while (true) {
835
+ const token = this.nextToken();
836
+ if (token.type === TokenType.EOF) break;
837
+ switch (token.type) {
838
+ case TokenType.Number:
839
+ case TokenType.String:
840
+ case TokenType.HexString:
841
+ case TokenType.Bool:
842
+ case TokenType.Null:
843
+ case TokenType.Name:
844
+ operandStack.push(token.value);
845
+ break;
846
+ case TokenType.ArrayStart: {
847
+ const arr = this.parseArray();
848
+ operandStack.push(arr);
849
+ break;
850
+ }
851
+ case TokenType.ArrayEnd: break;
852
+ case TokenType.Operator: {
853
+ const op = token.value;
854
+ if (op === "BI") {
855
+ const inlineImg = this.parseInlineImage();
856
+ result.push({
857
+ operator: "BI",
858
+ operands: [inlineImg]
859
+ });
860
+ } else result.push({
861
+ operator: op,
862
+ operands: operandStack.splice(0, operandStack.length)
863
+ });
864
+ break;
865
+ }
866
+ case TokenType.InlineImage: break;
867
+ }
868
+ }
869
+ return result;
870
+ }
871
+ /**
872
+ * Parse a PDF array `[…]`. Called after the `[` token has been consumed.
873
+ */
874
+ parseArray() {
875
+ const items = [];
876
+ while (true) {
877
+ const token = this.nextToken();
878
+ if (token.type === TokenType.EOF) break;
879
+ if (token.type === TokenType.ArrayEnd) break;
880
+ switch (token.type) {
881
+ case TokenType.Number:
882
+ case TokenType.String:
883
+ case TokenType.HexString:
884
+ case TokenType.Bool:
885
+ case TokenType.Null:
886
+ case TokenType.Name:
887
+ items.push(token.value);
888
+ break;
889
+ case TokenType.ArrayStart:
890
+ items.push(this.parseArray());
891
+ break;
892
+ default: break;
893
+ }
894
+ }
895
+ return items;
896
+ }
897
+ /**
898
+ * Parse an inline image.
899
+ *
900
+ * After `BI` has been read, we expect key-value pairs (name + value)
901
+ * until `ID`, then raw binary data until we find `EI` preceded by
902
+ * whitespace.
903
+ */
904
+ parseInlineImage() {
905
+ const dict = {};
906
+ while (true) {
907
+ this.skipWhitespace();
908
+ if (this.pos >= this.data.length) break;
909
+ if (this.peekKeyword("ID")) {
910
+ this.pos += 2;
911
+ break;
912
+ }
913
+ const keyToken = this.nextToken();
914
+ if (keyToken.type === TokenType.Operator) {
915
+ const kw = keyToken.value;
916
+ if (kw === "ID") break;
917
+ dict[kw] = this.nextToken().value;
918
+ continue;
919
+ }
920
+ if (keyToken.type === TokenType.Name) {
921
+ const name = keyToken.value.value;
922
+ dict[name] = this.nextToken().value;
923
+ } else if (keyToken.type === TokenType.EOF) break;
924
+ }
925
+ if (this.pos < this.data.length) {
926
+ const ch = this.data[this.pos];
927
+ if (ch === 32 || ch === 10 || ch === 13 || ch === 9) {
928
+ this.pos++;
929
+ if (ch === 13 && this.pos < this.data.length && this.data[this.pos] === 10) this.pos++;
930
+ }
931
+ }
932
+ const dataStart = this.pos;
933
+ let dataEnd = this.pos;
934
+ let searchFrom = this.pos;
935
+ while (searchFrom < this.data.length) {
936
+ const eIdx = this.data.indexOf(69, searchFrom);
937
+ if (eIdx === -1 || eIdx + 1 >= this.data.length) {
938
+ this.pos = this.data.length;
939
+ break;
940
+ }
941
+ if (eIdx > dataStart && this.isWhitespace(this.data[eIdx - 1]) && this.data[eIdx + 1] === 73) {
942
+ const afterEI = eIdx + 2;
943
+ if (afterEI >= this.data.length || this.isWhitespace(this.data[afterEI])) {
944
+ dataEnd = eIdx - 1;
945
+ this.pos = afterEI;
946
+ break;
947
+ }
948
+ }
949
+ searchFrom = eIdx + 1;
950
+ }
951
+ return {
952
+ dict,
953
+ data: this.data.slice(dataStart, dataEnd)
954
+ };
955
+ }
956
+ /**
957
+ * Peek ahead to see if the next characters form a given keyword
958
+ * followed by whitespace.
959
+ */
960
+ peekKeyword(keyword) {
961
+ for (let i = 0; i < keyword.length; i++) {
962
+ if (this.pos + i >= this.data.length) return false;
963
+ if (this.data[this.pos + i] !== keyword.charCodeAt(i)) return false;
964
+ }
965
+ const afterPos = this.pos + keyword.length;
966
+ if (afterPos >= this.data.length) return true;
967
+ const after = this.data[afterPos];
968
+ return this.isWhitespace(after) || this.isDelimiter(after);
969
+ }
970
+ /**
971
+ * Read and return the next token from the stream.
972
+ */
973
+ nextToken() {
974
+ this.skipWhitespaceAndComments();
975
+ if (this.pos >= this.data.length) return {
976
+ type: TokenType.EOF,
977
+ value: null
978
+ };
979
+ const ch = this.data[this.pos];
980
+ if (ch === 40) return this.readLiteralString();
981
+ if (ch === 60) {
982
+ if (this.pos + 1 < this.data.length && this.data[this.pos + 1] === 60) {
983
+ this.pos += 2;
984
+ return this.nextToken();
985
+ }
986
+ return this.readHexString();
987
+ }
988
+ if (ch === 62 && this.pos + 1 < this.data.length && this.data[this.pos + 1] === 62) {
989
+ this.pos += 2;
990
+ return this.nextToken();
991
+ }
992
+ if (ch === 91) {
993
+ this.pos++;
994
+ return {
995
+ type: TokenType.ArrayStart,
996
+ value: null
997
+ };
998
+ }
999
+ if (ch === 93) {
1000
+ this.pos++;
1001
+ return {
1002
+ type: TokenType.ArrayEnd,
1003
+ value: null
1004
+ };
1005
+ }
1006
+ if (ch === 47) return this.readName();
1007
+ if (this.isNumberStart(ch)) return this.readNumber();
1008
+ if (this.isRegularChar(ch)) return this.readKeyword();
1009
+ this.pos++;
1010
+ return this.nextToken();
1011
+ }
1012
+ /**
1013
+ * Read a literal string `(…)`, handling nested parentheses and escapes.
1014
+ */
1015
+ readLiteralString() {
1016
+ this.pos++;
1017
+ const parts = [];
1018
+ let depth = 1;
1019
+ while (this.pos < this.data.length && depth > 0) {
1020
+ const ch = this.data[this.pos];
1021
+ if (ch === 92) {
1022
+ this.pos++;
1023
+ if (this.pos >= this.data.length) break;
1024
+ const esc = this.data[this.pos];
1025
+ switch (esc) {
1026
+ case 110:
1027
+ parts.push("\n");
1028
+ this.pos++;
1029
+ break;
1030
+ case 114:
1031
+ parts.push("\r");
1032
+ this.pos++;
1033
+ break;
1034
+ case 116:
1035
+ parts.push(" ");
1036
+ this.pos++;
1037
+ break;
1038
+ case 98:
1039
+ parts.push("\b");
1040
+ this.pos++;
1041
+ break;
1042
+ case 102:
1043
+ parts.push("\f");
1044
+ this.pos++;
1045
+ break;
1046
+ case 40:
1047
+ parts.push("(");
1048
+ this.pos++;
1049
+ break;
1050
+ case 41:
1051
+ parts.push(")");
1052
+ this.pos++;
1053
+ break;
1054
+ case 92:
1055
+ parts.push("\\");
1056
+ this.pos++;
1057
+ break;
1058
+ case 10:
1059
+ this.pos++;
1060
+ break;
1061
+ case 13:
1062
+ this.pos++;
1063
+ if (this.pos < this.data.length && this.data[this.pos] === 10) this.pos++;
1064
+ break;
1065
+ default:
1066
+ if (esc >= 48 && esc <= 55) {
1067
+ let octal = esc - 48;
1068
+ this.pos++;
1069
+ if (this.pos < this.data.length) {
1070
+ const d2 = this.data[this.pos];
1071
+ if (d2 >= 48 && d2 <= 55) {
1072
+ octal = octal * 8 + (d2 - 48);
1073
+ this.pos++;
1074
+ if (this.pos < this.data.length) {
1075
+ const d3 = this.data[this.pos];
1076
+ if (d3 >= 48 && d3 <= 55) {
1077
+ octal = octal * 8 + (d3 - 48);
1078
+ this.pos++;
1079
+ }
1080
+ }
1081
+ }
1082
+ }
1083
+ parts.push(String.fromCharCode(octal & 255));
1084
+ } else {
1085
+ parts.push(String.fromCharCode(esc));
1086
+ this.pos++;
1087
+ }
1088
+ break;
1089
+ }
1090
+ } else if (ch === 40) {
1091
+ depth++;
1092
+ parts.push("(");
1093
+ this.pos++;
1094
+ } else if (ch === 41) {
1095
+ depth--;
1096
+ if (depth > 0) parts.push(")");
1097
+ this.pos++;
1098
+ } else {
1099
+ parts.push(String.fromCharCode(ch));
1100
+ this.pos++;
1101
+ }
1102
+ }
1103
+ return {
1104
+ type: TokenType.String,
1105
+ value: parts.join("")
1106
+ };
1107
+ }
1108
+ /**
1109
+ * Read a hex string `<…>`.
1110
+ */
1111
+ readHexString() {
1112
+ this.pos++;
1113
+ const bytes = [];
1114
+ let hi = -1;
1115
+ while (this.pos < this.data.length) {
1116
+ const ch = this.data[this.pos];
1117
+ if (ch === 62) {
1118
+ this.pos++;
1119
+ break;
1120
+ }
1121
+ if (this.isWhitespace(ch)) {
1122
+ this.pos++;
1123
+ continue;
1124
+ }
1125
+ const v = hexVal[ch];
1126
+ if (v === -1) {
1127
+ this.pos++;
1128
+ continue;
1129
+ }
1130
+ if (hi === -1) hi = v;
1131
+ else {
1132
+ bytes.push(hi << 4 | v);
1133
+ hi = -1;
1134
+ }
1135
+ this.pos++;
1136
+ }
1137
+ if (hi !== -1) bytes.push(hi << 4);
1138
+ return {
1139
+ type: TokenType.HexString,
1140
+ value: String.fromCharCode.apply(null, bytes)
1141
+ };
1142
+ }
1143
+ /**
1144
+ * Read a PDF name `/…`.
1145
+ */
1146
+ readName() {
1147
+ this.pos++;
1148
+ const parts = ["/"];
1149
+ while (this.pos < this.data.length) {
1150
+ const ch = this.data[this.pos];
1151
+ if (this.isWhitespace(ch) || this.isDelimiter(ch)) break;
1152
+ if (ch === 35 && this.pos + 2 < this.data.length) {
1153
+ const hi = hexVal[this.data[this.pos + 1]];
1154
+ const lo = hexVal[this.data[this.pos + 2]];
1155
+ if (hi !== -1 && lo !== -1) {
1156
+ parts.push(String.fromCharCode(hi << 4 | lo));
1157
+ this.pos += 3;
1158
+ continue;
1159
+ }
1160
+ }
1161
+ parts.push(String.fromCharCode(ch));
1162
+ this.pos++;
1163
+ }
1164
+ return {
1165
+ type: TokenType.Name,
1166
+ value: PdfName.of(parts.join(""))
1167
+ };
1168
+ }
1169
+ /**
1170
+ * Read a numeric value (integer or real).
1171
+ */
1172
+ readNumber() {
1173
+ const start = this.pos;
1174
+ let hasDecimal = false;
1175
+ if (this.data[this.pos] === 43 || this.data[this.pos] === 45) this.pos++;
1176
+ while (this.pos < this.data.length) {
1177
+ const ch = this.data[this.pos];
1178
+ if (ch === 46) {
1179
+ if (hasDecimal) break;
1180
+ hasDecimal = true;
1181
+ this.pos++;
1182
+ } else if (ch >= 48 && ch <= 57) this.pos++;
1183
+ else break;
1184
+ }
1185
+ const str = this.decodeAscii(start, this.pos);
1186
+ const value = parseFloat(str);
1187
+ return {
1188
+ type: TokenType.Number,
1189
+ value: isNaN(value) ? 0 : value
1190
+ };
1191
+ }
1192
+ /**
1193
+ * Read a keyword — an operator name or one of the special keywords
1194
+ * `true`, `false`, `null`.
1195
+ */
1196
+ readKeyword() {
1197
+ const start = this.pos;
1198
+ while (this.pos < this.data.length) {
1199
+ const ch = this.data[this.pos];
1200
+ if (this.isWhitespace(ch) || this.isDelimiter(ch)) break;
1201
+ this.pos++;
1202
+ }
1203
+ const word = this.decodeAscii(start, this.pos);
1204
+ if (word === "true") return {
1205
+ type: TokenType.Bool,
1206
+ value: true
1207
+ };
1208
+ if (word === "false") return {
1209
+ type: TokenType.Bool,
1210
+ value: false
1211
+ };
1212
+ if (word === "null") return {
1213
+ type: TokenType.Null,
1214
+ value: null
1215
+ };
1216
+ return {
1217
+ type: TokenType.Operator,
1218
+ value: word
1219
+ };
1220
+ }
1221
+ /** PDF whitespace characters. */
1222
+ isWhitespace(ch) {
1223
+ return ch === 0 || ch === 9 || ch === 10 || ch === 12 || ch === 13 || ch === 32;
1224
+ }
1225
+ /** PDF delimiter characters. */
1226
+ isDelimiter(ch) {
1227
+ return ch === 40 || ch === 41 || ch === 60 || ch === 62 || ch === 91 || ch === 93 || ch === 123 || ch === 125 || ch === 47 || ch === 37;
1228
+ }
1229
+ /** Whether a character can begin a number. */
1230
+ isNumberStart(ch) {
1231
+ return ch >= 48 && ch <= 57 || ch === 43 || ch === 45 || ch === 46;
1232
+ }
1233
+ /** Whether a character is a regular (non-whitespace, non-delimiter) character. */
1234
+ isRegularChar(ch) {
1235
+ return !this.isWhitespace(ch) && !this.isDelimiter(ch);
1236
+ }
1237
+ /** Skip whitespace. */
1238
+ skipWhitespace() {
1239
+ while (this.pos < this.data.length && this.isWhitespace(this.data[this.pos])) this.pos++;
1240
+ }
1241
+ /** Skip whitespace and `%` comments. */
1242
+ skipWhitespaceAndComments() {
1243
+ while (this.pos < this.data.length) {
1244
+ const ch = this.data[this.pos];
1245
+ if (this.isWhitespace(ch)) this.pos++;
1246
+ else if (ch === 37) {
1247
+ this.pos++;
1248
+ while (this.pos < this.data.length) {
1249
+ const c = this.data[this.pos];
1250
+ if (c === 10 || c === 13) break;
1251
+ this.pos++;
1252
+ }
1253
+ } else break;
1254
+ }
1255
+ }
1256
+ /**
1257
+ * Decode a slice of the data as ASCII text.
1258
+ */
1259
+ decodeAscii(start, end) {
1260
+ return String.fromCharCode.apply(null, this.data.subarray(start, end));
1261
+ }
1262
+ };
1263
+
1264
+ //#endregion
1265
+ //#region src/assets/image/dpiAnalyze.ts
1266
+ /**
1267
+ * Compute the effective DPI of an image given its pixel dimensions
1268
+ * and display dimensions in points.
1269
+ *
1270
+ * PDF uses 72 points per inch, so:
1271
+ * ```
1272
+ * DPI = imagePixels / (displayPoints / 72)
1273
+ * ```
1274
+ *
1275
+ * @param imageWidth - Image width in pixels.
1276
+ * @param imageHeight - Image height in pixels.
1277
+ * @param displayWidth - Display width in PDF points (1/72 inch).
1278
+ * @param displayHeight - Display height in PDF points (1/72 inch).
1279
+ * @returns DPI information.
1280
+ *
1281
+ * @example
1282
+ * ```ts
1283
+ * import { computeImageDpi } from 'modern-pdf-lib';
1284
+ *
1285
+ * // A 3000×2000 image displayed at 4.17×2.78 inches (300×200 points)
1286
+ * const dpi = computeImageDpi(3000, 2000, 300, 200);
1287
+ * console.log(dpi.effectiveDpi); // 720
1288
+ * ```
1289
+ */
1290
+ function computeImageDpi(imageWidth, imageHeight, displayWidth, displayHeight) {
1291
+ const xDpi = displayWidth > 0 ? imageWidth / displayWidth * 72 : Infinity;
1292
+ const yDpi = displayHeight > 0 ? imageHeight / displayHeight * 72 : Infinity;
1293
+ return {
1294
+ xDpi,
1295
+ yDpi,
1296
+ effectiveDpi: Math.min(xDpi, yDpi)
1297
+ };
1298
+ }
1299
+ /**
1300
+ * Compute the target pixel dimensions for downscaling an image
1301
+ * to a maximum DPI at a given display size.
1302
+ *
1303
+ * @param imageWidth - Current image width in pixels.
1304
+ * @param imageHeight - Current image height in pixels.
1305
+ * @param displayWidth - Display width in PDF points.
1306
+ * @param displayHeight - Display height in PDF points.
1307
+ * @param maxDpi - Maximum allowed DPI.
1308
+ * @returns Target dimensions, or the original dimensions if no
1309
+ * downscaling is needed.
1310
+ */
1311
+ function computeTargetDimensions(imageWidth, imageHeight, displayWidth, displayHeight, maxDpi) {
1312
+ const dpi = computeImageDpi(imageWidth, imageHeight, displayWidth, displayHeight);
1313
+ if (dpi.effectiveDpi <= maxDpi || !isFinite(dpi.effectiveDpi)) return {
1314
+ width: imageWidth,
1315
+ height: imageHeight,
1316
+ downscaled: false
1317
+ };
1318
+ const scale = maxDpi / dpi.effectiveDpi;
1319
+ return {
1320
+ width: Math.max(1, Math.round(imageWidth * scale)),
1321
+ height: Math.max(1, Math.round(imageHeight * scale)),
1322
+ downscaled: true
1323
+ };
1324
+ }
1325
+
1326
+ //#endregion
1327
+ //#region src/assets/image/compressionAnalysis.ts
1328
+ /**
1329
+ * Determine the human-readable format name from filter names.
1330
+ * @internal
1331
+ */
1332
+ function formatFromFilters(filters) {
1333
+ if (filters.length === 0) return "Raw";
1334
+ for (const f of filters) {
1335
+ if (f === "DCTDecode") return "JPEG";
1336
+ if (f === "JPXDecode") return "JPEG2000";
1337
+ if (f === "CCITTFaxDecode") return "CCITT";
1338
+ if (f === "JBIG2Decode") return "JBIG2";
1339
+ if (f === "FlateDecode") return "FlateDecode";
1340
+ if (f === "LZWDecode") return "LZW";
1341
+ if (f === "RunLengthDecode") return "RunLength";
1342
+ if (f === "ASCIIHexDecode") return "ASCIIHex";
1343
+ if (f === "ASCII85Decode") return "ASCII85";
1344
+ }
1345
+ return filters[0] ?? "Unknown";
1346
+ }
1347
+ /**
1348
+ * Estimate JPEG size using a heuristic when WASM is not available.
1349
+ *
1350
+ * At quality 80, JPEG is typically 10–15% of raw pixel data size for
1351
+ * photographic content. We use 12.5% as a reasonable middle estimate
1352
+ * and scale linearly with quality.
1353
+ *
1354
+ * @internal
1355
+ */
1356
+ function estimateJpegSizeHeuristic(width, height, channels, quality) {
1357
+ const rawSize = width * height * channels;
1358
+ const baseRatio = .125;
1359
+ const qualityFactor = quality / 80;
1360
+ const estimated = Math.round(rawSize * baseRatio * qualityFactor);
1361
+ return Math.max(200, estimated);
1362
+ }
1363
+ /**
1364
+ * Determine the recommendation for an image.
1365
+ * @internal
1366
+ */
1367
+ function determineRecommendation(savingsPercent, isGrayscale, colorSpace, effectiveDpi, maxDpi) {
1368
+ if (isGrayscale && colorSpace !== "DeviceGray" && colorSpace !== "CalGray") return "grayscale";
1369
+ if (effectiveDpi !== void 0 && isFinite(effectiveDpi) && effectiveDpi > maxDpi) return "downscale";
1370
+ if (savingsPercent > 10) return "recompress";
1371
+ return "keep";
1372
+ }
1373
+ /**
1374
+ * Analyze all images in a PDF and report potential savings without
1375
+ * modifying the document.
1376
+ *
1377
+ * For each image XObject with `bitsPerComponent === 8` and 1–4 channels,
1378
+ * the function estimates the JPEG-encoded size — using the WASM encoder
1379
+ * when available, or a heuristic fallback otherwise.
1380
+ *
1381
+ * @param doc - A parsed `PdfDocument`.
1382
+ * @param options - Optional quality and maxDpi settings.
1383
+ * @returns An `AnalysisReport` with per-image and aggregate statistics.
1384
+ *
1385
+ * @example
1386
+ * ```ts
1387
+ * import { loadPdf, analyzeImages } from 'modern-pdf-lib';
1388
+ *
1389
+ * const doc = await loadPdf(pdfBytes);
1390
+ * const report = analyzeImages(doc, { quality: 75, maxDpi: 150 });
1391
+ *
1392
+ * console.log(`Total savings: ${report.totalSavingsPercent.toFixed(1)}%`);
1393
+ * for (const img of report.images) {
1394
+ * console.log(` ${img.name}: ${img.recommendation} (${img.savingsPercent.toFixed(1)}%)`);
1395
+ * }
1396
+ * ```
1397
+ */
1398
+ function analyzeImages(doc, options) {
1399
+ const quality = options?.quality ?? 80;
1400
+ const maxDpi = options?.maxDpi ?? 150;
1401
+ const allImages = extractImages(doc);
1402
+ const analyses = [];
1403
+ const wasmReady = isJpegWasmReady();
1404
+ for (const info of allImages) {
1405
+ const currentSize = info.compressedSize;
1406
+ const currentFormat = formatFromFilters(info.filters);
1407
+ if (info.bitsPerComponent !== 8 || info.channels < 1 || info.channels > 4) {
1408
+ analyses.push({
1409
+ name: info.name,
1410
+ pageIndex: info.pageIndex,
1411
+ width: info.width,
1412
+ height: info.height,
1413
+ currentSize,
1414
+ currentFormat,
1415
+ colorSpace: info.colorSpace,
1416
+ estimatedJpegSize: currentSize,
1417
+ estimatedSavings: 0,
1418
+ savingsPercent: 0,
1419
+ isGrayscale: false,
1420
+ effectiveDpi: void 0,
1421
+ recommendation: "keep"
1422
+ });
1423
+ continue;
1424
+ }
1425
+ let pixels;
1426
+ try {
1427
+ pixels = decodeImageStream(info);
1428
+ } catch {
1429
+ analyses.push({
1430
+ name: info.name,
1431
+ pageIndex: info.pageIndex,
1432
+ width: info.width,
1433
+ height: info.height,
1434
+ currentSize,
1435
+ currentFormat,
1436
+ colorSpace: info.colorSpace,
1437
+ estimatedJpegSize: currentSize,
1438
+ estimatedSavings: 0,
1439
+ savingsPercent: 0,
1440
+ isGrayscale: false,
1441
+ effectiveDpi: void 0,
1442
+ recommendation: "keep"
1443
+ });
1444
+ continue;
1445
+ }
1446
+ let grayscale = false;
1447
+ if ((info.channels === 3 || info.channels === 4) && pixels.length >= info.width * info.height * info.channels) grayscale = isGrayscaleImage(pixels, info.width, info.height, info.channels);
1448
+ else if (info.channels === 1) grayscale = true;
1449
+ let estimatedJpegSize;
1450
+ if (wasmReady && pixels.length >= info.width * info.height * info.channels) {
1451
+ const channels = info.channels <= 4 ? info.channels : 3;
1452
+ const encoded = encodeJpegWasm(pixels, info.width, info.height, channels, quality);
1453
+ if (encoded) estimatedJpegSize = encoded.length;
1454
+ else estimatedJpegSize = estimateJpegSizeHeuristic(info.width, info.height, info.channels, quality);
1455
+ } else estimatedJpegSize = estimateJpegSizeHeuristic(info.width, info.height, info.channels, quality);
1456
+ const estimatedSavings = Math.max(0, currentSize - estimatedJpegSize);
1457
+ const savingsPercent = currentSize > 0 ? estimatedSavings / currentSize * 100 : 0;
1458
+ const effectiveDpi = computeImageDpi(info.width, info.height, info.width, info.height).effectiveDpi;
1459
+ const recommendation = determineRecommendation(savingsPercent, grayscale, info.colorSpace, effectiveDpi, maxDpi);
1460
+ analyses.push({
1461
+ name: info.name,
1462
+ pageIndex: info.pageIndex,
1463
+ width: info.width,
1464
+ height: info.height,
1465
+ currentSize,
1466
+ currentFormat,
1467
+ colorSpace: info.colorSpace,
1468
+ estimatedJpegSize,
1469
+ estimatedSavings,
1470
+ savingsPercent,
1471
+ isGrayscale: grayscale,
1472
+ effectiveDpi,
1473
+ recommendation
1474
+ });
1475
+ }
1476
+ const totalCurrentSize = analyses.reduce((sum, a) => sum + a.currentSize, 0);
1477
+ const totalEstimatedSize = analyses.reduce((sum, a) => sum + a.estimatedJpegSize, 0);
1478
+ const totalSavings = Math.max(0, totalCurrentSize - totalEstimatedSize);
1479
+ return {
1480
+ images: analyses,
1481
+ totalCurrentSize,
1482
+ totalEstimatedSize,
1483
+ totalSavings,
1484
+ totalSavingsPercent: totalCurrentSize > 0 ? totalSavings / totalCurrentSize * 100 : 0
1485
+ };
1486
+ }
1487
+
1488
+ //#endregion
1489
+ export { extractText as a, parseContentStream as i, computeImageDpi as n, extractTextWithPositions as o, computeTargetDimensions as r, analyzeImages as t };
1490
+ //# sourceMappingURL=compressionAnalysis-Bw2alOxt.mjs.map