@cj-tech-master/excelts 8.0.0 → 8.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. package/README.md +14 -1
  2. package/README_zh.md +6 -0
  3. package/dist/browser/modules/archive/zip/stream.d.ts +4 -0
  4. package/dist/browser/modules/archive/zip/stream.js +53 -0
  5. package/dist/browser/modules/pdf/core/crypto.d.ts +65 -0
  6. package/dist/browser/modules/pdf/core/crypto.js +637 -0
  7. package/dist/browser/modules/pdf/core/encryption.d.ts +23 -20
  8. package/dist/browser/modules/pdf/core/encryption.js +88 -261
  9. package/dist/browser/modules/pdf/core/pdf-writer.d.ts +6 -4
  10. package/dist/browser/modules/pdf/core/pdf-writer.js +19 -10
  11. package/dist/browser/modules/pdf/index.d.ts +23 -2
  12. package/dist/browser/modules/pdf/index.js +21 -3
  13. package/dist/browser/modules/pdf/reader/annotation-extractor.d.ts +63 -0
  14. package/dist/browser/modules/pdf/reader/annotation-extractor.js +155 -0
  15. package/dist/browser/modules/pdf/reader/cmap-parser.d.ts +70 -0
  16. package/dist/browser/modules/pdf/reader/cmap-parser.js +321 -0
  17. package/dist/browser/modules/pdf/reader/content-interpreter.d.ts +57 -0
  18. package/dist/browser/modules/pdf/reader/content-interpreter.js +715 -0
  19. package/dist/browser/modules/pdf/reader/font-decoder.d.ts +58 -0
  20. package/dist/browser/modules/pdf/reader/font-decoder.js +1513 -0
  21. package/dist/browser/modules/pdf/reader/form-extractor.d.ts +48 -0
  22. package/dist/browser/modules/pdf/reader/form-extractor.js +355 -0
  23. package/dist/browser/modules/pdf/reader/image-extractor.d.ts +55 -0
  24. package/dist/browser/modules/pdf/reader/image-extractor.js +220 -0
  25. package/dist/browser/modules/pdf/reader/metadata-reader.d.ts +56 -0
  26. package/dist/browser/modules/pdf/reader/metadata-reader.js +275 -0
  27. package/dist/browser/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
  28. package/dist/browser/modules/pdf/reader/pdf-decrypt.js +443 -0
  29. package/dist/browser/modules/pdf/reader/pdf-document.d.ts +191 -0
  30. package/dist/browser/modules/pdf/reader/pdf-document.js +818 -0
  31. package/dist/browser/modules/pdf/reader/pdf-parser.d.ts +65 -0
  32. package/dist/browser/modules/pdf/reader/pdf-parser.js +285 -0
  33. package/dist/browser/modules/pdf/reader/pdf-reader.d.ts +143 -0
  34. package/dist/browser/modules/pdf/reader/pdf-reader.js +200 -0
  35. package/dist/browser/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
  36. package/dist/browser/modules/pdf/reader/pdf-tokenizer.js +543 -0
  37. package/dist/browser/modules/pdf/reader/reader-utils.d.ts +15 -0
  38. package/dist/browser/modules/pdf/reader/reader-utils.js +27 -0
  39. package/dist/browser/modules/pdf/reader/stream-filters.d.ts +20 -0
  40. package/dist/browser/modules/pdf/reader/stream-filters.js +456 -0
  41. package/dist/browser/modules/pdf/reader/text-reconstruction.d.ts +44 -0
  42. package/dist/browser/modules/pdf/reader/text-reconstruction.js +463 -0
  43. package/dist/cjs/modules/archive/zip/stream.js +53 -0
  44. package/dist/cjs/modules/pdf/core/crypto.js +649 -0
  45. package/dist/cjs/modules/pdf/core/encryption.js +88 -263
  46. package/dist/cjs/modules/pdf/core/pdf-writer.js +19 -10
  47. package/dist/cjs/modules/pdf/index.js +23 -4
  48. package/dist/cjs/modules/pdf/reader/annotation-extractor.js +158 -0
  49. package/dist/cjs/modules/pdf/reader/cmap-parser.js +326 -0
  50. package/dist/cjs/modules/pdf/reader/content-interpreter.js +718 -0
  51. package/dist/cjs/modules/pdf/reader/font-decoder.js +1518 -0
  52. package/dist/cjs/modules/pdf/reader/form-extractor.js +358 -0
  53. package/dist/cjs/modules/pdf/reader/image-extractor.js +223 -0
  54. package/dist/cjs/modules/pdf/reader/metadata-reader.js +278 -0
  55. package/dist/cjs/modules/pdf/reader/pdf-decrypt.js +447 -0
  56. package/dist/cjs/modules/pdf/reader/pdf-document.js +822 -0
  57. package/dist/cjs/modules/pdf/reader/pdf-parser.js +301 -0
  58. package/dist/cjs/modules/pdf/reader/pdf-reader.js +203 -0
  59. package/dist/cjs/modules/pdf/reader/pdf-tokenizer.js +517 -0
  60. package/dist/cjs/modules/pdf/reader/reader-utils.js +30 -0
  61. package/dist/cjs/modules/pdf/reader/stream-filters.js +459 -0
  62. package/dist/cjs/modules/pdf/reader/text-reconstruction.js +467 -0
  63. package/dist/esm/modules/archive/zip/stream.js +53 -0
  64. package/dist/esm/modules/pdf/core/crypto.js +637 -0
  65. package/dist/esm/modules/pdf/core/encryption.js +88 -261
  66. package/dist/esm/modules/pdf/core/pdf-writer.js +19 -10
  67. package/dist/esm/modules/pdf/index.js +21 -3
  68. package/dist/esm/modules/pdf/reader/annotation-extractor.js +155 -0
  69. package/dist/esm/modules/pdf/reader/cmap-parser.js +321 -0
  70. package/dist/esm/modules/pdf/reader/content-interpreter.js +715 -0
  71. package/dist/esm/modules/pdf/reader/font-decoder.js +1513 -0
  72. package/dist/esm/modules/pdf/reader/form-extractor.js +355 -0
  73. package/dist/esm/modules/pdf/reader/image-extractor.js +220 -0
  74. package/dist/esm/modules/pdf/reader/metadata-reader.js +275 -0
  75. package/dist/esm/modules/pdf/reader/pdf-decrypt.js +443 -0
  76. package/dist/esm/modules/pdf/reader/pdf-document.js +818 -0
  77. package/dist/esm/modules/pdf/reader/pdf-parser.js +285 -0
  78. package/dist/esm/modules/pdf/reader/pdf-reader.js +200 -0
  79. package/dist/esm/modules/pdf/reader/pdf-tokenizer.js +543 -0
  80. package/dist/esm/modules/pdf/reader/reader-utils.js +27 -0
  81. package/dist/esm/modules/pdf/reader/stream-filters.js +456 -0
  82. package/dist/esm/modules/pdf/reader/text-reconstruction.js +463 -0
  83. package/dist/iife/excelts.iife.js +703 -267
  84. package/dist/iife/excelts.iife.js.map +1 -1
  85. package/dist/iife/excelts.iife.min.js +35 -35
  86. package/dist/types/modules/archive/zip/stream.d.ts +4 -0
  87. package/dist/types/modules/pdf/core/crypto.d.ts +65 -0
  88. package/dist/types/modules/pdf/core/encryption.d.ts +23 -20
  89. package/dist/types/modules/pdf/core/pdf-writer.d.ts +6 -4
  90. package/dist/types/modules/pdf/index.d.ts +23 -2
  91. package/dist/types/modules/pdf/reader/annotation-extractor.d.ts +63 -0
  92. package/dist/types/modules/pdf/reader/cmap-parser.d.ts +70 -0
  93. package/dist/types/modules/pdf/reader/content-interpreter.d.ts +57 -0
  94. package/dist/types/modules/pdf/reader/font-decoder.d.ts +58 -0
  95. package/dist/types/modules/pdf/reader/form-extractor.d.ts +48 -0
  96. package/dist/types/modules/pdf/reader/image-extractor.d.ts +55 -0
  97. package/dist/types/modules/pdf/reader/metadata-reader.d.ts +56 -0
  98. package/dist/types/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
  99. package/dist/types/modules/pdf/reader/pdf-document.d.ts +191 -0
  100. package/dist/types/modules/pdf/reader/pdf-parser.d.ts +65 -0
  101. package/dist/types/modules/pdf/reader/pdf-reader.d.ts +143 -0
  102. package/dist/types/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
  103. package/dist/types/modules/pdf/reader/reader-utils.d.ts +15 -0
  104. package/dist/types/modules/pdf/reader/stream-filters.d.ts +20 -0
  105. package/dist/types/modules/pdf/reader/text-reconstruction.d.ts +44 -0
  106. package/package.json +1 -1
@@ -0,0 +1,715 @@
1
+ /**
2
+ * PDF content stream interpreter for text extraction.
3
+ *
4
+ * Implements a full PDF graphics state machine that processes content stream
5
+ * operators to extract positioned text fragments. These fragments are then
6
+ * assembled into readable text by the text reconstruction module.
7
+ *
8
+ * Supported operator categories:
9
+ * - Text state: Tf, Tc, Tw, Tz, TL, Ts, Tr
10
+ * - Text positioning: Td, TD, Tm, T*
11
+ * - Text showing: Tj, TJ, ', "
12
+ * - Text objects: BT, ET
13
+ * - Graphics state: q, Q, cm, gs, i, M, ri, W, W*
14
+ * - Color: CS, cs, SC, sc, SCN, scn
15
+ * - Marked content: BDC, BMC, EMC, MP, DP
16
+ * - Type3 glyph: d0, d1
17
+ * - Shading: sh
18
+ * - Inline images: BI/ID/EI
19
+ * - XObject invocation: Do (for form XObjects containing text)
20
+ *
21
+ * @see PDF Reference 1.7, Chapter 5 - Text
22
+ * @see PDF Reference 1.7, Chapter 4 - Graphics
23
+ */
24
+ import { PdfTokenizer, TokenType } from "./pdf-tokenizer.js";
25
+ import { resolveFont, decodeText, getCharWidth } from "./font-decoder.js";
26
+ import { isPdfRef, isPdfArray, dictGetName, dictGetArray } from "./pdf-parser.js";
27
+ // =============================================================================
28
+ // Constants
29
+ // =============================================================================
30
+ /** Maximum Form XObject nesting depth to prevent infinite recursion */
31
+ const MAX_FORM_DEPTH = 10;
32
+ /** Cached TextEncoder instance */
33
+ const _textEncoder = new TextEncoder();
34
+ // =============================================================================
35
+ // RTL Detection
36
+ // =============================================================================
37
+ /**
38
+ * Check if a character code point is in an RTL Unicode range.
39
+ */
40
+ function isRtlChar(codePoint) {
41
+ return (
42
+ // Arabic (0600–06FF)
43
+ (codePoint >= 0x0600 && codePoint <= 0x06ff) ||
44
+ // Arabic Supplement (0750–077F)
45
+ (codePoint >= 0x0750 && codePoint <= 0x077f) ||
46
+ // Arabic Extended-A (08A0–08FF)
47
+ (codePoint >= 0x08a0 && codePoint <= 0x08ff) ||
48
+ // Arabic Presentation Forms-A (FB50–FDFF)
49
+ (codePoint >= 0xfb50 && codePoint <= 0xfdff) ||
50
+ // Arabic Presentation Forms-B (FE70–FEFF)
51
+ (codePoint >= 0xfe70 && codePoint <= 0xfeff) ||
52
+ // Hebrew (0590–05FF)
53
+ (codePoint >= 0x0590 && codePoint <= 0x05ff) ||
54
+ // Hebrew Presentation Forms (FB1D–FB4F)
55
+ (codePoint >= 0xfb1d && codePoint <= 0xfb4f) ||
56
+ // Syriac (0700–074F)
57
+ (codePoint >= 0x0700 && codePoint <= 0x074f) ||
58
+ // Thaana (0780–07BF)
59
+ (codePoint >= 0x0780 && codePoint <= 0x07bf) ||
60
+ // NKo (07C0–07FF)
61
+ (codePoint >= 0x07c0 && codePoint <= 0x07ff));
62
+ }
63
+ /**
64
+ * Check if the first character of a string is in an RTL Unicode range.
65
+ */
66
+ function detectRtl(text) {
67
+ if (text.length === 0) {
68
+ return false;
69
+ }
70
+ const codePoint = text.codePointAt(0);
71
+ return codePoint !== undefined && isRtlChar(codePoint);
72
+ }
73
+ // =============================================================================
74
+ // Content Stream Interpreter
75
+ // =============================================================================
76
+ /**
77
+ * Extract text fragments from a page's content stream(s).
78
+ */
79
+ export function extractTextFromPage(pageDict, doc) {
80
+ // Resolve page resources (centralized with cycle protection)
81
+ const resources = doc.resolvePageResources(pageDict);
82
+ const fonts = resolveFontResources(resources, doc);
83
+ // Get content stream(s)
84
+ const contentStreams = getContentStreams(pageDict, doc);
85
+ if (contentStreams.length === 0) {
86
+ return [];
87
+ }
88
+ const fragments = [];
89
+ const interpreter = new ContentInterpreter(fonts, doc, resources);
90
+ for (const streamData of contentStreams) {
91
+ interpreter.process(streamData, fragments);
92
+ }
93
+ return fragments;
94
+ }
95
+ /**
96
+ * Resolve all font resources for a page.
97
+ */
98
+ function resolveFontResources(resources, doc) {
99
+ const fonts = new Map();
100
+ const fontDict = resources.get("Font");
101
+ if (!fontDict) {
102
+ return fonts;
103
+ }
104
+ const resolvedFontDict = doc.derefDict(fontDict);
105
+ if (!resolvedFontDict) {
106
+ return fonts;
107
+ }
108
+ for (const [name, ref] of resolvedFontDict) {
109
+ const fd = doc.derefDict(ref);
110
+ if (fd) {
111
+ try {
112
+ fonts.set(name, resolveFont(fd, doc));
113
+ }
114
+ catch {
115
+ // Skip invalid fonts
116
+ }
117
+ }
118
+ }
119
+ return fonts;
120
+ }
121
+ /**
122
+ * Get decoded content stream data for a page.
123
+ * Handles both single stream and array of streams.
124
+ */
125
+ function getContentStreams(pageDict, doc) {
126
+ const contents = pageDict.get("Contents");
127
+ if (!contents) {
128
+ return [];
129
+ }
130
+ if (isPdfRef(contents)) {
131
+ const result = doc.derefStreamWithObjNum(contents);
132
+ if (result) {
133
+ return [doc.getStreamData(result.stream, result.objNum, result.gen)];
134
+ }
135
+ const resolved = doc.deref(contents);
136
+ if (isPdfArray(resolved)) {
137
+ return resolveStreamArray(resolved, doc);
138
+ }
139
+ return [];
140
+ }
141
+ if (isPdfArray(contents)) {
142
+ return resolveStreamArray(contents, doc);
143
+ }
144
+ // Note: a direct PdfStream inside the page dict (not via ref) is technically
145
+ // invalid per the spec — Contents must be an indirect reference or array of refs.
146
+ // We don't handle it because we can't determine the correct objNum/gen for decryption.
147
+ return [];
148
+ }
149
+ function resolveStreamArray(arr, doc) {
150
+ const result = [];
151
+ for (const item of arr) {
152
+ const r = doc.derefStreamWithObjNum(item);
153
+ if (r) {
154
+ result.push(doc.getStreamData(r.stream, r.objNum, r.gen));
155
+ }
156
+ }
157
+ return result;
158
+ }
159
+ // =============================================================================
160
+ // Content Interpreter
161
+ // =============================================================================
162
+ class ContentInterpreter {
163
+ constructor(fonts, doc, resources) {
164
+ // Graphics state
165
+ this.stateStack = [];
166
+ this.ctm = [1, 0, 0, 1, 0, 0];
167
+ // Text state
168
+ this.textState = {
169
+ charSpacing: 0,
170
+ wordSpacing: 0,
171
+ horizontalScaling: 100,
172
+ leading: 0,
173
+ font: null,
174
+ fontSize: 0,
175
+ renderMode: 0,
176
+ rise: 0
177
+ };
178
+ // Text object state
179
+ this.textMatrix = [1, 0, 0, 1, 0, 0];
180
+ this.lineMatrix = [1, 0, 0, 1, 0, 0];
181
+ this.inTextObject = false;
182
+ // Form XObject recursion depth
183
+ this.formDepth = 0;
184
+ this.fonts = fonts;
185
+ this.doc = doc;
186
+ this.resources = resources;
187
+ }
188
+ process(streamData, fragments) {
189
+ const tokenizer = new PdfTokenizer(streamData);
190
+ const operands = [];
191
+ while (true) {
192
+ const token = tokenizer.next();
193
+ if (token.type === TokenType.EOF) {
194
+ break;
195
+ }
196
+ if (token.type === TokenType.Keyword) {
197
+ const keyword = token.strValue;
198
+ // Handle inline image: BI ... ID <data> EI
199
+ if (keyword === "BI") {
200
+ this.skipInlineImage(tokenizer);
201
+ operands.length = 0;
202
+ }
203
+ else {
204
+ this.executeOperator(keyword, operands, fragments);
205
+ operands.length = 0;
206
+ }
207
+ }
208
+ else if (token.type === TokenType.ArrayBegin) {
209
+ // Parse array inline (for TJ operator)
210
+ operands.push(this.parseInlineArray(tokenizer));
211
+ }
212
+ else {
213
+ operands.push(tokenToOperand(token));
214
+ }
215
+ }
216
+ }
217
+ /**
218
+ * Skip an inline image in the content stream.
219
+ *
220
+ * Inline images have the form: BI <key-value pairs> ID <image data> EI
221
+ * We need to parse past the key-value pairs (which the tokenizer handles),
222
+ * skip the single whitespace byte after ID, then scan for the EI marker.
223
+ */
224
+ skipInlineImage(tokenizer) {
225
+ // Phase 1: Read key-value pairs until we encounter the ID keyword
226
+ while (true) {
227
+ const tok = tokenizer.next();
228
+ if (tok.type === TokenType.EOF) {
229
+ return;
230
+ }
231
+ if (tok.type === TokenType.Keyword && tok.strValue === "ID") {
232
+ break;
233
+ }
234
+ // Just consume the token (key-value pairs) — we don't need them
235
+ }
236
+ // Phase 2: Skip one whitespace byte after ID (per PDF spec)
237
+ const data = tokenizer.bytes;
238
+ let pos = tokenizer.position;
239
+ if (pos < data.length) {
240
+ // The byte immediately after ID should be a single whitespace byte
241
+ pos++;
242
+ }
243
+ // Phase 3: Scan forward for EI preceded by whitespace
244
+ // EI is 0x45 0x49, and must be preceded by whitespace and followed by
245
+ // whitespace or EOF to distinguish from image data containing "EI"
246
+ while (pos + 1 < data.length) {
247
+ if (data[pos] === 0x45 &&
248
+ data[pos + 1] === 0x49 &&
249
+ pos > 0 &&
250
+ isWhitespaceByte(data[pos - 1]) &&
251
+ (pos + 2 >= data.length ||
252
+ isWhitespaceByte(data[pos + 2]) ||
253
+ isDelimiterByte(data[pos + 2]))) {
254
+ // Found EI — advance past it
255
+ tokenizer.position = pos + 2;
256
+ return;
257
+ }
258
+ pos++;
259
+ }
260
+ // If we didn't find EI, just set position to end
261
+ tokenizer.position = data.length;
262
+ }
263
+ parseInlineArray(tokenizer) {
264
+ const arr = [];
265
+ while (true) {
266
+ const tok = tokenizer.next();
267
+ if (tok.type === TokenType.ArrayEnd || tok.type === TokenType.EOF) {
268
+ break;
269
+ }
270
+ arr.push(tokenToOperand(tok));
271
+ }
272
+ return arr;
273
+ }
274
+ executeOperator(op, operands, fragments) {
275
+ switch (op) {
276
+ // ---- Graphics State ----
277
+ case "q":
278
+ this.saveState();
279
+ break;
280
+ case "Q":
281
+ this.restoreState();
282
+ break;
283
+ case "cm":
284
+ if (operands.length >= 6) {
285
+ this.concatMatrix(nums(operands, 6));
286
+ }
287
+ break;
288
+ // ---- Graphics State (no-op for text extraction) ----
289
+ case "gs": // ExtGState
290
+ case "i": // Flatness
291
+ case "M": // Miter limit
292
+ case "ri": // Rendering intent
293
+ case "sh": // Shading
294
+ // Consume operands, no action needed for text extraction
295
+ break;
296
+ // ---- Clipping (no-op) ----
297
+ case "W": // Clipping (non-zero winding)
298
+ case "W*": // Clipping (even-odd)
299
+ break;
300
+ // ---- Color Operators (no-op for text extraction) ----
301
+ case "CS": // Set color space (stroking)
302
+ case "cs": // Set color space (non-stroking)
303
+ case "SC": // Set color (stroking)
304
+ case "sc": // Set color (non-stroking)
305
+ case "SCN": // Set color (stroking, extended)
306
+ case "scn": // Set color (non-stroking, extended)
307
+ case "G": // Set gray (stroking)
308
+ case "g": // Set gray (non-stroking)
309
+ case "RG": // Set RGB (stroking)
310
+ case "rg": // Set RGB (non-stroking)
311
+ case "K": // Set CMYK (stroking)
312
+ case "k": // Set CMYK (non-stroking)
313
+ // Consume operands, no action needed
314
+ break;
315
+ // ---- Marked Content (no-op for text extraction) ----
316
+ case "BDC": // Begin marked content with properties
317
+ case "BMC": // Begin marked content
318
+ case "EMC": // End marked content
319
+ case "MP": // Marked content point
320
+ case "DP": // Marked content point with properties
321
+ break;
322
+ // ---- Type3 Font Glyph Operators (no-op) ----
323
+ case "d0": // Set glyph width
324
+ case "d1": // Set glyph width and bounding box
325
+ break;
326
+ // ---- Path Construction/Painting (no-op for text extraction) ----
327
+ case "m": // moveto
328
+ case "l": // lineto
329
+ case "c": // curveto (cubic Bézier)
330
+ case "v": // curveto (initial point replicated)
331
+ case "y": // curveto (final point replicated)
332
+ case "h": // closepath
333
+ case "re": // rectangle
334
+ case "S": // stroke
335
+ case "s": // close and stroke
336
+ case "f": // fill (non-zero winding)
337
+ case "F": // fill (non-zero winding, obsolete)
338
+ case "f*": // fill (even-odd)
339
+ case "B": // fill and stroke (non-zero)
340
+ case "B*": // fill and stroke (even-odd)
341
+ case "b": // close, fill and stroke (non-zero)
342
+ case "b*": // close, fill and stroke (even-odd)
343
+ case "n": // end path without fill/stroke
344
+ case "j": // line join style
345
+ case "J": // line cap style
346
+ case "d": // dash pattern
347
+ case "w": // line width
348
+ break;
349
+ // ---- Text State ----
350
+ case "Tc":
351
+ this.textState.charSpacing = num(operands, 0);
352
+ break;
353
+ case "Tw":
354
+ this.textState.wordSpacing = num(operands, 0);
355
+ break;
356
+ case "Tz":
357
+ this.textState.horizontalScaling = num(operands, 0);
358
+ break;
359
+ case "TL":
360
+ this.textState.leading = num(operands, 0);
361
+ break;
362
+ case "Tf":
363
+ this.setFont(operands);
364
+ break;
365
+ case "Tr":
366
+ this.textState.renderMode = num(operands, 0);
367
+ break;
368
+ case "Ts":
369
+ this.textState.rise = num(operands, 0);
370
+ break;
371
+ // ---- Text Objects ----
372
+ case "BT":
373
+ this.beginText();
374
+ break;
375
+ case "ET":
376
+ this.inTextObject = false;
377
+ break;
378
+ // ---- Text Positioning ----
379
+ case "Td":
380
+ this.moveText(num(operands, 0), num(operands, 1));
381
+ break;
382
+ case "TD":
383
+ this.textState.leading = -num(operands, 1);
384
+ this.moveText(num(operands, 0), num(operands, 1));
385
+ break;
386
+ case "Tm":
387
+ if (operands.length >= 6) {
388
+ this.setTextMatrix(nums(operands, 6));
389
+ }
390
+ break;
391
+ case "T*":
392
+ this.moveText(0, -this.textState.leading);
393
+ break;
394
+ // ---- Text Showing ----
395
+ case "Tj":
396
+ this.showText(operands[0], fragments);
397
+ break;
398
+ case "TJ":
399
+ this.showTextArray(operands[0], fragments);
400
+ break;
401
+ case "'":
402
+ this.moveText(0, -this.textState.leading);
403
+ this.showText(operands[0], fragments);
404
+ break;
405
+ case '"':
406
+ this.textState.wordSpacing = num(operands, 0);
407
+ this.textState.charSpacing = num(operands, 1);
408
+ this.moveText(0, -this.textState.leading);
409
+ this.showText(operands[2], fragments);
410
+ break;
411
+ // ---- XObject ----
412
+ case "Do":
413
+ this.doXObject(operands, fragments);
414
+ break;
415
+ }
416
+ }
417
+ // ===========================================================================
418
+ // Graphics State
419
+ // ===========================================================================
420
+ saveState() {
421
+ this.stateStack.push({
422
+ ctm: [...this.ctm],
423
+ textState: { ...this.textState }
424
+ });
425
+ }
426
+ restoreState() {
427
+ const state = this.stateStack.pop();
428
+ if (state) {
429
+ this.ctm = state.ctm;
430
+ this.textState = state.textState;
431
+ }
432
+ }
433
+ concatMatrix(m) {
434
+ this.ctm = multiplyMatrices(m, this.ctm);
435
+ }
436
+ // ===========================================================================
437
+ // Text State
438
+ // ===========================================================================
439
+ setFont(operands) {
440
+ if (operands.length < 2) {
441
+ return;
442
+ }
443
+ const fontName = typeof operands[0] === "string" ? operands[0] : String(operands[0]);
444
+ const fontSize = typeof operands[1] === "number" ? operands[1] : 0;
445
+ this.textState.font = this.fonts.get(fontName) ?? null;
446
+ this.textState.fontSize = fontSize;
447
+ }
448
+ beginText() {
449
+ this.inTextObject = true;
450
+ this.textMatrix = [1, 0, 0, 1, 0, 0];
451
+ this.lineMatrix = [1, 0, 0, 1, 0, 0];
452
+ }
453
+ moveText(tx, ty) {
454
+ const m = [1, 0, 0, 1, tx, ty];
455
+ this.lineMatrix = multiplyMatrices(m, this.lineMatrix);
456
+ this.textMatrix = [...this.lineMatrix];
457
+ }
458
+ setTextMatrix(m) {
459
+ this.textMatrix = [...m];
460
+ this.lineMatrix = [...m];
461
+ }
462
+ // ===========================================================================
463
+ // Text Showing
464
+ // ===========================================================================
465
+ showText(operand, fragments) {
466
+ if (operand === undefined || !this.textState.font) {
467
+ return;
468
+ }
469
+ let bytes;
470
+ if (operand instanceof Uint8Array) {
471
+ bytes = operand;
472
+ }
473
+ else if (typeof operand === "string") {
474
+ bytes = _textEncoder.encode(operand);
475
+ }
476
+ else {
477
+ return;
478
+ }
479
+ const font = this.textState.font;
480
+ const text = decodeText(bytes, font);
481
+ if (text.length === 0) {
482
+ return;
483
+ }
484
+ // Calculate position using text matrix and CTM
485
+ const tm = multiplyMatrices(this.textMatrix, this.ctm);
486
+ const x = tm[4];
487
+ const y = tm[5];
488
+ const fontSize = this.textState.fontSize * Math.sqrt(tm[0] * tm[0] + tm[1] * tm[1]);
489
+ // Calculate text width
490
+ const width = this.calculateTextWidth(bytes, font);
491
+ // Determine vertical text: check if font has WMode=1
492
+ const isVertical = font.wmode === 1;
493
+ // Determine RTL: check the first character of the decoded text
494
+ const isRtl = detectRtl(text);
495
+ fragments.push({
496
+ text,
497
+ x,
498
+ y,
499
+ fontSize: Math.abs(fontSize),
500
+ fontName: font.baseFontName,
501
+ width,
502
+ charSpacing: this.textState.charSpacing,
503
+ wordSpacing: this.textState.wordSpacing,
504
+ horizontalScaling: this.textState.horizontalScaling,
505
+ isVertical,
506
+ isRtl
507
+ });
508
+ // Advance text matrix
509
+ this.advanceTextPosition(bytes, font);
510
+ }
511
+ showTextArray(operand, fragments) {
512
+ if (operand === undefined || !Array.isArray(operand)) {
513
+ return;
514
+ }
515
+ for (const item of operand) {
516
+ if (typeof item === "number") {
517
+ // Negative number = move right, positive = move left (in thousandths of text space unit)
518
+ const displacement = (-item / 1000) * this.textState.fontSize * (this.textState.horizontalScaling / 100);
519
+ this.textMatrix[4] += displacement * this.textMatrix[0];
520
+ this.textMatrix[5] += displacement * this.textMatrix[1];
521
+ }
522
+ else {
523
+ this.showText(item, fragments);
524
+ }
525
+ }
526
+ }
527
+ calculateTextWidth(bytes, font) {
528
+ let width = 0;
529
+ const scale = this.textState.fontSize * (this.textState.horizontalScaling / 100);
530
+ if (font.subtype === "Type0" || font.bytesPerCode === 2) {
531
+ // CID fonts: use CMap codespace ranges for variable-length code parsing,
532
+ // consistent with decodeCIDText in font-decoder.ts
533
+ let i = 0;
534
+ while (i < bytes.length) {
535
+ let codeLen = 0;
536
+ if (font.toUnicode?.hasCodeSpaceRanges) {
537
+ codeLen = font.toUnicode.getCodeLength(bytes[i]);
538
+ }
539
+ let code;
540
+ if (codeLen === 2 && i + 1 < bytes.length) {
541
+ code = (bytes[i] << 8) | bytes[i + 1];
542
+ i += 2;
543
+ }
544
+ else if (codeLen === 1) {
545
+ code = bytes[i];
546
+ i++;
547
+ }
548
+ else if (i + 1 < bytes.length) {
549
+ // Fallback: assume 2-byte
550
+ code = (bytes[i] << 8) | bytes[i + 1];
551
+ i += 2;
552
+ }
553
+ else {
554
+ code = bytes[i];
555
+ i++;
556
+ }
557
+ const w = getCharWidth(code, font) / 1000;
558
+ width += w * scale + this.textState.charSpacing;
559
+ if (code === 0x0020) {
560
+ width += this.textState.wordSpacing;
561
+ }
562
+ }
563
+ }
564
+ else {
565
+ for (let i = 0; i < bytes.length; i++) {
566
+ const w = getCharWidth(bytes[i], font) / 1000;
567
+ width += w * scale + this.textState.charSpacing;
568
+ if (bytes[i] === 0x20) {
569
+ width += this.textState.wordSpacing;
570
+ }
571
+ }
572
+ }
573
+ return width;
574
+ }
575
+ advanceTextPosition(bytes, font) {
576
+ const width = this.calculateTextWidth(bytes, font);
577
+ // Advance text matrix by the width of the rendered text
578
+ this.textMatrix[4] += width * this.textMatrix[0];
579
+ this.textMatrix[5] += width * this.textMatrix[1];
580
+ }
581
+ // ===========================================================================
582
+ // XObject Handling (Form XObjects may contain text)
583
+ // ===========================================================================
584
+ doXObject(operands, fragments) {
585
+ if (operands.length < 1) {
586
+ return;
587
+ }
588
+ // Guard against infinite recursion from self-referencing Form XObjects
589
+ if (this.formDepth >= MAX_FORM_DEPTH) {
590
+ return;
591
+ }
592
+ const name = typeof operands[0] === "string" ? operands[0] : String(operands[0]);
593
+ // Look up XObject in resources
594
+ const xobjects = this.resources.get("XObject");
595
+ if (!xobjects) {
596
+ return;
597
+ }
598
+ const xobjDict = this.doc.derefDict(xobjects);
599
+ if (!xobjDict) {
600
+ return;
601
+ }
602
+ const xobj = xobjDict.get(name);
603
+ if (!xobj) {
604
+ return;
605
+ }
606
+ const streamResult = this.doc.derefStreamWithObjNum(xobj);
607
+ if (!streamResult) {
608
+ return;
609
+ }
610
+ const stream = streamResult.stream;
611
+ const streamDict = stream.dict;
612
+ const subtype = dictGetName(streamDict, "Subtype");
613
+ if (subtype !== "Form") {
614
+ return;
615
+ }
616
+ // Process form XObject — it has its own resources and content stream
617
+ const formResources = streamDict.get("Resources");
618
+ const resolvedResources = formResources
619
+ ? (this.doc.derefDict(formResources) ?? this.resources)
620
+ : this.resources;
621
+ // Resolve fonts from form's resources
622
+ const formFonts = resolveFontResources(resolvedResources, this.doc);
623
+ // Merge with page fonts
624
+ const mergedFonts = new Map(this.fonts);
625
+ for (const [k, v] of formFonts) {
626
+ mergedFonts.set(k, v);
627
+ }
628
+ // Process form content with saved state
629
+ const savedFonts = this.fonts;
630
+ this.fonts = mergedFonts;
631
+ // Apply form matrix if present
632
+ const matrix = dictGetArray(streamDict, "Matrix");
633
+ if (matrix && matrix.length === 6) {
634
+ this.saveState();
635
+ this.concatMatrix(matrix);
636
+ }
637
+ const formData = this.doc.getStreamData(stream, streamResult.objNum, streamResult.gen);
638
+ this.formDepth++;
639
+ this.process(formData, fragments);
640
+ this.formDepth--;
641
+ if (matrix && matrix.length === 6) {
642
+ this.restoreState();
643
+ }
644
+ this.fonts = savedFonts;
645
+ }
646
+ }
647
+ // =============================================================================
648
+ // Inline Image Helpers
649
+ // =============================================================================
650
+ /** Check if a byte is PDF whitespace */
651
+ function isWhitespaceByte(b) {
652
+ return b === 0x00 || b === 0x09 || b === 0x0a || b === 0x0d || b === 0x0c || b === 0x20;
653
+ }
654
+ /** Check if a byte is a PDF delimiter */
655
+ function isDelimiterByte(b) {
656
+ return (b === 0x28 || // (
657
+ b === 0x29 || // )
658
+ b === 0x3c || // <
659
+ b === 0x3e || // >
660
+ b === 0x5b || // [
661
+ b === 0x5d || // ]
662
+ b === 0x7b || // {
663
+ b === 0x7d || // }
664
+ b === 0x2f || // /
665
+ b === 0x25 // %
666
+ );
667
+ }
668
+ function tokenToOperand(token) {
669
+ switch (token.type) {
670
+ case TokenType.Number:
671
+ return token.numValue ?? 0;
672
+ case TokenType.Name:
673
+ return token.strValue ?? "";
674
+ case TokenType.LiteralString:
675
+ case TokenType.HexString:
676
+ return token.rawBytes ?? new Uint8Array(0);
677
+ case TokenType.Boolean:
678
+ return token.boolValue ?? false;
679
+ case TokenType.Null:
680
+ return null;
681
+ case TokenType.ArrayBegin:
682
+ // This shouldn't happen — arrays should be parsed before reaching here
683
+ return [];
684
+ default:
685
+ return token.strValue ?? null;
686
+ }
687
+ }
688
+ function num(operands, index) {
689
+ const val = operands[index];
690
+ return typeof val === "number" ? val : 0;
691
+ }
692
+ function nums(operands, count) {
693
+ const result = [];
694
+ for (let i = 0; i < count; i++) {
695
+ result.push(num(operands, i));
696
+ }
697
+ return result;
698
+ }
699
+ // =============================================================================
700
+ // Matrix Operations
701
+ // =============================================================================
702
+ /**
703
+ * Multiply two 3x3 transformation matrices (stored as [a,b,c,d,e,f]).
704
+ * Matrix format: [a b 0; c d 0; e f 1]
705
+ */
706
+ function multiplyMatrices(m1, m2) {
707
+ return [
708
+ m1[0] * m2[0] + m1[1] * m2[2],
709
+ m1[0] * m2[1] + m1[1] * m2[3],
710
+ m1[2] * m2[0] + m1[3] * m2[2],
711
+ m1[2] * m2[1] + m1[3] * m2[3],
712
+ m1[4] * m2[0] + m1[5] * m2[2] + m2[4],
713
+ m1[4] * m2[1] + m1[5] * m2[3] + m2[5]
714
+ ];
715
+ }