@oh-my-pi/pi-coding-agent 16.0.7 → 16.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. package/CHANGELOG.md +31 -0
  2. package/dist/cli.js +4752 -12462
  3. package/dist/types/cli/update-cli.d.ts +11 -0
  4. package/dist/types/debug/remote-debugger.d.ts +45 -0
  5. package/dist/types/internal-urls/docs-index.d.ts +19 -0
  6. package/dist/types/markit/converters/docx.d.ts +6 -0
  7. package/dist/types/markit/converters/epub.d.ts +15 -0
  8. package/dist/types/markit/converters/pdf/columns.d.ts +35 -0
  9. package/dist/types/markit/converters/pdf/extract.d.ts +10 -0
  10. package/dist/types/markit/converters/pdf/grid.d.ts +25 -0
  11. package/dist/types/markit/converters/pdf/headers.d.ts +24 -0
  12. package/dist/types/markit/converters/pdf/index.d.ts +6 -0
  13. package/dist/types/markit/converters/pdf/render.d.ts +24 -0
  14. package/dist/types/markit/converters/pdf/types.d.ts +75 -0
  15. package/dist/types/markit/converters/pptx.d.ts +57 -0
  16. package/dist/types/markit/converters/xlsx.d.ts +25 -0
  17. package/dist/types/markit/index.d.ts +2 -0
  18. package/dist/types/markit/registry.d.ts +16 -0
  19. package/dist/types/markit/types.d.ts +30 -0
  20. package/dist/types/session/agent-session.d.ts +7 -8
  21. package/dist/types/session/auth-storage.d.ts +3 -2
  22. package/dist/types/session/yield-queue.d.ts +3 -1
  23. package/dist/types/tools/browser/attach.d.ts +1 -1
  24. package/dist/types/utils/markit.d.ts +0 -8
  25. package/dist/types/utils/mupdf-wasm-embed.d.ts +1 -0
  26. package/dist/types/utils/turndown.d.ts +15 -0
  27. package/dist/types/utils/zip.d.ts +119 -0
  28. package/package.json +20 -18
  29. package/scripts/build-binary.ts +7 -3
  30. package/scripts/bundle-dist.ts +28 -12
  31. package/scripts/embed-mupdf-wasm.ts +67 -0
  32. package/scripts/generate-docs-index.ts +48 -32
  33. package/scripts/omp +1 -1
  34. package/src/advisor/__tests__/advisor.test.ts +83 -0
  35. package/src/advisor/runtime.ts +16 -1
  36. package/src/cli/auth-broker-cli.ts +1 -3
  37. package/src/cli/auth-gateway-cli.ts +2 -5
  38. package/src/cli/update-cli.ts +63 -3
  39. package/src/config/model-discovery.ts +20 -8
  40. package/src/config/models-config-schema.ts +8 -1
  41. package/src/debug/index.ts +44 -0
  42. package/src/debug/remote-debugger.ts +151 -0
  43. package/src/debug/report-bundle.ts +2 -1
  44. package/src/internal-urls/docs-index.generated.txt +2 -0
  45. package/src/internal-urls/docs-index.ts +102 -0
  46. package/src/internal-urls/omp-protocol.ts +10 -9
  47. package/src/markit/NOTICE +32 -0
  48. package/src/markit/converters/docx.ts +56 -0
  49. package/src/markit/converters/epub.ts +136 -0
  50. package/src/markit/converters/mammoth.d.ts +24 -0
  51. package/src/markit/converters/pdf/columns.ts +103 -0
  52. package/src/markit/converters/pdf/extract.ts +574 -0
  53. package/src/markit/converters/pdf/grid.ts +780 -0
  54. package/src/markit/converters/pdf/headers.ts +106 -0
  55. package/src/markit/converters/pdf/index.ts +146 -0
  56. package/src/markit/converters/pdf/render.ts +501 -0
  57. package/src/markit/converters/pdf/types.ts +84 -0
  58. package/src/markit/converters/pptx.ts +325 -0
  59. package/src/markit/converters/xlsx.ts +173 -0
  60. package/src/markit/index.ts +2 -0
  61. package/src/markit/registry.ts +59 -0
  62. package/src/markit/types.ts +35 -0
  63. package/src/modes/components/snapcompact-shape-preview-doc.md +14 -7
  64. package/src/modes/components/snapcompact-shape-preview.ts +2 -2
  65. package/src/modes/controllers/input-controller.ts +29 -8
  66. package/src/modes/interactive-mode.ts +26 -9
  67. package/src/prompts/advisor/system.md +1 -0
  68. package/src/sdk.ts +5 -9
  69. package/src/session/agent-session.ts +62 -40
  70. package/src/session/auth-storage.ts +2 -11
  71. package/src/session/yield-queue.ts +7 -1
  72. package/src/tools/browser/attach.ts +2 -2
  73. package/src/tools/fetch.ts +25 -60
  74. package/src/tools/read.ts +1 -1
  75. package/src/tools/search.ts +1 -6
  76. package/src/tools/write.ts +25 -65
  77. package/src/utils/markit.ts +25 -9
  78. package/src/utils/mupdf-wasm-embed.ts +12 -0
  79. package/src/utils/tools-manager.ts +2 -11
  80. package/src/utils/turndown.ts +83 -0
  81. package/src/{tools/archive-reader.ts → utils/zip.ts} +453 -83
  82. package/src/web/scrapers/types.ts +3 -46
  83. package/dist/types/internal-urls/docs-index.generated.d.ts +0 -2
  84. package/dist/types/tools/archive-reader.d.ts +0 -49
  85. package/src/internal-urls/docs-index.generated.ts +0 -120
@@ -0,0 +1,574 @@
1
+ // Adapted from markit-ai (MIT). See ../../NOTICE.
2
+
3
+ /**
4
+ * PDF content extraction using mupdf.
5
+ *
6
+ * Extracts text boxes (with position, font size, bold) and vector line
7
+ * segments (table borders) from each page. Uses mupdf's native WASM
8
+ * engine for fast parsing, and reads raw content streams for vector graphics.
9
+ *
10
+ * Coordinate system: PDF native (origin = bottom-left, Y increases upward).
11
+ */
12
+ import type * as mupdf from "mupdf";
13
+ import type { ImageRegion, PageContent, Segment, TextBox } from "./types";
14
+
15
+ // mupdf instantiates its WASM module via a top-level await. A static
16
+ // `import * as mupdf` would pull that await into this module's init, which makes
17
+ // the whole bundled markit chunk's `__esm` init async — and bun's compiled
18
+ // bundler fails to await that init transitively through the `../markit` barrel,
19
+ // exposing the converter classes before their module-level consts initialize
20
+ // (e.g. `EXTENSIONS` reads as undefined). Importing mupdf lazily keeps the chunk
21
+ // init synchronous and also keeps the ~10MB wasm off non-PDF conversions.
22
+ let mupdfModule: typeof mupdf | undefined;
23
+ async function loadMupdf(): Promise<typeof mupdf> {
24
+ if (!mupdfModule) {
25
+ mupdfModule = await import("mupdf");
26
+ }
27
+ return mupdfModule;
28
+ }
29
+
30
+ /** mupdf structured-text JSON bounding box (top-left origin). */
31
+ interface StextBBox {
32
+ x: number;
33
+ y: number;
34
+ w: number;
35
+ h: number;
36
+ }
37
+
38
+ /** Font metadata attached to a structured-text line. */
39
+ interface StextFont {
40
+ size?: number;
41
+ weight?: string;
42
+ name?: string;
43
+ }
44
+
45
+ /** A line within a text block in mupdf structured-text JSON. */
46
+ interface StextLine {
47
+ text?: string;
48
+ font?: StextFont;
49
+ bbox: StextBBox;
50
+ }
51
+
52
+ /** A block (text or image) in mupdf structured-text JSON. */
53
+ interface StextBlock {
54
+ type: string;
55
+ bbox: StextBBox;
56
+ lines: StextLine[];
57
+ }
58
+
59
+ /** Parsed mupdf structured-text JSON for a page. */
60
+ interface StructuredTextJSON {
61
+ blocks: StextBlock[];
62
+ }
63
+
64
+ /** A raw text fragment before merging into word/phrase boxes. */
65
+ interface RawTextItem {
66
+ text: string;
67
+ x: number;
68
+ y: number;
69
+ width: number;
70
+ height: number;
71
+ fontSize: number;
72
+ isBold: boolean;
73
+ }
74
+
75
+ // ---------------------------------------------------------------------------
76
+ // Text extraction
77
+ // ---------------------------------------------------------------------------
78
+ /** Y tolerance for merging text fragments on the same visual line. */
79
+ const SAME_LINE_Y_TOLERANCE = 2;
80
+ /** Max horizontal gap (pts) to merge adjacent fragments into one text box. */
81
+ const MAX_MERGE_GAP = 14;
82
+
83
+ /**
84
+ * Merge horizontally adjacent raw text items on the same visual line into
85
+ * word/phrase-level text boxes.
86
+ */
87
+ function mergeIntoWords(raws: RawTextItem[]): RawTextItem[] {
88
+ if (raws.length === 0) return [];
89
+ // Sort by Y descending (top-first in bottom-left coords), then X ascending
90
+ const sorted = [...raws].sort((a, b) => {
91
+ const dy = b.y - a.y;
92
+ return Math.abs(dy) > SAME_LINE_Y_TOLERANCE ? dy : a.x - b.x;
93
+ });
94
+ const merged: RawTextItem[] = [];
95
+ let cur = { ...sorted[0] };
96
+ for (let i = 1; i < sorted.length; i++) {
97
+ const next = sorted[i];
98
+ const sameY = Math.abs(next.y - cur.y) <= SAME_LINE_Y_TOLERANCE;
99
+ const close = next.x <= cur.x + cur.width + MAX_MERGE_GAP;
100
+ if (sameY && close) {
101
+ const gap = next.x - (cur.x + cur.width);
102
+ const sep = gap > 1 ? " " : "";
103
+ cur.text += sep + next.text;
104
+ cur.width = next.x + next.width - cur.x;
105
+ cur.height = Math.max(cur.height, next.height);
106
+ cur.fontSize = Math.max(cur.fontSize, next.fontSize);
107
+ cur.isBold = cur.isBold || next.isBold;
108
+ } else {
109
+ merged.push(cur);
110
+ cur = { ...next };
111
+ }
112
+ }
113
+ merged.push(cur);
114
+ return merged;
115
+ }
116
+
117
+ /**
118
+ * Extract text boxes from a mupdf page using structured text output.
119
+ *
120
+ * mupdf's structured text JSON uses top-left origin; we convert to
121
+ * bottom-left (standard PDF coordinates) using the page height.
122
+ */
123
+ function extractTextBoxes(
124
+ page: mupdf.Page,
125
+ pageNumber: number,
126
+ pageHeight: number,
127
+ stext?: StructuredTextJSON,
128
+ ): TextBox[] {
129
+ if (!stext) {
130
+ stext = JSON.parse(page.toStructuredText("preserve-whitespace").asJSON()) as StructuredTextJSON;
131
+ }
132
+ const raws: RawTextItem[] = [];
133
+ for (const block of stext.blocks) {
134
+ if (block.type !== "text") continue;
135
+ for (const line of block.lines) {
136
+ const text = line.text?.trim();
137
+ if (!text) continue;
138
+ const fontSize = line.font?.size ?? 0;
139
+ const weight = line.font?.weight ?? "normal";
140
+ const fontName = line.font?.name ?? "";
141
+ const isBold = weight === "bold" || /bold/i.test(fontName) || /Black|Heavy/i.test(fontName);
142
+ // mupdf bbox: {x, y, w, h} in top-left coords
143
+ // Convert to bottom-left: pdfY = pageHeight - (bbox.y + bbox.h)
144
+ const bboxY = line.bbox.y;
145
+ const bboxH = line.bbox.h;
146
+ const pdfY = pageHeight - (bboxY + bboxH);
147
+ raws.push({
148
+ text,
149
+ x: line.bbox.x,
150
+ y: pdfY,
151
+ width: line.bbox.w,
152
+ height: bboxH,
153
+ fontSize,
154
+ isBold,
155
+ });
156
+ }
157
+ }
158
+ const words = mergeIntoWords(raws);
159
+ return words
160
+ .map((w, i) => ({
161
+ id: `p${pageNumber}-t${i}`,
162
+ text: w.text.trim(),
163
+ pageNumber,
164
+ fontSize: w.fontSize,
165
+ isBold: w.isBold,
166
+ bounds: {
167
+ left: w.x,
168
+ right: w.x + w.width,
169
+ bottom: w.y,
170
+ top: w.y + w.height,
171
+ },
172
+ }))
173
+ .filter(b => b.text.length > 0);
174
+ }
175
+
176
+ // ---------------------------------------------------------------------------
177
+ // Vector segment extraction from raw content stream
178
+ // ---------------------------------------------------------------------------
179
+ /** Minimum aspect ratio for a filled rect to be considered a line. */
180
+ const LINE_ASPECT_THRESHOLD = 6;
181
+ /** Minimum length (pts) for a segment to count. */
182
+ const MIN_LENGTH = 2;
183
+ /** Maximum thickness (pts) for a border line (filters out filled areas). */
184
+ const MAX_THICKNESS = 3;
185
+
186
+ /**
187
+ * Convert a thin filled rectangle to a horizontal or vertical segment.
188
+ * Returns null if the rect doesn't look like a border line.
189
+ */
190
+ function thinRectToSegment(id: string, x: number, y: number, w: number, h: number): Segment | null {
191
+ const aw = Math.abs(w);
192
+ const ah = Math.abs(h);
193
+ if (aw > ah * LINE_ASPECT_THRESHOLD && aw >= MIN_LENGTH && ah <= MAX_THICKNESS) {
194
+ // Horizontal line
195
+ const cy = y + ah / 2;
196
+ return { id, x1: x, y1: cy, x2: x + aw, y2: cy };
197
+ }
198
+ if (ah > aw * LINE_ASPECT_THRESHOLD && ah >= MIN_LENGTH && aw <= MAX_THICKNESS) {
199
+ // Vertical line
200
+ const cx = x + aw / 2;
201
+ return { id, x1: cx, y1: y, x2: cx, y2: y + ah };
202
+ }
203
+ return null;
204
+ }
205
+
206
+ /**
207
+ * Emit 4 edge segments from a stroked rectangle.
208
+ */
209
+ function pushStrokedRectEdges(segments: Segment[], id: string, x: number, y: number, w: number, h: number): void {
210
+ const aw = Math.abs(w);
211
+ const ah = Math.abs(h);
212
+ const base = id;
213
+ if (aw >= MIN_LENGTH) {
214
+ segments.push({ id: `${base}-b`, x1: x, y1: y, x2: x + aw, y2: y });
215
+ segments.push({
216
+ id: `${base}-t`,
217
+ x1: x,
218
+ y1: y + ah,
219
+ x2: x + aw,
220
+ y2: y + ah,
221
+ });
222
+ }
223
+ if (ah >= MIN_LENGTH) {
224
+ segments.push({ id: `${base}-l`, x1: x, y1: y, x2: x, y2: y + ah });
225
+ segments.push({
226
+ id: `${base}-r`,
227
+ x1: x + aw,
228
+ y1: y,
229
+ x2: x + aw,
230
+ y2: y + ah,
231
+ });
232
+ }
233
+ }
234
+
235
+ const CTM_IDENTITY = [1, 0, 0, 1, 0, 0];
236
+
237
+ /** Concatenate two affine matrices: result = parent × child. */
238
+ function ctmConcat(p: number[], c: number[]): number[] {
239
+ return [
240
+ p[0] * c[0] + p[2] * c[1],
241
+ p[1] * c[0] + p[3] * c[1],
242
+ p[0] * c[2] + p[2] * c[3],
243
+ p[1] * c[2] + p[3] * c[3],
244
+ p[0] * c[4] + p[2] * c[5] + p[4],
245
+ p[1] * c[4] + p[3] * c[5] + p[5],
246
+ ];
247
+ }
248
+
249
+ function ctmApply(m: number[], x: number, y: number): [number, number] {
250
+ return [m[0] * x + m[2] * y + m[4], m[1] * x + m[3] * y + m[5]];
251
+ }
252
+
253
+ // ---------------------------------------------------------------------------
254
+ // Content stream parsing
255
+ // ---------------------------------------------------------------------------
256
+ /**
257
+ * Parse a PDF content stream and extract line segments from thin filled
258
+ * rectangles (re+f), stroked rectangles (re+S), and explicit lines (m/l+S).
259
+ * Tracks the CTM via q/Q/cm operators so coordinates are in page space.
260
+ */
261
+ function extractSegmentsFromContentStream(raw: string, pageNumber: number): Segment[] {
262
+ const segments: Segment[] = [];
263
+ const tokens = tokenizeContentStream(raw);
264
+ let idx = 0;
265
+ let strokeWidth = 1.0;
266
+ // Graphics state stack (q/Q): saves CTM + strokeWidth
267
+ let ctm = [...CTM_IDENTITY];
268
+ const stateStack: Array<{ ctm: number[]; strokeWidth: number }> = [];
269
+ // State for path building (in user coordinates, pre-CTM)
270
+ let curX = 0;
271
+ let curY = 0;
272
+ let pathStartX = 0;
273
+ let pathStartY = 0;
274
+ const pendingRects: Array<{ x: number; y: number; w: number; h: number }> = [];
275
+ const pendingLines: Array<{ x1: number; y1: number; x2: number; y2: number }> = [];
276
+ function flushPath(mode: "fill" | "stroke"): void {
277
+ const sid = () => `p${pageNumber}-s${segments.length}`;
278
+ if (mode === "fill") {
279
+ for (const r of pendingRects) {
280
+ // Transform the rect corners through CTM, then check if it's a thin line
281
+ const [x0, y0] = ctmApply(ctm, r.x, r.y);
282
+ const [x1, y1] = ctmApply(ctm, r.x + r.w, r.y + r.h);
283
+ const seg = thinRectToSegment(
284
+ sid(),
285
+ Math.min(x0, x1),
286
+ Math.min(y0, y1),
287
+ Math.abs(x1 - x0),
288
+ Math.abs(y1 - y0),
289
+ );
290
+ if (seg) segments.push(seg);
291
+ }
292
+ } else if (mode === "stroke" && strokeWidth <= MAX_THICKNESS) {
293
+ for (const r of pendingRects) {
294
+ const [x0, y0] = ctmApply(ctm, r.x, r.y);
295
+ const [x1, y1] = ctmApply(ctm, r.x + r.w, r.y + r.h);
296
+ pushStrokedRectEdges(
297
+ segments,
298
+ sid(),
299
+ Math.min(x0, x1),
300
+ Math.min(y0, y1),
301
+ Math.abs(x1 - x0),
302
+ Math.abs(y1 - y0),
303
+ );
304
+ }
305
+ for (const l of pendingLines) {
306
+ const [lx1, ly1] = ctmApply(ctm, l.x1, l.y1);
307
+ const [lx2, ly2] = ctmApply(ctm, l.x2, l.y2);
308
+ const dx = Math.abs(lx2 - lx1);
309
+ const dy = Math.abs(ly2 - ly1);
310
+ // Only keep H/V lines
311
+ if ((dx >= MIN_LENGTH && dy < 1) || (dy >= MIN_LENGTH && dx < 1)) {
312
+ segments.push({ id: sid(), x1: lx1, y1: ly1, x2: lx2, y2: ly2 });
313
+ }
314
+ }
315
+ }
316
+ pendingRects.length = 0;
317
+ pendingLines.length = 0;
318
+ }
319
+ while (idx < tokens.length) {
320
+ const t = tokens[idx];
321
+ if (t === "q") {
322
+ stateStack.push({ ctm: [...ctm], strokeWidth });
323
+ } else if (t === "Q") {
324
+ const saved = stateStack.pop();
325
+ if (saved) {
326
+ ctm = saved.ctm;
327
+ strokeWidth = saved.strokeWidth;
328
+ }
329
+ } else if (t === "cm" && idx >= 6) {
330
+ const a = Number(tokens[idx - 6]);
331
+ const b = Number(tokens[idx - 5]);
332
+ const c = Number(tokens[idx - 4]);
333
+ const d = Number(tokens[idx - 3]);
334
+ const e = Number(tokens[idx - 2]);
335
+ const f = Number(tokens[idx - 1]);
336
+ ctm = ctmConcat(ctm, [a, b, c, d, e, f]);
337
+ } else if (t === "w" && idx >= 1) {
338
+ strokeWidth = Number(tokens[idx - 1]) || strokeWidth;
339
+ } else if (t === "re" && idx >= 4) {
340
+ const x = Number(tokens[idx - 4]);
341
+ const y = Number(tokens[idx - 3]);
342
+ const w = Number(tokens[idx - 2]);
343
+ const h = Number(tokens[idx - 1]);
344
+ if (Number.isFinite(x + y + w + h)) {
345
+ pendingRects.push({ x, y, w, h });
346
+ }
347
+ } else if (t === "m" && idx >= 2) {
348
+ curX = Number(tokens[idx - 2]);
349
+ curY = Number(tokens[idx - 1]);
350
+ pathStartX = curX;
351
+ pathStartY = curY;
352
+ } else if (t === "l" && idx >= 2) {
353
+ const x2 = Number(tokens[idx - 2]);
354
+ const y2 = Number(tokens[idx - 1]);
355
+ pendingLines.push({ x1: curX, y1: curY, x2, y2 });
356
+ curX = x2;
357
+ curY = y2;
358
+ } else if (t === "h") {
359
+ // closePath: line back to start
360
+ if (curX !== pathStartX || curY !== pathStartY) {
361
+ pendingLines.push({
362
+ x1: curX,
363
+ y1: curY,
364
+ x2: pathStartX,
365
+ y2: pathStartY,
366
+ });
367
+ }
368
+ curX = pathStartX;
369
+ curY = pathStartY;
370
+ } else if (t === "f" || t === "F" || t === "f*") {
371
+ flushPath("fill");
372
+ } else if (t === "S" || t === "s") {
373
+ if (t === "s") {
374
+ // closeStroke: implicit closePath
375
+ if (curX !== pathStartX || curY !== pathStartY) {
376
+ pendingLines.push({
377
+ x1: curX,
378
+ y1: curY,
379
+ x2: pathStartX,
380
+ y2: pathStartY,
381
+ });
382
+ }
383
+ }
384
+ flushPath("stroke");
385
+ } else if (t === "B" || t === "B*" || t === "b" || t === "b*") {
386
+ // fill + stroke combined
387
+ flushPath("fill");
388
+ flushPath("stroke");
389
+ } else if (t === "n") {
390
+ // end path without painting — discard
391
+ pendingRects.length = 0;
392
+ pendingLines.length = 0;
393
+ }
394
+ idx++;
395
+ }
396
+ return segments;
397
+ }
398
+
399
+ /**
400
+ * Fast tokenizer for PDF content streams.
401
+ * Splits on whitespace, skipping comments and string literals.
402
+ */
403
+ function tokenizeContentStream(raw: string): string[] {
404
+ const tokens: string[] = [];
405
+ const len = raw.length;
406
+ let i = 0;
407
+ while (i < len) {
408
+ const ch = raw.charCodeAt(i);
409
+ // Skip whitespace
410
+ if (ch <= 32) {
411
+ i++;
412
+ continue;
413
+ }
414
+ // Skip comments
415
+ if (ch === 37 /* % */) {
416
+ while (i < len && raw.charCodeAt(i) !== 10) i++;
417
+ continue;
418
+ }
419
+ // Skip string literals (...)
420
+ if (ch === 40 /* ( */) {
421
+ let depth = 1;
422
+ i++;
423
+ while (i < len && depth > 0) {
424
+ const c = raw.charCodeAt(i);
425
+ if (c === 92 /* \ */) {
426
+ i++;
427
+ } else if (c === 40) {
428
+ depth++;
429
+ } else if (c === 41) {
430
+ depth--;
431
+ }
432
+ i++;
433
+ }
434
+ continue;
435
+ }
436
+ // Skip hex strings <...>
437
+ if (ch === 60 /* < */ && i + 1 < len && raw.charCodeAt(i + 1) !== 60) {
438
+ i++;
439
+ while (i < len && raw.charCodeAt(i) !== 62) i++;
440
+ i++; // skip >
441
+ continue;
442
+ }
443
+ // Skip dict delimiters << >>
444
+ if (ch === 60 && i + 1 < len && raw.charCodeAt(i + 1) === 60) {
445
+ i += 2;
446
+ continue;
447
+ }
448
+ if (ch === 62 && i + 1 < len && raw.charCodeAt(i + 1) === 62) {
449
+ i += 2;
450
+ continue;
451
+ }
452
+ // Regular token: read until whitespace or delimiter
453
+ const start = i;
454
+ while (i < len) {
455
+ const c = raw.charCodeAt(i);
456
+ if (c <= 32 || c === 40 || c === 41 || c === 60 || c === 62 || c === 37) break;
457
+ i++;
458
+ }
459
+ if (i > start) {
460
+ tokens.push(raw.substring(start, i));
461
+ }
462
+ }
463
+ return tokens;
464
+ }
465
+
466
+ // ---------------------------------------------------------------------------
467
+ // Image region detection
468
+ // ---------------------------------------------------------------------------
469
+ /** Minimum area (pts²) for an image to be considered a diagram, not an icon. */
470
+ const MIN_IMAGE_AREA = 5000;
471
+
472
+ function extractImageRegions(stext: StructuredTextJSON, pageNumber: number, pageHeight: number): ImageRegion[] {
473
+ const regions: ImageRegion[] = [];
474
+ for (const block of stext.blocks) {
475
+ if (block.type !== "image") continue;
476
+ const { x, y, w, h } = block.bbox;
477
+ if (w * h < MIN_IMAGE_AREA) continue; // skip tiny icons
478
+ // Convert Y from mupdf (top-left) to PDF (bottom-left) for ordering
479
+ const pdfTopY = pageHeight - y;
480
+ regions.push({
481
+ id: `p${pageNumber}-img${regions.length}`,
482
+ pageNumber,
483
+ bbox: { x, y, w, h },
484
+ topY: pdfTopY,
485
+ });
486
+ }
487
+ return regions;
488
+ }
489
+
490
+ // ---------------------------------------------------------------------------
491
+ // Public API
492
+ // ---------------------------------------------------------------------------
493
+ /**
494
+ * Render an image region from a PDF page as a PNG buffer.
495
+ * Uses mupdf's DrawDevice to render just the cropped area at 2x resolution.
496
+ */
497
+ export async function renderImageRegion(input: Uint8Array, region: ImageRegion): Promise<Uint8Array> {
498
+ const m = await loadMupdf();
499
+ const doc = m.Document.openDocument(input, "application/pdf");
500
+ const page = doc.loadPage(region.pageNumber - 1);
501
+ const pad = 10;
502
+ const bx = region.bbox.x - pad;
503
+ const by = region.bbox.y - pad;
504
+ const bw = region.bbox.w + 2 * pad;
505
+ const bh = region.bbox.h + 2 * pad;
506
+ const scale = 2;
507
+ const pw = Math.round(bw * scale);
508
+ const ph = Math.round(bh * scale);
509
+ const pix = new m.Pixmap(m.ColorSpace.DeviceRGB, [0, 0, pw, ph], false);
510
+ pix.clear(255);
511
+ const matrix: mupdf.Matrix = [scale, 0, 0, scale, -bx * scale, -by * scale];
512
+ const dl = page.toDisplayList();
513
+ const dev = new m.DrawDevice(matrix, pix);
514
+ dl.run(dev, m.Matrix.identity);
515
+ dev.close();
516
+ return pix.asPNG();
517
+ }
518
+
519
+ /**
520
+ * Extract text boxes and vector segments from all pages of a PDF buffer.
521
+ */
522
+ export async function extractPages(input: Uint8Array): Promise<PageContent[]> {
523
+ const m = await loadMupdf();
524
+ const doc = m.Document.openDocument(input, "application/pdf");
525
+ const pages: PageContent[] = [];
526
+ for (let i = 0; i < doc.countPages(); i++) {
527
+ const pageNumber = i + 1;
528
+ const page = doc.loadPage(i);
529
+ const bounds = page.getBounds();
530
+ const pageHeight = bounds[3] - bounds[1];
531
+ // Single structured text pass with both flags
532
+ const stext = JSON.parse(
533
+ page.toStructuredText("preserve-whitespace,preserve-images").asJSON(),
534
+ ) as StructuredTextJSON;
535
+ // Extract text boxes and image regions from the same parse
536
+ const textBoxes = extractTextBoxes(page, pageNumber, pageHeight, stext);
537
+ const images = extractImageRegions(stext, pageNumber, pageHeight);
538
+ // Extract vector segments from raw content stream
539
+ let segments: Segment[] = [];
540
+ try {
541
+ const pageObj = (page as mupdf.PDFPage).getObject();
542
+ const contents = pageObj.get("Contents");
543
+ if (contents) {
544
+ let rawBytes: Uint8Array;
545
+ if (contents.isArray()) {
546
+ // Multiple content streams — concatenate
547
+ const parts: Uint8Array[] = [];
548
+ const len = contents.length ?? 0;
549
+ for (let j = 0; j < len; j++) {
550
+ const stream = contents.get(j);
551
+ if (stream?.readStream) {
552
+ parts.push(stream.readStream().asUint8Array());
553
+ }
554
+ }
555
+ const totalLen = parts.reduce((s, p) => s + p.length, 0);
556
+ rawBytes = new Uint8Array(totalLen);
557
+ let offset = 0;
558
+ for (const part of parts) {
559
+ rawBytes.set(part, offset);
560
+ offset += part.length;
561
+ }
562
+ } else {
563
+ rawBytes = contents.readStream().asUint8Array();
564
+ }
565
+ const raw = new TextDecoder().decode(rawBytes);
566
+ segments = extractSegmentsFromContentStream(raw, pageNumber);
567
+ }
568
+ } catch {
569
+ // Content stream extraction failed — proceed with text only
570
+ }
571
+ pages.push({ pageNumber, textBoxes, segments, images });
572
+ }
573
+ return pages;
574
+ }