@oh-my-pi/pi-coding-agent 16.0.7 → 16.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. package/CHANGELOG.md +41 -0
  2. package/dist/cli.js +4817 -12449
  3. package/dist/types/cli/args.d.ts +1 -0
  4. package/dist/types/cli/update-cli.d.ts +11 -0
  5. package/dist/types/commands/launch.d.ts +3 -0
  6. package/dist/types/debug/remote-debugger.d.ts +45 -0
  7. package/dist/types/goals/runtime.d.ts +4 -1
  8. package/dist/types/internal-urls/docs-index.d.ts +19 -0
  9. package/dist/types/markit/converters/docx.d.ts +6 -0
  10. package/dist/types/markit/converters/epub.d.ts +15 -0
  11. package/dist/types/markit/converters/pdf/columns.d.ts +35 -0
  12. package/dist/types/markit/converters/pdf/extract.d.ts +10 -0
  13. package/dist/types/markit/converters/pdf/grid.d.ts +25 -0
  14. package/dist/types/markit/converters/pdf/headers.d.ts +24 -0
  15. package/dist/types/markit/converters/pdf/index.d.ts +6 -0
  16. package/dist/types/markit/converters/pdf/render.d.ts +24 -0
  17. package/dist/types/markit/converters/pdf/types.d.ts +75 -0
  18. package/dist/types/markit/converters/pptx.d.ts +57 -0
  19. package/dist/types/markit/converters/xlsx.d.ts +25 -0
  20. package/dist/types/markit/index.d.ts +2 -0
  21. package/dist/types/markit/registry.d.ts +16 -0
  22. package/dist/types/markit/types.d.ts +30 -0
  23. package/dist/types/modes/print-mode.d.ts +2 -0
  24. package/dist/types/session/agent-session.d.ts +7 -8
  25. package/dist/types/session/auth-storage.d.ts +3 -2
  26. package/dist/types/session/yield-queue.d.ts +3 -1
  27. package/dist/types/tools/browser/attach.d.ts +1 -1
  28. package/dist/types/utils/markit.d.ts +0 -8
  29. package/dist/types/utils/mupdf-wasm-embed.d.ts +1 -0
  30. package/dist/types/utils/turndown.d.ts +15 -0
  31. package/dist/types/utils/zip.d.ts +119 -0
  32. package/package.json +20 -18
  33. package/scripts/build-binary.ts +7 -3
  34. package/scripts/bundle-dist.ts +28 -12
  35. package/scripts/embed-mupdf-wasm.ts +67 -0
  36. package/scripts/generate-docs-index.ts +48 -32
  37. package/scripts/omp +1 -1
  38. package/src/advisor/__tests__/advisor.test.ts +83 -0
  39. package/src/advisor/runtime.ts +16 -1
  40. package/src/cli/args.ts +3 -0
  41. package/src/cli/auth-broker-cli.ts +1 -3
  42. package/src/cli/auth-gateway-cli.ts +2 -5
  43. package/src/cli/flag-tables.ts +1 -0
  44. package/src/cli/update-cli.ts +63 -3
  45. package/src/commands/launch.ts +3 -0
  46. package/src/config/model-discovery.ts +20 -8
  47. package/src/config/models-config-schema.ts +8 -1
  48. package/src/debug/index.ts +44 -0
  49. package/src/debug/remote-debugger.ts +151 -0
  50. package/src/debug/report-bundle.ts +2 -1
  51. package/src/goals/runtime.ts +19 -7
  52. package/src/internal-urls/docs-index.generated.txt +2 -0
  53. package/src/internal-urls/docs-index.ts +102 -0
  54. package/src/internal-urls/omp-protocol.ts +10 -9
  55. package/src/main.ts +8 -0
  56. package/src/markit/NOTICE +32 -0
  57. package/src/markit/converters/docx.ts +56 -0
  58. package/src/markit/converters/epub.ts +136 -0
  59. package/src/markit/converters/mammoth.d.ts +24 -0
  60. package/src/markit/converters/pdf/columns.ts +103 -0
  61. package/src/markit/converters/pdf/extract.ts +574 -0
  62. package/src/markit/converters/pdf/grid.ts +780 -0
  63. package/src/markit/converters/pdf/headers.ts +106 -0
  64. package/src/markit/converters/pdf/index.ts +146 -0
  65. package/src/markit/converters/pdf/render.ts +501 -0
  66. package/src/markit/converters/pdf/types.ts +84 -0
  67. package/src/markit/converters/pptx.ts +325 -0
  68. package/src/markit/converters/xlsx.ts +173 -0
  69. package/src/markit/index.ts +2 -0
  70. package/src/markit/registry.ts +59 -0
  71. package/src/markit/types.ts +35 -0
  72. package/src/modes/components/snapcompact-shape-preview-doc.md +14 -7
  73. package/src/modes/components/snapcompact-shape-preview.ts +2 -2
  74. package/src/modes/controllers/input-controller.ts +29 -8
  75. package/src/modes/interactive-mode.ts +33 -12
  76. package/src/modes/print-mode.ts +5 -1
  77. package/src/prompts/advisor/advise-tool.md +3 -1
  78. package/src/prompts/advisor/system.md +55 -11
  79. package/src/sdk.ts +5 -9
  80. package/src/session/agent-session.ts +72 -42
  81. package/src/session/auth-storage.ts +2 -11
  82. package/src/session/yield-queue.ts +7 -1
  83. package/src/tools/browser/attach.ts +2 -2
  84. package/src/tools/fetch.ts +25 -60
  85. package/src/tools/read.ts +1 -1
  86. package/src/tools/search.ts +1 -6
  87. package/src/tools/write.ts +25 -65
  88. package/src/utils/markit.ts +25 -9
  89. package/src/utils/mupdf-wasm-embed.ts +12 -0
  90. package/src/utils/tools-manager.ts +2 -11
  91. package/src/utils/turndown.ts +83 -0
  92. package/src/{tools/archive-reader.ts → utils/zip.ts} +453 -83
  93. package/src/web/scrapers/types.ts +3 -46
  94. package/dist/types/internal-urls/docs-index.generated.d.ts +0 -2
  95. package/dist/types/tools/archive-reader.d.ts +0 -49
  96. package/src/internal-urls/docs-index.generated.ts +0 -120
@@ -20,12 +20,11 @@ import { CachedOutputBlock, markFramedBlockComponent } from "../tui/output-block
20
20
  import { webpExclusionForModel } from "../utils/image-loading";
21
21
  import { formatDimensionNote, resizeImage } from "../utils/image-resize";
22
22
  import { ensureTool } from "../utils/tools-manager";
23
+ import { type ArchiveFormat, listArchiveRoot, sniffArchiveFormat } from "../utils/zip";
23
24
  import { extractWithParallel, findParallelApiKey, getParallelExtractContent } from "../web/parallel";
24
- import { specialHandlers } from "../web/scrapers";
25
- import type { RenderResult } from "../web/scrapers/types";
25
+ import type { RenderResult, SpecialHandler } from "../web/scrapers/types";
26
26
  import { finalizeOutput, loadPage, looksLikeHtml, MAX_BYTES, MAX_OUTPUT_CHARS } from "../web/scrapers/types";
27
27
  import { convertWithMarkit, fetchBinary } from "../web/scrapers/utils";
28
- import { type ArchiveFormat, listArchiveRoot, sniffArchiveFormat } from "./archive-reader";
29
28
  import { applyListLimit } from "./list-limit";
30
29
  import { formatStyledArtifactReference, type OutputMeta } from "./output-meta";
31
30
  import { type LineRange, parseLineRanges } from "./path-utils";
@@ -51,34 +50,9 @@ const CONVERTIBLE_MIMES = new Set([
51
50
  "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
52
51
  "application/rtf",
53
52
  "application/epub+zip",
54
- "image/png",
55
- "image/jpeg",
56
- "image/gif",
57
- "image/webp",
58
- "audio/mpeg",
59
- "audio/wav",
60
- "audio/ogg",
61
53
  ]);
62
54
 
63
- const CONVERTIBLE_EXTENSIONS = new Set([
64
- ".pdf",
65
- ".doc",
66
- ".docx",
67
- ".ppt",
68
- ".pptx",
69
- ".xls",
70
- ".xlsx",
71
- ".rtf",
72
- ".epub",
73
- ".png",
74
- ".jpg",
75
- ".jpeg",
76
- ".gif",
77
- ".webp",
78
- ".mp3",
79
- ".wav",
80
- ".ogg",
81
- ]);
55
+ const CONVERTIBLE_EXTENSIONS = new Set([".pdf", ".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx", ".rtf", ".epub"]);
82
56
 
83
57
  const NOTEBOOK_MIMES = new Set(["application/x-ipynb+json"]);
84
58
  const NOTEBOOK_EXTENSIONS = new Set([".ipynb"]);
@@ -1044,6 +1018,18 @@ async function tryRenderBinaryPayload(
1044
1018
  // Unified Special Handler Dispatch
1045
1019
  // =============================================================================
1046
1020
 
1021
+ let specialHandlersPromise: Promise<SpecialHandler[]> | undefined;
1022
+
1023
+ /**
1024
+ * Lazily load the site-specific scraper handlers. The scrapers barrel eagerly
1025
+ * imports ~80 site modules, none of which are needed until the first fetch that
1026
+ * requires a special handler, so we keep them out of the cold-startup graph.
1027
+ */
1028
+ function loadSpecialHandlers(): Promise<SpecialHandler[]> {
1029
+ specialHandlersPromise ??= import("../web/scrapers").then(m => m.specialHandlers);
1030
+ return specialHandlersPromise;
1031
+ }
1032
+
1047
1033
  /**
1048
1034
  * Try all special handlers
1049
1035
  */
@@ -1053,6 +1039,7 @@ async function handleSpecialUrls(
1053
1039
  signal: AbortSignal | undefined,
1054
1040
  storage: AgentStorage | null,
1055
1041
  ): Promise<FetchRenderResult | null> {
1042
+ const specialHandlers = await loadSpecialHandlers();
1056
1043
  for (const handler of specialHandlers) {
1057
1044
  if (signal?.aborted) {
1058
1045
  throw new ToolAbortError();
@@ -1144,45 +1131,25 @@ async function renderUrl(
1144
1131
  notes.push(
1145
1132
  `Image MIME type ${imageMimeType} is unsupported for inline model serialization; returning text metadata only`,
1146
1133
  );
1147
- const shouldTryConvertibleFallback = isConvertible(mime, extHint);
1148
- if (shouldTryConvertibleFallback) {
1149
- notes.push("Attempting binary conversion fallback for unsupported image MIME type");
1150
- } else {
1151
- notes.push("Falling back to textual rendering from initial response");
1152
- }
1153
- skipConvertibleBinaryRetry = !shouldTryConvertibleFallback;
1134
+ notes.push("Falling back to textual rendering from initial response");
1135
+ skipConvertibleBinaryRetry = true;
1154
1136
  } else {
1155
1137
  const binary = await fetchBinary(finalUrl, timeout, signal);
1156
1138
  if (binary.ok) {
1157
1139
  notes.push("Fetched image binary");
1158
- const conversionExtension = getExtensionHint(finalUrl, binary.contentDisposition) || extHint;
1159
- let convertedText: string | null = null;
1160
- const converted = await convertWithMarkit(binary.buffer, conversionExtension, timeout, signal);
1161
- if (converted.ok) {
1162
- if (converted.content.trim().length > 50) {
1163
- notes.push("Converted with markit");
1164
- convertedText = converted.content;
1165
- } else {
1166
- notes.push("markit conversion produced no usable output");
1167
- }
1168
- } else if (converted.error) {
1169
- notes.push(`markit conversion failed: ${converted.error}`);
1170
- } else {
1171
- notes.push("markit conversion failed");
1172
- }
1173
1140
 
1174
1141
  if (binary.buffer.byteLength > MAX_INLINE_IMAGE_SOURCE_BYTES) {
1175
1142
  notes.push(
1176
1143
  `Image exceeds inline source limit (${binary.buffer.byteLength} bytes > ${MAX_INLINE_IMAGE_SOURCE_BYTES} bytes)`,
1177
1144
  );
1178
1145
  const output = finalizeOutput(
1179
- convertedText ?? `Fetched image content (${imageMimeType}), but it is too large to inline render.`,
1146
+ `Fetched image content (${imageMimeType}), but it is too large to inline render.`,
1180
1147
  );
1181
1148
  return {
1182
1149
  url,
1183
1150
  finalUrl,
1184
1151
  contentType: imageMimeType,
1185
- method: convertedText ? "markit" : "image-too-large",
1152
+ method: "image-too-large",
1186
1153
  content: output.content,
1187
1154
  fetchedAt,
1188
1155
  truncated: output.truncated,
@@ -1199,15 +1166,13 @@ async function renderUrl(
1199
1166
  if (!isDecodedImage) {
1200
1167
  notes.push(`Fetched payload could not be decoded as ${imageMimeType}; returning text metadata only`);
1201
1168
  const output = finalizeOutput(
1202
- convertedText ??
1203
- rawContent ??
1204
- `Fetched payload was labeled ${imageMimeType}, but bytes were not a valid image.`,
1169
+ rawContent ?? `Fetched payload was labeled ${imageMimeType}, but bytes were not a valid image.`,
1205
1170
  );
1206
1171
  return {
1207
1172
  url,
1208
1173
  finalUrl,
1209
1174
  contentType: imageMimeType,
1210
- method: convertedText ? "markit" : "image-invalid",
1175
+ method: "image-invalid",
1211
1176
  content: output.content,
1212
1177
  fetchedAt,
1213
1178
  truncated: output.truncated,
@@ -1219,13 +1184,13 @@ async function renderUrl(
1219
1184
  `Image exceeds inline output limit after resize (${resized.buffer.length} bytes > ${MAX_INLINE_IMAGE_OUTPUT_BYTES} bytes)`,
1220
1185
  );
1221
1186
  const output = finalizeOutput(
1222
- convertedText ?? `Fetched image content (${imageMimeType}), but it is too large to inline render.`,
1187
+ `Fetched image content (${imageMimeType}), but it is too large to inline render.`,
1223
1188
  );
1224
1189
  return {
1225
1190
  url,
1226
1191
  finalUrl,
1227
1192
  contentType: imageMimeType,
1228
- method: convertedText ? "markit" : "image-too-large",
1193
+ method: "image-too-large",
1229
1194
  content: output.content,
1230
1195
  fetchedAt,
1231
1196
  truncated: output.truncated,
@@ -1234,7 +1199,7 @@ async function renderUrl(
1234
1199
  }
1235
1200
 
1236
1201
  const dimensionNote = formatDimensionNote(resized);
1237
- let imageSummary = convertedText ?? `Fetched image content (${resized.mimeType}).`;
1202
+ let imageSummary = `Fetched image content (${resized.mimeType}).`;
1238
1203
  if (dimensionNote) {
1239
1204
  imageSummary += `\n${dimensionNote}`;
1240
1205
  }
package/src/tools/read.ts CHANGED
@@ -48,8 +48,8 @@ import {
48
48
  webpExclusionForModel,
49
49
  } from "../utils/image-loading";
50
50
  import { convertFileWithMarkit } from "../utils/markit";
51
+ import { type ArchiveReader, formatArchiveEntryLines, openArchive, parseArchivePathCandidates } from "../utils/zip";
51
52
  import { buildDirectoryTree, type DirectoryTree } from "../workspace-tree";
52
- import { type ArchiveReader, formatArchiveEntryLines, openArchive, parseArchivePathCandidates } from "./archive-reader";
53
53
  import {
54
54
  type ConflictEntry,
55
55
  type ConflictScope,
@@ -28,13 +28,8 @@ import {
28
28
  uriHyperlink,
29
29
  } from "../tui";
30
30
  import { resolveFileDisplayMode } from "../utils/file-display-mode";
31
+ import { type ArchiveReader, type ExtractedArchiveFile, openArchive, parseArchivePathCandidates } from "../utils/zip";
31
32
  import type { ToolSession } from ".";
32
- import {
33
- type ArchiveReader,
34
- type ExtractedArchiveFile,
35
- openArchive,
36
- parseArchivePathCandidates,
37
- } from "./archive-reader";
38
33
  import { createFileRecorder, formatResultPath } from "./file-recorder";
39
34
  import { classifyGroupedLines, formatGroupedFiles, groupLineIndicesByBlank } from "./grouped-file-output";
40
35
  import { formatMatchLine } from "./match-line-format";
@@ -20,8 +20,14 @@ import writeDescription from "../prompts/tools/write.md" with { type: "text" };
20
20
  import type { ToolSession } from "../sdk";
21
21
  import { fileHyperlink, framedBlock, renderStatusLine } from "../tui";
22
22
  import { resolveFileDisplayMode } from "../utils/file-display-mode";
23
+ import {
24
+ type ArchiveMemberContent,
25
+ archiveFormatFromPath,
26
+ parseArchivePathCandidates,
27
+ readArchiveEntries,
28
+ writeArchive,
29
+ } from "../utils/zip";
23
30
  import { truncateForPrompt } from "./approval";
24
- import { parseArchivePathCandidates } from "./archive-reader";
25
31
  import { assertEditableFile } from "./auto-generated-guard";
26
32
  import {
27
33
  type ConflictEntry,
@@ -65,12 +71,6 @@ import { toolResult } from "./tool-result";
65
71
  const LOOSE_HASHLINE_HEADER_RE = /^\s*\[[^#\r\n]+#[^ \t\r\n]*\]\s*$/;
66
72
  const EXECUTABLE_NOTICE = "[Notice: Made executable via chmod +x]";
67
73
 
68
- let fflateModulePromise: Promise<typeof import("fflate")> | undefined;
69
- async function loadFflate(): Promise<typeof import("fflate")> {
70
- if (!fflateModulePromise) fflateModulePromise = import("fflate");
71
- return fflateModulePromise;
72
- }
73
-
74
74
  const writeSchema = type({
75
75
  path: type("string").describe("file path"),
76
76
  content: type("string").describe("file content"),
@@ -369,9 +369,10 @@ export class WriteTool implements AgentTool<typeof writeSchema, WriteToolDetails
369
369
  const finalPath = resolvedArchivePath.exists
370
370
  ? await fs.realpath(resolvedArchivePath.absolutePath).catch(() => resolvedArchivePath.absolutePath)
371
371
  : resolvedArchivePath.absolutePath;
372
- const lowerPath = finalPath.toLowerCase();
373
- const isZip = lowerPath.endsWith(".zip");
374
- const isGzip = lowerPath.endsWith(".tar.gz") || lowerPath.endsWith(".tgz");
372
+ // A realpath swap can land on a name without an archive extension; a
373
+ // whole-archive rewrite then defaults to an uncompressed tar, matching the
374
+ // previous `isZip`/`isGzip`/else fallthrough.
375
+ const format = archiveFormatFromPath(finalPath) ?? "tar";
375
376
  // Rewrites are whole-archive: write to a temp file and rename so a
376
377
  // crash/disk-full mid-write can't destroy the original archive.
377
378
  const tmpPath = `${finalPath}.tmp-${process.pid}`;
@@ -381,67 +382,26 @@ export class WriteTool implements AgentTool<typeof writeSchema, WriteToolDetails
381
382
  await fs.mkdir(parentDir, { recursive: true });
382
383
  }
383
384
 
384
- if (isZip) {
385
- const zipEntries: Record<string, Uint8Array> = {};
386
-
387
- if (resolvedArchivePath.exists) {
388
- try {
389
- const bytes = await Bun.file(resolvedArchivePath.absolutePath).bytes();
390
- const { unzipSync } = await loadFflate();
391
- const existing = unzipSync(new Uint8Array(bytes));
392
- for (const [entryPath, data] of Object.entries(existing)) {
393
- zipEntries[entryPath.replace(/\\/g, "/")] = data;
394
- }
395
- } catch (error) {
396
- throw new ToolError(error instanceof Error ? error.message : String(error));
397
- }
398
- }
399
-
400
- zipEntries[resolvedArchivePath.archiveSubPath] = new TextEncoder().encode(content);
401
-
385
+ const entries = new Map<string, ArchiveMemberContent>();
386
+ if (resolvedArchivePath.exists) {
402
387
  try {
403
- const { zipSync } = await loadFflate();
404
- const zipBuffer = zipSync(zipEntries);
405
- await Bun.write(tmpPath, zipBuffer);
406
- await fs.rename(tmpPath, finalPath);
407
- } catch (error) {
408
- await fs.rm(tmpPath, { force: true }).catch(() => {});
409
- throw new ToolError(error instanceof Error ? error.message : String(error));
410
- }
411
- } else {
412
- const archiveEntries: Record<string, string | File> = {};
413
- if (resolvedArchivePath.exists) {
414
- let archive: Bun.Archive;
415
- try {
416
- archive = new Bun.Archive(await Bun.file(resolvedArchivePath.absolutePath).bytes());
417
- } catch (error) {
418
- throw new ToolError(error instanceof Error ? error.message : String(error));
388
+ const existing = await readArchiveEntries({ bytes: await Bun.file(finalPath).bytes(), format });
389
+ for (const [entryPath, data] of existing) {
390
+ entries.set(entryPath, data);
419
391
  }
420
-
421
- let files: Map<string, File>;
422
- try {
423
- files = await archive.files();
424
- } catch (error) {
425
- throw new ToolError(error instanceof Error ? error.message : String(error));
426
- }
427
-
428
- for (const [entryPath, file] of files) {
429
- archiveEntries[entryPath.replace(/\\/g, "/")] = file;
430
- }
431
- }
432
-
433
- archiveEntries[resolvedArchivePath.archiveSubPath] = content;
434
-
435
- try {
436
- // `Bun.Archive.write` never infers compression from the extension;
437
- // request gzip explicitly so `.tar.gz`/`.tgz` stay compressed.
438
- await Bun.Archive.write(tmpPath, archiveEntries, isGzip ? { compress: "gzip" } : undefined);
439
- await fs.rename(tmpPath, finalPath);
440
392
  } catch (error) {
441
- await fs.rm(tmpPath, { force: true }).catch(() => {});
442
393
  throw new ToolError(error instanceof Error ? error.message : String(error));
443
394
  }
444
395
  }
396
+ entries.set(resolvedArchivePath.archiveSubPath, content);
397
+
398
+ try {
399
+ await writeArchive(tmpPath, format, entries);
400
+ await fs.rename(tmpPath, finalPath);
401
+ } catch (error) {
402
+ await fs.rm(tmpPath, { force: true }).catch(() => {});
403
+ throw new ToolError(error instanceof Error ? error.message : String(error));
404
+ }
445
405
 
446
406
  invalidateFsScanAfterWrite(resolvedArchivePath.absolutePath);
447
407
  const outputPath = `${formatPathRelativeToCwd(resolvedArchivePath.absolutePath, this.session.cwd)}:${
@@ -1,6 +1,7 @@
1
1
  import { logger, untilAborted } from "@oh-my-pi/pi-utils";
2
- import type { Markit, StreamInfo } from "markit-ai";
2
+ import type { Markit, StreamInfo } from "../markit";
3
3
  import { ToolAbortError } from "../tools/tool-errors";
4
+ import { loadEmbeddedMupdfWasm } from "./mupdf-wasm-embed";
4
5
 
5
6
  export interface MarkitConversionResult {
6
7
  content: string;
@@ -21,10 +22,7 @@ export interface MarkitFileConversionOptions {
21
22
  interface MuPdfWasmModuleConfig {
22
23
  print?: (...values: unknown[]) => void;
23
24
  printErr?: (...values: unknown[]) => void;
24
- }
25
-
26
- declare global {
27
- var $libmupdf_wasm_Module: MuPdfWasmModuleConfig | undefined;
25
+ wasmBinary?: Uint8Array;
28
26
  }
29
27
 
30
28
  function logMuPdfWasmOutput(stream: "stdout" | "stderr", values: unknown[]): void {
@@ -32,17 +30,35 @@ function logMuPdfWasmOutput(stream: "stdout" | "stderr", values: unknown[]): voi
32
30
  logger.debug("mupdf wasm output", { stream, message });
33
31
  }
34
32
 
33
+ // `$libmupdf_wasm_Module` is declared globally (as `any`) by the mupdf package.
34
+ // Install print hooks before the WASM module initializes so its stdout/stderr
35
+ // route to the file logger instead of corrupting the TUI.
35
36
  function installMuPdfWasmLogger(): void {
36
- const moduleConfig = globalThis.$libmupdf_wasm_Module ?? {};
37
- moduleConfig.print = (...values) => logMuPdfWasmOutput("stdout", values);
38
- moduleConfig.printErr = (...values) => logMuPdfWasmOutput("stderr", values);
37
+ const moduleConfig: MuPdfWasmModuleConfig = globalThis.$libmupdf_wasm_Module ?? {};
38
+ moduleConfig.print = (...values: unknown[]) => logMuPdfWasmOutput("stdout", values);
39
+ moduleConfig.printErr = (...values: unknown[]) => logMuPdfWasmOutput("stderr", values);
40
+ globalThis.$libmupdf_wasm_Module = moduleConfig;
41
+ }
42
+
43
+ // Hand the WASM module its bytes directly when the compiled binary embedded them
44
+ // (scripts/embed-mupdf-wasm.ts); a single-file binary has no node_modules for
45
+ // mupdf to read `mupdf-wasm.wasm` from. Source/npm builds get undefined here and
46
+ // mupdf loads its own wasm. Must run before the mupdf module evaluates.
47
+ function installEmbeddedMupdfWasm(): void {
48
+ const wasmBinary = loadEmbeddedMupdfWasm();
49
+ if (!wasmBinary) return;
50
+ const moduleConfig: MuPdfWasmModuleConfig = globalThis.$libmupdf_wasm_Module ?? {};
51
+ moduleConfig.wasmBinary = wasmBinary;
39
52
  globalThis.$libmupdf_wasm_Module = moduleConfig;
40
53
  }
41
54
 
42
55
  installMuPdfWasmLogger();
43
56
 
44
57
  let markit: () => Markit | Promise<Markit> = async () => {
45
- const promise = import("markit-ai").then(({ Markit }) => {
58
+ // Lazy: keep the document engine (mammoth/mupdf) off the startup
59
+ // import graph — it loads only when a document is first converted.
60
+ installEmbeddedMupdfWasm();
61
+ const promise = import("../markit").then(({ Markit }) => {
46
62
  const instance = new Markit();
47
63
  markit = () => instance;
48
64
  return instance;
@@ -0,0 +1,12 @@
1
+ // AUTOGENERATED -- managed by scripts/embed-mupdf-wasm.ts. Do not edit by hand.
2
+ //
3
+ // Compiled single-file binaries cannot let mupdf resolve its `mupdf-wasm.wasm`
4
+ // sibling from the read-only bunfs, so the binary build (scripts/build-binary.ts
5
+ // and scripts/ci-release-build-binaries.ts) regenerates this module to embed the
6
+ // wasm bytes via `with { type: "file" }` and copies the wasm next to it. Source
7
+ // checkouts, `bun test`, and the npm `dist/cli.js` bundle keep mupdf external and
8
+ // load the wasm from node_modules, so this placeholder returns undefined and the
9
+ // build resets back to it afterward.
10
+ export function loadEmbeddedMupdfWasm(): Uint8Array | undefined {
11
+ return undefined;
12
+ }
@@ -2,6 +2,7 @@ import * as fs from "node:fs";
2
2
  import * as os from "node:os";
3
3
  import * as path from "node:path";
4
4
  import { $which, APP_NAME, getToolsDir, logger, ptree, TempDir } from "@oh-my-pi/pi-utils";
5
+ import { extractArchive } from "./zip";
5
6
 
6
7
  const TOOLS_DIR = getToolsDir();
7
8
  const TOOL_DOWNLOAD_TIMEOUT_MS = 120_000;
@@ -220,17 +221,7 @@ async function downloadTool(tool: ToolName, signal?: AbortSignal): Promise<strin
220
221
  }
221
222
 
222
223
  try {
223
- const archive = new Bun.Archive(await Bun.file(archivePath).arrayBuffer());
224
- const files = await archive.files();
225
- const extractRoot = path.resolve(tmp.path());
226
-
227
- for (const [filePath, file] of files) {
228
- const outputPath = path.resolve(extractRoot, filePath);
229
- if (!outputPath.startsWith(extractRoot + path.sep)) {
230
- throw new Error(`Archive entry escapes extraction dir: ${filePath}`);
231
- }
232
- await Bun.write(outputPath, file);
233
- }
224
+ await extractArchive(archivePath, tmp.path());
234
225
  } catch (err) {
235
226
  throw new Error(`Failed to extract ${assetName}: ${err instanceof Error ? err.message : String(err)}`);
236
227
  }
@@ -0,0 +1,83 @@
1
+ import TurndownService from "turndown";
2
+ import { gfm } from "turndown-plugin-gfm";
3
+
4
+ type TurndownListParent = {
5
+ nodeName: string;
6
+ getAttribute(name: string): string | null;
7
+ children: ArrayLike<unknown>;
8
+ };
9
+
10
+ /**
11
+ * Build a Turndown instance configured for GFM with the fixes omp relies on:
12
+ * `~~strikethrough~~`, unescaped heading periods, and single-space list markers.
13
+ *
14
+ * Shared by the web scrapers (HTML → markdown) and the markit document engine
15
+ * (`src/markit`). The rule set must stay identical across both call sites.
16
+ */
17
+ export function createTurndown(): TurndownService {
18
+ const turndown = new TurndownService({
19
+ headingStyle: "atx",
20
+ codeBlockStyle: "fenced",
21
+ bulletListMarker: "-",
22
+ });
23
+ turndown.use(gfm);
24
+ // GFM spec uses ~~ (double tilde), not ~ (single)
25
+ turndown.addRule("strikethrough", {
26
+ filter: ["del", "s", "strike"],
27
+ replacement(content) {
28
+ return `~~${content}~~`;
29
+ },
30
+ });
31
+ // Unescape the backslash turndown inserts before periods in headings ("1." -> "1\.")
32
+ turndown.addRule("heading", {
33
+ filter: ["h1", "h2", "h3", "h4", "h5", "h6"],
34
+ replacement(content, node) {
35
+ const level = Number(node.nodeName.charAt(1));
36
+ const prefix = "#".repeat(level);
37
+ const cleaned = content.replace(/\\([.])/g, "$1").trim();
38
+ return `\n\n${prefix} ${cleaned}\n\n`;
39
+ },
40
+ });
41
+ // Single space after the marker (turndown hardcodes three)
42
+ turndown.addRule("listItem", {
43
+ filter: "li",
44
+ replacement(content, node, options) {
45
+ const body = content.replace(/^\n+/, "").replace(/\n+$/, "\n").replace(/\n/gm, "\n ");
46
+ const parent = node.parentNode as unknown as TurndownListParent | null;
47
+ let prefix = `${options.bulletListMarker} `;
48
+ if (parent?.nodeName === "OL") {
49
+ const start = parent.getAttribute("start");
50
+ const index = Array.prototype.indexOf.call(parent.children, node);
51
+ prefix = `${(start ? Number(start) : 1) + index}. `;
52
+ }
53
+ return prefix + body + (node.nextSibling ? "\n" : "");
54
+ },
55
+ });
56
+ return turndown;
57
+ }
58
+
59
+ /**
60
+ * Normalize HTML tables so turndown-plugin-gfm can render them:
61
+ * - strip `<p>` tags inside `<td>`/`<th>` cells (joining paragraphs with a space)
62
+ * - wrap the first row in `<thead>` when missing
63
+ */
64
+ export function normalizeTablesHtml(html: string): string {
65
+ let result = html.replace(
66
+ /<(td|th)([^>]*)>([\s\S]*?)<\/(td|th)>/gi,
67
+ (_match, tag: string, attrs: string, inner: string, closeTag: string) => {
68
+ const stripped = inner
69
+ .replace(/^\s*<p>/i, "")
70
+ .replace(/<\/p>\s*$/i, "")
71
+ .replace(/<\/p>\s*<p>/gi, " ");
72
+ return `<${tag}${attrs}>${stripped}</${closeTag}>`;
73
+ },
74
+ );
75
+ result = result.replace(
76
+ /<table([^>]*)>\s*(?:<tbody>\s*)?(<tr[\s\S]*?<\/tr>)([\s\S]*?)<\/(?:tbody>\s*<\/)?table>/gi,
77
+ (_match, attrs: string, firstRow: string, rest: string) => {
78
+ const theadRow = firstRow.replace(/<td/gi, "<th").replace(/<\/td>/gi, "</th>");
79
+ return `<table${attrs}><thead>${theadRow}</thead><tbody>${rest}</tbody></table>`;
80
+ },
81
+ );
82
+ return result;
83
+ }