botholomew 0.13.0 → 0.14.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/chat/agent.ts +17 -4
- package/src/commands/context.ts +35 -9
- package/src/context/fetcher-errors.ts +8 -0
- package/src/context/fetcher.ts +96 -27
- package/src/context/markdown-converter.ts +186 -0
- package/src/context/store.ts +209 -36
- package/src/fs/sandbox.ts +18 -4
- package/src/tools/dir/create.ts +1 -1
- package/src/tools/dir/tree.ts +3 -2
- package/src/tools/file/copy.ts +1 -1
- package/src/tools/file/delete.ts +11 -2
- package/src/tools/file/edit.ts +1 -1
- package/src/tools/file/info.ts +3 -1
- package/src/tools/file/move.ts +1 -1
- package/src/tools/file/write.ts +1 -1
- package/src/tools/registry.ts +5 -0
- package/src/tools/tool.ts +5 -0
- package/src/tools/util/sleep.ts +77 -0
- package/src/tui/components/SleepProgress.tsx +70 -0
- package/src/tui/components/ToolCall.tsx +10 -0
- package/src/utils/frontmatter.ts +10 -2
package/package.json
CHANGED
package/src/chat/agent.ts
CHANGED
|
@@ -62,6 +62,7 @@ const CHAT_TOOL_NAMES = new Set([
|
|
|
62
62
|
"skill_edit",
|
|
63
63
|
"skill_search",
|
|
64
64
|
"skill_delete",
|
|
65
|
+
"sleep",
|
|
65
66
|
]);
|
|
66
67
|
|
|
67
68
|
export function getChatTools() {
|
|
@@ -364,6 +365,7 @@ export async function runChatTurn(input: {
|
|
|
364
365
|
projectDir,
|
|
365
366
|
config,
|
|
366
367
|
mcpxClient,
|
|
368
|
+
shouldAbort: session ? () => session.aborted : undefined,
|
|
367
369
|
});
|
|
368
370
|
const durationMs = Date.now() - start;
|
|
369
371
|
const stored = maybeStoreResult(toolUse.name, result.output);
|
|
@@ -411,6 +413,7 @@ interface ChatToolCallCtx {
|
|
|
411
413
|
projectDir: string;
|
|
412
414
|
config: Required<BotholomewConfig>;
|
|
413
415
|
mcpxClient: McpxClient | null;
|
|
416
|
+
shouldAbort?: () => boolean;
|
|
414
417
|
}
|
|
415
418
|
|
|
416
419
|
async function executeChatToolCall(
|
|
@@ -434,10 +437,20 @@ async function executeChatToolCall(
|
|
|
434
437
|
}
|
|
435
438
|
|
|
436
439
|
try {
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
440
|
+
// `sleep` deliberately yields for up to an hour; opening a DuckDB
|
|
441
|
+
// connection for that whole window would hold the instance-level file
|
|
442
|
+
// lock and block any worker that also wants the DB. Run it without a
|
|
443
|
+
// connection — the tool doesn't touch the DB.
|
|
444
|
+
const runWithoutDb = tool.name === "sleep";
|
|
445
|
+
const result = runWithoutDb
|
|
446
|
+
? await tool.execute(parsed.data, {
|
|
447
|
+
...baseCtx,
|
|
448
|
+
conn: undefined as unknown as ToolContext["conn"],
|
|
449
|
+
})
|
|
450
|
+
: await withDb(baseCtx.dbPath, (conn) => {
|
|
451
|
+
const ctx: ToolContext = { ...baseCtx, conn };
|
|
452
|
+
return tool.execute(parsed.data, ctx);
|
|
453
|
+
});
|
|
441
454
|
const isError =
|
|
442
455
|
typeof result === "object" && result !== null && "is_error" in result
|
|
443
456
|
? (result as { is_error: boolean }).is_error
|
package/src/commands/context.ts
CHANGED
|
@@ -18,6 +18,10 @@ import { withDb } from "../db/connection.ts";
|
|
|
18
18
|
import { indexStats } from "../db/embeddings.ts";
|
|
19
19
|
import { migrate } from "../db/schema.ts";
|
|
20
20
|
import { createMcpxClient } from "../mcpx/client.ts";
|
|
21
|
+
import {
|
|
22
|
+
type ContextFileMeta,
|
|
23
|
+
serializeContextFile,
|
|
24
|
+
} from "../utils/frontmatter.ts";
|
|
21
25
|
import { logger } from "../utils/logger.ts";
|
|
22
26
|
|
|
23
27
|
export function registerContextCommand(program: Command) {
|
|
@@ -46,21 +50,42 @@ export function registerContextCommand(program: Command) {
|
|
|
46
50
|
const dir = program.opts().dir;
|
|
47
51
|
const config = await loadConfig(dir);
|
|
48
52
|
const mcpxClient = await createMcpxClient(dir);
|
|
49
|
-
|
|
53
|
+
logger.info(`importing ${url}`);
|
|
50
54
|
try {
|
|
51
55
|
const fetched = await fetchUrl(url, config, mcpxClient, opts.prompt);
|
|
52
|
-
spinner.update({ text: "writing to context/" });
|
|
53
56
|
const dest = opts.path ?? deriveContextPath(url, fetched.source);
|
|
54
|
-
|
|
57
|
+
const meta: ContextFileMeta = {
|
|
58
|
+
source_url: url,
|
|
59
|
+
imported_at: new Date().toISOString(),
|
|
60
|
+
};
|
|
61
|
+
// Title falls back to the URL when fetcher couldn't extract one —
|
|
62
|
+
// skip it in that case to avoid duplicating source_url.
|
|
63
|
+
if (fetched.title && fetched.title !== url) {
|
|
64
|
+
meta.title = fetched.title;
|
|
65
|
+
}
|
|
66
|
+
const body = serializeContextFile(meta, fetched.content);
|
|
67
|
+
await writeContextFile(dir, dest, body, {
|
|
55
68
|
onConflict: opts.overwrite ? "overwrite" : "error",
|
|
56
69
|
});
|
|
57
|
-
|
|
58
|
-
|
|
70
|
+
logger.success(
|
|
71
|
+
`imported ${body.length} bytes → ${ansis.bold(`context/${dest}`)} (source: ${fetched.source ?? "http"})`,
|
|
72
|
+
);
|
|
73
|
+
|
|
74
|
+
// Reindex so the new file is searchable. reindexContext is
|
|
75
|
+
// incremental — files whose content_hash matches the index are
|
|
76
|
+
// skipped, so this only embeds the file we just wrote.
|
|
77
|
+
const dbPath = getDbPath(dir);
|
|
78
|
+
await withDb(dbPath, migrate);
|
|
79
|
+
const summary = await reindexContext(dir, config, dbPath, {
|
|
80
|
+
onProgress: (msg) => logger.dim(` ${msg}`),
|
|
59
81
|
});
|
|
82
|
+
logger.success(
|
|
83
|
+
`indexed: ${summary.added} added, ${summary.updated} updated, ${summary.unchanged} unchanged, ${summary.chunksWritten} chunks written`,
|
|
84
|
+
);
|
|
60
85
|
} catch (err) {
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
86
|
+
logger.error(
|
|
87
|
+
`import failed: ${err instanceof Error ? err.message : String(err)}`,
|
|
88
|
+
);
|
|
64
89
|
process.exit(1);
|
|
65
90
|
} finally {
|
|
66
91
|
await mcpxClient?.close();
|
|
@@ -196,9 +221,10 @@ function renderTreeAnsi(
|
|
|
196
221
|
): string {
|
|
197
222
|
const lines: string[] = [];
|
|
198
223
|
const connector = isRoot ? "" : isLast ? "└── " : "├── ";
|
|
199
|
-
const
|
|
224
|
+
const base = node.is_directory
|
|
200
225
|
? ansis.blue(node.name === "." ? "context/" : `${node.name}/`)
|
|
201
226
|
: node.name;
|
|
227
|
+
const label = node.is_symlink ? `${base} ${ansis.cyan("→")}` : base;
|
|
202
228
|
lines.push(`${prefix}${connector}${label}`);
|
|
203
229
|
if (node.is_directory && node.children) {
|
|
204
230
|
const childPrefix = isRoot ? "" : prefix + (isLast ? " " : "│ ");
|
package/src/context/fetcher.ts
CHANGED
|
@@ -15,8 +15,16 @@ import { mcpSearchTool } from "../tools/mcp/search.ts";
|
|
|
15
15
|
import type { ToolContext } from "../tools/tool.ts";
|
|
16
16
|
import { type AnyToolDefinition, toAnthropicTool } from "../tools/tool.ts";
|
|
17
17
|
import { logger } from "../utils/logger.ts";
|
|
18
|
+
import { FetchFailureError } from "./fetcher-errors.ts";
|
|
19
|
+
import {
|
|
20
|
+
convertToMarkdown,
|
|
21
|
+
isMarkdownMimeType,
|
|
22
|
+
resolveEffectiveMimeType,
|
|
23
|
+
} from "./markdown-converter.ts";
|
|
18
24
|
import { stripHtmlTags } from "./url-utils.ts";
|
|
19
25
|
|
|
26
|
+
export { FetchFailureError } from "./fetcher-errors.ts";
|
|
27
|
+
|
|
20
28
|
const MAX_CONTENT_BYTES = 500_000;
|
|
21
29
|
const MAX_TURNS = 10;
|
|
22
30
|
const MAX_RESPONSE_TOKENS = 4_096;
|
|
@@ -36,29 +44,23 @@ export interface FetchedContent {
|
|
|
36
44
|
source: string | null;
|
|
37
45
|
}
|
|
38
46
|
|
|
39
|
-
export class FetchFailureError extends Error {
|
|
40
|
-
readonly userMessage: string;
|
|
41
|
-
constructor(message: string) {
|
|
42
|
-
super(message);
|
|
43
|
-
this.name = "FetchFailureError";
|
|
44
|
-
this.userMessage = message;
|
|
45
|
-
}
|
|
46
|
-
}
|
|
47
|
-
|
|
48
47
|
const FETCHER_SYSTEM_PROMPT = `You are a content fetcher. Your job is to find the right MCP tool to retrieve the content at the given URL, run it, and tell the harness which result to save.
|
|
49
48
|
|
|
50
49
|
**Important: the harness captures the full result of every mcp_exec call automatically.** You only see a short preview of each result so you can verify it looks reasonable. You do NOT need to read or copy the full content — you just identify which exec call to save.
|
|
51
50
|
|
|
52
|
-
|
|
51
|
+
**Format preference: markdown, in order of preference.**
|
|
52
|
+
1. When searching with mcp_search or mcp_list_tools, prefer tools whose names indicate markdown output: anything containing "markdown", "md", "AsMarkdown", "AsMd", "AsDocmd", or similar. For example, prefer "GoogleDocs_GetDocumentAsDocmd" over "GoogleDocs_GetDocumentAsHtml".
|
|
53
|
+
2. If no markdown-named variant exists, use mcp_info to inspect the tool's input schema for a "format", "mime_type", "output_format", or similar parameter and request "markdown" (or "md") when available.
|
|
54
|
+
3. If neither is possible, run the tool anyway. The harness will convert the captured content to markdown via a separate LLM call before saving — markdown-native tools are still preferred because they're cheaper and higher fidelity, but you do not have to find one.
|
|
53
55
|
|
|
54
56
|
Workflow:
|
|
55
|
-
1. Use mcp_search or mcp_list_tools to find the best tool for this URL (e.g., Google Docs tools for docs.google.com, Firecrawl for generic web pages, GitHub tools for github.com).
|
|
57
|
+
1. Use mcp_search or mcp_list_tools to find the best tool for this URL (e.g., Google Docs tools for docs.google.com, Firecrawl for generic web pages, GitHub tools for github.com). Apply the format preference above.
|
|
56
58
|
2. Use mcp_info to inspect the tool's input schema.
|
|
57
59
|
3. Call mcp_exec with the right arguments — request markdown format when supported.
|
|
58
|
-
4. Look at the preview returned by mcp_exec. If it looks like the right content, call accept_content with the exec_call_id (the tool_use_id of the mcp_exec call)
|
|
60
|
+
4. Look at the preview returned by mcp_exec. If it looks like the right content, call accept_content with the exec_call_id (the tool_use_id of the mcp_exec call), a sensible title, and the actual mime_type the tool returned (so the harness knows whether to convert).
|
|
59
61
|
|
|
60
62
|
Terminal tools:
|
|
61
|
-
- accept_content(exec_call_id, title, mime_type?) — save the
|
|
63
|
+
- accept_content(exec_call_id, title, mime_type?) — save the content captured from a previous mcp_exec call. The harness has the full content; you supply the id, title, and the source mime_type (e.g., "text/html", "application/json", "text/markdown"). The harness converts to markdown before storage when needed.
|
|
62
64
|
- request_http_fallback() — fall back to a basic HTTP fetch. Use only when no MCP tool can handle the URL after a genuine attempt. Tools like Firecrawl can handle most URLs, so don't give up on the first try.
|
|
63
65
|
- report_failure(message) — surface an actionable message to the user (e.g., "this Google Doc is private — share it with your service account", "Firecrawl is not authenticated"). Use only when there is a specific next step the user must take.`;
|
|
64
66
|
|
|
@@ -147,14 +149,14 @@ export async function fetchUrl(
|
|
|
147
149
|
|
|
148
150
|
if (!mcpxClient) {
|
|
149
151
|
logger.dim(" no MCPX client — using HTTP fallback");
|
|
150
|
-
return httpFallback(url);
|
|
152
|
+
return httpFallback(url, config);
|
|
151
153
|
}
|
|
152
154
|
|
|
153
155
|
const result = await runFetcherLoop(url, config, mcpxClient, promptAddition);
|
|
154
156
|
if (result) return result;
|
|
155
157
|
|
|
156
158
|
logger.dim(" agent signaled fallback — using HTTP");
|
|
157
|
-
return httpFallback(url);
|
|
159
|
+
return httpFallback(url, config);
|
|
158
160
|
}
|
|
159
161
|
|
|
160
162
|
async function runFetcherLoop(
|
|
@@ -292,14 +294,26 @@ async function runFetcherLoop(
|
|
|
292
294
|
});
|
|
293
295
|
continue;
|
|
294
296
|
}
|
|
295
|
-
const
|
|
297
|
+
const claimedMimeType = input.mime_type || cached.mimeType;
|
|
296
298
|
logger.dim(
|
|
297
|
-
` turn ${turn + 1}: accept_content: "${input.title}" (${cached.content.length} chars, ${
|
|
299
|
+
` turn ${turn + 1}: accept_content: "${input.title}" (${cached.content.length} chars, claimed ${claimedMimeType}, from ${cached.server}/${cached.tool})`,
|
|
300
|
+
);
|
|
301
|
+
const truncated = cached.content.slice(0, MAX_CONTENT_BYTES);
|
|
302
|
+
// Always normalize via the converter. MCP tools frequently mislabel
|
|
303
|
+
// format — e.g. Google Docs' "Docmd" tool claims text/markdown but
|
|
304
|
+
// returns a structured `[H1 ...]` annotation format. The converter
|
|
305
|
+
// prompt handles already-clean markdown by echoing it unchanged.
|
|
306
|
+
logger.dim(` normalizing → markdown`);
|
|
307
|
+
const finalContent = await convertToMarkdown(
|
|
308
|
+
truncated,
|
|
309
|
+
claimedMimeType,
|
|
310
|
+
url,
|
|
311
|
+
config,
|
|
298
312
|
);
|
|
299
313
|
return {
|
|
300
314
|
title: input.title,
|
|
301
|
-
content:
|
|
302
|
-
mimeType,
|
|
315
|
+
content: finalContent,
|
|
316
|
+
mimeType: "text/markdown",
|
|
303
317
|
sourceUrl: url,
|
|
304
318
|
source: cached.server,
|
|
305
319
|
};
|
|
@@ -405,7 +419,10 @@ async function runFetcherLoop(
|
|
|
405
419
|
return null;
|
|
406
420
|
}
|
|
407
421
|
|
|
408
|
-
export async function httpFallback(
|
|
422
|
+
export async function httpFallback(
|
|
423
|
+
url: string,
|
|
424
|
+
config: Required<BotholomewConfig> | null = null,
|
|
425
|
+
): Promise<FetchedContent> {
|
|
409
426
|
const response = await fetch(url, {
|
|
410
427
|
headers: { "User-Agent": "Botholomew/1.0" },
|
|
411
428
|
signal: AbortSignal.timeout(HTTP_TIMEOUT_MS),
|
|
@@ -416,7 +433,8 @@ export async function httpFallback(url: string): Promise<FetchedContent> {
|
|
|
416
433
|
}
|
|
417
434
|
|
|
418
435
|
const contentType = response.headers.get("content-type") || "";
|
|
419
|
-
const
|
|
436
|
+
const baseMimeType = contentType.split(";")[0]?.trim() || "text/plain";
|
|
437
|
+
const isHtml = baseMimeType === "text/html";
|
|
420
438
|
let text = await response.text();
|
|
421
439
|
|
|
422
440
|
let title = url;
|
|
@@ -425,21 +443,72 @@ export async function httpFallback(url: string): Promise<FetchedContent> {
|
|
|
425
443
|
if (titleMatch?.[1]) {
|
|
426
444
|
title = titleMatch[1].trim();
|
|
427
445
|
}
|
|
428
|
-
text = stripHtmlTags(text);
|
|
429
446
|
}
|
|
430
447
|
|
|
431
448
|
if (text.length > MAX_CONTENT_BYTES) {
|
|
432
449
|
text = text.slice(0, MAX_CONTENT_BYTES);
|
|
433
450
|
}
|
|
434
451
|
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
452
|
+
// No API key: we can't honestly produce markdown. Strip HTML tags so the
|
|
453
|
+
// saved file is at least readable, and label it text/plain so downstream
|
|
454
|
+
// consumers know it isn't real markdown. Other content types pass through.
|
|
455
|
+
if (!config?.anthropic_api_key) {
|
|
456
|
+
if (isHtml) {
|
|
457
|
+
return {
|
|
458
|
+
title,
|
|
459
|
+
content: stripHtmlTags(text),
|
|
460
|
+
mimeType: "text/plain",
|
|
461
|
+
sourceUrl: url,
|
|
462
|
+
source: null,
|
|
463
|
+
};
|
|
464
|
+
}
|
|
465
|
+
return {
|
|
466
|
+
title,
|
|
467
|
+
content: text,
|
|
468
|
+
mimeType: baseMimeType,
|
|
469
|
+
sourceUrl: url,
|
|
470
|
+
source: null,
|
|
471
|
+
};
|
|
472
|
+
}
|
|
473
|
+
|
|
474
|
+
// With an API key: convert anything non-text/non-markdown to markdown.
|
|
475
|
+
// Plain text short-circuits to avoid burning a conversion call on what's
|
|
476
|
+
// probably already a readable README/log/etc. text/markdown short-circuits
|
|
477
|
+
// too — but only after verifying the body actually looks like markdown.
|
|
478
|
+
// Some servers mislabel HTML as text/markdown.
|
|
479
|
+
const { mimeType: effectiveMimeType, sniffed } = resolveEffectiveMimeType(
|
|
480
|
+
baseMimeType,
|
|
481
|
+
text,
|
|
482
|
+
);
|
|
483
|
+
if (sniffed) {
|
|
484
|
+
logger.warn(
|
|
485
|
+
`server claimed ${baseMimeType} but body looks like ${effectiveMimeType} — converting anyway`,
|
|
486
|
+
);
|
|
487
|
+
}
|
|
488
|
+
if (
|
|
489
|
+
effectiveMimeType === "text/plain" ||
|
|
490
|
+
isMarkdownMimeType(effectiveMimeType)
|
|
491
|
+
) {
|
|
492
|
+
return {
|
|
493
|
+
title,
|
|
494
|
+
content: text,
|
|
495
|
+
mimeType: effectiveMimeType,
|
|
496
|
+
sourceUrl: url,
|
|
497
|
+
source: null,
|
|
498
|
+
};
|
|
499
|
+
}
|
|
438
500
|
|
|
501
|
+
logger.dim(` converting ${effectiveMimeType} → markdown`);
|
|
502
|
+
const converted = await convertToMarkdown(
|
|
503
|
+
text,
|
|
504
|
+
effectiveMimeType,
|
|
505
|
+
url,
|
|
506
|
+
config,
|
|
507
|
+
);
|
|
439
508
|
return {
|
|
440
509
|
title,
|
|
441
|
-
content:
|
|
442
|
-
mimeType,
|
|
510
|
+
content: converted,
|
|
511
|
+
mimeType: "text/markdown",
|
|
443
512
|
sourceUrl: url,
|
|
444
513
|
source: null,
|
|
445
514
|
};
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
import Anthropic from "@anthropic-ai/sdk";
|
|
2
|
+
import type { BotholomewConfig } from "../config/schemas.ts";
|
|
3
|
+
import { logger } from "../utils/logger.ts";
|
|
4
|
+
import { FetchFailureError } from "./fetcher-errors.ts";
|
|
5
|
+
|
|
6
|
+
const CONVERTER_MAX_TOKENS = 16_384;
|
|
7
|
+
|
|
8
|
+
const CONVERTER_SYSTEM_PROMPT = `You normalize documents to clean, well-structured Markdown.
|
|
9
|
+
|
|
10
|
+
**If the input is already clean, valid Markdown, return it verbatim with no edits.** Look for ATX headings (#, ##), bullet/numbered lists, fenced code blocks, inline code, links in [text](url) form, blockquotes, GFM tables. If the structure is consistently markdown-shaped, echo it back unchanged.
|
|
11
|
+
|
|
12
|
+
Otherwise, convert it. The input mime_type is a hint, not a guarantee — verify the actual content. Common non-markdown formats to recognize and convert:
|
|
13
|
+
- **HTML** — strip tags, scripts, styles, navigation/footer chrome; preserve headings, paragraphs, lists, tables, links, code.
|
|
14
|
+
- **JSON / XML / YAML** — render the structure as readable Markdown (headings/lists for objects, tables where appropriate, fenced code blocks for inline values).
|
|
15
|
+
- **DocMD (Google Docs structured format)** — lines like \`[H1 1-31 HEADING_1 tabId=t.0 ...] Title text\` or \`[P5 884-937 PARAGRAPH ...] Body text\`. Strip the bracket annotations entirely; map H1→#, H2→##, H3→###, P→paragraph; preserve the trailing text content.
|
|
16
|
+
- **RTF, plain text with mixed structure, ad-hoc formats** — extract the semantic content, drop the noise.
|
|
17
|
+
|
|
18
|
+
Rules for the output:
|
|
19
|
+
- Preserve all semantic content: headings, paragraphs, lists, tables, links, inline code, code blocks, blockquotes.
|
|
20
|
+
- Use ATX headings (#, ##, ###), fenced code blocks (\`\`\`lang), GFM-style tables, and reference- or inline-style links — whichever is cleanest.
|
|
21
|
+
- Strip metadata headers/IDs that aren't part of the document body (e.g. \`@document_id: ...\`, \`@revision_id: ...\`).
|
|
22
|
+
- Output **only** the Markdown. No preamble ("Here is the converted markdown:"), no trailing commentary, no wrapping the entire output in a code fence.`;
|
|
23
|
+
|
|
24
|
+
const MARKDOWN_MIME_TYPES = new Set([
|
|
25
|
+
"text/markdown",
|
|
26
|
+
"text/x-markdown",
|
|
27
|
+
"text/md",
|
|
28
|
+
]);
|
|
29
|
+
|
|
30
|
+
export function isMarkdownMimeType(mimeType: string): boolean {
|
|
31
|
+
const base = mimeType.split(";")[0]?.trim().toLowerCase() ?? "";
|
|
32
|
+
return MARKDOWN_MIME_TYPES.has(base);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Sniff content for a non-markdown structure. Returns a mime type when the
|
|
37
|
+
* content has unmistakable markers of HTML / XML / JSON / etc., otherwise
|
|
38
|
+
* null. Used to verify a tool's claim of `text/markdown` — if the agent (or
|
|
39
|
+
* a defaulted mime type) lies about the format, we want to convert anyway.
|
|
40
|
+
*
|
|
41
|
+
* Markdown is a superset of plain text, so a null return ≠ "definitely
|
|
42
|
+
* markdown". It just means we found no strong contradicting signal.
|
|
43
|
+
*/
|
|
44
|
+
export function sniffNonMarkdownMimeType(content: string): string | null {
|
|
45
|
+
const head = content.trimStart().slice(0, 4096);
|
|
46
|
+
if (!head) return null;
|
|
47
|
+
|
|
48
|
+
if (/^<!doctype\s+html/i.test(head)) return "text/html";
|
|
49
|
+
if (/^<html[\s>]/i.test(head)) return "text/html";
|
|
50
|
+
if (/^<\?xml[\s?]/i.test(head)) return "application/xml";
|
|
51
|
+
|
|
52
|
+
// JSON: parses as JSON top-to-bottom (use the full content, not the head).
|
|
53
|
+
const trimmed = content.trim();
|
|
54
|
+
if (
|
|
55
|
+
(trimmed.startsWith("{") && trimmed.endsWith("}")) ||
|
|
56
|
+
(trimmed.startsWith("[") && trimmed.endsWith("]"))
|
|
57
|
+
) {
|
|
58
|
+
try {
|
|
59
|
+
JSON.parse(trimmed);
|
|
60
|
+
return "application/json";
|
|
61
|
+
} catch {
|
|
62
|
+
// fall through
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// Heuristic HTML: dense tag markup. Markdown can contain occasional inline
|
|
67
|
+
// HTML, so we only flag it when tags dominate the sample.
|
|
68
|
+
const tagMatches = head.match(/<\/?[a-z][a-z0-9]*[\s/>]/gi) ?? [];
|
|
69
|
+
if (tagMatches.length >= 10) {
|
|
70
|
+
const charsPerTag = head.length / tagMatches.length;
|
|
71
|
+
if (charsPerTag < 80) return "text/html";
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
return null;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
/**
|
|
78
|
+
* Decide the effective mime type for a piece of content. If the claim is
|
|
79
|
+
* markdown but the content sniffs as something else, trust the sniff so we
|
|
80
|
+
* convert instead of saving mislabeled garbage.
|
|
81
|
+
*/
|
|
82
|
+
export function resolveEffectiveMimeType(
|
|
83
|
+
claimedMimeType: string,
|
|
84
|
+
content: string,
|
|
85
|
+
): { mimeType: string; sniffed: boolean } {
|
|
86
|
+
if (!isMarkdownMimeType(claimedMimeType)) {
|
|
87
|
+
return { mimeType: claimedMimeType, sniffed: false };
|
|
88
|
+
}
|
|
89
|
+
const sniffed = sniffNonMarkdownMimeType(content);
|
|
90
|
+
if (sniffed) return { mimeType: sniffed, sniffed: true };
|
|
91
|
+
return { mimeType: claimedMimeType, sniffed: false };
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
function stripLeadingMarkdownFence(text: string): string {
|
|
95
|
+
const trimmed = text.trim();
|
|
96
|
+
const fenceMatch = trimmed.match(
|
|
97
|
+
/^```(?:markdown|md)?\s*\n([\s\S]*?)\n```\s*$/,
|
|
98
|
+
);
|
|
99
|
+
if (fenceMatch?.[1]) return fenceMatch[1];
|
|
100
|
+
return text;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
/**
|
|
104
|
+
* Convert arbitrary content to Markdown via a single-shot LLM call.
|
|
105
|
+
*
|
|
106
|
+
* Does **not** short-circuit on `mimeType === "text/markdown"` — tools
|
|
107
|
+
* frequently mislabel their output (e.g. Google Docs' "DocMD" tool returns
|
|
108
|
+
* structured `[H1 ...]` annotations, not real markdown). The mime type is
|
|
109
|
+
* passed in as a hint for the model; the model decides whether the content
|
|
110
|
+
* is already markdown (echo unchanged) or needs converting.
|
|
111
|
+
*
|
|
112
|
+
* - Throws FetchFailureError when the response hits max_tokens (silently
|
|
113
|
+
* truncating the saved file would be worse than failing loudly).
|
|
114
|
+
* - On transient API errors, logs a warning and returns the raw content so
|
|
115
|
+
* the import still produces *something* the user can edit.
|
|
116
|
+
*/
|
|
117
|
+
export async function convertToMarkdown(
|
|
118
|
+
content: string,
|
|
119
|
+
mimeType: string,
|
|
120
|
+
sourceUrl: string,
|
|
121
|
+
config: Required<BotholomewConfig>,
|
|
122
|
+
): Promise<string> {
|
|
123
|
+
if (!config.anthropic_api_key) return content;
|
|
124
|
+
|
|
125
|
+
const client = new Anthropic({ apiKey: config.anthropic_api_key });
|
|
126
|
+
// Conversion is mechanical text-shaping — Haiku (the chunker model) is
|
|
127
|
+
// plenty smart for this and ~5x faster than Opus on long documents.
|
|
128
|
+
const model = config.chunker_model || config.model;
|
|
129
|
+
|
|
130
|
+
try {
|
|
131
|
+
const stream = client.messages.stream({
|
|
132
|
+
model,
|
|
133
|
+
max_tokens: CONVERTER_MAX_TOKENS,
|
|
134
|
+
system: CONVERTER_SYSTEM_PROMPT,
|
|
135
|
+
messages: [
|
|
136
|
+
{
|
|
137
|
+
role: "user",
|
|
138
|
+
content: `Convert this ${mimeType} content to Markdown. Source URL: ${sourceUrl}\n\n${content}`,
|
|
139
|
+
},
|
|
140
|
+
],
|
|
141
|
+
});
|
|
142
|
+
|
|
143
|
+
let charsReceived = 0;
|
|
144
|
+
let lastLogged = 0;
|
|
145
|
+
const PROGRESS_INTERVAL_CHARS = 2_000;
|
|
146
|
+
for await (const event of stream) {
|
|
147
|
+
if (
|
|
148
|
+
event.type === "content_block_delta" &&
|
|
149
|
+
event.delta.type === "text_delta"
|
|
150
|
+
) {
|
|
151
|
+
charsReceived += event.delta.text.length;
|
|
152
|
+
if (charsReceived - lastLogged >= PROGRESS_INTERVAL_CHARS) {
|
|
153
|
+
logger.dim(` ...converted ${charsReceived} chars`);
|
|
154
|
+
lastLogged = charsReceived;
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
const final = await stream.finalMessage();
|
|
160
|
+
|
|
161
|
+
if (final.stop_reason === "max_tokens") {
|
|
162
|
+
throw new FetchFailureError(
|
|
163
|
+
`Markdown conversion exceeded token budget (max_tokens=${CONVERTER_MAX_TOKENS}). The source document is too large to convert in one pass — try fetching a smaller section or a tool that supports pagination.`,
|
|
164
|
+
);
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
const text = final.content
|
|
168
|
+
.flatMap((block) => (block.type === "text" ? [block.text] : []))
|
|
169
|
+
.join("");
|
|
170
|
+
|
|
171
|
+
if (!text.trim()) {
|
|
172
|
+
logger.warn(
|
|
173
|
+
"markdown conversion returned empty output — saving raw content",
|
|
174
|
+
);
|
|
175
|
+
return content;
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
return stripLeadingMarkdownFence(text);
|
|
179
|
+
} catch (err) {
|
|
180
|
+
if (err instanceof FetchFailureError) throw err;
|
|
181
|
+
logger.warn(
|
|
182
|
+
`markdown conversion failed (${err instanceof Error ? err.message : String(err)}) — saving raw content`,
|
|
183
|
+
);
|
|
184
|
+
return content;
|
|
185
|
+
}
|
|
186
|
+
}
|