botholomew 0.16.4 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. package/README.md +46 -41
  2. package/package.json +4 -9
  3. package/src/chat/agent.ts +37 -40
  4. package/src/chat/session.ts +10 -10
  5. package/src/cli.ts +0 -2
  6. package/src/commands/capabilities.ts +35 -33
  7. package/src/commands/context.ts +133 -221
  8. package/src/commands/init.ts +22 -1
  9. package/src/commands/mcpx.ts +21 -8
  10. package/src/commands/nuke.ts +52 -15
  11. package/src/commands/prepare.ts +16 -13
  12. package/src/config/loader.ts +1 -8
  13. package/src/config/schemas.ts +6 -0
  14. package/src/constants.ts +16 -32
  15. package/src/init/index.ts +52 -27
  16. package/src/mcpx/client.ts +21 -5
  17. package/src/mem/client.ts +33 -0
  18. package/src/{context → prompts}/capabilities.ts +11 -7
  19. package/src/schedules/store.ts +1 -1
  20. package/src/tasks/store.ts +1 -1
  21. package/src/threads/store.ts +1 -1
  22. package/src/tools/capabilities/refresh.ts +1 -1
  23. package/src/tools/membot/adapter.ts +111 -0
  24. package/src/tools/membot/copy.ts +59 -0
  25. package/src/tools/membot/count_lines.ts +53 -0
  26. package/src/tools/membot/edit.ts +72 -0
  27. package/src/tools/membot/exists.ts +54 -0
  28. package/src/tools/membot/index.ts +26 -0
  29. package/src/tools/{context → membot}/pipe.ts +34 -32
  30. package/src/tools/registry.ts +6 -37
  31. package/src/tools/tool.ts +6 -8
  32. package/src/tui/App.tsx +3 -4
  33. package/src/tui/components/ContextPanel.tsx +109 -226
  34. package/src/tui/components/HelpPanel.tsx +2 -2
  35. package/src/tui/components/StatusBar.tsx +0 -6
  36. package/src/tui/components/ThreadPanel.tsx +8 -7
  37. package/src/tui/wrapDetail.ts +11 -0
  38. package/src/worker/heartbeat.ts +0 -20
  39. package/src/worker/index.ts +13 -13
  40. package/src/worker/llm.ts +7 -9
  41. package/src/worker/prompt.ts +25 -13
  42. package/src/worker/spawn.ts +1 -1
  43. package/src/worker/tick.ts +10 -9
  44. package/src/commands/db.ts +0 -119
  45. package/src/commands/with-db.ts +0 -22
  46. package/src/context/chunker.ts +0 -275
  47. package/src/context/embedder-impl.ts +0 -100
  48. package/src/context/embedder.ts +0 -9
  49. package/src/context/fetcher-errors.ts +0 -8
  50. package/src/context/fetcher.ts +0 -515
  51. package/src/context/locks.ts +0 -146
  52. package/src/context/markdown-converter.ts +0 -186
  53. package/src/context/reindex.ts +0 -198
  54. package/src/context/store.ts +0 -841
  55. package/src/context/url-utils.ts +0 -25
  56. package/src/db/connection.ts +0 -255
  57. package/src/db/doctor.ts +0 -235
  58. package/src/db/embeddings.ts +0 -317
  59. package/src/db/query.ts +0 -56
  60. package/src/db/schema.ts +0 -93
  61. package/src/db/sql/1-core_tables.sql +0 -53
  62. package/src/db/sql/10-dedupe_context_items.sql +0 -26
  63. package/src/db/sql/11-rebuild_hnsw.sql +0 -8
  64. package/src/db/sql/12-workers.sql +0 -66
  65. package/src/db/sql/13-drive-paths.sql +0 -47
  66. package/src/db/sql/14-drop_hnsw_index.sql +0 -8
  67. package/src/db/sql/15-fts_index.sql +0 -8
  68. package/src/db/sql/16-source_url.sql +0 -7
  69. package/src/db/sql/17-worker_log_path.sql +0 -3
  70. package/src/db/sql/18-reset_embeddings_for_local.sql +0 -39
  71. package/src/db/sql/19-disk_backed_index.sql +0 -36
  72. package/src/db/sql/2-logging_tables.sql +0 -24
  73. package/src/db/sql/20-drop_db_tables_for_files.sql +0 -19
  74. package/src/db/sql/3-daemon_state.sql +0 -5
  75. package/src/db/sql/4-unique_context_path.sql +0 -1
  76. package/src/db/sql/5-reset_embeddings_for_openai.sql +0 -1
  77. package/src/db/sql/6-vss_index.sql +0 -7
  78. package/src/db/sql/7-drop_embeddings_fk.sql +0 -23
  79. package/src/db/sql/8-task_output.sql +0 -1
  80. package/src/db/sql/9-source-type.sql +0 -1
  81. package/src/tools/context/read-large-result.ts +0 -33
  82. package/src/tools/dir/create.ts +0 -47
  83. package/src/tools/dir/size.ts +0 -77
  84. package/src/tools/dir/tree.ts +0 -124
  85. package/src/tools/file/copy.ts +0 -73
  86. package/src/tools/file/count-lines.ts +0 -54
  87. package/src/tools/file/delete.ts +0 -83
  88. package/src/tools/file/edit.ts +0 -76
  89. package/src/tools/file/exists.ts +0 -33
  90. package/src/tools/file/info.ts +0 -66
  91. package/src/tools/file/move.ts +0 -66
  92. package/src/tools/file/read.ts +0 -67
  93. package/src/tools/file/write.ts +0 -58
  94. package/src/tools/search/fuse.ts +0 -96
  95. package/src/tools/search/index.ts +0 -127
  96. package/src/tools/search/regexp.ts +0 -82
  97. package/src/tools/search/semantic.ts +0 -167
  98. /package/src/{db → utils}/uuid.ts +0 -0
@@ -1,515 +0,0 @@
1
- import type {
2
- Tool as AnthropicTool,
3
- MessageParam,
4
- ToolResultBlockParam,
5
- ToolUseBlock,
6
- } from "@anthropic-ai/sdk/resources/messages";
7
- import type { McpxClient } from "@evantahler/mcpx";
8
- import type { BotholomewConfig } from "../config/schemas.ts";
9
- import type { DbConnection } from "../db/connection.ts";
10
- import { mcpExecTool } from "../tools/mcp/exec.ts";
11
- import { mcpInfoTool } from "../tools/mcp/info.ts";
12
- import { mcpListToolsTool } from "../tools/mcp/list-tools.ts";
13
- import { mcpSearchTool } from "../tools/mcp/search.ts";
14
- import type { ToolContext } from "../tools/tool.ts";
15
- import { type AnyToolDefinition, toAnthropicTool } from "../tools/tool.ts";
16
- import { logger } from "../utils/logger.ts";
17
- import { createLlmClient } from "../worker/llm-client.ts";
18
- import { FetchFailureError } from "./fetcher-errors.ts";
19
- import {
20
- convertToMarkdown,
21
- isMarkdownMimeType,
22
- resolveEffectiveMimeType,
23
- } from "./markdown-converter.ts";
24
- import { stripHtmlTags } from "./url-utils.ts";
25
-
26
- export { FetchFailureError } from "./fetcher-errors.ts";
27
-
28
- const MAX_CONTENT_BYTES = 500_000;
29
- const MAX_TURNS = 10;
30
- const MAX_RESPONSE_TOKENS = 4_096;
31
- const PREVIEW_CHARS = 2_000;
32
- const HTTP_TIMEOUT_MS = 30_000;
33
-
34
- export interface FetchedContent {
35
- title: string;
36
- content: string;
37
- mimeType: string;
38
- sourceUrl: string;
39
- /**
40
- * MCP server that produced the content (e.g. "google-docs", "github",
41
- * "firecrawl"), or null when we fell back to a plain HTTP fetch. Useful
42
- * for `botholomew context import` to pick a default destination subdirectory.
43
- */
44
- source: string | null;
45
- }
46
-
47
- const FETCHER_SYSTEM_PROMPT = `You are a content fetcher. Your job is to find the right MCP tool to retrieve the content at the given URL, run it, and tell the harness which result to save.
48
-
49
- **Important: the harness captures the full result of every mcp_exec call automatically.** You only see a short preview of each result so you can verify it looks reasonable. You do NOT need to read or copy the full content — you just identify which exec call to save.
50
-
51
- **Format preference: markdown, in order of preference.**
52
- 1. When searching with mcp_search or mcp_list_tools, prefer tools whose names indicate markdown output: anything containing "markdown", "md", "AsMarkdown", "AsMd", "AsDocmd", or similar. For example, prefer "GoogleDocs_GetDocumentAsDocmd" over "GoogleDocs_GetDocumentAsHtml".
53
- 2. If no markdown-named variant exists, use mcp_info to inspect the tool's input schema for a "format", "mime_type", "output_format", or similar parameter and request "markdown" (or "md") when available.
54
- 3. If neither is possible, run the tool anyway. The harness will convert the captured content to markdown via a separate LLM call before saving — markdown-native tools are still preferred because they're cheaper and higher fidelity, but you do not have to find one.
55
-
56
- Workflow:
57
- 1. Use mcp_search or mcp_list_tools to find the best tool for this URL (e.g., Google Docs tools for docs.google.com, Firecrawl for generic web pages, GitHub tools for github.com). Apply the format preference above.
58
- 2. Use mcp_info to inspect the tool's input schema.
59
- 3. Call mcp_exec with the right arguments — request markdown format when supported.
60
- 4. Look at the preview returned by mcp_exec. If it looks like the right content, call accept_content with the exec_call_id (the tool_use_id of the mcp_exec call), a sensible title, and the actual mime_type the tool returned (so the harness knows whether to convert).
61
-
62
- Terminal tools:
63
- - accept_content(exec_call_id, title, mime_type?) — save the content captured from a previous mcp_exec call. The harness has the full content; you supply the id, title, and the source mime_type (e.g., "text/html", "application/json", "text/markdown"). The harness converts to markdown before storage when needed.
64
- - request_http_fallback() — fall back to a basic HTTP fetch. Use only when no MCP tool can handle the URL after a genuine attempt. Tools like Firecrawl can handle most URLs, so don't give up on the first try.
65
- - report_failure(message) — surface an actionable message to the user (e.g., "this Google Doc is private — share it with your service account", "Firecrawl is not authenticated"). Use only when there is a specific next step the user must take.`;
66
-
67
- const acceptContentTool: AnthropicTool = {
68
- name: "accept_content",
69
- description:
70
- "Save the full content captured by the harness from a previous mcp_exec call. You only need to supply the exec_call_id (the tool_use_id of that mcp_exec call) and a title — the harness already has the full content. Do NOT paste content here.",
71
- input_schema: {
72
- type: "object" as const,
73
- properties: {
74
- exec_call_id: {
75
- type: "string",
76
- description:
77
- "The tool_use_id of the mcp_exec call whose result should be saved (the harness lists captured ids in mcp_exec previews).",
78
- },
79
- title: {
80
- type: "string",
81
- description:
82
- "A human-readable title for the content (e.g., the document title, or derived from the URL).",
83
- },
84
- mime_type: {
85
- type: "string",
86
- description: "MIME type of the content (defaults to text/markdown).",
87
- },
88
- },
89
- required: ["exec_call_id", "title"],
90
- },
91
- };
92
-
93
- interface AcceptContentInput {
94
- exec_call_id: string;
95
- title: string;
96
- mime_type?: string;
97
- }
98
-
99
- const requestHttpFallbackTool: AnthropicTool = {
100
- name: "request_http_fallback",
101
- description:
102
- "Fall back to a basic HTTP fetch. Use only when no MCP tool can handle the URL after a genuine attempt.",
103
- input_schema: {
104
- type: "object" as const,
105
- properties: {},
106
- required: [],
107
- },
108
- };
109
-
110
- const reportFailureTool: AnthropicTool = {
111
- name: "report_failure",
112
- description:
113
- "Report a fetch failure with an actionable message for the user (e.g., 'this Google Doc is private — share it with your service account'). Use only when there is a clear next step the user must take.",
114
- input_schema: {
115
- type: "object" as const,
116
- properties: {
117
- message: {
118
- type: "string",
119
- description:
120
- "A clear, actionable, user-facing message explaining what the user needs to do to make this URL fetchable.",
121
- },
122
- },
123
- required: ["message"],
124
- },
125
- };
126
-
127
- interface ReportFailureInput {
128
- message: string;
129
- }
130
-
131
- const mcpTools: AnyToolDefinition[] = [
132
- mcpListToolsTool as unknown as AnyToolDefinition,
133
- mcpSearchTool as unknown as AnyToolDefinition,
134
- mcpInfoTool as unknown as AnyToolDefinition,
135
- mcpExecTool as unknown as AnyToolDefinition,
136
- ];
137
-
138
- export async function fetchUrl(
139
- url: string,
140
- config: Required<BotholomewConfig>,
141
- mcpxClient: McpxClient | null,
142
- promptAddition?: string,
143
- ): Promise<FetchedContent> {
144
- if (!config.anthropic_api_key) {
145
- throw new Error(
146
- "Anthropic API key is required for URL fetching. Set ANTHROPIC_API_KEY or configure it in config/config.json",
147
- );
148
- }
149
-
150
- if (!mcpxClient) {
151
- logger.dim(" no MCPX client — using HTTP fallback");
152
- return httpFallback(url, config);
153
- }
154
-
155
- const result = await runFetcherLoop(url, config, mcpxClient, promptAddition);
156
- if (result) return result;
157
-
158
- logger.dim(" agent signaled fallback — using HTTP");
159
- return httpFallback(url, config);
160
- }
161
-
162
- async function runFetcherLoop(
163
- url: string,
164
- config: Required<BotholomewConfig>,
165
- mcpxClient: McpxClient,
166
- promptAddition?: string,
167
- ): Promise<FetchedContent | null> {
168
- const client = createLlmClient(config);
169
-
170
- const toolCtx: ToolContext = {
171
- conn: null as unknown as DbConnection,
172
- dbPath: "",
173
- projectDir: "",
174
- config,
175
- mcpxClient,
176
- };
177
-
178
- const tools: AnthropicTool[] = [
179
- ...mcpTools.map(toAnthropicTool),
180
- acceptContentTool,
181
- requestHttpFallbackTool,
182
- reportFailureTool,
183
- ];
184
-
185
- // Cache of full mcp_exec results keyed by tool_use_id.
186
- // The LLM only sees a truncated preview; on accept_content it references
187
- // the id and the harness saves the captured content. `server` is retained so
188
- // we can attribute the save to a specific MCP service when routing to a drive.
189
- const execResults = new Map<
190
- string,
191
- { server: string; tool: string; content: string; mimeType: string }
192
- >();
193
-
194
- const userPrompt = promptAddition
195
- ? `Fetch the content at: ${url}\n\nAdditional guidance:\n${promptAddition}`
196
- : `Fetch the content at: ${url}`;
197
- const messages: MessageParam[] = [{ role: "user", content: userPrompt }];
198
-
199
- for (let turn = 0; turn < MAX_TURNS; turn++) {
200
- const response = await client.messages.create({
201
- model: config.model,
202
- max_tokens: MAX_RESPONSE_TOKENS,
203
- system: FETCHER_SYSTEM_PROMPT,
204
- messages,
205
- tools,
206
- });
207
-
208
- // Log assistant text reasoning
209
- for (const block of response.content) {
210
- if (block.type === "text" && block.text.trim()) {
211
- logger.dim(` turn ${turn + 1}: ${block.text.trim()}`);
212
- }
213
- }
214
-
215
- if (response.stop_reason === "max_tokens") {
216
- throw new FetchFailureError(
217
- `The fetched document is too large to return in a single LLM response (hit max_tokens=${MAX_RESPONSE_TOKENS}). Try fetching a smaller section, a specific page, or a tool that supports pagination.`,
218
- );
219
- }
220
-
221
- const toolUseBlocks = response.content.filter(
222
- (block): block is ToolUseBlock => block.type === "tool_use",
223
- );
224
-
225
- if (toolUseBlocks.length === 0) {
226
- logger.dim(` turn ${turn + 1}: no tool calls — signaling fallback`);
227
- return null;
228
- }
229
-
230
- messages.push({ role: "assistant", content: response.content });
231
-
232
- // Check for report_failure first (terminal — surfaces actionable user message)
233
- const failureCall = toolUseBlocks.find((b) => b.name === "report_failure");
234
- if (failureCall) {
235
- const input = failureCall.input as Partial<ReportFailureInput>;
236
- const message =
237
- typeof input.message === "string" && input.message.trim()
238
- ? input.message
239
- : "Fetch failed but the agent did not provide a message.";
240
- logger.dim(` turn ${turn + 1}: report_failure: ${message}`);
241
- throw new FetchFailureError(message);
242
- }
243
-
244
- // Check for request_http_fallback (terminal)
245
- const fallbackCall = toolUseBlocks.find(
246
- (b) => b.name === "request_http_fallback",
247
- );
248
- if (fallbackCall) {
249
- logger.dim(` turn ${turn + 1}: agent requested HTTP fallback`);
250
- return null;
251
- }
252
-
253
- // Check for accept_content (terminal — looks up captured exec result)
254
- const acceptCall = toolUseBlocks.find((b) => b.name === "accept_content");
255
- if (acceptCall) {
256
- const input = acceptCall.input as Partial<AcceptContentInput>;
257
- if (
258
- typeof input.exec_call_id !== "string" ||
259
- typeof input.title !== "string"
260
- ) {
261
- logger.dim(
262
- ` turn ${turn + 1}: accept_content missing required fields — asking agent to retry`,
263
- );
264
- messages.push({
265
- role: "user",
266
- content: [
267
- {
268
- type: "tool_result" as const,
269
- tool_use_id: acceptCall.id,
270
- content:
271
- "Invalid accept_content call: both 'exec_call_id' and 'title' are required strings.",
272
- is_error: true,
273
- },
274
- ],
275
- });
276
- continue;
277
- }
278
- const cached = execResults.get(input.exec_call_id);
279
- if (!cached) {
280
- const validIds = [...execResults.keys()];
281
- logger.dim(
282
- ` turn ${turn + 1}: accept_content: unknown exec_call_id "${input.exec_call_id}"`,
283
- );
284
- messages.push({
285
- role: "user",
286
- content: [
287
- {
288
- type: "tool_result" as const,
289
- tool_use_id: acceptCall.id,
290
- content: `No mcp_exec call with id "${input.exec_call_id}" was captured. Captured ids: ${validIds.length ? validIds.join(", ") : "(none yet — run mcp_exec first)"}.`,
291
- is_error: true,
292
- },
293
- ],
294
- });
295
- continue;
296
- }
297
- const claimedMimeType = input.mime_type || cached.mimeType;
298
- logger.dim(
299
- ` turn ${turn + 1}: accept_content: "${input.title}" (${cached.content.length} chars, claimed ${claimedMimeType}, from ${cached.server}/${cached.tool})`,
300
- );
301
- const truncated = cached.content.slice(0, MAX_CONTENT_BYTES);
302
- // Always normalize via the converter. MCP tools frequently mislabel
303
- // format — e.g. Google Docs' "Docmd" tool claims text/markdown but
304
- // returns a structured `[H1 ...]` annotation format. The converter
305
- // prompt handles already-clean markdown by echoing it unchanged.
306
- logger.dim(` normalizing → markdown`);
307
- const finalContent = await convertToMarkdown(
308
- truncated,
309
- claimedMimeType,
310
- url,
311
- config,
312
- );
313
- return {
314
- title: input.title,
315
- content: finalContent,
316
- mimeType: "text/markdown",
317
- sourceUrl: url,
318
- source: cached.server,
319
- };
320
- }
321
-
322
- // Execute non-terminal MCP tools in parallel
323
- const toolResults: ToolResultBlockParam[] = await Promise.all(
324
- toolUseBlocks.map(async (toolUse) => {
325
- // Log which tool the agent selected (and the underlying MCP server/tool for mcp_exec)
326
- const toolInput = toolUse.input as Record<string, unknown>;
327
- if (toolUse.name === "mcp_exec") {
328
- logger.dim(
329
- ` turn ${turn + 1}: mcp_exec → ${toolInput.server}/${toolInput.tool}`,
330
- );
331
- } else {
332
- const args = JSON.stringify(toolInput).slice(0, 80);
333
- logger.dim(` turn ${turn + 1}: ${toolUse.name}(${args})`);
334
- }
335
-
336
- const toolDef = mcpTools.find((t) => t.name === toolUse.name);
337
- if (!toolDef) {
338
- return {
339
- type: "tool_result" as const,
340
- tool_use_id: toolUse.id,
341
- content: `Unknown tool: ${toolUse.name}`,
342
- is_error: true,
343
- };
344
- }
345
-
346
- try {
347
- const parsed = toolDef.inputSchema.safeParse(toolUse.input);
348
- if (!parsed.success) {
349
- return {
350
- type: "tool_result" as const,
351
- tool_use_id: toolUse.id,
352
- content: `Invalid input: ${parsed.error.message}`,
353
- is_error: true,
354
- };
355
- }
356
- const result = await toolDef.execute(parsed.data, toolCtx);
357
- if (result.is_error) {
358
- logger.dim(
359
- ` → error: ${JSON.stringify(result).slice(0, 160)}`,
360
- );
361
- return {
362
- type: "tool_result" as const,
363
- tool_use_id: toolUse.id,
364
- content: JSON.stringify(result),
365
- is_error: true,
366
- };
367
- }
368
-
369
- // For successful mcp_exec calls, capture the full content in the
370
- // harness and send only a preview to the LLM. The LLM accepts the
371
- // result by referring to its tool_use_id.
372
- if (toolUse.name === "mcp_exec") {
373
- const execResult = result as {
374
- result: string;
375
- is_error: boolean;
376
- };
377
- const content = execResult.result;
378
- execResults.set(toolUse.id, {
379
- server: String(toolInput.server),
380
- tool: String(toolInput.tool),
381
- content,
382
- mimeType: "text/markdown",
383
- });
384
- const preview =
385
- content.length > PREVIEW_CHARS
386
- ? `${content.slice(0, PREVIEW_CHARS)}\n\n[... ${content.length - PREVIEW_CHARS} more chars truncated. Full content (${content.length} chars total) is captured by the harness with exec_call_id="${toolUse.id}". Call accept_content with this id to save it.]`
387
- : `${content}\n\n[Full content (${content.length} chars) captured by the harness with exec_call_id="${toolUse.id}". Call accept_content with this id to save it.]`;
388
- logger.dim(
389
- ` → captured ${content.length} chars (id=${toolUse.id})`,
390
- );
391
- return {
392
- type: "tool_result" as const,
393
- tool_use_id: toolUse.id,
394
- content: preview,
395
- };
396
- }
397
-
398
- return {
399
- type: "tool_result" as const,
400
- tool_use_id: toolUse.id,
401
- content: JSON.stringify(result),
402
- };
403
- } catch (err) {
404
- logger.dim(` → exception: ${err}`);
405
- return {
406
- type: "tool_result" as const,
407
- tool_use_id: toolUse.id,
408
- content: `Error: ${err}`,
409
- is_error: true,
410
- };
411
- }
412
- }),
413
- );
414
-
415
- messages.push({ role: "user", content: toolResults });
416
- }
417
-
418
- logger.dim(` max turns (${MAX_TURNS}) exceeded — signaling fallback`);
419
- return null;
420
- }
421
-
422
- export async function httpFallback(
423
- url: string,
424
- config: Required<BotholomewConfig> | null = null,
425
- ): Promise<FetchedContent> {
426
- const response = await fetch(url, {
427
- headers: { "User-Agent": "Botholomew/1.0" },
428
- signal: AbortSignal.timeout(HTTP_TIMEOUT_MS),
429
- });
430
-
431
- if (!response.ok) {
432
- throw new Error(`HTTP ${response.status} ${response.statusText}: ${url}`);
433
- }
434
-
435
- const contentType = response.headers.get("content-type") || "";
436
- const baseMimeType = contentType.split(";")[0]?.trim() || "text/plain";
437
- const isHtml = baseMimeType === "text/html";
438
- let text = await response.text();
439
-
440
- let title = url;
441
- if (isHtml) {
442
- const titleMatch = text.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
443
- if (titleMatch?.[1]) {
444
- title = titleMatch[1].trim();
445
- }
446
- }
447
-
448
- if (text.length > MAX_CONTENT_BYTES) {
449
- text = text.slice(0, MAX_CONTENT_BYTES);
450
- }
451
-
452
- // No API key: we can't honestly produce markdown. Strip HTML tags so the
453
- // saved file is at least readable, and label it text/plain so downstream
454
- // consumers know it isn't real markdown. Other content types pass through.
455
- if (!config?.anthropic_api_key) {
456
- if (isHtml) {
457
- return {
458
- title,
459
- content: stripHtmlTags(text),
460
- mimeType: "text/plain",
461
- sourceUrl: url,
462
- source: null,
463
- };
464
- }
465
- return {
466
- title,
467
- content: text,
468
- mimeType: baseMimeType,
469
- sourceUrl: url,
470
- source: null,
471
- };
472
- }
473
-
474
- // With an API key: convert anything non-text/non-markdown to markdown.
475
- // Plain text short-circuits to avoid burning a conversion call on what's
476
- // probably already a readable README/log/etc. text/markdown short-circuits
477
- // too — but only after verifying the body actually looks like markdown.
478
- // Some servers mislabel HTML as text/markdown.
479
- const { mimeType: effectiveMimeType, sniffed } = resolveEffectiveMimeType(
480
- baseMimeType,
481
- text,
482
- );
483
- if (sniffed) {
484
- logger.warn(
485
- `server claimed ${baseMimeType} but body looks like ${effectiveMimeType} — converting anyway`,
486
- );
487
- }
488
- if (
489
- effectiveMimeType === "text/plain" ||
490
- isMarkdownMimeType(effectiveMimeType)
491
- ) {
492
- return {
493
- title,
494
- content: text,
495
- mimeType: effectiveMimeType,
496
- sourceUrl: url,
497
- source: null,
498
- };
499
- }
500
-
501
- logger.dim(` converting ${effectiveMimeType} → markdown`);
502
- const converted = await convertToMarkdown(
503
- text,
504
- effectiveMimeType,
505
- url,
506
- config,
507
- );
508
- return {
509
- title,
510
- content: converted,
511
- mimeType: "text/markdown",
512
- sourceUrl: url,
513
- source: null,
514
- };
515
- }
@@ -1,146 +0,0 @@
1
- import { createHash } from "node:crypto";
2
- import { readdir, stat } from "node:fs/promises";
3
- import { join } from "node:path";
4
- import { CONTEXT_DIR, LOCKS_SUBDIR } from "../constants.ts";
5
- import {
6
- acquireLock,
7
- LockHeldError,
8
- readLockHolder,
9
- releaseLock,
10
- } from "../fs/atomic.ts";
11
-
12
- /**
13
- * Per-path mutex for `context/` mutations. Tasks/schedules already serialize
14
- * their own writes via O_EXCL lockfiles; this gives the same guarantee for
15
- * `context_write` / `context_edit` / `context_delete` / `context_mv` so two
16
- * tools (worker + chat, or two workers on the same path) can't race on
17
- * read-modify-write or rename ordering.
18
- *
19
- * Lockfiles live at `<projectDir>/context/.locks/<sha1(path)>.lock`. We hash
20
- * the path so the lock filename is bounded-length and slash-free, and so a
21
- * leading-dot path doesn't accidentally collide with `walk()`'s dotfile skip
22
- * in `src/context/store.ts`. The `.locks/` dir itself is invisible to
23
- * `context_list` (walk skips dot-prefixed names at every depth).
24
- */
25
-
26
- // Retries are exponential-ish with jitter. Total worst-case wait is
27
- // ~5 seconds — comfortable for a small herd of concurrent writers (the
28
- // per-path critical section is just a stat + tmp write + rename, on the
29
- // order of 1-10 ms each), and short enough that a stuck holder surfaces
30
- // to the caller instead of hanging an LLM tool call indefinitely.
31
- const ACQUIRE_RETRIES = 32;
32
- const ACQUIRE_BASE_BACKOFF_MS = 10;
33
- const ACQUIRE_MAX_BACKOFF_MS = 200;
34
-
35
- export function getContextLocksDir(projectDir: string): string {
36
- return join(projectDir, CONTEXT_DIR, LOCKS_SUBDIR);
37
- }
38
-
39
- export function contextLockPath(
40
- projectDir: string,
41
- normalizedPath: string,
42
- ): string {
43
- const hash = createHash("sha1").update(normalizedPath).digest("hex");
44
- return join(getContextLocksDir(projectDir), `${hash}.lock`);
45
- }
46
-
47
- /**
48
- * Run `fn` while holding the per-path context lock. Retries a few times with
49
- * a small backoff if another caller has the lock — concurrent context tools
50
- * are expected to converge, not surface "try again" errors to the LLM.
51
- *
52
- * `holderId` is stored in the lockfile body so the reaper (and humans
53
- * inspecting `context/.locks/`) can identify the owner. Pass the worker id
54
- * when called from a worker; chat sessions pass `"chat:<sessionId>"` or
55
- * just `"chat"` — anything stable for the duration of the operation.
56
- */
57
- export async function withContextLock<T>(
58
- projectDir: string,
59
- normalizedPath: string,
60
- holderId: string,
61
- fn: () => Promise<T>,
62
- ): Promise<T> {
63
- const lockPath = contextLockPath(projectDir, normalizedPath);
64
- for (let attempt = 0; ; attempt++) {
65
- try {
66
- await acquireLock(lockPath, holderId);
67
- try {
68
- return await fn();
69
- } finally {
70
- await releaseLock(lockPath);
71
- }
72
- } catch (err) {
73
- if (err instanceof LockHeldError && attempt < ACQUIRE_RETRIES) {
74
- const exp = Math.min(
75
- ACQUIRE_MAX_BACKOFF_MS,
76
- ACQUIRE_BASE_BACKOFF_MS * 2 ** attempt,
77
- );
78
- const jittered = exp * (0.5 + Math.random());
79
- await new Promise((res) => setTimeout(res, jittered));
80
- continue;
81
- }
82
- throw err;
83
- }
84
- }
85
- }
86
-
87
- /**
88
- * True if `<projectDir>/context/.locks/<sha1(path)>.lock` currently exists.
89
- * Used by the reindex orphan-prune to skip paths that a worker is mid-write
90
- * on — without this guard the prune can drop the search-index rows of a
91
- * file that's about to land on disk.
92
- */
93
- export async function isContextPathLocked(
94
- projectDir: string,
95
- normalizedPath: string,
96
- ): Promise<boolean> {
97
- try {
98
- await stat(contextLockPath(projectDir, normalizedPath));
99
- return true;
100
- } catch (err) {
101
- if ((err as NodeJS.ErrnoException).code === "ENOENT") return false;
102
- throw err;
103
- }
104
- }
105
-
106
- /**
107
- * Reaper: walk `context/.locks/`, drop any lockfile whose holder is no
108
- * longer running per `isHolderAlive`. Mirrors `reapOrphanLocks` in
109
- * `src/tasks/store.ts` so the worker reaper can clean stale context locks
110
- * left behind by a crashed worker.
111
- *
112
- * `isHolderAlive` receives the raw holder id — the caller decides what
113
- * counts as alive (typically: workers/<id>.json status === "running").
114
- * Holders that don't match the worker convention (e.g. `"chat"` from a
115
- * chat session) are conservatively treated as alive — not our business
116
- * to expire those.
117
- */
118
- export async function reapOrphanContextLocks(
119
- projectDir: string,
120
- isHolderAlive: (holderId: string) => Promise<boolean>,
121
- ): Promise<string[]> {
122
- const dir = getContextLocksDir(projectDir);
123
- let names: string[];
124
- try {
125
- names = await readdir(dir);
126
- } catch (err) {
127
- if ((err as NodeJS.ErrnoException).code === "ENOENT") return [];
128
- throw err;
129
- }
130
- const released: string[] = [];
131
- for (const name of names) {
132
- if (!name.endsWith(".lock")) continue;
133
- const lockPath = join(dir, name);
134
- const holder = await readLockHolder(lockPath);
135
- if (!holder) {
136
- await releaseLock(lockPath);
137
- released.push(name);
138
- continue;
139
- }
140
- if (!(await isHolderAlive(holder))) {
141
- await releaseLock(lockPath);
142
- released.push(name);
143
- }
144
- }
145
- return released;
146
- }