botholomew 0.6.2 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/src/commands/context.ts +227 -66
- package/src/context/chunker.ts +98 -1
- package/src/context/fetcher.ts +436 -0
- package/src/context/url-utils.ts +48 -0
- package/src/db/context.ts +8 -2
- package/src/db/sql/9-source-type.sql +1 -0
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "botholomew",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.7.0",
|
|
4
4
|
"description": "An AI agent for knowledge work",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
@@ -24,7 +24,7 @@
|
|
|
24
24
|
"dependencies": {
|
|
25
25
|
"@anthropic-ai/sdk": "^0.88.0",
|
|
26
26
|
"@duckdb/node-api": "^1.5.2-r.1",
|
|
27
|
-
"@evantahler/mcpx": "0.18.
|
|
27
|
+
"@evantahler/mcpx": "0.18.6",
|
|
28
28
|
"ansis": "^4.2.0",
|
|
29
29
|
"commander": "^14.0.0",
|
|
30
30
|
"gray-matter": "^4.0.3",
|
package/src/commands/context.ts
CHANGED
|
@@ -8,11 +8,13 @@ import { loadConfig } from "../config/loader.ts";
|
|
|
8
8
|
import type { BotholomewConfig } from "../config/schemas.ts";
|
|
9
9
|
import { generateDescription } from "../context/describer.ts";
|
|
10
10
|
import { embedSingle } from "../context/embedder.ts";
|
|
11
|
+
import { FetchFailureError, fetchUrl } from "../context/fetcher.ts";
|
|
11
12
|
import {
|
|
12
13
|
type PreparedIngestion,
|
|
13
14
|
prepareIngestion,
|
|
14
15
|
storeIngestion,
|
|
15
16
|
} from "../context/ingest.ts";
|
|
17
|
+
import { isUrl, urlToContextPath } from "../context/url-utils.ts";
|
|
16
18
|
import type { DbConnection } from "../db/connection.ts";
|
|
17
19
|
import {
|
|
18
20
|
type ContextItem,
|
|
@@ -24,6 +26,7 @@ import {
|
|
|
24
26
|
upsertContextItem,
|
|
25
27
|
} from "../db/context.ts";
|
|
26
28
|
import { getEmbeddingsForItem, hybridSearch } from "../db/embeddings.ts";
|
|
29
|
+
import { createMcpxClient } from "../mcpx/client.ts";
|
|
27
30
|
import { logger } from "../utils/logger.ts";
|
|
28
31
|
import {
|
|
29
32
|
registerContextToolSubcommands,
|
|
@@ -63,7 +66,7 @@ export function registerContextCommand(program: Command) {
|
|
|
63
66
|
return;
|
|
64
67
|
}
|
|
65
68
|
|
|
66
|
-
const header = `${ansis.bold("Path".padEnd(35))} ${"Title".padEnd(20)} ${"Description".padEnd(30)} ${"Type".padEnd(15)} ${"Updated".padEnd(18)} Indexed`;
|
|
69
|
+
const header = `${ansis.bold("ID".padEnd(36))} ${ansis.bold("Path".padEnd(35))} ${"Title".padEnd(20)} ${"Description".padEnd(30)} ${"Source".padEnd(6)} ${"Type".padEnd(15)} ${"Updated".padEnd(18)} Indexed`;
|
|
67
70
|
console.log(header);
|
|
68
71
|
console.log("-".repeat(header.length));
|
|
69
72
|
|
|
@@ -75,8 +78,13 @@ export function registerContextCommand(program: Command) {
|
|
|
75
78
|
const desc = item.description
|
|
76
79
|
? ansis.dim(item.description.slice(0, 29).padEnd(30))
|
|
77
80
|
: ansis.dim("".padEnd(30));
|
|
81
|
+
const source =
|
|
82
|
+
item.source_type === "url"
|
|
83
|
+
? ansis.cyan("url".padEnd(6))
|
|
84
|
+
: ansis.dim("file".padEnd(6));
|
|
85
|
+
const id = ansis.dim(item.id.padEnd(36));
|
|
78
86
|
console.log(
|
|
79
|
-
`${item.context_path.slice(0, 34).padEnd(35)} ${item.title.slice(0, 19).padEnd(20)} ${desc} ${item.mime_type.slice(0, 14).padEnd(15)} ${updated} ${indexed}`,
|
|
87
|
+
`${id} ${item.context_path.slice(0, 34).padEnd(35)} ${item.title.slice(0, 19).padEnd(20)} ${desc} ${source} ${item.mime_type.slice(0, 14).padEnd(15)} ${updated} ${indexed}`,
|
|
80
88
|
);
|
|
81
89
|
}
|
|
82
90
|
|
|
@@ -86,87 +94,178 @@ export function registerContextCommand(program: Command) {
|
|
|
86
94
|
|
|
87
95
|
ctx
|
|
88
96
|
.command("add <paths...>")
|
|
89
|
-
.description("Add files or
|
|
97
|
+
.description("Add files, directories, or URLs to context")
|
|
90
98
|
.option("--prefix <prefix>", "virtual path prefix", "/")
|
|
99
|
+
.option("--name <path>", "custom context path (single URL only)")
|
|
100
|
+
.option(
|
|
101
|
+
"--prompt-addition <text>",
|
|
102
|
+
"extra guidance for the URL fetcher agent (e.g., auth notes, tool hints)",
|
|
103
|
+
)
|
|
91
104
|
.action((paths: string[], opts) =>
|
|
92
105
|
withDb(program, async (conn, dir) => {
|
|
93
|
-
// Phase 1: Scan all paths
|
|
106
|
+
// Phase 1: Scan all paths — separate URLs from local files
|
|
94
107
|
const filesToAdd: { filePath: string; contextPath: string }[] = [];
|
|
95
|
-
const
|
|
108
|
+
const urlsToAdd: { url: string; contextPath: string }[] = [];
|
|
109
|
+
const spinner = createSpinner("Scanning paths...").start();
|
|
110
|
+
|
|
111
|
+
// Validate --name: only valid with a single URL
|
|
112
|
+
if (opts.name && (paths.length > 1 || !paths[0] || !isUrl(paths[0]))) {
|
|
113
|
+
spinner.error({
|
|
114
|
+
text: "--name can only be used with a single URL",
|
|
115
|
+
});
|
|
116
|
+
process.exit(1);
|
|
117
|
+
}
|
|
96
118
|
|
|
97
119
|
for (const path of paths) {
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
120
|
+
if (isUrl(path)) {
|
|
121
|
+
const contextPath =
|
|
122
|
+
opts.name || urlToContextPath(path, opts.prefix);
|
|
123
|
+
urlsToAdd.push({ url: path, contextPath });
|
|
124
|
+
} else {
|
|
125
|
+
const resolvedPath = resolve(path);
|
|
126
|
+
let info: Awaited<ReturnType<typeof stat>>;
|
|
127
|
+
try {
|
|
128
|
+
info = await stat(resolvedPath);
|
|
129
|
+
} catch {
|
|
130
|
+
spinner.error({ text: `Path not found: ${resolvedPath}` });
|
|
131
|
+
process.exit(1);
|
|
132
|
+
}
|
|
106
133
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
134
|
+
if (info.isDirectory()) {
|
|
135
|
+
const entries = await walkDirectory(resolvedPath);
|
|
136
|
+
for (const filePath of entries) {
|
|
137
|
+
const relativePath = filePath.slice(resolvedPath.length);
|
|
138
|
+
filesToAdd.push({
|
|
139
|
+
filePath,
|
|
140
|
+
contextPath: join(opts.prefix, relativePath),
|
|
141
|
+
});
|
|
142
|
+
}
|
|
143
|
+
} else {
|
|
111
144
|
filesToAdd.push({
|
|
112
|
-
filePath,
|
|
113
|
-
contextPath: join(opts.prefix,
|
|
145
|
+
filePath: resolvedPath,
|
|
146
|
+
contextPath: join(opts.prefix, basename(resolvedPath)),
|
|
114
147
|
});
|
|
115
148
|
}
|
|
116
|
-
} else {
|
|
117
|
-
filesToAdd.push({
|
|
118
|
-
filePath: resolvedPath,
|
|
119
|
-
contextPath: join(opts.prefix, basename(resolvedPath)),
|
|
120
|
-
});
|
|
121
149
|
}
|
|
122
150
|
}
|
|
123
151
|
|
|
152
|
+
const totalCount = filesToAdd.length + urlsToAdd.length;
|
|
124
153
|
spinner.success({
|
|
125
|
-
text: `Found ${filesToAdd.length} file(s)
|
|
154
|
+
text: `Found ${totalCount} item(s) to add (${filesToAdd.length} file(s), ${urlsToAdd.length} URL(s)).`,
|
|
126
155
|
});
|
|
127
156
|
|
|
128
157
|
// Phase 2: Load config and upsert DB records (batched, parallel LLM descriptions)
|
|
129
158
|
const config = await loadConfig(dir);
|
|
130
159
|
const CONCURRENCY = 10;
|
|
131
160
|
let addCompleted = 0;
|
|
132
|
-
const upsertSpinner = createSpinner(
|
|
133
|
-
`Adding and describing 0/${filesToAdd.length} files...`,
|
|
134
|
-
).start();
|
|
135
161
|
const itemIds: { id: string; contextPath: string }[] = [];
|
|
136
162
|
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
const
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
163
|
+
// Process local files (with spinner — these are quick, no chatty logs)
|
|
164
|
+
if (filesToAdd.length > 0) {
|
|
165
|
+
const fileSpinner = createSpinner(
|
|
166
|
+
`Adding and describing 0/${filesToAdd.length} file(s)...`,
|
|
167
|
+
).start();
|
|
168
|
+
|
|
169
|
+
for (let i = 0; i < filesToAdd.length; i += CONCURRENCY) {
|
|
170
|
+
const batch = filesToAdd.slice(i, i + CONCURRENCY);
|
|
171
|
+
const results = await Promise.all(
|
|
172
|
+
batch.map(async ({ filePath, contextPath }) => {
|
|
173
|
+
const result = await addFile(
|
|
174
|
+
conn,
|
|
175
|
+
filePath,
|
|
176
|
+
contextPath,
|
|
177
|
+
config,
|
|
178
|
+
);
|
|
179
|
+
addCompleted++;
|
|
180
|
+
fileSpinner.update({
|
|
181
|
+
text: `Adding and describing ${addCompleted}/${filesToAdd.length} file(s)...`,
|
|
182
|
+
});
|
|
183
|
+
return result ? { id: result, contextPath } : null;
|
|
184
|
+
}),
|
|
185
|
+
);
|
|
186
|
+
for (const r of results) {
|
|
187
|
+
if (r) itemIds.push(r);
|
|
188
|
+
}
|
|
151
189
|
}
|
|
190
|
+
|
|
191
|
+
fileSpinner.success({
|
|
192
|
+
text: `Added and described ${addCompleted} file(s).`,
|
|
193
|
+
});
|
|
152
194
|
}
|
|
153
195
|
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
196
|
+
// Process URLs (no spinner — agent logs would interleave; render cleanly instead)
|
|
197
|
+
if (urlsToAdd.length > 0) {
|
|
198
|
+
const mcpxClient = await createMcpxClient(dir);
|
|
199
|
+
if (!mcpxClient) {
|
|
200
|
+
logger.dim(
|
|
201
|
+
"No MCP servers configured — remote fetches will use basic HTTP.",
|
|
202
|
+
);
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
let urlIdx = 0;
|
|
206
|
+
let urlAdded = 0;
|
|
207
|
+
for (const { url, contextPath } of urlsToAdd) {
|
|
208
|
+
urlIdx++;
|
|
209
|
+
console.log(
|
|
210
|
+
`\n${ansis.bold(`[${urlIdx}/${urlsToAdd.length}]`)} ${ansis.cyan(url)}`,
|
|
211
|
+
);
|
|
212
|
+
const result = await addUrl(
|
|
213
|
+
conn,
|
|
214
|
+
config,
|
|
215
|
+
url,
|
|
216
|
+
contextPath,
|
|
217
|
+
mcpxClient,
|
|
218
|
+
opts.promptAddition,
|
|
219
|
+
);
|
|
220
|
+
if (result.ok) {
|
|
221
|
+
urlAdded++;
|
|
222
|
+
itemIds.push({ id: result.id, contextPath });
|
|
223
|
+
console.log(` ${ansis.green("✔")} stored at ${contextPath}`);
|
|
224
|
+
} else if (result.actionable) {
|
|
225
|
+
console.log(
|
|
226
|
+
` ${ansis.red("✗")} ${ansis.bold("action required:")}`,
|
|
227
|
+
);
|
|
228
|
+
for (const line of result.error.split("\n")) {
|
|
229
|
+
console.log(` ${ansis.yellow(line)}`);
|
|
230
|
+
}
|
|
231
|
+
} else {
|
|
232
|
+
console.log(
|
|
233
|
+
` ${ansis.red("✗")} failed to fetch: ${result.error}`,
|
|
234
|
+
);
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
const urlSummary = `Added ${urlAdded}/${urlsToAdd.length} URL(s).`;
|
|
239
|
+
if (urlAdded === urlsToAdd.length) {
|
|
240
|
+
console.log(`\n${ansis.green("✔")} ${urlSummary}`);
|
|
241
|
+
} else if (urlAdded === 0) {
|
|
242
|
+
console.log(`\n${ansis.red("✗")} ${urlSummary}`);
|
|
243
|
+
} else {
|
|
244
|
+
console.log(`\n${ansis.yellow("⚠")} ${urlSummary}`);
|
|
245
|
+
}
|
|
246
|
+
}
|
|
157
247
|
|
|
158
248
|
// Phase 3: Chunk + embed in parallel (network I/O)
|
|
159
249
|
if (itemIds.length === 0 || !config.openai_api_key) {
|
|
160
250
|
if (!config.openai_api_key) {
|
|
161
251
|
logger.dim("Skipping embeddings (no OpenAI API key configured).");
|
|
162
252
|
}
|
|
163
|
-
|
|
164
|
-
|
|
253
|
+
const msg = `Added ${itemIds.length}/${totalCount} item(s), 0 chunks indexed.`;
|
|
254
|
+
if (itemIds.length === totalCount) {
|
|
255
|
+
logger.success(msg);
|
|
256
|
+
process.exit(0);
|
|
257
|
+
} else if (itemIds.length === 0) {
|
|
258
|
+
logger.error(msg);
|
|
259
|
+
process.exit(1);
|
|
260
|
+
} else {
|
|
261
|
+
logger.warn(msg);
|
|
262
|
+
process.exit(1);
|
|
263
|
+
}
|
|
165
264
|
}
|
|
166
265
|
|
|
167
266
|
let completed = 0;
|
|
168
267
|
const embedSpinner = createSpinner(
|
|
169
|
-
`Embedding 0/${itemIds.length}
|
|
268
|
+
`Embedding 0/${itemIds.length} items...`,
|
|
170
269
|
).start();
|
|
171
270
|
|
|
172
271
|
const prepared: PreparedIngestion[] = [];
|
|
@@ -177,7 +276,7 @@ export function registerContextCommand(program: Command) {
|
|
|
177
276
|
const result = await prepareIngestion(conn, id, config);
|
|
178
277
|
completed++;
|
|
179
278
|
embedSpinner.update({
|
|
180
|
-
text: `Embedding ${completed}/${itemIds.length}
|
|
279
|
+
text: `Embedding ${completed}/${itemIds.length} items...`,
|
|
181
280
|
});
|
|
182
281
|
return result;
|
|
183
282
|
}),
|
|
@@ -187,7 +286,7 @@ export function registerContextCommand(program: Command) {
|
|
|
187
286
|
}
|
|
188
287
|
}
|
|
189
288
|
embedSpinner.success({
|
|
190
|
-
text: `Embedded ${prepared.length}
|
|
289
|
+
text: `Embedded ${prepared.length} item(s).`,
|
|
191
290
|
});
|
|
192
291
|
|
|
193
292
|
// Phase 4: Store embeddings (sequential, fast DB writes)
|
|
@@ -204,8 +303,14 @@ export function registerContextCommand(program: Command) {
|
|
|
204
303
|
const parts: string[] = [];
|
|
205
304
|
if (filesAdded > 0) parts.push(`${filesAdded} added`);
|
|
206
305
|
if (filesUpdated > 0) parts.push(`${filesUpdated} updated`);
|
|
207
|
-
|
|
208
|
-
|
|
306
|
+
const summary = `${parts.join(", ")} — ${chunks} chunk(s) indexed (${itemIds.length}/${totalCount} item(s)).`;
|
|
307
|
+
if (itemIds.length === totalCount) {
|
|
308
|
+
logger.success(summary);
|
|
309
|
+
process.exit(0);
|
|
310
|
+
} else {
|
|
311
|
+
logger.warn(summary);
|
|
312
|
+
process.exit(1);
|
|
313
|
+
}
|
|
209
314
|
}),
|
|
210
315
|
);
|
|
211
316
|
|
|
@@ -310,7 +415,9 @@ export function registerContextCommand(program: Command) {
|
|
|
310
415
|
|
|
311
416
|
ctx
|
|
312
417
|
.command("refresh [path]")
|
|
313
|
-
.description(
|
|
418
|
+
.description(
|
|
419
|
+
"Re-import files from disk / re-fetch URLs and re-embed if content changed",
|
|
420
|
+
)
|
|
314
421
|
.option("--all", "refresh all items with a source path")
|
|
315
422
|
.action((path: string | undefined, opts: { all?: boolean }) =>
|
|
316
423
|
withDb(program, async (conn, dir) => {
|
|
@@ -333,7 +440,11 @@ export function registerContextCommand(program: Command) {
|
|
|
333
440
|
|
|
334
441
|
const config = await loadConfig(dir);
|
|
335
442
|
|
|
336
|
-
//
|
|
443
|
+
// Init MCPX client if any URL items need refreshing
|
|
444
|
+
const hasUrls = sourced.some((i) => i.source_type === "url");
|
|
445
|
+
const mcpxClient = hasUrls ? await createMcpxClient(dir) : null;
|
|
446
|
+
|
|
447
|
+
// Phase 1: Read files / fetch URLs, compare, and update DB
|
|
337
448
|
const spinner = createSpinner(
|
|
338
449
|
`Refreshing 0/${sourced.length} items...`,
|
|
339
450
|
).start();
|
|
@@ -348,13 +459,21 @@ export function registerContextCommand(program: Command) {
|
|
|
348
459
|
});
|
|
349
460
|
try {
|
|
350
461
|
const sourcePath = item.source_path as string;
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
462
|
+
let content: string;
|
|
463
|
+
|
|
464
|
+
if (item.source_type === "url") {
|
|
465
|
+
const fetched = await fetchUrl(sourcePath, config, mcpxClient);
|
|
466
|
+
content = fetched.content;
|
|
467
|
+
} else {
|
|
468
|
+
const bunFile = Bun.file(sourcePath);
|
|
469
|
+
if (!(await bunFile.exists())) {
|
|
470
|
+
missing++;
|
|
471
|
+
logger.warn(` Missing: ${item.source_path}`);
|
|
472
|
+
continue;
|
|
473
|
+
}
|
|
474
|
+
content = await bunFile.text();
|
|
356
475
|
}
|
|
357
|
-
|
|
476
|
+
|
|
358
477
|
if (content === item.content) {
|
|
359
478
|
unchanged++;
|
|
360
479
|
continue;
|
|
@@ -363,11 +482,11 @@ export function registerContextCommand(program: Command) {
|
|
|
363
482
|
updated++;
|
|
364
483
|
toReembed.push(item.id);
|
|
365
484
|
} catch (err) {
|
|
366
|
-
logger.warn(` Error
|
|
485
|
+
logger.warn(` Error refreshing ${item.source_path}: ${err}`);
|
|
367
486
|
}
|
|
368
487
|
}
|
|
369
488
|
spinner.success({
|
|
370
|
-
text: `Checked ${sourced.length}
|
|
489
|
+
text: `Checked ${sourced.length} item(s): ${updated} updated, ${unchanged} unchanged, ${missing} missing.`,
|
|
371
490
|
});
|
|
372
491
|
|
|
373
492
|
// Phase 2: Re-embed changed items
|
|
@@ -381,7 +500,7 @@ export function registerContextCommand(program: Command) {
|
|
|
381
500
|
const CONCURRENCY = 10;
|
|
382
501
|
let completed = 0;
|
|
383
502
|
const embedSpinner = createSpinner(
|
|
384
|
-
`Embedding 0/${toReembed.length}
|
|
503
|
+
`Embedding 0/${toReembed.length} item(s)...`,
|
|
385
504
|
).start();
|
|
386
505
|
|
|
387
506
|
const prepared: PreparedIngestion[] = [];
|
|
@@ -392,7 +511,7 @@ export function registerContextCommand(program: Command) {
|
|
|
392
511
|
const result = await prepareIngestion(conn, id, config);
|
|
393
512
|
completed++;
|
|
394
513
|
embedSpinner.update({
|
|
395
|
-
text: `Embedding ${completed}/${toReembed.length}
|
|
514
|
+
text: `Embedding ${completed}/${toReembed.length} item(s)...`,
|
|
396
515
|
});
|
|
397
516
|
return result;
|
|
398
517
|
}),
|
|
@@ -402,7 +521,7 @@ export function registerContextCommand(program: Command) {
|
|
|
402
521
|
}
|
|
403
522
|
}
|
|
404
523
|
embedSpinner.success({
|
|
405
|
-
text: `Embedded ${prepared.length}
|
|
524
|
+
text: `Embedded ${prepared.length} item(s).`,
|
|
406
525
|
});
|
|
407
526
|
|
|
408
527
|
let chunks = 0;
|
|
@@ -412,7 +531,7 @@ export function registerContextCommand(program: Command) {
|
|
|
412
531
|
}
|
|
413
532
|
|
|
414
533
|
logger.success(
|
|
415
|
-
`Refreshed ${updated}
|
|
534
|
+
`Refreshed ${updated} item(s), ${chunks} chunk(s) re-indexed.`,
|
|
416
535
|
);
|
|
417
536
|
}),
|
|
418
537
|
);
|
|
@@ -476,6 +595,48 @@ async function addFile(
|
|
|
476
595
|
}
|
|
477
596
|
}
|
|
478
597
|
|
|
598
|
+
/** Fetch a URL and upsert into context. Returns the item ID, or null on failure. */
|
|
599
|
+
type AddUrlResult =
|
|
600
|
+
| { ok: true; id: string }
|
|
601
|
+
| { ok: false; error: string; actionable: boolean };
|
|
602
|
+
|
|
603
|
+
async function addUrl(
|
|
604
|
+
conn: DbConnection,
|
|
605
|
+
config: Required<BotholomewConfig>,
|
|
606
|
+
url: string,
|
|
607
|
+
contextPath: string,
|
|
608
|
+
mcpxClient: Awaited<ReturnType<typeof createMcpxClient>>,
|
|
609
|
+
promptAddition?: string,
|
|
610
|
+
): Promise<AddUrlResult> {
|
|
611
|
+
try {
|
|
612
|
+
const fetched = await fetchUrl(url, config, mcpxClient, promptAddition);
|
|
613
|
+
|
|
614
|
+
const description = await generateDescription(config, {
|
|
615
|
+
filename: new URL(url).hostname,
|
|
616
|
+
mimeType: fetched.mimeType,
|
|
617
|
+
content: fetched.content,
|
|
618
|
+
});
|
|
619
|
+
|
|
620
|
+
const item = await upsertContextItem(conn, {
|
|
621
|
+
title: fetched.title,
|
|
622
|
+
description,
|
|
623
|
+
content: fetched.content,
|
|
624
|
+
mimeType: fetched.mimeType,
|
|
625
|
+
sourceType: "url",
|
|
626
|
+
sourcePath: url,
|
|
627
|
+
contextPath,
|
|
628
|
+
isTextual: true,
|
|
629
|
+
});
|
|
630
|
+
|
|
631
|
+
return { ok: true, id: item.id };
|
|
632
|
+
} catch (err) {
|
|
633
|
+
if (err instanceof FetchFailureError) {
|
|
634
|
+
return { ok: false, error: err.userMessage, actionable: true };
|
|
635
|
+
}
|
|
636
|
+
return { ok: false, error: String(err), actionable: false };
|
|
637
|
+
}
|
|
638
|
+
}
|
|
639
|
+
|
|
479
640
|
async function walkDirectory(dirPath: string): Promise<string[]> {
|
|
480
641
|
const files: string[] = [];
|
|
481
642
|
const entries = await readdir(dirPath, { withFileTypes: true });
|
package/src/context/chunker.ts
CHANGED
|
@@ -9,6 +9,13 @@ export interface Chunk {
|
|
|
9
9
|
const SHORT_CONTENT_THRESHOLD = 200;
|
|
10
10
|
const LLM_TIMEOUT_MS = 10_000;
|
|
11
11
|
const DEFAULT_OVERLAP_LINES = 2;
|
|
12
|
+
// OpenAI's embedding endpoint caps inputs at 8192 tokens. The cl100k_base
|
|
13
|
+
// tokenizer averages ~4 chars/token on plain English but can drop to ~2
|
|
14
|
+
// chars/token on dense/code/non-ASCII content. We cap at 15k chars so even
|
|
15
|
+
// at the worst-case ~2.5 chars/token (~6k tokens) we stay well under the
|
|
16
|
+
// 8192-token limit, leaving headroom for the title/description prefix
|
|
17
|
+
// prepended at embed time.
|
|
18
|
+
const MAX_CHUNK_CHARS = 15_000;
|
|
12
19
|
|
|
13
20
|
const CHUNKER_TOOL_NAME = "return_chunks";
|
|
14
21
|
const CHUNKER_TOOL = {
|
|
@@ -41,6 +48,90 @@ const CHUNKER_TOOL = {
|
|
|
41
48
|
},
|
|
42
49
|
};
|
|
43
50
|
|
|
51
|
+
/**
|
|
52
|
+
* Split text into pieces no larger than `maxChars`, preferring paragraph,
|
|
53
|
+
* line, and finally hard-character boundaries.
|
|
54
|
+
*/
|
|
55
|
+
function splitText(text: string, maxChars: number): string[] {
|
|
56
|
+
if (text.length <= maxChars) return [text];
|
|
57
|
+
|
|
58
|
+
// Try paragraph splits first.
|
|
59
|
+
const paragraphs = text.split(/\n\n+/);
|
|
60
|
+
if (paragraphs.length > 1) {
|
|
61
|
+
const out: string[] = [];
|
|
62
|
+
let buf = "";
|
|
63
|
+
for (const p of paragraphs) {
|
|
64
|
+
const candidate = buf ? `${buf}\n\n${p}` : p;
|
|
65
|
+
if (candidate.length <= maxChars) {
|
|
66
|
+
buf = candidate;
|
|
67
|
+
} else {
|
|
68
|
+
if (buf) out.push(buf);
|
|
69
|
+
if (p.length <= maxChars) {
|
|
70
|
+
buf = p;
|
|
71
|
+
} else {
|
|
72
|
+
out.push(...splitText(p, maxChars));
|
|
73
|
+
buf = "";
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
if (buf) out.push(buf);
|
|
78
|
+
return out;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// Fall back to line splits.
|
|
82
|
+
const lines = text.split("\n");
|
|
83
|
+
if (lines.length > 1) {
|
|
84
|
+
const out: string[] = [];
|
|
85
|
+
let buf = "";
|
|
86
|
+
for (const line of lines) {
|
|
87
|
+
const candidate = buf ? `${buf}\n${line}` : line;
|
|
88
|
+
if (candidate.length <= maxChars) {
|
|
89
|
+
buf = candidate;
|
|
90
|
+
} else {
|
|
91
|
+
if (buf) out.push(buf);
|
|
92
|
+
if (line.length <= maxChars) {
|
|
93
|
+
buf = line;
|
|
94
|
+
} else {
|
|
95
|
+
// Single line longer than maxChars — slice it.
|
|
96
|
+
for (let i = 0; i < line.length; i += maxChars) {
|
|
97
|
+
out.push(line.slice(i, i + maxChars));
|
|
98
|
+
}
|
|
99
|
+
buf = "";
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
if (buf) out.push(buf);
|
|
104
|
+
return out;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// Last resort: hard slice.
|
|
108
|
+
const out: string[] = [];
|
|
109
|
+
for (let i = 0; i < text.length; i += maxChars) {
|
|
110
|
+
out.push(text.slice(i, i + maxChars));
|
|
111
|
+
}
|
|
112
|
+
return out;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
/**
|
|
116
|
+
* Re-chunk any chunks larger than `maxChars`, preserving order and reindexing.
|
|
117
|
+
*/
|
|
118
|
+
export function enforceMaxChunkSize(
|
|
119
|
+
chunks: Chunk[],
|
|
120
|
+
maxChars = MAX_CHUNK_CHARS,
|
|
121
|
+
): Chunk[] {
|
|
122
|
+
const out: Chunk[] = [];
|
|
123
|
+
for (const c of chunks) {
|
|
124
|
+
if (c.content.length <= maxChars) {
|
|
125
|
+
out.push({ index: out.length, content: c.content });
|
|
126
|
+
continue;
|
|
127
|
+
}
|
|
128
|
+
for (const piece of splitText(c.content, maxChars)) {
|
|
129
|
+
out.push({ index: out.length, content: piece });
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
return out;
|
|
133
|
+
}
|
|
134
|
+
|
|
44
135
|
/**
|
|
45
136
|
* Add overlapping lines from the end of each chunk to the start of the next.
|
|
46
137
|
* Improves retrieval when concepts span chunk boundaries.
|
|
@@ -137,5 +228,11 @@ export async function chunk(
|
|
|
137
228
|
}
|
|
138
229
|
|
|
139
230
|
const chunks = await chunkWithLLM(content, mimeType, config);
|
|
140
|
-
|
|
231
|
+
// Enforce a hard size cap before AND after overlap. The first pass handles
|
|
232
|
+
// oversize chunks from the LLM (common for docs with very long lines); the
|
|
233
|
+
// second pass handles the rare case where added overlap pushes a near-limit
|
|
234
|
+
// chunk over.
|
|
235
|
+
const sized = enforceMaxChunkSize(chunks);
|
|
236
|
+
const withOverlap = addOverlapToChunks(sized);
|
|
237
|
+
return enforceMaxChunkSize(withOverlap);
|
|
141
238
|
}
|
|
@@ -0,0 +1,436 @@
|
|
|
1
|
+
import Anthropic from "@anthropic-ai/sdk";
|
|
2
|
+
import type {
|
|
3
|
+
Tool as AnthropicTool,
|
|
4
|
+
MessageParam,
|
|
5
|
+
ToolResultBlockParam,
|
|
6
|
+
ToolUseBlock,
|
|
7
|
+
} from "@anthropic-ai/sdk/resources/messages";
|
|
8
|
+
import type { McpxClient } from "@evantahler/mcpx";
|
|
9
|
+
import type { BotholomewConfig } from "../config/schemas.ts";
|
|
10
|
+
import type { DbConnection } from "../db/connection.ts";
|
|
11
|
+
import { mcpExecTool } from "../tools/mcp/exec.ts";
|
|
12
|
+
import { mcpInfoTool } from "../tools/mcp/info.ts";
|
|
13
|
+
import { mcpListToolsTool } from "../tools/mcp/list-tools.ts";
|
|
14
|
+
import { mcpSearchTool } from "../tools/mcp/search.ts";
|
|
15
|
+
import type { ToolContext } from "../tools/tool.ts";
|
|
16
|
+
import { type AnyToolDefinition, toAnthropicTool } from "../tools/tool.ts";
|
|
17
|
+
import { logger } from "../utils/logger.ts";
|
|
18
|
+
import { stripHtmlTags } from "./url-utils.ts";
|
|
19
|
+
|
|
20
|
+
const MAX_CONTENT_BYTES = 500_000;
|
|
21
|
+
const MAX_TURNS = 10;
|
|
22
|
+
const MAX_RESPONSE_TOKENS = 4_096;
|
|
23
|
+
const PREVIEW_CHARS = 2_000;
|
|
24
|
+
const HTTP_TIMEOUT_MS = 30_000;
|
|
25
|
+
|
|
26
|
+
export interface FetchedContent {
|
|
27
|
+
title: string;
|
|
28
|
+
content: string;
|
|
29
|
+
mimeType: string;
|
|
30
|
+
sourceUrl: string;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export class FetchFailureError extends Error {
|
|
34
|
+
readonly userMessage: string;
|
|
35
|
+
constructor(message: string) {
|
|
36
|
+
super(message);
|
|
37
|
+
this.name = "FetchFailureError";
|
|
38
|
+
this.userMessage = message;
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
const FETCHER_SYSTEM_PROMPT = `You are a content fetcher. Your job is to find the right MCP tool to retrieve the content at the given URL, run it, and tell the harness which result to save.
|
|
43
|
+
|
|
44
|
+
**Important: the harness captures the full result of every mcp_exec call automatically.** You only see a short preview of each result so you can verify it looks reasonable. You do NOT need to read or copy the full content — you just identify which exec call to save.
|
|
45
|
+
|
|
46
|
+
Strongly prefer markdown output. Most MCP tools support a markdown/format parameter — use it when available.
|
|
47
|
+
|
|
48
|
+
Workflow:
|
|
49
|
+
1. Use mcp_search or mcp_list_tools to find the best tool for this URL (e.g., Google Docs tools for docs.google.com, Firecrawl for generic web pages, GitHub tools for github.com).
|
|
50
|
+
2. Use mcp_info to inspect the tool's input schema.
|
|
51
|
+
3. Call mcp_exec with the right arguments — request markdown format when supported.
|
|
52
|
+
4. Look at the preview returned by mcp_exec. If it looks like the right content, call accept_content with the exec_call_id (the tool_use_id of the mcp_exec call) and a sensible title.
|
|
53
|
+
|
|
54
|
+
Terminal tools:
|
|
55
|
+
- accept_content(exec_call_id, title, mime_type?) — save the full content captured from a previous mcp_exec call. The harness has the full content; you just supply the id, title, and optional mime_type (defaults to text/markdown).
|
|
56
|
+
- request_http_fallback() — fall back to a basic HTTP fetch. Use only when no MCP tool can handle the URL after a genuine attempt. Tools like Firecrawl can handle most URLs, so don't give up on the first try.
|
|
57
|
+
- report_failure(message) — surface an actionable message to the user (e.g., "this Google Doc is private — share it with your service account", "Firecrawl is not authenticated"). Use only when there is a specific next step the user must take.`;
|
|
58
|
+
|
|
59
|
+
const acceptContentTool: AnthropicTool = {
|
|
60
|
+
name: "accept_content",
|
|
61
|
+
description:
|
|
62
|
+
"Save the full content captured by the harness from a previous mcp_exec call. You only need to supply the exec_call_id (the tool_use_id of that mcp_exec call) and a title — the harness already has the full content. Do NOT paste content here.",
|
|
63
|
+
input_schema: {
|
|
64
|
+
type: "object" as const,
|
|
65
|
+
properties: {
|
|
66
|
+
exec_call_id: {
|
|
67
|
+
type: "string",
|
|
68
|
+
description:
|
|
69
|
+
"The tool_use_id of the mcp_exec call whose result should be saved (the harness lists captured ids in mcp_exec previews).",
|
|
70
|
+
},
|
|
71
|
+
title: {
|
|
72
|
+
type: "string",
|
|
73
|
+
description:
|
|
74
|
+
"A human-readable title for the content (e.g., the document title, or derived from the URL).",
|
|
75
|
+
},
|
|
76
|
+
mime_type: {
|
|
77
|
+
type: "string",
|
|
78
|
+
description: "MIME type of the content (defaults to text/markdown).",
|
|
79
|
+
},
|
|
80
|
+
},
|
|
81
|
+
required: ["exec_call_id", "title"],
|
|
82
|
+
},
|
|
83
|
+
};
|
|
84
|
+
|
|
85
|
+
interface AcceptContentInput {
|
|
86
|
+
exec_call_id: string;
|
|
87
|
+
title: string;
|
|
88
|
+
mime_type?: string;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
const requestHttpFallbackTool: AnthropicTool = {
|
|
92
|
+
name: "request_http_fallback",
|
|
93
|
+
description:
|
|
94
|
+
"Fall back to a basic HTTP fetch. Use only when no MCP tool can handle the URL after a genuine attempt.",
|
|
95
|
+
input_schema: {
|
|
96
|
+
type: "object" as const,
|
|
97
|
+
properties: {},
|
|
98
|
+
required: [],
|
|
99
|
+
},
|
|
100
|
+
};
|
|
101
|
+
|
|
102
|
+
const reportFailureTool: AnthropicTool = {
|
|
103
|
+
name: "report_failure",
|
|
104
|
+
description:
|
|
105
|
+
"Report a fetch failure with an actionable message for the user (e.g., 'this Google Doc is private — share it with your service account'). Use only when there is a clear next step the user must take.",
|
|
106
|
+
input_schema: {
|
|
107
|
+
type: "object" as const,
|
|
108
|
+
properties: {
|
|
109
|
+
message: {
|
|
110
|
+
type: "string",
|
|
111
|
+
description:
|
|
112
|
+
"A clear, actionable, user-facing message explaining what the user needs to do to make this URL fetchable.",
|
|
113
|
+
},
|
|
114
|
+
},
|
|
115
|
+
required: ["message"],
|
|
116
|
+
},
|
|
117
|
+
};
|
|
118
|
+
|
|
119
|
+
interface ReportFailureInput {
|
|
120
|
+
message: string;
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
const mcpTools: AnyToolDefinition[] = [
|
|
124
|
+
mcpListToolsTool as unknown as AnyToolDefinition,
|
|
125
|
+
mcpSearchTool as unknown as AnyToolDefinition,
|
|
126
|
+
mcpInfoTool as unknown as AnyToolDefinition,
|
|
127
|
+
mcpExecTool as unknown as AnyToolDefinition,
|
|
128
|
+
];
|
|
129
|
+
|
|
130
|
+
export async function fetchUrl(
|
|
131
|
+
url: string,
|
|
132
|
+
config: Required<BotholomewConfig>,
|
|
133
|
+
mcpxClient: McpxClient | null,
|
|
134
|
+
promptAddition?: string,
|
|
135
|
+
): Promise<FetchedContent> {
|
|
136
|
+
if (!config.anthropic_api_key) {
|
|
137
|
+
throw new Error(
|
|
138
|
+
"Anthropic API key is required for URL fetching. Set ANTHROPIC_API_KEY or configure it in .botholomew/config.json",
|
|
139
|
+
);
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
if (!mcpxClient) {
|
|
143
|
+
logger.dim(" no MCPX client — using HTTP fallback");
|
|
144
|
+
return httpFallback(url);
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
const result = await runFetcherLoop(url, config, mcpxClient, promptAddition);
|
|
148
|
+
if (result) return result;
|
|
149
|
+
|
|
150
|
+
logger.dim(" agent signaled fallback — using HTTP");
|
|
151
|
+
return httpFallback(url);
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
async function runFetcherLoop(
|
|
155
|
+
url: string,
|
|
156
|
+
config: Required<BotholomewConfig>,
|
|
157
|
+
mcpxClient: McpxClient,
|
|
158
|
+
promptAddition?: string,
|
|
159
|
+
): Promise<FetchedContent | null> {
|
|
160
|
+
const client = new Anthropic({ apiKey: config.anthropic_api_key });
|
|
161
|
+
|
|
162
|
+
const toolCtx: ToolContext = {
|
|
163
|
+
conn: null as unknown as DbConnection,
|
|
164
|
+
projectDir: "",
|
|
165
|
+
config,
|
|
166
|
+
mcpxClient,
|
|
167
|
+
};
|
|
168
|
+
|
|
169
|
+
const tools: AnthropicTool[] = [
|
|
170
|
+
...mcpTools.map(toAnthropicTool),
|
|
171
|
+
acceptContentTool,
|
|
172
|
+
requestHttpFallbackTool,
|
|
173
|
+
reportFailureTool,
|
|
174
|
+
];
|
|
175
|
+
|
|
176
|
+
// Cache of full mcp_exec results keyed by tool_use_id.
|
|
177
|
+
// The LLM only sees a truncated preview; on accept_content it references
|
|
178
|
+
// the id and the harness saves the captured content.
|
|
179
|
+
const execResults = new Map<
|
|
180
|
+
string,
|
|
181
|
+
{ server: string; tool: string; content: string; mimeType: string }
|
|
182
|
+
>();
|
|
183
|
+
|
|
184
|
+
const userPrompt = promptAddition
|
|
185
|
+
? `Fetch the content at: ${url}\n\nAdditional guidance:\n${promptAddition}`
|
|
186
|
+
: `Fetch the content at: ${url}`;
|
|
187
|
+
const messages: MessageParam[] = [{ role: "user", content: userPrompt }];
|
|
188
|
+
|
|
189
|
+
for (let turn = 0; turn < MAX_TURNS; turn++) {
|
|
190
|
+
const response = await client.messages.create({
|
|
191
|
+
model: config.model,
|
|
192
|
+
max_tokens: MAX_RESPONSE_TOKENS,
|
|
193
|
+
system: FETCHER_SYSTEM_PROMPT,
|
|
194
|
+
messages,
|
|
195
|
+
tools,
|
|
196
|
+
});
|
|
197
|
+
|
|
198
|
+
// Log assistant text reasoning
|
|
199
|
+
for (const block of response.content) {
|
|
200
|
+
if (block.type === "text" && block.text.trim()) {
|
|
201
|
+
logger.dim(` turn ${turn + 1}: ${block.text.trim()}`);
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
if (response.stop_reason === "max_tokens") {
|
|
206
|
+
throw new FetchFailureError(
|
|
207
|
+
`The fetched document is too large to return in a single LLM response (hit max_tokens=${MAX_RESPONSE_TOKENS}). Try fetching a smaller section, a specific page, or a tool that supports pagination.`,
|
|
208
|
+
);
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
const toolUseBlocks = response.content.filter(
|
|
212
|
+
(block): block is ToolUseBlock => block.type === "tool_use",
|
|
213
|
+
);
|
|
214
|
+
|
|
215
|
+
if (toolUseBlocks.length === 0) {
|
|
216
|
+
logger.dim(` turn ${turn + 1}: no tool calls — signaling fallback`);
|
|
217
|
+
return null;
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
messages.push({ role: "assistant", content: response.content });
|
|
221
|
+
|
|
222
|
+
// Check for report_failure first (terminal — surfaces actionable user message)
|
|
223
|
+
const failureCall = toolUseBlocks.find((b) => b.name === "report_failure");
|
|
224
|
+
if (failureCall) {
|
|
225
|
+
const input = failureCall.input as Partial<ReportFailureInput>;
|
|
226
|
+
const message =
|
|
227
|
+
typeof input.message === "string" && input.message.trim()
|
|
228
|
+
? input.message
|
|
229
|
+
: "Fetch failed but the agent did not provide a message.";
|
|
230
|
+
logger.dim(` turn ${turn + 1}: report_failure: ${message}`);
|
|
231
|
+
throw new FetchFailureError(message);
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
// Check for request_http_fallback (terminal)
|
|
235
|
+
const fallbackCall = toolUseBlocks.find(
|
|
236
|
+
(b) => b.name === "request_http_fallback",
|
|
237
|
+
);
|
|
238
|
+
if (fallbackCall) {
|
|
239
|
+
logger.dim(` turn ${turn + 1}: agent requested HTTP fallback`);
|
|
240
|
+
return null;
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
// Check for accept_content (terminal — looks up captured exec result)
|
|
244
|
+
const acceptCall = toolUseBlocks.find((b) => b.name === "accept_content");
|
|
245
|
+
if (acceptCall) {
|
|
246
|
+
const input = acceptCall.input as Partial<AcceptContentInput>;
|
|
247
|
+
if (
|
|
248
|
+
typeof input.exec_call_id !== "string" ||
|
|
249
|
+
typeof input.title !== "string"
|
|
250
|
+
) {
|
|
251
|
+
logger.dim(
|
|
252
|
+
` turn ${turn + 1}: accept_content missing required fields — asking agent to retry`,
|
|
253
|
+
);
|
|
254
|
+
messages.push({
|
|
255
|
+
role: "user",
|
|
256
|
+
content: [
|
|
257
|
+
{
|
|
258
|
+
type: "tool_result" as const,
|
|
259
|
+
tool_use_id: acceptCall.id,
|
|
260
|
+
content:
|
|
261
|
+
"Invalid accept_content call: both 'exec_call_id' and 'title' are required strings.",
|
|
262
|
+
is_error: true,
|
|
263
|
+
},
|
|
264
|
+
],
|
|
265
|
+
});
|
|
266
|
+
continue;
|
|
267
|
+
}
|
|
268
|
+
const cached = execResults.get(input.exec_call_id);
|
|
269
|
+
if (!cached) {
|
|
270
|
+
const validIds = [...execResults.keys()];
|
|
271
|
+
logger.dim(
|
|
272
|
+
` turn ${turn + 1}: accept_content: unknown exec_call_id "${input.exec_call_id}"`,
|
|
273
|
+
);
|
|
274
|
+
messages.push({
|
|
275
|
+
role: "user",
|
|
276
|
+
content: [
|
|
277
|
+
{
|
|
278
|
+
type: "tool_result" as const,
|
|
279
|
+
tool_use_id: acceptCall.id,
|
|
280
|
+
content: `No mcp_exec call with id "${input.exec_call_id}" was captured. Captured ids: ${validIds.length ? validIds.join(", ") : "(none yet — run mcp_exec first)"}.`,
|
|
281
|
+
is_error: true,
|
|
282
|
+
},
|
|
283
|
+
],
|
|
284
|
+
});
|
|
285
|
+
continue;
|
|
286
|
+
}
|
|
287
|
+
const mimeType = input.mime_type || cached.mimeType;
|
|
288
|
+
logger.dim(
|
|
289
|
+
` turn ${turn + 1}: accept_content: "${input.title}" (${cached.content.length} chars, ${mimeType}, from ${cached.server}/${cached.tool})`,
|
|
290
|
+
);
|
|
291
|
+
return {
|
|
292
|
+
title: input.title,
|
|
293
|
+
content: cached.content.slice(0, MAX_CONTENT_BYTES),
|
|
294
|
+
mimeType,
|
|
295
|
+
sourceUrl: url,
|
|
296
|
+
};
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
// Execute non-terminal MCP tools in parallel
|
|
300
|
+
const toolResults: ToolResultBlockParam[] = await Promise.all(
|
|
301
|
+
toolUseBlocks.map(async (toolUse) => {
|
|
302
|
+
// Log which tool the agent selected (and the underlying MCP server/tool for mcp_exec)
|
|
303
|
+
const toolInput = toolUse.input as Record<string, unknown>;
|
|
304
|
+
if (toolUse.name === "mcp_exec") {
|
|
305
|
+
logger.dim(
|
|
306
|
+
` turn ${turn + 1}: mcp_exec → ${toolInput.server}/${toolInput.tool}`,
|
|
307
|
+
);
|
|
308
|
+
} else {
|
|
309
|
+
const args = JSON.stringify(toolInput).slice(0, 80);
|
|
310
|
+
logger.dim(` turn ${turn + 1}: ${toolUse.name}(${args})`);
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
const toolDef = mcpTools.find((t) => t.name === toolUse.name);
|
|
314
|
+
if (!toolDef) {
|
|
315
|
+
return {
|
|
316
|
+
type: "tool_result" as const,
|
|
317
|
+
tool_use_id: toolUse.id,
|
|
318
|
+
content: `Unknown tool: ${toolUse.name}`,
|
|
319
|
+
is_error: true,
|
|
320
|
+
};
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
try {
|
|
324
|
+
const parsed = toolDef.inputSchema.safeParse(toolUse.input);
|
|
325
|
+
if (!parsed.success) {
|
|
326
|
+
return {
|
|
327
|
+
type: "tool_result" as const,
|
|
328
|
+
tool_use_id: toolUse.id,
|
|
329
|
+
content: `Invalid input: ${parsed.error.message}`,
|
|
330
|
+
is_error: true,
|
|
331
|
+
};
|
|
332
|
+
}
|
|
333
|
+
const result = await toolDef.execute(parsed.data, toolCtx);
|
|
334
|
+
if (result.is_error) {
|
|
335
|
+
logger.dim(
|
|
336
|
+
` → error: ${JSON.stringify(result).slice(0, 160)}`,
|
|
337
|
+
);
|
|
338
|
+
return {
|
|
339
|
+
type: "tool_result" as const,
|
|
340
|
+
tool_use_id: toolUse.id,
|
|
341
|
+
content: JSON.stringify(result),
|
|
342
|
+
is_error: true,
|
|
343
|
+
};
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
// For successful mcp_exec calls, capture the full content in the
|
|
347
|
+
// harness and send only a preview to the LLM. The LLM accepts the
|
|
348
|
+
// result by referring to its tool_use_id.
|
|
349
|
+
if (toolUse.name === "mcp_exec") {
|
|
350
|
+
const execResult = result as {
|
|
351
|
+
result: string;
|
|
352
|
+
is_error: boolean;
|
|
353
|
+
};
|
|
354
|
+
const content = execResult.result;
|
|
355
|
+
execResults.set(toolUse.id, {
|
|
356
|
+
server: String(toolInput.server),
|
|
357
|
+
tool: String(toolInput.tool),
|
|
358
|
+
content,
|
|
359
|
+
mimeType: "text/markdown",
|
|
360
|
+
});
|
|
361
|
+
const preview =
|
|
362
|
+
content.length > PREVIEW_CHARS
|
|
363
|
+
? `${content.slice(0, PREVIEW_CHARS)}\n\n[... ${content.length - PREVIEW_CHARS} more chars truncated. Full content (${content.length} chars total) is captured by the harness with exec_call_id="${toolUse.id}". Call accept_content with this id to save it.]`
|
|
364
|
+
: `${content}\n\n[Full content (${content.length} chars) captured by the harness with exec_call_id="${toolUse.id}". Call accept_content with this id to save it.]`;
|
|
365
|
+
logger.dim(
|
|
366
|
+
` → captured ${content.length} chars (id=${toolUse.id})`,
|
|
367
|
+
);
|
|
368
|
+
return {
|
|
369
|
+
type: "tool_result" as const,
|
|
370
|
+
tool_use_id: toolUse.id,
|
|
371
|
+
content: preview,
|
|
372
|
+
};
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
return {
|
|
376
|
+
type: "tool_result" as const,
|
|
377
|
+
tool_use_id: toolUse.id,
|
|
378
|
+
content: JSON.stringify(result),
|
|
379
|
+
};
|
|
380
|
+
} catch (err) {
|
|
381
|
+
logger.dim(` → exception: ${err}`);
|
|
382
|
+
return {
|
|
383
|
+
type: "tool_result" as const,
|
|
384
|
+
tool_use_id: toolUse.id,
|
|
385
|
+
content: `Error: ${err}`,
|
|
386
|
+
is_error: true,
|
|
387
|
+
};
|
|
388
|
+
}
|
|
389
|
+
}),
|
|
390
|
+
);
|
|
391
|
+
|
|
392
|
+
messages.push({ role: "user", content: toolResults });
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
logger.dim(` max turns (${MAX_TURNS}) exceeded — signaling fallback`);
|
|
396
|
+
return null;
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
export async function httpFallback(url: string): Promise<FetchedContent> {
|
|
400
|
+
const response = await fetch(url, {
|
|
401
|
+
headers: { "User-Agent": "Botholomew/1.0" },
|
|
402
|
+
signal: AbortSignal.timeout(HTTP_TIMEOUT_MS),
|
|
403
|
+
});
|
|
404
|
+
|
|
405
|
+
if (!response.ok) {
|
|
406
|
+
throw new Error(`HTTP ${response.status} ${response.statusText}: ${url}`);
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
const contentType = response.headers.get("content-type") || "";
|
|
410
|
+
const isHtml = contentType.includes("text/html");
|
|
411
|
+
let text = await response.text();
|
|
412
|
+
|
|
413
|
+
let title = url;
|
|
414
|
+
if (isHtml) {
|
|
415
|
+
const titleMatch = text.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
|
|
416
|
+
if (titleMatch?.[1]) {
|
|
417
|
+
title = titleMatch[1].trim();
|
|
418
|
+
}
|
|
419
|
+
text = stripHtmlTags(text);
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
if (text.length > MAX_CONTENT_BYTES) {
|
|
423
|
+
text = text.slice(0, MAX_CONTENT_BYTES);
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
const mimeType = isHtml
|
|
427
|
+
? "text/markdown"
|
|
428
|
+
: contentType.split(";")[0] || "text/plain";
|
|
429
|
+
|
|
430
|
+
return {
|
|
431
|
+
title,
|
|
432
|
+
content: text,
|
|
433
|
+
mimeType,
|
|
434
|
+
sourceUrl: url,
|
|
435
|
+
};
|
|
436
|
+
}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Attempts to parse the input as a URL and returns true if the protocol is http or https.
|
|
3
|
+
*/
|
|
4
|
+
export function isUrl(input: string): boolean {
|
|
5
|
+
try {
|
|
6
|
+
const url = new URL(input);
|
|
7
|
+
return url.protocol === "http:" || url.protocol === "https:";
|
|
8
|
+
} catch {
|
|
9
|
+
return false;
|
|
10
|
+
}
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Derives a virtual context path from a URL.
|
|
15
|
+
* Example: `https://docs.google.com/document/d/abc123/edit` → `/{prefix}/docs.google.com/document-d-abc123.md`
|
|
16
|
+
*/
|
|
17
|
+
export function urlToContextPath(url: string, prefix: string): string {
|
|
18
|
+
const parsed = new URL(url);
|
|
19
|
+
const hostname = parsed.hostname;
|
|
20
|
+
const pathname = parsed.pathname
|
|
21
|
+
.replace(/\/+$/, "") // strip trailing slashes
|
|
22
|
+
.replace(/^\/+/, "") // strip leading slashes
|
|
23
|
+
.replace(/[^a-zA-Z0-9\-_.]/g, "-") // slugify
|
|
24
|
+
.replace(/-{2,}/g, "-"); // collapse repeated dashes
|
|
25
|
+
|
|
26
|
+
const slug = pathname ? `${hostname}/${pathname}` : hostname;
|
|
27
|
+
const full = `${prefix.replace(/\/+$/, "")}/${slug}.md`;
|
|
28
|
+
|
|
29
|
+
if (full.length > 120) {
|
|
30
|
+
return `${full.slice(0, 117 - 3)}.md`;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
return full;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* Strips HTML tags from a string, removing script/style blocks first,
|
|
38
|
+
* then all remaining tags, and collapsing whitespace.
|
|
39
|
+
*/
|
|
40
|
+
export function stripHtmlTags(html: string): string {
|
|
41
|
+
return html
|
|
42
|
+
.replace(/<script[\s\S]*?<\/script>/gi, "") // remove script blocks
|
|
43
|
+
.replace(/<style[\s\S]*?<\/style>/gi, "") // remove style blocks
|
|
44
|
+
.replace(/<[^>]*>/g, "") // remove all remaining tags
|
|
45
|
+
.replace(/[ \t]+/g, " ") // collapse horizontal whitespace
|
|
46
|
+
.replace(/\n{3,}/g, "\n\n") // collapse excessive newlines
|
|
47
|
+
.trim();
|
|
48
|
+
}
|
package/src/db/context.ts
CHANGED
|
@@ -9,6 +9,7 @@ export interface ContextItem {
|
|
|
9
9
|
content: string | null;
|
|
10
10
|
mime_type: string;
|
|
11
11
|
is_textual: boolean;
|
|
12
|
+
source_type: "file" | "url";
|
|
12
13
|
source_path: string | null;
|
|
13
14
|
context_path: string;
|
|
14
15
|
indexed_at: Date | null;
|
|
@@ -30,6 +31,7 @@ interface ContextItemRow {
|
|
|
30
31
|
content_blob: unknown;
|
|
31
32
|
mime_type: string;
|
|
32
33
|
is_textual: boolean;
|
|
34
|
+
source_type: string;
|
|
33
35
|
source_path: string | null;
|
|
34
36
|
context_path: string;
|
|
35
37
|
indexed_at: string | null;
|
|
@@ -45,6 +47,7 @@ function rowToContextItem(row: ContextItemRow): ContextItem {
|
|
|
45
47
|
content: row.content,
|
|
46
48
|
mime_type: row.mime_type,
|
|
47
49
|
is_textual: !!row.is_textual,
|
|
50
|
+
source_type: row.source_type as "file" | "url",
|
|
48
51
|
source_path: row.source_path,
|
|
49
52
|
context_path: row.context_path,
|
|
50
53
|
indexed_at: row.indexed_at ? new Date(row.indexed_at) : null,
|
|
@@ -61,6 +64,7 @@ export async function createContextItem(
|
|
|
61
64
|
title: string;
|
|
62
65
|
content?: string;
|
|
63
66
|
mimeType?: string;
|
|
67
|
+
sourceType?: "file" | "url";
|
|
64
68
|
sourcePath?: string;
|
|
65
69
|
contextPath: string;
|
|
66
70
|
description?: string;
|
|
@@ -69,8 +73,8 @@ export async function createContextItem(
|
|
|
69
73
|
): Promise<ContextItem> {
|
|
70
74
|
const id = uuidv7();
|
|
71
75
|
const row = await db.queryGet<ContextItemRow>(
|
|
72
|
-
`INSERT INTO context_items (id, title, description, content, mime_type, is_textual, source_path, context_path)
|
|
73
|
-
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8)
|
|
76
|
+
`INSERT INTO context_items (id, title, description, content, mime_type, is_textual, source_type, source_path, context_path)
|
|
77
|
+
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9)
|
|
74
78
|
RETURNING *`,
|
|
75
79
|
id,
|
|
76
80
|
params.title,
|
|
@@ -78,6 +82,7 @@ export async function createContextItem(
|
|
|
78
82
|
params.content ?? null,
|
|
79
83
|
params.mimeType ?? "text/plain",
|
|
80
84
|
params.isTextual !== false,
|
|
85
|
+
params.sourceType ?? "file",
|
|
81
86
|
params.sourcePath ?? null,
|
|
82
87
|
params.contextPath,
|
|
83
88
|
);
|
|
@@ -99,6 +104,7 @@ export async function upsertContextItem(
|
|
|
99
104
|
title: string;
|
|
100
105
|
content?: string;
|
|
101
106
|
mimeType?: string;
|
|
107
|
+
sourceType?: "file" | "url";
|
|
102
108
|
sourcePath?: string;
|
|
103
109
|
contextPath: string;
|
|
104
110
|
description?: string;
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
ALTER TABLE context_items ADD COLUMN source_type TEXT DEFAULT 'file';
|