botholomew 0.8.10 → 0.9.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/chat/agent.ts +5 -3
- package/src/commands/context.ts +223 -373
- package/src/commands/tools.ts +100 -11
- package/src/context/describer.ts +3 -118
- package/src/context/drives.ts +110 -0
- package/src/context/fetcher.ts +11 -1
- package/src/context/ingest.ts +13 -10
- package/src/context/refresh.ts +39 -24
- package/src/context/url-utils.ts +0 -23
- package/src/db/context.ts +195 -119
- package/src/db/embeddings.ts +35 -16
- package/src/db/sql/13-drive-paths.sql +49 -0
- package/src/tools/context/list-drives.ts +36 -0
- package/src/tools/context/refresh.ts +41 -23
- package/src/tools/context/search.ts +8 -3
- package/src/tools/dir/create.ts +14 -11
- package/src/tools/dir/size.ts +3 -2
- package/src/tools/dir/tree.ts +57 -17
- package/src/tools/file/copy.ts +14 -8
- package/src/tools/file/count-lines.ts +6 -3
- package/src/tools/file/delete.ts +12 -5
- package/src/tools/file/edit.ts +5 -3
- package/src/tools/file/exists.ts +25 -3
- package/src/tools/file/info.ts +90 -18
- package/src/tools/file/move.ts +15 -16
- package/src/tools/file/read.ts +79 -5
- package/src/tools/file/write.ts +29 -12
- package/src/tools/registry.ts +2 -2
- package/src/tools/search/grep.ts +44 -11
- package/src/tools/search/semantic.ts +7 -3
- package/src/tui/components/ContextPanel.tsx +73 -35
- package/src/tui/markdown.ts +2 -3
- package/src/worker/prompt.ts +3 -2
- package/src/tools/dir/list.ts +0 -89
package/src/commands/tools.ts
CHANGED
|
@@ -3,6 +3,10 @@ import type { Command } from "commander";
|
|
|
3
3
|
import { z } from "zod";
|
|
4
4
|
import { loadConfig } from "../config/loader.ts";
|
|
5
5
|
import { getDbPath } from "../constants.ts";
|
|
6
|
+
import { parseDriveRef } from "../context/drives.ts";
|
|
7
|
+
import type { DbConnection } from "../db/connection.ts";
|
|
8
|
+
import { getContextItemById } from "../db/context.ts";
|
|
9
|
+
import { isUuid } from "../db/uuid.ts";
|
|
6
10
|
import { registerAllTools } from "../tools/registry.ts";
|
|
7
11
|
import {
|
|
8
12
|
type AnyToolDefinition,
|
|
@@ -43,7 +47,7 @@ export function registerSearchToolSubcommands(parent: Command) {
|
|
|
43
47
|
}
|
|
44
48
|
}
|
|
45
49
|
|
|
46
|
-
/** Derive CLI subcommand name from tool name: "context_read" → "read", "
|
|
50
|
+
/** Derive CLI subcommand name from tool name: "context_read" → "read", "context_create_dir" → "create-dir" */
|
|
47
51
|
function deriveSubName(toolName: string): string {
|
|
48
52
|
return toolName.replace(/^[^_]+_/, "").replace(/_/g, "-");
|
|
49
53
|
}
|
|
@@ -64,7 +68,7 @@ function registerToolAsCLI(parent: Command, tool: AnyToolDefinition) {
|
|
|
64
68
|
for (const [key, schema] of Object.entries(shape)) {
|
|
65
69
|
const desc = schema.description ?? key;
|
|
66
70
|
const isOptional = schema.isOptional();
|
|
67
|
-
const unwrapped =
|
|
71
|
+
const unwrapped = unwrapSchema(schema);
|
|
68
72
|
|
|
69
73
|
if (isPositionalArg(key, tool.name)) {
|
|
70
74
|
positionals.push(isOptional ? `[${key}]` : `<${key}>`);
|
|
@@ -109,7 +113,14 @@ function registerToolAsCLI(parent: Command, tool: AnyToolDefinition) {
|
|
|
109
113
|
while (root.parent) root = root.parent;
|
|
110
114
|
return withDb(root, async (conn, dir) => {
|
|
111
115
|
try {
|
|
112
|
-
const input = buildInput(
|
|
116
|
+
const input = await buildInput(
|
|
117
|
+
tool,
|
|
118
|
+
positionals,
|
|
119
|
+
options,
|
|
120
|
+
shape,
|
|
121
|
+
args,
|
|
122
|
+
conn,
|
|
123
|
+
);
|
|
113
124
|
|
|
114
125
|
const ctx: ToolContext = {
|
|
115
126
|
conn,
|
|
@@ -129,7 +140,7 @@ function registerToolAsCLI(parent: Command, tool: AnyToolDefinition) {
|
|
|
129
140
|
});
|
|
130
141
|
}
|
|
131
142
|
|
|
132
|
-
function buildInput(
|
|
143
|
+
async function buildInput(
|
|
133
144
|
tool: AnyToolDefinition,
|
|
134
145
|
positionals: string[],
|
|
135
146
|
options: {
|
|
@@ -140,14 +151,36 @@ function buildInput(
|
|
|
140
151
|
}[],
|
|
141
152
|
shape: Record<string, z.ZodType>,
|
|
142
153
|
args: unknown[],
|
|
143
|
-
|
|
154
|
+
conn: DbConnection,
|
|
155
|
+
): Promise<Record<string, unknown>> {
|
|
144
156
|
const input: Record<string, unknown> = {};
|
|
145
157
|
|
|
146
|
-
// Positional args come first in Commander's action callback
|
|
158
|
+
// Positional args come first in Commander's action callback. Context tools
|
|
159
|
+
// carry `(drive, path)` or `(src_drive, src_path, …)` in their schema but
|
|
160
|
+
// accept a friendlier `drive:/path` or bare-UUID form as a single positional
|
|
161
|
+
// on the CLI.
|
|
147
162
|
for (let i = 0; i < positionals.length; i++) {
|
|
148
163
|
const key = positionals[i]?.replace(/[<>[\]]/g, "");
|
|
149
164
|
const value = args[i];
|
|
150
|
-
if (key
|
|
165
|
+
if (key === undefined || value === undefined) continue;
|
|
166
|
+
const splitTargets = driveRefSplitTargets(key, shape);
|
|
167
|
+
if (splitTargets && typeof value === "string") {
|
|
168
|
+
const parsed = parseDriveRef(value);
|
|
169
|
+
if (parsed) {
|
|
170
|
+
input[splitTargets.drive] = parsed.drive;
|
|
171
|
+
input[splitTargets.path] = parsed.path;
|
|
172
|
+
continue;
|
|
173
|
+
}
|
|
174
|
+
if (isUuid(value)) {
|
|
175
|
+
const item = await getContextItemById(conn, value);
|
|
176
|
+
if (item) {
|
|
177
|
+
input[splitTargets.drive] = item.drive;
|
|
178
|
+
input[splitTargets.path] = item.path;
|
|
179
|
+
continue;
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
input[key] = value;
|
|
151
184
|
}
|
|
152
185
|
|
|
153
186
|
// Options object is the last argument before the Command object
|
|
@@ -163,7 +196,7 @@ function buildInput(
|
|
|
163
196
|
|
|
164
197
|
const schemaForKey = shape[opt.key];
|
|
165
198
|
if (!schemaForKey) continue;
|
|
166
|
-
const unwrapped =
|
|
199
|
+
const unwrapped = unwrapSchema(schemaForKey);
|
|
167
200
|
|
|
168
201
|
// Parse JSON for array types
|
|
169
202
|
if (opt.isArray && typeof value === "string") {
|
|
@@ -192,6 +225,19 @@ function formatOutput(result: unknown, _toolName: string) {
|
|
|
192
225
|
if (typeof result === "object") {
|
|
193
226
|
const obj = result as Record<string, unknown>;
|
|
194
227
|
|
|
228
|
+
// Structured error shape: { is_error: true, message, next_action_hint? }
|
|
229
|
+
if (obj.is_error === true) {
|
|
230
|
+
const msg = typeof obj.message === "string" ? obj.message : "Error";
|
|
231
|
+
logger.error(msg);
|
|
232
|
+
if (
|
|
233
|
+
typeof obj.next_action_hint === "string" &&
|
|
234
|
+
obj.next_action_hint.length > 0
|
|
235
|
+
) {
|
|
236
|
+
console.log(ansis.dim(obj.next_action_hint));
|
|
237
|
+
}
|
|
238
|
+
process.exit(1);
|
|
239
|
+
}
|
|
240
|
+
|
|
195
241
|
// Special formatting for known output shapes
|
|
196
242
|
if ("tree" in obj && typeof obj.tree === "string") {
|
|
197
243
|
console.log(obj.tree);
|
|
@@ -217,6 +263,26 @@ function formatOutput(result: unknown, _toolName: string) {
|
|
|
217
263
|
return;
|
|
218
264
|
}
|
|
219
265
|
|
|
266
|
+
if ("drives" in obj && Array.isArray(obj.drives)) {
|
|
267
|
+
const drives = obj.drives as { drive: string; count: number }[];
|
|
268
|
+
if (drives.length === 0) {
|
|
269
|
+
if (typeof obj.hint === "string") console.log(ansis.dim(obj.hint));
|
|
270
|
+
return;
|
|
271
|
+
}
|
|
272
|
+
const widest = Math.max(...drives.map((d) => d.drive.length));
|
|
273
|
+
for (const d of drives) {
|
|
274
|
+
const label = `${d.drive}:/`.padEnd(widest + 2);
|
|
275
|
+
const plural = d.count === 1 ? "item" : "items";
|
|
276
|
+
console.log(
|
|
277
|
+
` ${ansis.cyan(label)} ${ansis.dim(`(${d.count} ${plural})`)}`,
|
|
278
|
+
);
|
|
279
|
+
}
|
|
280
|
+
if (typeof obj.hint === "string") {
|
|
281
|
+
console.log(`\n${ansis.dim(obj.hint)}`);
|
|
282
|
+
}
|
|
283
|
+
return;
|
|
284
|
+
}
|
|
285
|
+
|
|
220
286
|
if ("matches" in obj && Array.isArray(obj.matches)) {
|
|
221
287
|
for (const match of obj.matches) {
|
|
222
288
|
if (typeof match === "string") {
|
|
@@ -263,7 +329,6 @@ function isPositionalArg(key: string, toolName: string): boolean {
|
|
|
263
329
|
// These keys are treated as positional arguments
|
|
264
330
|
const positionalKeys: Record<string, string[]> = {
|
|
265
331
|
context_create_dir: ["path"],
|
|
266
|
-
context_list_dir: ["path"],
|
|
267
332
|
context_tree: ["path"],
|
|
268
333
|
context_dir_size: ["path"],
|
|
269
334
|
context_read: ["path"],
|
|
@@ -282,9 +347,33 @@ function isPositionalArg(key: string, toolName: string): boolean {
|
|
|
282
347
|
return positionalKeys[toolName]?.includes(key) ?? false;
|
|
283
348
|
}
|
|
284
349
|
|
|
285
|
-
function
|
|
350
|
+
function unwrapSchema(schema: z.ZodType): z.ZodType {
|
|
286
351
|
if (schema instanceof z.ZodOptional) {
|
|
287
|
-
return schema.unwrap() as z.ZodType;
|
|
352
|
+
return unwrapSchema(schema.unwrap() as z.ZodType);
|
|
353
|
+
}
|
|
354
|
+
if (schema instanceof z.ZodDefault) {
|
|
355
|
+
return unwrapSchema(schema.unwrap() as z.ZodType);
|
|
288
356
|
}
|
|
289
357
|
return schema;
|
|
290
358
|
}
|
|
359
|
+
|
|
360
|
+
/**
|
|
361
|
+
* Decide how to expand a positional `path`/`src`/`dst` value into the tool's
|
|
362
|
+
* schema when it carries a `drive:/path` prefix. Returns the drive+path field
|
|
363
|
+
* names in the schema, or null if the schema has no matching drive field.
|
|
364
|
+
*/
|
|
365
|
+
function driveRefSplitTargets(
|
|
366
|
+
positionalKey: string,
|
|
367
|
+
shape: Record<string, z.ZodType>,
|
|
368
|
+
): { drive: string; path: string } | null {
|
|
369
|
+
if (positionalKey === "path" && "drive" in shape && "path" in shape) {
|
|
370
|
+
return { drive: "drive", path: "path" };
|
|
371
|
+
}
|
|
372
|
+
if (positionalKey === "src" && "src_drive" in shape && "src_path" in shape) {
|
|
373
|
+
return { drive: "src_drive", path: "src_path" };
|
|
374
|
+
}
|
|
375
|
+
if (positionalKey === "dst" && "dst_drive" in shape && "dst_path" in shape) {
|
|
376
|
+
return { drive: "dst_drive", path: "dst_path" };
|
|
377
|
+
}
|
|
378
|
+
return null;
|
|
379
|
+
}
|
package/src/context/describer.ts
CHANGED
|
@@ -3,7 +3,6 @@ import type { BotholomewConfig } from "../config/schemas.ts";
|
|
|
3
3
|
import { logger } from "../utils/logger.ts";
|
|
4
4
|
|
|
5
5
|
const DESCRIBE_TOOL_NAME = "return_description";
|
|
6
|
-
const DESCRIBE_AND_PLACE_TOOL_NAME = "return_description_and_path";
|
|
7
6
|
|
|
8
7
|
const DESCRIBE_TOOL = {
|
|
9
8
|
name: DESCRIBE_TOOL_NAME,
|
|
@@ -21,28 +20,6 @@ const DESCRIBE_TOOL = {
|
|
|
21
20
|
},
|
|
22
21
|
};
|
|
23
22
|
|
|
24
|
-
const DESCRIBE_AND_PLACE_TOOL = {
|
|
25
|
-
name: DESCRIBE_AND_PLACE_TOOL_NAME,
|
|
26
|
-
description:
|
|
27
|
-
"Return a one-sentence description AND a suggested absolute folder path for this file.",
|
|
28
|
-
input_schema: {
|
|
29
|
-
type: "object" as const,
|
|
30
|
-
properties: {
|
|
31
|
-
description: {
|
|
32
|
-
type: "string",
|
|
33
|
-
description:
|
|
34
|
-
"A concise one-sentence summary of what this content is about.",
|
|
35
|
-
},
|
|
36
|
-
suggested_path: {
|
|
37
|
-
type: "string",
|
|
38
|
-
description:
|
|
39
|
-
"Absolute virtual-filesystem path (starts with /) where this file should live, including the filename. Prefer existing folders. Include a project/source disambiguator (e.g. /projects/<source-dir>/README.md) when the basename is likely to collide.",
|
|
40
|
-
},
|
|
41
|
-
},
|
|
42
|
-
required: ["description", "suggested_path"],
|
|
43
|
-
},
|
|
44
|
-
};
|
|
45
|
-
|
|
46
23
|
const TIMEOUT_MS = 10_000;
|
|
47
24
|
const MAX_CONTENT_CHARS = 8000;
|
|
48
25
|
const MAX_FILE_BYTES = 10 * 1024 * 1024; // 10 MB
|
|
@@ -56,35 +33,11 @@ const IMAGE_TYPES = new Set([
|
|
|
56
33
|
|
|
57
34
|
type ImageMediaType = "image/jpeg" | "image/png" | "image/gif" | "image/webp";
|
|
58
35
|
|
|
59
|
-
/**
|
|
60
|
-
* Build the message content array for the LLM description request.
|
|
61
|
-
* Attaches the file as an image or document block when possible.
|
|
62
|
-
*/
|
|
63
36
|
async function buildMessageContent(
|
|
64
37
|
opts: DescriberOpts,
|
|
65
|
-
includePlacement: boolean,
|
|
66
38
|
): Promise<Anthropic.Messages.ContentBlockParam[]> {
|
|
67
|
-
const
|
|
68
|
-
|
|
69
|
-
"",
|
|
70
|
-
"Also suggest an absolute folder path where this file should live in the virtual filesystem. Rules:",
|
|
71
|
-
"- Start with /",
|
|
72
|
-
"- Keep the basename close to the source filename",
|
|
73
|
-
"- STRONGLY prefer folders that already exist below — reuse them unless the new file is clearly unrelated to everything there. Do NOT invent a new folder that is a near-synonym of an existing one.",
|
|
74
|
-
"- Use at most 3 nested folders unless an existing folder already goes deeper",
|
|
75
|
-
"- If the basename is common (README.md, index.md, notes.md), include a project/source disambiguator from the source path",
|
|
76
|
-
opts.existingTree
|
|
77
|
-
? `\nExisting filesystem (folders end with /, files are listed under the folders they live in so you can see what kinds of documents are already there):\n${opts.existingTree}`
|
|
78
|
-
: "\nExisting filesystem: (empty — you are placing the first file)",
|
|
79
|
-
opts.sourcePath ? `\nSource filesystem path: ${opts.sourcePath}` : "",
|
|
80
|
-
]
|
|
81
|
-
.filter((s) => s.length > 0)
|
|
82
|
-
.join("\n")
|
|
83
|
-
: "";
|
|
84
|
-
|
|
85
|
-
const textPrompt = `Describe this file in one sentence. Be specific about what it contains, not generic.\n\nFilename: ${opts.filename}\nMIME type: ${opts.mimeType}${placementBlock ? `\n${placementBlock}` : ""}`;
|
|
86
|
-
|
|
87
|
-
// Text file — include content inline
|
|
39
|
+
const textPrompt = `Describe this file in one sentence. Be specific about what it contains, not generic.\n\nFilename: ${opts.filename}\nMIME type: ${opts.mimeType}`;
|
|
40
|
+
|
|
88
41
|
if (opts.content) {
|
|
89
42
|
const truncated =
|
|
90
43
|
opts.content.length > MAX_CONTENT_CHARS
|
|
@@ -93,7 +46,6 @@ async function buildMessageContent(
|
|
|
93
46
|
return [{ type: "text", text: `${textPrompt}\n\nContent:\n${truncated}` }];
|
|
94
47
|
}
|
|
95
48
|
|
|
96
|
-
// Binary file — try to attach if we have a file path
|
|
97
49
|
if (opts.filePath) {
|
|
98
50
|
const file = Bun.file(opts.filePath);
|
|
99
51
|
const size = file.size;
|
|
@@ -127,7 +79,6 @@ async function buildMessageContent(
|
|
|
127
79
|
}
|
|
128
80
|
}
|
|
129
81
|
|
|
130
|
-
// Fallback — describe from filename and MIME type only
|
|
131
82
|
return [
|
|
132
83
|
{
|
|
133
84
|
type: "text",
|
|
@@ -141,20 +92,6 @@ interface DescriberOpts {
|
|
|
141
92
|
mimeType: string;
|
|
142
93
|
content: string | null;
|
|
143
94
|
filePath?: string;
|
|
144
|
-
sourcePath?: string;
|
|
145
|
-
existingTree?: string;
|
|
146
|
-
}
|
|
147
|
-
|
|
148
|
-
/** Normalize and validate an LLM-suggested path. Returns null if invalid. */
|
|
149
|
-
export function sanitizeSuggestedPath(raw: string): string | null {
|
|
150
|
-
const trimmed = raw.trim();
|
|
151
|
-
if (!trimmed) return null;
|
|
152
|
-
if (!trimmed.startsWith("/")) return null;
|
|
153
|
-
if (trimmed.includes("..")) return null;
|
|
154
|
-
// Collapse repeated slashes, strip trailing slash (unless root).
|
|
155
|
-
const collapsed = trimmed.replace(/\/+/g, "/");
|
|
156
|
-
if (collapsed === "/") return null; // needs a filename
|
|
157
|
-
return collapsed.endsWith("/") ? collapsed.slice(0, -1) : collapsed;
|
|
158
95
|
}
|
|
159
96
|
|
|
160
97
|
/**
|
|
@@ -173,7 +110,7 @@ export async function generateDescription(
|
|
|
173
110
|
const client = new Anthropic({ apiKey: config.anthropic_api_key });
|
|
174
111
|
|
|
175
112
|
try {
|
|
176
|
-
const content = await buildMessageContent(opts
|
|
113
|
+
const content = await buildMessageContent(opts);
|
|
177
114
|
|
|
178
115
|
const response = await Promise.race([
|
|
179
116
|
client.messages.create({
|
|
@@ -201,55 +138,3 @@ export async function generateDescription(
|
|
|
201
138
|
return "";
|
|
202
139
|
}
|
|
203
140
|
}
|
|
204
|
-
|
|
205
|
-
/**
|
|
206
|
-
* Generate description + suggested_path in a single LLM call.
|
|
207
|
-
* Returns { description, suggested_path } on success, or null on failure.
|
|
208
|
-
*/
|
|
209
|
-
export async function generateDescriptionAndPath(
|
|
210
|
-
config: Required<BotholomewConfig>,
|
|
211
|
-
opts: DescriberOpts,
|
|
212
|
-
): Promise<{ description: string; suggested_path: string } | null> {
|
|
213
|
-
if (!config.anthropic_api_key) return null;
|
|
214
|
-
|
|
215
|
-
const client = new Anthropic({ apiKey: config.anthropic_api_key });
|
|
216
|
-
|
|
217
|
-
try {
|
|
218
|
-
const content = await buildMessageContent(opts, true);
|
|
219
|
-
|
|
220
|
-
const response = await Promise.race([
|
|
221
|
-
client.messages.create({
|
|
222
|
-
model: config.chunker_model,
|
|
223
|
-
max_tokens: 512,
|
|
224
|
-
tools: [DESCRIBE_AND_PLACE_TOOL],
|
|
225
|
-
tool_choice: { type: "tool", name: DESCRIBE_AND_PLACE_TOOL_NAME },
|
|
226
|
-
messages: [{ role: "user", content }],
|
|
227
|
-
}),
|
|
228
|
-
new Promise<never>((_, reject) =>
|
|
229
|
-
setTimeout(
|
|
230
|
-
() => reject(new Error("Description+path generation timeout")),
|
|
231
|
-
TIMEOUT_MS,
|
|
232
|
-
),
|
|
233
|
-
),
|
|
234
|
-
]);
|
|
235
|
-
|
|
236
|
-
const toolBlock = response.content.find((b) => b.type === "tool_use");
|
|
237
|
-
if (!toolBlock || toolBlock.type !== "tool_use") return null;
|
|
238
|
-
|
|
239
|
-
const input = toolBlock.input as {
|
|
240
|
-
description?: string;
|
|
241
|
-
suggested_path?: string;
|
|
242
|
-
};
|
|
243
|
-
const suggested = input.suggested_path
|
|
244
|
-
? sanitizeSuggestedPath(input.suggested_path)
|
|
245
|
-
: null;
|
|
246
|
-
if (!suggested) return null;
|
|
247
|
-
return {
|
|
248
|
-
description: input.description || "",
|
|
249
|
-
suggested_path: suggested,
|
|
250
|
-
};
|
|
251
|
-
} catch (err) {
|
|
252
|
-
logger.debug(`Description+path generation failed: ${err}`);
|
|
253
|
-
return null;
|
|
254
|
-
}
|
|
255
|
-
}
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Drives name the origin of a context item. Every item lives at a
|
|
3
|
+
* `(drive, path)` pair; the `drive:/path` string form is a display and CLI
|
|
4
|
+
* convention (single column queries use the two columns directly).
|
|
5
|
+
*
|
|
6
|
+
* Built-in drives:
|
|
7
|
+
* disk — local filesystem; path is the absolute filesystem path
|
|
8
|
+
* url — generic HTTP(S) URL; path is the full URL
|
|
9
|
+
* agent — agent-authored scratch; path is whatever the agent chose
|
|
10
|
+
* google-docs — Google Docs; path is `/<docId>`
|
|
11
|
+
* github — GitHub content; path is `/<owner>/<repo>/<rest>`
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
export const BUILT_IN_DRIVES = [
|
|
15
|
+
"disk",
|
|
16
|
+
"url",
|
|
17
|
+
"agent",
|
|
18
|
+
"google-docs",
|
|
19
|
+
"github",
|
|
20
|
+
] as const;
|
|
21
|
+
|
|
22
|
+
export interface DriveTarget {
|
|
23
|
+
drive: string;
|
|
24
|
+
path: string;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
/** Parse `drive:/path` → `{ drive, path }`. Returns null if not in drive form. */
|
|
28
|
+
export function parseDriveRef(ref: string): DriveTarget | null {
|
|
29
|
+
const i = ref.indexOf(":");
|
|
30
|
+
if (i <= 0) return null;
|
|
31
|
+
const drive = ref.slice(0, i);
|
|
32
|
+
const path = ref.slice(i + 1);
|
|
33
|
+
if (!path.startsWith("/")) return null;
|
|
34
|
+
if (!/^[a-z][a-z0-9_-]*$/.test(drive)) return null;
|
|
35
|
+
return { drive, path };
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/** Format a `(drive, path)` pair for display / CLI. */
|
|
39
|
+
export function formatDriveRef(target: DriveTarget): string {
|
|
40
|
+
return `${target.drive}:${target.path}`;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
/**
|
|
44
|
+
* Detect the right drive for a URL. If `mcpxServerName` is provided, prefer it
|
|
45
|
+
* as a hint (some MCP servers are named after the service they back).
|
|
46
|
+
*/
|
|
47
|
+
export function detectDriveFromUrl(
|
|
48
|
+
url: string,
|
|
49
|
+
mcpxServerName?: string | null,
|
|
50
|
+
): DriveTarget {
|
|
51
|
+
const hint = mcpxServerName?.toLowerCase() ?? "";
|
|
52
|
+
let parsed: URL | null = null;
|
|
53
|
+
try {
|
|
54
|
+
parsed = new URL(url);
|
|
55
|
+
} catch {
|
|
56
|
+
return { drive: "url", path: `/${url}` };
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
const host = parsed.hostname.toLowerCase();
|
|
60
|
+
|
|
61
|
+
if (
|
|
62
|
+
host === "docs.google.com" ||
|
|
63
|
+
(hint.includes("google") && hint.includes("doc"))
|
|
64
|
+
) {
|
|
65
|
+
const docId = extractGoogleDocId(parsed);
|
|
66
|
+
if (docId) return { drive: "google-docs", path: `/${docId}` };
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
if (
|
|
70
|
+
host === "github.com" ||
|
|
71
|
+
host === "raw.githubusercontent.com" ||
|
|
72
|
+
hint.includes("github")
|
|
73
|
+
) {
|
|
74
|
+
const ghPath = extractGithubPath(parsed);
|
|
75
|
+
if (ghPath) return { drive: "github", path: ghPath };
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
return { drive: "url", path: `/${url}` };
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
function extractGoogleDocId(u: URL): string | null {
|
|
82
|
+
// https://docs.google.com/document/d/<docId>/edit
|
|
83
|
+
// https://docs.google.com/spreadsheets/d/<docId>/edit
|
|
84
|
+
const m = u.pathname.match(/\/d\/([^/]+)/);
|
|
85
|
+
return m?.[1] ?? null;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
function extractGithubPath(u: URL): string | null {
|
|
89
|
+
// https://github.com/<owner>/<repo>/blob/<ref>/<path...>
|
|
90
|
+
// https://github.com/<owner>/<repo>/tree/<ref>/<path...>
|
|
91
|
+
// https://github.com/<owner>/<repo>
|
|
92
|
+
// https://raw.githubusercontent.com/<owner>/<repo>/<ref>/<path...>
|
|
93
|
+
const segs = u.pathname.split("/").filter(Boolean);
|
|
94
|
+
if (segs.length < 2) return null;
|
|
95
|
+
const [owner, repo, kind, _ref, ...rest] = segs;
|
|
96
|
+
if (!owner || !repo) return null;
|
|
97
|
+
if (u.hostname === "raw.githubusercontent.com") {
|
|
98
|
+
// segs: owner, repo, ref, ...rest
|
|
99
|
+
const [_o, _r, _f, ...raw] = segs;
|
|
100
|
+
return raw.length > 0
|
|
101
|
+
? `/${owner}/${repo}/${raw.join("/")}`
|
|
102
|
+
: `/${owner}/${repo}`;
|
|
103
|
+
}
|
|
104
|
+
if (kind === "blob" || kind === "tree") {
|
|
105
|
+
return rest.length > 0
|
|
106
|
+
? `/${owner}/${repo}/${rest.join("/")}`
|
|
107
|
+
: `/${owner}/${repo}`;
|
|
108
|
+
}
|
|
109
|
+
return `/${owner}/${repo}`;
|
|
110
|
+
}
|
package/src/context/fetcher.ts
CHANGED
|
@@ -15,6 +15,7 @@ import { mcpSearchTool } from "../tools/mcp/search.ts";
|
|
|
15
15
|
import type { ToolContext } from "../tools/tool.ts";
|
|
16
16
|
import { type AnyToolDefinition, toAnthropicTool } from "../tools/tool.ts";
|
|
17
17
|
import { logger } from "../utils/logger.ts";
|
|
18
|
+
import { detectDriveFromUrl } from "./drives.ts";
|
|
18
19
|
import { stripHtmlTags } from "./url-utils.ts";
|
|
19
20
|
|
|
20
21
|
const MAX_CONTENT_BYTES = 500_000;
|
|
@@ -28,6 +29,8 @@ export interface FetchedContent {
|
|
|
28
29
|
content: string;
|
|
29
30
|
mimeType: string;
|
|
30
31
|
sourceUrl: string;
|
|
32
|
+
drive: string;
|
|
33
|
+
path: string;
|
|
31
34
|
}
|
|
32
35
|
|
|
33
36
|
export class FetchFailureError extends Error {
|
|
@@ -176,7 +179,8 @@ async function runFetcherLoop(
|
|
|
176
179
|
|
|
177
180
|
// Cache of full mcp_exec results keyed by tool_use_id.
|
|
178
181
|
// The LLM only sees a truncated preview; on accept_content it references
|
|
179
|
-
// the id and the harness saves the captured content.
|
|
182
|
+
// the id and the harness saves the captured content. `server` is retained so
|
|
183
|
+
// we can attribute the save to a specific MCP service when routing to a drive.
|
|
180
184
|
const execResults = new Map<
|
|
181
185
|
string,
|
|
182
186
|
{ server: string; tool: string; content: string; mimeType: string }
|
|
@@ -289,11 +293,14 @@ async function runFetcherLoop(
|
|
|
289
293
|
logger.dim(
|
|
290
294
|
` turn ${turn + 1}: accept_content: "${input.title}" (${cached.content.length} chars, ${mimeType}, from ${cached.server}/${cached.tool})`,
|
|
291
295
|
);
|
|
296
|
+
const { drive, path } = detectDriveFromUrl(url, cached.server);
|
|
292
297
|
return {
|
|
293
298
|
title: input.title,
|
|
294
299
|
content: cached.content.slice(0, MAX_CONTENT_BYTES),
|
|
295
300
|
mimeType,
|
|
296
301
|
sourceUrl: url,
|
|
302
|
+
drive,
|
|
303
|
+
path,
|
|
297
304
|
};
|
|
298
305
|
}
|
|
299
306
|
|
|
@@ -428,10 +435,13 @@ export async function httpFallback(url: string): Promise<FetchedContent> {
|
|
|
428
435
|
? "text/markdown"
|
|
429
436
|
: contentType.split(";")[0] || "text/plain";
|
|
430
437
|
|
|
438
|
+
const { drive, path } = detectDriveFromUrl(url);
|
|
431
439
|
return {
|
|
432
440
|
title,
|
|
433
441
|
content: text,
|
|
434
442
|
mimeType,
|
|
435
443
|
sourceUrl: url,
|
|
444
|
+
drive,
|
|
445
|
+
path,
|
|
436
446
|
};
|
|
437
447
|
}
|
package/src/context/ingest.ts
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
import type { BotholomewConfig } from "../config/schemas.ts";
|
|
2
2
|
import type { DbConnection } from "../db/connection.ts";
|
|
3
|
-
import { getContextItem,
|
|
3
|
+
import { getContextItem, getContextItemById } from "../db/context.ts";
|
|
4
4
|
import { createEmbedding, deleteEmbeddingsForItem } from "../db/embeddings.ts";
|
|
5
5
|
import { logger } from "../utils/logger.ts";
|
|
6
6
|
import { chunk } from "./chunker.ts";
|
|
7
|
+
import { type DriveTarget, formatDriveRef } from "./drives.ts";
|
|
7
8
|
import { embed as defaultEmbed } from "./embedder.ts";
|
|
8
9
|
|
|
9
10
|
type IngestEmbedFn = (texts: string[]) => Promise<number[][]>;
|
|
@@ -12,7 +13,8 @@ export interface PreparedIngestion {
|
|
|
12
13
|
itemId: string;
|
|
13
14
|
title: string;
|
|
14
15
|
description: string;
|
|
15
|
-
|
|
16
|
+
drive: string;
|
|
17
|
+
path: string;
|
|
16
18
|
chunks: { index: number; content: string }[];
|
|
17
19
|
vectors: number[][];
|
|
18
20
|
}
|
|
@@ -27,7 +29,7 @@ export async function prepareIngestion(
|
|
|
27
29
|
config: Required<BotholomewConfig>,
|
|
28
30
|
embedFn?: IngestEmbedFn,
|
|
29
31
|
): Promise<PreparedIngestion | null> {
|
|
30
|
-
const item = await
|
|
32
|
+
const item = await getContextItemById(conn, itemId);
|
|
31
33
|
if (!item) {
|
|
32
34
|
logger.warn(`ingest: context item ${itemId} not found`);
|
|
33
35
|
return null;
|
|
@@ -52,11 +54,12 @@ export async function prepareIngestion(
|
|
|
52
54
|
const chunks = await chunk(item.content, item.mime_type, config);
|
|
53
55
|
if (chunks.length === 0) return null;
|
|
54
56
|
|
|
57
|
+
const ref = formatDriveRef(item);
|
|
55
58
|
const textsForEmbedding = chunks.map((c) => {
|
|
56
59
|
const parts: string[] = [];
|
|
57
60
|
if (item.title) parts.push(`Title: ${item.title}`);
|
|
58
61
|
if (item.description) parts.push(`Description: ${item.description}`);
|
|
59
|
-
|
|
62
|
+
parts.push(`Source: ${ref}`);
|
|
60
63
|
parts.push(c.content);
|
|
61
64
|
return parts.join("\n");
|
|
62
65
|
});
|
|
@@ -66,7 +69,8 @@ export async function prepareIngestion(
|
|
|
66
69
|
itemId,
|
|
67
70
|
title: item.title,
|
|
68
71
|
description: item.description,
|
|
69
|
-
|
|
72
|
+
drive: item.drive,
|
|
73
|
+
path: item.path,
|
|
70
74
|
chunks,
|
|
71
75
|
vectors,
|
|
72
76
|
};
|
|
@@ -102,7 +106,6 @@ export async function storeIngestion(
|
|
|
102
106
|
chunkContent: c.content,
|
|
103
107
|
title: prepared.title,
|
|
104
108
|
description: prepared.description,
|
|
105
|
-
sourcePath: prepared.sourcePath,
|
|
106
109
|
embedding: v,
|
|
107
110
|
});
|
|
108
111
|
}
|
|
@@ -144,17 +147,17 @@ export async function ingestContextItem(
|
|
|
144
147
|
}
|
|
145
148
|
|
|
146
149
|
/**
|
|
147
|
-
* Ingest a context item by its
|
|
150
|
+
* Ingest a context item by its (drive, path) pair.
|
|
148
151
|
*/
|
|
149
152
|
export async function ingestByPath(
|
|
150
153
|
conn: DbConnection,
|
|
151
|
-
|
|
154
|
+
target: DriveTarget,
|
|
152
155
|
config: Required<BotholomewConfig>,
|
|
153
156
|
embedFn?: IngestEmbedFn,
|
|
154
157
|
): Promise<number> {
|
|
155
|
-
const item = await
|
|
158
|
+
const item = await getContextItem(conn, target);
|
|
156
159
|
if (!item) {
|
|
157
|
-
logger.warn(`ingest: no item at
|
|
160
|
+
logger.warn(`ingest: no item at ${formatDriveRef(target)}`);
|
|
158
161
|
return 0;
|
|
159
162
|
}
|
|
160
163
|
return ingestContextItem(conn, item.id, config, embedFn);
|