botholomew 0.6.3 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,11 +8,13 @@ import { loadConfig } from "../config/loader.ts";
8
8
  import type { BotholomewConfig } from "../config/schemas.ts";
9
9
  import { generateDescription } from "../context/describer.ts";
10
10
  import { embedSingle } from "../context/embedder.ts";
11
+ import { FetchFailureError, fetchUrl } from "../context/fetcher.ts";
11
12
  import {
12
13
  type PreparedIngestion,
13
14
  prepareIngestion,
14
15
  storeIngestion,
15
16
  } from "../context/ingest.ts";
17
+ import { isUrl, urlToContextPath } from "../context/url-utils.ts";
16
18
  import type { DbConnection } from "../db/connection.ts";
17
19
  import {
18
20
  type ContextItem,
@@ -24,6 +26,7 @@ import {
24
26
  upsertContextItem,
25
27
  } from "../db/context.ts";
26
28
  import { getEmbeddingsForItem, hybridSearch } from "../db/embeddings.ts";
29
+ import { createMcpxClient } from "../mcpx/client.ts";
27
30
  import { logger } from "../utils/logger.ts";
28
31
  import {
29
32
  registerContextToolSubcommands,
@@ -63,7 +66,7 @@ export function registerContextCommand(program: Command) {
63
66
  return;
64
67
  }
65
68
 
66
- const header = `${ansis.bold("Path".padEnd(35))} ${"Title".padEnd(20)} ${"Description".padEnd(30)} ${"Type".padEnd(15)} ${"Updated".padEnd(18)} Indexed`;
69
+ const header = `${ansis.bold("ID".padEnd(36))} ${ansis.bold("Path".padEnd(35))} ${"Title".padEnd(20)} ${"Description".padEnd(30)} ${"Source".padEnd(6)} ${"Type".padEnd(15)} ${"Updated".padEnd(18)} Indexed`;
67
70
  console.log(header);
68
71
  console.log("-".repeat(header.length));
69
72
 
@@ -75,8 +78,13 @@ export function registerContextCommand(program: Command) {
75
78
  const desc = item.description
76
79
  ? ansis.dim(item.description.slice(0, 29).padEnd(30))
77
80
  : ansis.dim("".padEnd(30));
81
+ const source =
82
+ item.source_type === "url"
83
+ ? ansis.cyan("url".padEnd(6))
84
+ : ansis.dim("file".padEnd(6));
85
+ const id = ansis.dim(item.id.padEnd(36));
78
86
  console.log(
79
- `${item.context_path.slice(0, 34).padEnd(35)} ${item.title.slice(0, 19).padEnd(20)} ${desc} ${item.mime_type.slice(0, 14).padEnd(15)} ${updated} ${indexed}`,
87
+ `${id} ${item.context_path.slice(0, 34).padEnd(35)} ${item.title.slice(0, 19).padEnd(20)} ${desc} ${source} ${item.mime_type.slice(0, 14).padEnd(15)} ${updated} ${indexed}`,
80
88
  );
81
89
  }
82
90
 
@@ -86,87 +94,178 @@ export function registerContextCommand(program: Command) {
86
94
 
87
95
  ctx
88
96
  .command("add <paths...>")
89
- .description("Add files or directories to context")
97
+ .description("Add files, directories, or URLs to context")
90
98
  .option("--prefix <prefix>", "virtual path prefix", "/")
99
+ .option("--name <path>", "custom context path (single URL only)")
100
+ .option(
101
+ "--prompt-addition <text>",
102
+ "extra guidance for the URL fetcher agent (e.g., auth notes, tool hints)",
103
+ )
91
104
  .action((paths: string[], opts) =>
92
105
  withDb(program, async (conn, dir) => {
93
- // Phase 1: Scan all paths and validate they exist
106
+ // Phase 1: Scan all paths separate URLs from local files
94
107
  const filesToAdd: { filePath: string; contextPath: string }[] = [];
95
- const spinner = createSpinner("Scanning files...").start();
108
+ const urlsToAdd: { url: string; contextPath: string }[] = [];
109
+ const spinner = createSpinner("Scanning paths...").start();
110
+
111
+ // Validate --name: only valid with a single URL
112
+ if (opts.name && (paths.length > 1 || !paths[0] || !isUrl(paths[0]))) {
113
+ spinner.error({
114
+ text: "--name can only be used with a single URL",
115
+ });
116
+ process.exit(1);
117
+ }
96
118
 
97
119
  for (const path of paths) {
98
- const resolvedPath = resolve(path);
99
- let info: Awaited<ReturnType<typeof stat>>;
100
- try {
101
- info = await stat(resolvedPath);
102
- } catch {
103
- spinner.error({ text: `Path not found: ${resolvedPath}` });
104
- process.exit(1);
105
- }
120
+ if (isUrl(path)) {
121
+ const contextPath =
122
+ opts.name || urlToContextPath(path, opts.prefix);
123
+ urlsToAdd.push({ url: path, contextPath });
124
+ } else {
125
+ const resolvedPath = resolve(path);
126
+ let info: Awaited<ReturnType<typeof stat>>;
127
+ try {
128
+ info = await stat(resolvedPath);
129
+ } catch {
130
+ spinner.error({ text: `Path not found: ${resolvedPath}` });
131
+ process.exit(1);
132
+ }
106
133
 
107
- if (info.isDirectory()) {
108
- const entries = await walkDirectory(resolvedPath);
109
- for (const filePath of entries) {
110
- const relativePath = filePath.slice(resolvedPath.length);
134
+ if (info.isDirectory()) {
135
+ const entries = await walkDirectory(resolvedPath);
136
+ for (const filePath of entries) {
137
+ const relativePath = filePath.slice(resolvedPath.length);
138
+ filesToAdd.push({
139
+ filePath,
140
+ contextPath: join(opts.prefix, relativePath),
141
+ });
142
+ }
143
+ } else {
111
144
  filesToAdd.push({
112
- filePath,
113
- contextPath: join(opts.prefix, relativePath),
145
+ filePath: resolvedPath,
146
+ contextPath: join(opts.prefix, basename(resolvedPath)),
114
147
  });
115
148
  }
116
- } else {
117
- filesToAdd.push({
118
- filePath: resolvedPath,
119
- contextPath: join(opts.prefix, basename(resolvedPath)),
120
- });
121
149
  }
122
150
  }
123
151
 
152
+ const totalCount = filesToAdd.length + urlsToAdd.length;
124
153
  spinner.success({
125
- text: `Found ${filesToAdd.length} file(s) to add.`,
154
+ text: `Found ${totalCount} item(s) to add (${filesToAdd.length} file(s), ${urlsToAdd.length} URL(s)).`,
126
155
  });
127
156
 
128
157
  // Phase 2: Load config and upsert DB records (batched, parallel LLM descriptions)
129
158
  const config = await loadConfig(dir);
130
159
  const CONCURRENCY = 10;
131
160
  let addCompleted = 0;
132
- const upsertSpinner = createSpinner(
133
- `Adding and describing 0/${filesToAdd.length} files...`,
134
- ).start();
135
161
  const itemIds: { id: string; contextPath: string }[] = [];
136
162
 
137
- for (let i = 0; i < filesToAdd.length; i += CONCURRENCY) {
138
- const batch = filesToAdd.slice(i, i + CONCURRENCY);
139
- const results = await Promise.all(
140
- batch.map(async ({ filePath, contextPath }) => {
141
- const result = await addFile(conn, filePath, contextPath, config);
142
- addCompleted++;
143
- upsertSpinner.update({
144
- text: `Adding and describing ${addCompleted}/${filesToAdd.length} files...`,
145
- });
146
- return result ? { id: result, contextPath } : null;
147
- }),
148
- );
149
- for (const r of results) {
150
- if (r) itemIds.push(r);
163
+ // Process local files (with spinner these are quick, no chatty logs)
164
+ if (filesToAdd.length > 0) {
165
+ const fileSpinner = createSpinner(
166
+ `Adding and describing 0/${filesToAdd.length} file(s)...`,
167
+ ).start();
168
+
169
+ for (let i = 0; i < filesToAdd.length; i += CONCURRENCY) {
170
+ const batch = filesToAdd.slice(i, i + CONCURRENCY);
171
+ const results = await Promise.all(
172
+ batch.map(async ({ filePath, contextPath }) => {
173
+ const result = await addFile(
174
+ conn,
175
+ filePath,
176
+ contextPath,
177
+ config,
178
+ );
179
+ addCompleted++;
180
+ fileSpinner.update({
181
+ text: `Adding and describing ${addCompleted}/${filesToAdd.length} file(s)...`,
182
+ });
183
+ return result ? { id: result, contextPath } : null;
184
+ }),
185
+ );
186
+ for (const r of results) {
187
+ if (r) itemIds.push(r);
188
+ }
151
189
  }
190
+
191
+ fileSpinner.success({
192
+ text: `Added and described ${addCompleted} file(s).`,
193
+ });
152
194
  }
153
195
 
154
- upsertSpinner.success({
155
- text: `Added and described ${itemIds.length} file(s).`,
156
- });
196
+ // Process URLs (no spinner — agent logs would interleave; render cleanly instead)
197
+ if (urlsToAdd.length > 0) {
198
+ const mcpxClient = await createMcpxClient(dir);
199
+ if (!mcpxClient) {
200
+ logger.dim(
201
+ "No MCP servers configured — remote fetches will use basic HTTP.",
202
+ );
203
+ }
204
+
205
+ let urlIdx = 0;
206
+ let urlAdded = 0;
207
+ for (const { url, contextPath } of urlsToAdd) {
208
+ urlIdx++;
209
+ console.log(
210
+ `\n${ansis.bold(`[${urlIdx}/${urlsToAdd.length}]`)} ${ansis.cyan(url)}`,
211
+ );
212
+ const result = await addUrl(
213
+ conn,
214
+ config,
215
+ url,
216
+ contextPath,
217
+ mcpxClient,
218
+ opts.promptAddition,
219
+ );
220
+ if (result.ok) {
221
+ urlAdded++;
222
+ itemIds.push({ id: result.id, contextPath });
223
+ console.log(` ${ansis.green("✔")} stored at ${contextPath}`);
224
+ } else if (result.actionable) {
225
+ console.log(
226
+ ` ${ansis.red("✗")} ${ansis.bold("action required:")}`,
227
+ );
228
+ for (const line of result.error.split("\n")) {
229
+ console.log(` ${ansis.yellow(line)}`);
230
+ }
231
+ } else {
232
+ console.log(
233
+ ` ${ansis.red("✗")} failed to fetch: ${result.error}`,
234
+ );
235
+ }
236
+ }
237
+
238
+ const urlSummary = `Added ${urlAdded}/${urlsToAdd.length} URL(s).`;
239
+ if (urlAdded === urlsToAdd.length) {
240
+ console.log(`\n${ansis.green("✔")} ${urlSummary}`);
241
+ } else if (urlAdded === 0) {
242
+ console.log(`\n${ansis.red("✗")} ${urlSummary}`);
243
+ } else {
244
+ console.log(`\n${ansis.yellow("⚠")} ${urlSummary}`);
245
+ }
246
+ }
157
247
 
158
248
  // Phase 3: Chunk + embed in parallel (network I/O)
159
249
  if (itemIds.length === 0 || !config.openai_api_key) {
160
250
  if (!config.openai_api_key) {
161
251
  logger.dim("Skipping embeddings (no OpenAI API key configured).");
162
252
  }
163
- logger.success(`Added ${itemIds.length} file(s), 0 chunks indexed.`);
164
- process.exit(0);
253
+ const msg = `Added ${itemIds.length}/${totalCount} item(s), 0 chunks indexed.`;
254
+ if (itemIds.length === totalCount) {
255
+ logger.success(msg);
256
+ process.exit(0);
257
+ } else if (itemIds.length === 0) {
258
+ logger.error(msg);
259
+ process.exit(1);
260
+ } else {
261
+ logger.warn(msg);
262
+ process.exit(1);
263
+ }
165
264
  }
166
265
 
167
266
  let completed = 0;
168
267
  const embedSpinner = createSpinner(
169
- `Embedding 0/${itemIds.length} files...`,
268
+ `Embedding 0/${itemIds.length} items...`,
170
269
  ).start();
171
270
 
172
271
  const prepared: PreparedIngestion[] = [];
@@ -177,7 +276,7 @@ export function registerContextCommand(program: Command) {
177
276
  const result = await prepareIngestion(conn, id, config);
178
277
  completed++;
179
278
  embedSpinner.update({
180
- text: `Embedding ${completed}/${itemIds.length} files...`,
279
+ text: `Embedding ${completed}/${itemIds.length} items...`,
181
280
  });
182
281
  return result;
183
282
  }),
@@ -187,7 +286,7 @@ export function registerContextCommand(program: Command) {
187
286
  }
188
287
  }
189
288
  embedSpinner.success({
190
- text: `Embedded ${prepared.length} file(s).`,
289
+ text: `Embedded ${prepared.length} item(s).`,
191
290
  });
192
291
 
193
292
  // Phase 4: Store embeddings (sequential, fast DB writes)
@@ -204,8 +303,14 @@ export function registerContextCommand(program: Command) {
204
303
  const parts: string[] = [];
205
304
  if (filesAdded > 0) parts.push(`${filesAdded} added`);
206
305
  if (filesUpdated > 0) parts.push(`${filesUpdated} updated`);
207
- logger.success(`${parts.join(", ")} — ${chunks} chunk(s) indexed.`);
208
- process.exit(0);
306
+ const summary = `${parts.join(", ")} — ${chunks} chunk(s) indexed (${itemIds.length}/${totalCount} item(s)).`;
307
+ if (itemIds.length === totalCount) {
308
+ logger.success(summary);
309
+ process.exit(0);
310
+ } else {
311
+ logger.warn(summary);
312
+ process.exit(1);
313
+ }
209
314
  }),
210
315
  );
211
316
 
@@ -310,7 +415,9 @@ export function registerContextCommand(program: Command) {
310
415
 
311
416
  ctx
312
417
  .command("refresh [path]")
313
- .description("Re-import files from disk and re-embed if content changed")
418
+ .description(
419
+ "Re-import files from disk / re-fetch URLs and re-embed if content changed",
420
+ )
314
421
  .option("--all", "refresh all items with a source path")
315
422
  .action((path: string | undefined, opts: { all?: boolean }) =>
316
423
  withDb(program, async (conn, dir) => {
@@ -333,7 +440,11 @@ export function registerContextCommand(program: Command) {
333
440
 
334
441
  const config = await loadConfig(dir);
335
442
 
336
- // Phase 1: Read files from disk, compare, and update DB
443
+ // Init MCPX client if any URL items need refreshing
444
+ const hasUrls = sourced.some((i) => i.source_type === "url");
445
+ const mcpxClient = hasUrls ? await createMcpxClient(dir) : null;
446
+
447
+ // Phase 1: Read files / fetch URLs, compare, and update DB
337
448
  const spinner = createSpinner(
338
449
  `Refreshing 0/${sourced.length} items...`,
339
450
  ).start();
@@ -348,13 +459,21 @@ export function registerContextCommand(program: Command) {
348
459
  });
349
460
  try {
350
461
  const sourcePath = item.source_path as string;
351
- const bunFile = Bun.file(sourcePath);
352
- if (!(await bunFile.exists())) {
353
- missing++;
354
- logger.warn(` Missing: ${item.source_path}`);
355
- continue;
462
+ let content: string;
463
+
464
+ if (item.source_type === "url") {
465
+ const fetched = await fetchUrl(sourcePath, config, mcpxClient);
466
+ content = fetched.content;
467
+ } else {
468
+ const bunFile = Bun.file(sourcePath);
469
+ if (!(await bunFile.exists())) {
470
+ missing++;
471
+ logger.warn(` Missing: ${item.source_path}`);
472
+ continue;
473
+ }
474
+ content = await bunFile.text();
356
475
  }
357
- const content = await bunFile.text();
476
+
358
477
  if (content === item.content) {
359
478
  unchanged++;
360
479
  continue;
@@ -363,11 +482,11 @@ export function registerContextCommand(program: Command) {
363
482
  updated++;
364
483
  toReembed.push(item.id);
365
484
  } catch (err) {
366
- logger.warn(` Error reading ${item.source_path}: ${err}`);
485
+ logger.warn(` Error refreshing ${item.source_path}: ${err}`);
367
486
  }
368
487
  }
369
488
  spinner.success({
370
- text: `Checked ${sourced.length} file(s): ${updated} updated, ${unchanged} unchanged, ${missing} missing.`,
489
+ text: `Checked ${sourced.length} item(s): ${updated} updated, ${unchanged} unchanged, ${missing} missing.`,
371
490
  });
372
491
 
373
492
  // Phase 2: Re-embed changed items
@@ -381,7 +500,7 @@ export function registerContextCommand(program: Command) {
381
500
  const CONCURRENCY = 10;
382
501
  let completed = 0;
383
502
  const embedSpinner = createSpinner(
384
- `Embedding 0/${toReembed.length} files...`,
503
+ `Embedding 0/${toReembed.length} item(s)...`,
385
504
  ).start();
386
505
 
387
506
  const prepared: PreparedIngestion[] = [];
@@ -392,7 +511,7 @@ export function registerContextCommand(program: Command) {
392
511
  const result = await prepareIngestion(conn, id, config);
393
512
  completed++;
394
513
  embedSpinner.update({
395
- text: `Embedding ${completed}/${toReembed.length} files...`,
514
+ text: `Embedding ${completed}/${toReembed.length} item(s)...`,
396
515
  });
397
516
  return result;
398
517
  }),
@@ -402,7 +521,7 @@ export function registerContextCommand(program: Command) {
402
521
  }
403
522
  }
404
523
  embedSpinner.success({
405
- text: `Embedded ${prepared.length} file(s).`,
524
+ text: `Embedded ${prepared.length} item(s).`,
406
525
  });
407
526
 
408
527
  let chunks = 0;
@@ -412,7 +531,7 @@ export function registerContextCommand(program: Command) {
412
531
  }
413
532
 
414
533
  logger.success(
415
- `Refreshed ${updated} file(s), ${chunks} chunk(s) re-indexed.`,
534
+ `Refreshed ${updated} item(s), ${chunks} chunk(s) re-indexed.`,
416
535
  );
417
536
  }),
418
537
  );
@@ -476,6 +595,48 @@ async function addFile(
476
595
  }
477
596
  }
478
597
 
598
+ /** Fetch a URL and upsert into context. Returns the item ID, or null on failure. */
599
+ type AddUrlResult =
600
+ | { ok: true; id: string }
601
+ | { ok: false; error: string; actionable: boolean };
602
+
603
+ async function addUrl(
604
+ conn: DbConnection,
605
+ config: Required<BotholomewConfig>,
606
+ url: string,
607
+ contextPath: string,
608
+ mcpxClient: Awaited<ReturnType<typeof createMcpxClient>>,
609
+ promptAddition?: string,
610
+ ): Promise<AddUrlResult> {
611
+ try {
612
+ const fetched = await fetchUrl(url, config, mcpxClient, promptAddition);
613
+
614
+ const description = await generateDescription(config, {
615
+ filename: new URL(url).hostname,
616
+ mimeType: fetched.mimeType,
617
+ content: fetched.content,
618
+ });
619
+
620
+ const item = await upsertContextItem(conn, {
621
+ title: fetched.title,
622
+ description,
623
+ content: fetched.content,
624
+ mimeType: fetched.mimeType,
625
+ sourceType: "url",
626
+ sourcePath: url,
627
+ contextPath,
628
+ isTextual: true,
629
+ });
630
+
631
+ return { ok: true, id: item.id };
632
+ } catch (err) {
633
+ if (err instanceof FetchFailureError) {
634
+ return { ok: false, error: err.userMessage, actionable: true };
635
+ }
636
+ return { ok: false, error: String(err), actionable: false };
637
+ }
638
+ }
639
+
479
640
  async function walkDirectory(dirPath: string): Promise<string[]> {
480
641
  const files: string[] = [];
481
642
  const entries = await readdir(dirPath, { withFileTypes: true });
@@ -9,6 +9,13 @@ export interface Chunk {
9
9
  const SHORT_CONTENT_THRESHOLD = 200;
10
10
  const LLM_TIMEOUT_MS = 10_000;
11
11
  const DEFAULT_OVERLAP_LINES = 2;
12
+ // OpenAI's embedding endpoint caps inputs at 8192 tokens. The cl100k_base
13
+ // tokenizer averages ~4 chars/token on plain English but can drop to ~2
14
+ // chars/token on dense/code/non-ASCII content. We cap at 15k chars so even
15
+ // at the worst-case ~2.5 chars/token (~6k tokens) we stay well under the
16
+ // 8192-token limit, leaving headroom for the title/description prefix
17
+ // prepended at embed time.
18
+ const MAX_CHUNK_CHARS = 15_000;
12
19
 
13
20
  const CHUNKER_TOOL_NAME = "return_chunks";
14
21
  const CHUNKER_TOOL = {
@@ -41,6 +48,90 @@ const CHUNKER_TOOL = {
41
48
  },
42
49
  };
43
50
 
51
+ /**
52
+ * Split text into pieces no larger than `maxChars`, preferring paragraph,
53
+ * line, and finally hard-character boundaries.
54
+ */
55
+ function splitText(text: string, maxChars: number): string[] {
56
+ if (text.length <= maxChars) return [text];
57
+
58
+ // Try paragraph splits first.
59
+ const paragraphs = text.split(/\n\n+/);
60
+ if (paragraphs.length > 1) {
61
+ const out: string[] = [];
62
+ let buf = "";
63
+ for (const p of paragraphs) {
64
+ const candidate = buf ? `${buf}\n\n${p}` : p;
65
+ if (candidate.length <= maxChars) {
66
+ buf = candidate;
67
+ } else {
68
+ if (buf) out.push(buf);
69
+ if (p.length <= maxChars) {
70
+ buf = p;
71
+ } else {
72
+ out.push(...splitText(p, maxChars));
73
+ buf = "";
74
+ }
75
+ }
76
+ }
77
+ if (buf) out.push(buf);
78
+ return out;
79
+ }
80
+
81
+ // Fall back to line splits.
82
+ const lines = text.split("\n");
83
+ if (lines.length > 1) {
84
+ const out: string[] = [];
85
+ let buf = "";
86
+ for (const line of lines) {
87
+ const candidate = buf ? `${buf}\n${line}` : line;
88
+ if (candidate.length <= maxChars) {
89
+ buf = candidate;
90
+ } else {
91
+ if (buf) out.push(buf);
92
+ if (line.length <= maxChars) {
93
+ buf = line;
94
+ } else {
95
+ // Single line longer than maxChars — slice it.
96
+ for (let i = 0; i < line.length; i += maxChars) {
97
+ out.push(line.slice(i, i + maxChars));
98
+ }
99
+ buf = "";
100
+ }
101
+ }
102
+ }
103
+ if (buf) out.push(buf);
104
+ return out;
105
+ }
106
+
107
+ // Last resort: hard slice.
108
+ const out: string[] = [];
109
+ for (let i = 0; i < text.length; i += maxChars) {
110
+ out.push(text.slice(i, i + maxChars));
111
+ }
112
+ return out;
113
+ }
114
+
115
+ /**
116
+ * Re-chunk any chunks larger than `maxChars`, preserving order and reindexing.
117
+ */
118
+ export function enforceMaxChunkSize(
119
+ chunks: Chunk[],
120
+ maxChars = MAX_CHUNK_CHARS,
121
+ ): Chunk[] {
122
+ const out: Chunk[] = [];
123
+ for (const c of chunks) {
124
+ if (c.content.length <= maxChars) {
125
+ out.push({ index: out.length, content: c.content });
126
+ continue;
127
+ }
128
+ for (const piece of splitText(c.content, maxChars)) {
129
+ out.push({ index: out.length, content: piece });
130
+ }
131
+ }
132
+ return out;
133
+ }
134
+
44
135
  /**
45
136
  * Add overlapping lines from the end of each chunk to the start of the next.
46
137
  * Improves retrieval when concepts span chunk boundaries.
@@ -137,5 +228,11 @@ export async function chunk(
137
228
  }
138
229
 
139
230
  const chunks = await chunkWithLLM(content, mimeType, config);
140
- return addOverlapToChunks(chunks);
231
+ // Enforce a hard size cap before AND after overlap. The first pass handles
232
+ // oversize chunks from the LLM (common for docs with very long lines); the
233
+ // second pass handles the rare case where added overlap pushes a near-limit
234
+ // chunk over.
235
+ const sized = enforceMaxChunkSize(chunks);
236
+ const withOverlap = addOverlapToChunks(sized);
237
+ return enforceMaxChunkSize(withOverlap);
141
238
  }