botholomew 0.5.0 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/package.json +2 -2
  2. package/src/chat/session.ts +2 -2
  3. package/src/commands/context.ts +53 -42
  4. package/src/commands/daemon.ts +1 -1
  5. package/src/commands/schedule.ts +1 -1
  6. package/src/commands/task.ts +2 -1
  7. package/src/commands/thread.ts +6 -40
  8. package/src/commands/with-db.ts +2 -2
  9. package/src/constants.ts +1 -1
  10. package/src/context/chunker.ts +23 -46
  11. package/src/context/describer.ts +146 -0
  12. package/src/context/ingest.ts +27 -25
  13. package/src/daemon/index.ts +51 -5
  14. package/src/daemon/llm.ts +90 -13
  15. package/src/daemon/prompt.ts +3 -4
  16. package/src/daemon/schedules.ts +7 -1
  17. package/src/daemon/tick.ts +17 -5
  18. package/src/db/connection.ts +102 -40
  19. package/src/db/context.ts +120 -94
  20. package/src/db/embeddings.ts +55 -77
  21. package/src/db/query.ts +11 -0
  22. package/src/db/schedules.ts +27 -28
  23. package/src/db/schema.ts +9 -9
  24. package/src/db/sql/1-core_tables.sql +11 -11
  25. package/src/db/sql/2-logging_tables.sql +3 -3
  26. package/src/db/sql/3-daemon_state.sql +2 -2
  27. package/src/db/sql/6-vss_index.sql +1 -0
  28. package/src/db/sql/7-drop_embeddings_fk.sql +24 -0
  29. package/src/db/sql/8-task_output.sql +1 -0
  30. package/src/db/tasks.ts +89 -78
  31. package/src/db/threads.ts +52 -41
  32. package/src/init/index.ts +2 -2
  33. package/src/tools/file/move.ts +5 -3
  34. package/src/tools/file/write.ts +2 -30
  35. package/src/tools/search/semantic.ts +7 -4
  36. package/src/tools/task/list.ts +2 -0
  37. package/src/tools/task/view.ts +2 -0
  38. package/src/tui/App.tsx +20 -3
  39. package/src/tui/components/SchedulePanel.tsx +389 -0
  40. package/src/tui/components/TabBar.tsx +3 -2
  41. package/src/tui/components/TaskPanel.tsx +6 -0
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "botholomew",
3
- "version": "0.5.0",
3
+ "version": "0.6.1",
4
4
  "description": "An AI agent for knowledge work",
5
5
  "type": "module",
6
6
  "bin": {
@@ -23,8 +23,8 @@
23
23
  },
24
24
  "dependencies": {
25
25
  "@anthropic-ai/sdk": "^0.88.0",
26
+ "@duckdb/node-api": "^1.5.2-r.1",
26
27
  "@evantahler/mcpx": "0.18.3",
27
- "@sqliteai/sqlite-vector": "^0.9.95",
28
28
  "ansis": "^4.2.0",
29
29
  "commander": "^14.0.0",
30
30
  "gray-matter": "^4.0.3",
@@ -44,8 +44,8 @@ export async function startChatSession(
44
44
  );
45
45
  }
46
46
 
47
- const conn = getConnection(getDbPath(projectDir));
48
- migrate(conn);
47
+ const conn = await getConnection(getDbPath(projectDir));
48
+ await migrate(conn);
49
49
 
50
50
  let threadId: string;
51
51
  const messages: MessageParam[] = [];
@@ -5,6 +5,8 @@ import type { Command } from "commander";
5
5
  import { isText } from "istextorbinary";
6
6
  import { createSpinner } from "nanospinner";
7
7
  import { loadConfig } from "../config/loader.ts";
8
+ import type { BotholomewConfig } from "../config/schemas.ts";
9
+ import { generateDescription } from "../context/describer.ts";
8
10
  import { embedSingle } from "../context/embedder.ts";
9
11
  import {
10
12
  type PreparedIngestion,
@@ -14,18 +16,14 @@ import {
14
16
  import type { DbConnection } from "../db/connection.ts";
15
17
  import {
16
18
  type ContextItem,
17
- createContextItem,
18
19
  deleteContextItemByPath,
19
20
  getContextItemByPath,
20
21
  listContextItems,
21
22
  listContextItemsByPrefix,
22
23
  updateContextItem,
24
+ upsertContextItem,
23
25
  } from "../db/context.ts";
24
- import {
25
- getEmbeddingsForItem,
26
- hybridSearch,
27
- initVectorSearch,
28
- } from "../db/embeddings.ts";
26
+ import { getEmbeddingsForItem, hybridSearch } from "../db/embeddings.ts";
29
27
  import { logger } from "../utils/logger.ts";
30
28
  import { withDb } from "./with-db.ts";
31
29
 
@@ -61,7 +59,7 @@ export function registerContextCommand(program: Command) {
61
59
  return;
62
60
  }
63
61
 
64
- const header = `${ansis.bold("Path".padEnd(40))} ${"Title".padEnd(25)} ${"Type".padEnd(20)} ${"Updated".padEnd(18)} Indexed`;
62
+ const header = `${ansis.bold("Path".padEnd(35))} ${"Title".padEnd(20)} ${"Description".padEnd(30)} ${"Type".padEnd(15)} ${"Updated".padEnd(18)} Indexed`;
65
63
  console.log(header);
66
64
  console.log("-".repeat(header.length));
67
65
 
@@ -70,8 +68,11 @@ export function registerContextCommand(program: Command) {
70
68
  ? ansis.green("yes")
71
69
  : ansis.dim("no");
72
70
  const updated = ansis.dim(fmtDate(item.updated_at).padEnd(18));
71
+ const desc = item.description
72
+ ? ansis.dim(item.description.slice(0, 29).padEnd(30))
73
+ : ansis.dim("".padEnd(30));
73
74
  console.log(
74
- `${item.context_path.padEnd(40)} ${item.title.slice(0, 24).padEnd(25)} ${item.mime_type.slice(0, 19).padEnd(20)} ${updated} ${indexed}`,
75
+ `${item.context_path.slice(0, 34).padEnd(35)} ${item.title.slice(0, 19).padEnd(20)} ${desc} ${item.mime_type.slice(0, 14).padEnd(15)} ${updated} ${indexed}`,
75
76
  );
76
77
  }
77
78
 
@@ -91,6 +92,7 @@ export function registerContextCommand(program: Command) {
91
92
  }
92
93
 
93
94
  console.log(ansis.bold(item.title));
95
+ if (item.description) console.log(` Description: ${item.description}`);
94
96
  console.log(` Path: ${item.context_path}`);
95
97
  console.log(` MIME type: ${item.mime_type}`);
96
98
  if (item.source_path) console.log(` Source: ${item.source_path}`);
@@ -150,18 +152,34 @@ export function registerContextCommand(program: Command) {
150
152
  text: `Found ${filesToAdd.length} file(s) to add.`,
151
153
  });
152
154
 
153
- // Phase 2: Load config and upsert DB records (sequential, fast)
155
+ // Phase 2: Load config and upsert DB records (batched, parallel LLM descriptions)
154
156
  const config = await loadConfig(dir);
157
+ const CONCURRENCY = 10;
158
+ let addCompleted = 0;
155
159
  const upsertSpinner = createSpinner(
156
- "Adding files to database...",
160
+ `Adding and describing 0/${filesToAdd.length} files...`,
157
161
  ).start();
158
162
  const itemIds: { id: string; contextPath: string }[] = [];
159
- for (const { filePath, contextPath } of filesToAdd) {
160
- const result = await addFile(conn, filePath, contextPath);
161
- if (result) itemIds.push({ id: result, contextPath });
163
+
164
+ for (let i = 0; i < filesToAdd.length; i += CONCURRENCY) {
165
+ const batch = filesToAdd.slice(i, i + CONCURRENCY);
166
+ const results = await Promise.all(
167
+ batch.map(async ({ filePath, contextPath }) => {
168
+ const result = await addFile(conn, filePath, contextPath, config);
169
+ addCompleted++;
170
+ upsertSpinner.update({
171
+ text: `Adding and describing ${addCompleted}/${filesToAdd.length} files...`,
172
+ });
173
+ return result ? { id: result, contextPath } : null;
174
+ }),
175
+ );
176
+ for (const r of results) {
177
+ if (r) itemIds.push(r);
178
+ }
162
179
  }
180
+
163
181
  upsertSpinner.success({
164
- text: `Added ${itemIds.length} file(s) to database.`,
182
+ text: `Added and described ${itemIds.length} file(s).`,
165
183
  });
166
184
 
167
185
  // Phase 3: Chunk + embed in parallel (network I/O)
@@ -173,7 +191,6 @@ export function registerContextCommand(program: Command) {
173
191
  process.exit(0);
174
192
  }
175
193
 
176
- const CONCURRENCY = 10;
177
194
  let completed = 0;
178
195
  const embedSpinner = createSpinner(
179
196
  `Embedding 0/${itemIds.length} files...`,
@@ -205,7 +222,7 @@ export function registerContextCommand(program: Command) {
205
222
  let filesAdded = 0;
206
223
  let filesUpdated = 0;
207
224
  for (const p of prepared) {
208
- const result = storeIngestion(conn, p);
225
+ const result = await storeIngestion(conn, p);
209
226
  chunks += result.chunks;
210
227
  if (result.isUpdate) filesUpdated++;
211
228
  else filesAdded++;
@@ -226,9 +243,8 @@ export function registerContextCommand(program: Command) {
226
243
  .action((query, opts) =>
227
244
  withDb(program, async (conn, dir) => {
228
245
  const config = await loadConfig(dir);
229
- initVectorSearch(conn);
230
246
  const queryVec = await embedSingle(query, config);
231
- const results = hybridSearch(conn, query, queryVec, opts.topK);
247
+ const results = await hybridSearch(conn, query, queryVec, opts.topK);
232
248
 
233
249
  if (results.length === 0) {
234
250
  logger.dim("No results found.");
@@ -280,7 +296,7 @@ export function registerContextCommand(program: Command) {
280
296
  return;
281
297
  }
282
298
 
283
- const embeddings = getEmbeddingsForItem(conn, item.id);
299
+ const embeddings = await getEmbeddingsForItem(conn, item.id);
284
300
 
285
301
  console.log(ansis.bold(item.title));
286
302
  console.log(` Path: ${item.context_path}`);
@@ -411,7 +427,7 @@ export function registerContextCommand(program: Command) {
411
427
 
412
428
  let chunks = 0;
413
429
  for (const p of prepared) {
414
- const result = storeIngestion(conn, p);
430
+ const result = await storeIngestion(conn, p);
415
431
  chunks += result.chunks;
416
432
  }
417
433
 
@@ -443,36 +459,31 @@ async function addFile(
443
459
  conn: DbConnection,
444
460
  filePath: string,
445
461
  contextPath: string,
462
+ config: Required<BotholomewConfig>,
446
463
  ): Promise<string | null> {
447
464
  try {
448
465
  const bunFile = Bun.file(filePath);
449
466
  const mimeType = bunFile.type.split(";")[0] || "application/octet-stream";
450
467
  const filename = basename(filePath);
451
468
  const textual = isText(filename) !== false;
452
-
453
469
  const content = textual ? await bunFile.text() : null;
454
470
 
455
- const existing = await getContextItemByPath(conn, contextPath);
456
- let item: ContextItem;
457
-
458
- if (existing) {
459
- const updated = await updateContextItem(conn, existing.id, {
460
- title: filename,
461
- content: content ?? undefined,
462
- mime_type: mimeType,
463
- });
464
- if (!updated) throw new Error(`Failed to update: ${contextPath}`);
465
- item = updated;
466
- } else {
467
- item = await createContextItem(conn, {
468
- title: filename,
469
- content: content ?? undefined,
470
- mimeType,
471
- sourcePath: filePath,
472
- contextPath,
473
- isTextual: textual,
474
- });
475
- }
471
+ const description = await generateDescription(config, {
472
+ filename,
473
+ mimeType,
474
+ content,
475
+ filePath,
476
+ });
477
+
478
+ const item = await upsertContextItem(conn, {
479
+ title: filename,
480
+ description,
481
+ content: content ?? undefined,
482
+ mimeType,
483
+ sourcePath: filePath,
484
+ contextPath,
485
+ isTextual: textual,
486
+ });
476
487
 
477
488
  return textual && content ? item.id : null;
478
489
  } catch (err) {
@@ -12,7 +12,7 @@ export function registerDaemonCommand(program: Command) {
12
12
  .action(async () => {
13
13
  const dir = program.opts().dir;
14
14
  const { startDaemon } = await import("../daemon/index.ts");
15
- await startDaemon(dir);
15
+ await startDaemon(dir, { foreground: true });
16
16
  });
17
17
 
18
18
  daemon
@@ -163,7 +163,7 @@ function enabledColor(enabled: boolean): string {
163
163
  }
164
164
 
165
165
  function printSchedule(s: Schedule) {
166
- const id = ansis.dim(s.id.slice(0, 8));
166
+ const id = ansis.dim(s.id);
167
167
  const lastRun = s.last_run_at
168
168
  ? s.last_run_at.toISOString()
169
169
  : ansis.dim("never");
@@ -155,7 +155,7 @@ function priorityColor(priority: Task["priority"]): string {
155
155
  }
156
156
 
157
157
  function printTask(t: Task) {
158
- const id = ansis.dim(t.id.slice(0, 8));
158
+ const id = ansis.dim(t.id);
159
159
  console.log(
160
160
  ` ${id} ${statusColor(t.status)} ${priorityColor(t.priority)} ${t.name}`,
161
161
  );
@@ -168,6 +168,7 @@ function printTaskDetail(t: Task) {
168
168
  console.log(` Priority: ${priorityColor(t.priority)}`);
169
169
  if (t.description) console.log(` Description: ${t.description}`);
170
170
  if (t.waiting_reason) console.log(` Waiting: ${t.waiting_reason}`);
171
+ if (t.output) console.log(` Output: ${t.output}`);
171
172
  if (t.claimed_by) console.log(` Claimed by: ${t.claimed_by}`);
172
173
  if (t.blocked_by.length > 0)
173
174
  console.log(` Blocked by: ${t.blocked_by.join(", ")}`);
@@ -1,6 +1,5 @@
1
1
  import ansis from "ansis";
2
2
  import type { Command } from "commander";
3
- import type { DbConnection } from "../db/connection.ts";
4
3
  import type { Interaction, Thread } from "../db/threads.ts";
5
4
  import {
6
5
  deleteThread,
@@ -48,12 +47,7 @@ export function registerThreadCommand(program: Command) {
48
47
  )
49
48
  .action((id, opts) =>
50
49
  withDb(program, async (conn) => {
51
- const resolvedId = await resolveThreadId(conn, id);
52
- if (!resolvedId) {
53
- logger.error(`Thread not found: ${id}`);
54
- process.exit(1);
55
- }
56
- const result = await getThread(conn, resolvedId);
50
+ const result = await getThread(conn, id);
57
51
  if (!result) {
58
52
  logger.error(`Thread not found: ${id}`);
59
53
  process.exit(1);
@@ -72,17 +66,12 @@ export function registerThreadCommand(program: Command) {
72
66
  .description("Delete a thread and its interactions")
73
67
  .action((id) =>
74
68
  withDb(program, async (conn) => {
75
- const resolvedId = await resolveThreadId(conn, id);
76
- if (!resolvedId) {
77
- logger.error(`Thread not found: ${id}`);
78
- process.exit(1);
79
- }
80
- const deleted = await deleteThread(conn, resolvedId);
69
+ const deleted = await deleteThread(conn, id);
81
70
  if (!deleted) {
82
71
  logger.error(`Thread not found: ${id}`);
83
72
  process.exit(1);
84
73
  }
85
- logger.success(`Deleted thread: ${resolvedId}`);
74
+ logger.success(`Deleted thread: ${id}`);
86
75
  }),
87
76
  );
88
77
 
@@ -94,12 +83,7 @@ export function registerThreadCommand(program: Command) {
94
83
  withDb(program, async (conn) => {
95
84
  let resolvedId: string;
96
85
  if (id) {
97
- const found = await resolveThreadId(conn, id);
98
- if (!found) {
99
- logger.error(`Thread not found: ${id}`);
100
- process.exit(1);
101
- }
102
- resolvedId = found;
86
+ resolvedId = id;
103
87
  } else {
104
88
  const active = await getActiveThread(conn);
105
89
  if (!active) {
@@ -130,7 +114,7 @@ export function registerThreadCommand(program: Command) {
130
114
 
131
115
  const pollMs = opts.interval ?? 500;
132
116
  logger.info(
133
- `Following thread ${ansis.dim(resolvedId.slice(0, 8))}... (Ctrl+C to stop)`,
117
+ `Following thread ${ansis.dim(resolvedId)}... (Ctrl+C to stop)`,
134
118
  );
135
119
 
136
120
  const interval = setInterval(async () => {
@@ -168,24 +152,6 @@ export function registerThreadCommand(program: Command) {
168
152
  );
169
153
  }
170
154
 
171
- async function resolveThreadId(
172
- conn: DbConnection,
173
- idPrefix: string,
174
- ): Promise<string | null> {
175
- if (idPrefix.length >= 36) return idPrefix;
176
- const all = await listThreads(conn);
177
- const matches = all.filter((t) => t.id.startsWith(idPrefix));
178
- if (matches.length === 1) {
179
- const match = matches[0] as Thread;
180
- return match.id;
181
- }
182
- if (matches.length === 0) return null;
183
- logger.error(
184
- `Ambiguous thread prefix "${idPrefix}" matches ${matches.length} threads`,
185
- );
186
- process.exit(1);
187
- }
188
-
189
155
  function typeColor(type: Thread["type"]): string {
190
156
  switch (type) {
191
157
  case "daemon_tick":
@@ -213,7 +179,7 @@ function roleColor(role: Interaction["role"]): string {
213
179
  }
214
180
 
215
181
  function printThread(t: Thread) {
216
- const id = ansis.dim(t.id.slice(0, 8));
182
+ const id = ansis.dim(t.id);
217
183
  const title = t.title || ansis.dim("(untitled)");
218
184
  console.log(` ${id} ${typeColor(t.type)} ${statusLabel(t)} ${title}`);
219
185
  }
@@ -13,8 +13,8 @@ export async function withDb<T>(
13
13
  fn: (conn: DbConnection, dir: string) => Promise<T>,
14
14
  ): Promise<T> {
15
15
  const dir = program.opts().dir;
16
- const conn = getConnection(getDbPath(dir));
17
- migrate(conn);
16
+ const conn = await getConnection(getDbPath(dir));
17
+ await migrate(conn);
18
18
  try {
19
19
  return await fn(conn, dir);
20
20
  } finally {
package/src/constants.ts CHANGED
@@ -12,7 +12,7 @@ export const DEFAULTS = {
12
12
  UPDATE_CHECK_INTERVAL_MS: 24 * 60 * 60 * 1000, // 24 hours
13
13
  UPDATE_CHECK_TIMEOUT_MS: 5_000,
14
14
  } as const;
15
- export const DB_FILENAME = "data.sqlite";
15
+ export const DB_FILENAME = "data.duckdb";
16
16
  export const PID_FILENAME = "daemon.pid";
17
17
  export const LOG_FILENAME = "daemon.log";
18
18
  export const CONFIG_FILENAME = "config.json";
@@ -1,16 +1,14 @@
1
1
  import Anthropic from "@anthropic-ai/sdk";
2
2
  import type { BotholomewConfig } from "../config/schemas.ts";
3
- import { logger } from "../utils/logger.ts";
4
3
 
5
4
  export interface Chunk {
6
5
  index: number;
7
6
  content: string;
8
7
  }
9
8
 
10
- const DEFAULT_WINDOW_CHARS = 2000;
11
- const DEFAULT_OVERLAP_CHARS = 200;
12
9
  const SHORT_CONTENT_THRESHOLD = 200;
13
10
  const LLM_TIMEOUT_MS = 10_000;
11
+ const DEFAULT_OVERLAP_LINES = 2;
14
12
 
15
13
  const CHUNKER_TOOL_NAME = "return_chunks";
16
14
  const CHUNKER_TOOL = {
@@ -44,42 +42,23 @@ const CHUNKER_TOOL = {
44
42
  };
45
43
 
46
44
  /**
47
- * Deterministic sliding-window chunker.
48
- * Splits content into overlapping windows of approximately `windowChars` characters,
49
- * breaking at newlines when possible.
45
+ * Add overlapping lines from the end of each chunk to the start of the next.
46
+ * Improves retrieval when concepts span chunk boundaries.
50
47
  */
51
- export function chunkWithSlidingWindow(
52
- content: string,
53
- windowChars = DEFAULT_WINDOW_CHARS,
54
- overlapChars = DEFAULT_OVERLAP_CHARS,
48
+ export function addOverlapToChunks(
49
+ chunks: Chunk[],
50
+ overlapLines = DEFAULT_OVERLAP_LINES,
55
51
  ): Chunk[] {
56
- if (content.length <= windowChars) {
57
- return [{ index: 0, content }];
58
- }
59
-
60
- const chunks: Chunk[] = [];
61
- let start = 0;
62
- let index = 0;
63
-
64
- while (start < content.length) {
65
- let end = Math.min(start + windowChars, content.length);
66
-
67
- // Try to break at a newline near the end of the window
68
- if (end < content.length) {
69
- const lastNewline = content.lastIndexOf("\n", end);
70
- if (lastNewline > start + windowChars / 2) {
71
- end = lastNewline + 1;
72
- }
73
- }
74
-
75
- chunks.push({ index, content: content.slice(start, end) });
76
- index++;
77
-
78
- if (end >= content.length) break;
79
- start = end - overlapChars;
80
- }
81
-
82
- return chunks;
52
+ if (chunks.length <= 1 || overlapLines <= 0) return chunks;
53
+
54
+ return chunks.map((c, i) => {
55
+ if (i === 0) return { ...c };
56
+ const prevChunk = chunks[i - 1];
57
+ if (!prevChunk) return { ...c };
58
+ const prevLines = prevChunk.content.split("\n");
59
+ const overlap = prevLines.slice(-overlapLines).join("\n");
60
+ return { ...c, content: `${overlap}\n${c.content}` };
61
+ });
83
62
  }
84
63
 
85
64
  /**
@@ -139,7 +118,7 @@ ${content}`,
139
118
  }
140
119
 
141
120
  /**
142
- * Chunk content using LLM when possible, falling back to sliding window.
121
+ * Chunk content using the LLM chunker.
143
122
  * Short content (<200 chars) is returned as a single chunk.
144
123
  */
145
124
  export async function chunk(
@@ -151,14 +130,12 @@ export async function chunk(
151
130
  return [{ index: 0, content }];
152
131
  }
153
132
 
154
- // Only try LLM chunking if we have an API key
155
- if (config.anthropic_api_key) {
156
- try {
157
- return await chunkWithLLM(content, mimeType, config);
158
- } catch (err) {
159
- logger.debug(`LLM chunking failed, using sliding window: ${err}`);
160
- }
133
+ if (!config.anthropic_api_key) {
134
+ throw new Error(
135
+ "Anthropic API key is required for chunking. Set anthropic_api_key in config.",
136
+ );
161
137
  }
162
138
 
163
- return chunkWithSlidingWindow(content);
139
+ const chunks = await chunkWithLLM(content, mimeType, config);
140
+ return addOverlapToChunks(chunks);
164
141
  }
@@ -0,0 +1,146 @@
1
+ import Anthropic from "@anthropic-ai/sdk";
2
+ import type { BotholomewConfig } from "../config/schemas.ts";
3
+ import { logger } from "../utils/logger.ts";
4
+
5
+ const DESCRIBE_TOOL_NAME = "return_description";
6
+ const DESCRIBE_TOOL = {
7
+ name: DESCRIBE_TOOL_NAME,
8
+ description: "Return a one-sentence description of this content.",
9
+ input_schema: {
10
+ type: "object" as const,
11
+ properties: {
12
+ description: {
13
+ type: "string",
14
+ description:
15
+ "A concise one-sentence summary of what this content is about.",
16
+ },
17
+ },
18
+ required: ["description"],
19
+ },
20
+ };
21
+
22
+ const TIMEOUT_MS = 10_000;
23
+ const MAX_CONTENT_CHARS = 8000;
24
+ const MAX_FILE_BYTES = 10 * 1024 * 1024; // 10 MB
25
+
26
+ const IMAGE_TYPES = new Set([
27
+ "image/jpeg",
28
+ "image/png",
29
+ "image/gif",
30
+ "image/webp",
31
+ ]);
32
+
33
+ type ImageMediaType = "image/jpeg" | "image/png" | "image/gif" | "image/webp";
34
+
35
+ /**
36
+ * Build the message content array for the LLM description request.
37
+ * Attaches the file as an image or document block when possible.
38
+ */
39
+ async function buildMessageContent(
40
+ opts: DescriberOpts,
41
+ ): Promise<Anthropic.Messages.ContentBlockParam[]> {
42
+ const textPrompt = `Describe this file in one sentence. Be specific about what it contains, not generic.\n\nFilename: ${opts.filename}\nMIME type: ${opts.mimeType}`;
43
+
44
+ // Text file — include content inline
45
+ if (opts.content) {
46
+ const truncated =
47
+ opts.content.length > MAX_CONTENT_CHARS
48
+ ? `${opts.content.slice(0, MAX_CONTENT_CHARS)}\n... (truncated)`
49
+ : opts.content;
50
+ return [{ type: "text", text: `${textPrompt}\n\nContent:\n${truncated}` }];
51
+ }
52
+
53
+ // Binary file — try to attach if we have a file path
54
+ if (opts.filePath) {
55
+ const file = Bun.file(opts.filePath);
56
+ const size = file.size;
57
+
58
+ if (size > 0 && size <= MAX_FILE_BYTES) {
59
+ const data = Buffer.from(await file.arrayBuffer()).toString("base64");
60
+
61
+ if (IMAGE_TYPES.has(opts.mimeType)) {
62
+ return [
63
+ {
64
+ type: "image",
65
+ source: {
66
+ type: "base64",
67
+ media_type: opts.mimeType as ImageMediaType,
68
+ data,
69
+ },
70
+ },
71
+ { type: "text", text: textPrompt },
72
+ ];
73
+ }
74
+
75
+ if (opts.mimeType === "application/pdf") {
76
+ return [
77
+ {
78
+ type: "document",
79
+ source: { type: "base64", media_type: "application/pdf", data },
80
+ },
81
+ { type: "text", text: textPrompt },
82
+ ];
83
+ }
84
+ }
85
+ }
86
+
87
+ // Fallback — describe from filename and MIME type only
88
+ return [
89
+ {
90
+ type: "text",
91
+ text: `${textPrompt}\n\n(Binary file — no content preview available)`,
92
+ },
93
+ ];
94
+ }
95
+
96
+ interface DescriberOpts {
97
+ filename: string;
98
+ mimeType: string;
99
+ content: string | null;
100
+ filePath?: string;
101
+ }
102
+
103
+ /**
104
+ * Generate a short description of a file using the LLM.
105
+ * For textual files, summarises the content.
106
+ * For binary files, attaches images/PDFs directly or describes from metadata.
107
+ */
108
+ export async function generateDescription(
109
+ config: Required<BotholomewConfig>,
110
+ opts: DescriberOpts,
111
+ ): Promise<string> {
112
+ if (!config.anthropic_api_key) {
113
+ return "";
114
+ }
115
+
116
+ const client = new Anthropic({ apiKey: config.anthropic_api_key });
117
+
118
+ try {
119
+ const content = await buildMessageContent(opts);
120
+
121
+ const response = await Promise.race([
122
+ client.messages.create({
123
+ model: config.chunker_model,
124
+ max_tokens: 256,
125
+ tools: [DESCRIBE_TOOL],
126
+ tool_choice: { type: "tool", name: DESCRIBE_TOOL_NAME },
127
+ messages: [{ role: "user", content }],
128
+ }),
129
+ new Promise<never>((_, reject) =>
130
+ setTimeout(
131
+ () => reject(new Error("Description generation timeout")),
132
+ TIMEOUT_MS,
133
+ ),
134
+ ),
135
+ ]);
136
+
137
+ const toolBlock = response.content.find((b) => b.type === "tool_use");
138
+ if (!toolBlock || toolBlock.type !== "tool_use") return "";
139
+
140
+ const input = toolBlock.input as { description: string };
141
+ return input.description || "";
142
+ } catch (err) {
143
+ logger.debug(`Description generation failed: ${err}`);
144
+ return "";
145
+ }
146
+ }