botholomew 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/src/chat/session.ts +2 -2
- package/src/commands/context.ts +53 -42
- package/src/commands/daemon.ts +1 -1
- package/src/commands/schedule.ts +1 -1
- package/src/commands/task.ts +2 -1
- package/src/commands/thread.ts +6 -40
- package/src/commands/with-db.ts +2 -2
- package/src/constants.ts +1 -1
- package/src/context/chunker.ts +23 -46
- package/src/context/describer.ts +146 -0
- package/src/context/ingest.ts +27 -25
- package/src/daemon/index.ts +51 -5
- package/src/daemon/llm.ts +80 -12
- package/src/daemon/prompt.ts +3 -4
- package/src/daemon/schedules.ts +7 -1
- package/src/daemon/tick.ts +17 -5
- package/src/db/connection.ts +102 -40
- package/src/db/context.ts +120 -94
- package/src/db/embeddings.ts +55 -77
- package/src/db/query.ts +11 -0
- package/src/db/schedules.ts +27 -28
- package/src/db/schema.ts +9 -9
- package/src/db/sql/1-core_tables.sql +11 -11
- package/src/db/sql/2-logging_tables.sql +3 -3
- package/src/db/sql/3-daemon_state.sql +2 -2
- package/src/db/sql/6-vss_index.sql +1 -0
- package/src/db/sql/7-drop_embeddings_fk.sql +24 -0
- package/src/db/sql/8-task_output.sql +1 -0
- package/src/db/tasks.ts +89 -78
- package/src/db/threads.ts +52 -41
- package/src/init/index.ts +2 -2
- package/src/tools/file/move.ts +5 -3
- package/src/tools/file/write.ts +2 -30
- package/src/tools/search/semantic.ts +7 -4
- package/src/tools/task/list.ts +2 -0
- package/src/tools/task/view.ts +2 -0
- package/src/tui/App.tsx +20 -3
- package/src/tui/components/SchedulePanel.tsx +389 -0
- package/src/tui/components/TabBar.tsx +3 -2
- package/src/tui/components/TaskPanel.tsx +6 -0
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "botholomew",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.6.0",
|
|
4
4
|
"description": "An AI agent for knowledge work",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
@@ -23,8 +23,8 @@
|
|
|
23
23
|
},
|
|
24
24
|
"dependencies": {
|
|
25
25
|
"@anthropic-ai/sdk": "^0.88.0",
|
|
26
|
+
"@duckdb/node-api": "^1.5.2-r.1",
|
|
26
27
|
"@evantahler/mcpx": "0.18.3",
|
|
27
|
-
"@sqliteai/sqlite-vector": "^0.9.95",
|
|
28
28
|
"ansis": "^4.2.0",
|
|
29
29
|
"commander": "^14.0.0",
|
|
30
30
|
"gray-matter": "^4.0.3",
|
package/src/chat/session.ts
CHANGED
|
@@ -44,8 +44,8 @@ export async function startChatSession(
|
|
|
44
44
|
);
|
|
45
45
|
}
|
|
46
46
|
|
|
47
|
-
const conn = getConnection(getDbPath(projectDir));
|
|
48
|
-
migrate(conn);
|
|
47
|
+
const conn = await getConnection(getDbPath(projectDir));
|
|
48
|
+
await migrate(conn);
|
|
49
49
|
|
|
50
50
|
let threadId: string;
|
|
51
51
|
const messages: MessageParam[] = [];
|
package/src/commands/context.ts
CHANGED
|
@@ -5,6 +5,8 @@ import type { Command } from "commander";
|
|
|
5
5
|
import { isText } from "istextorbinary";
|
|
6
6
|
import { createSpinner } from "nanospinner";
|
|
7
7
|
import { loadConfig } from "../config/loader.ts";
|
|
8
|
+
import type { BotholomewConfig } from "../config/schemas.ts";
|
|
9
|
+
import { generateDescription } from "../context/describer.ts";
|
|
8
10
|
import { embedSingle } from "../context/embedder.ts";
|
|
9
11
|
import {
|
|
10
12
|
type PreparedIngestion,
|
|
@@ -14,18 +16,14 @@ import {
|
|
|
14
16
|
import type { DbConnection } from "../db/connection.ts";
|
|
15
17
|
import {
|
|
16
18
|
type ContextItem,
|
|
17
|
-
createContextItem,
|
|
18
19
|
deleteContextItemByPath,
|
|
19
20
|
getContextItemByPath,
|
|
20
21
|
listContextItems,
|
|
21
22
|
listContextItemsByPrefix,
|
|
22
23
|
updateContextItem,
|
|
24
|
+
upsertContextItem,
|
|
23
25
|
} from "../db/context.ts";
|
|
24
|
-
import {
|
|
25
|
-
getEmbeddingsForItem,
|
|
26
|
-
hybridSearch,
|
|
27
|
-
initVectorSearch,
|
|
28
|
-
} from "../db/embeddings.ts";
|
|
26
|
+
import { getEmbeddingsForItem, hybridSearch } from "../db/embeddings.ts";
|
|
29
27
|
import { logger } from "../utils/logger.ts";
|
|
30
28
|
import { withDb } from "./with-db.ts";
|
|
31
29
|
|
|
@@ -61,7 +59,7 @@ export function registerContextCommand(program: Command) {
|
|
|
61
59
|
return;
|
|
62
60
|
}
|
|
63
61
|
|
|
64
|
-
const header = `${ansis.bold("Path".padEnd(
|
|
62
|
+
const header = `${ansis.bold("Path".padEnd(35))} ${"Title".padEnd(20)} ${"Description".padEnd(30)} ${"Type".padEnd(15)} ${"Updated".padEnd(18)} Indexed`;
|
|
65
63
|
console.log(header);
|
|
66
64
|
console.log("-".repeat(header.length));
|
|
67
65
|
|
|
@@ -70,8 +68,11 @@ export function registerContextCommand(program: Command) {
|
|
|
70
68
|
? ansis.green("yes")
|
|
71
69
|
: ansis.dim("no");
|
|
72
70
|
const updated = ansis.dim(fmtDate(item.updated_at).padEnd(18));
|
|
71
|
+
const desc = item.description
|
|
72
|
+
? ansis.dim(item.description.slice(0, 29).padEnd(30))
|
|
73
|
+
: ansis.dim("".padEnd(30));
|
|
73
74
|
console.log(
|
|
74
|
-
`${item.context_path.padEnd(
|
|
75
|
+
`${item.context_path.slice(0, 34).padEnd(35)} ${item.title.slice(0, 19).padEnd(20)} ${desc} ${item.mime_type.slice(0, 14).padEnd(15)} ${updated} ${indexed}`,
|
|
75
76
|
);
|
|
76
77
|
}
|
|
77
78
|
|
|
@@ -91,6 +92,7 @@ export function registerContextCommand(program: Command) {
|
|
|
91
92
|
}
|
|
92
93
|
|
|
93
94
|
console.log(ansis.bold(item.title));
|
|
95
|
+
if (item.description) console.log(` Description: ${item.description}`);
|
|
94
96
|
console.log(` Path: ${item.context_path}`);
|
|
95
97
|
console.log(` MIME type: ${item.mime_type}`);
|
|
96
98
|
if (item.source_path) console.log(` Source: ${item.source_path}`);
|
|
@@ -150,18 +152,34 @@ export function registerContextCommand(program: Command) {
|
|
|
150
152
|
text: `Found ${filesToAdd.length} file(s) to add.`,
|
|
151
153
|
});
|
|
152
154
|
|
|
153
|
-
// Phase 2: Load config and upsert DB records (
|
|
155
|
+
// Phase 2: Load config and upsert DB records (batched, parallel LLM descriptions)
|
|
154
156
|
const config = await loadConfig(dir);
|
|
157
|
+
const CONCURRENCY = 10;
|
|
158
|
+
let addCompleted = 0;
|
|
155
159
|
const upsertSpinner = createSpinner(
|
|
156
|
-
|
|
160
|
+
`Adding and describing 0/${filesToAdd.length} files...`,
|
|
157
161
|
).start();
|
|
158
162
|
const itemIds: { id: string; contextPath: string }[] = [];
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
163
|
+
|
|
164
|
+
for (let i = 0; i < filesToAdd.length; i += CONCURRENCY) {
|
|
165
|
+
const batch = filesToAdd.slice(i, i + CONCURRENCY);
|
|
166
|
+
const results = await Promise.all(
|
|
167
|
+
batch.map(async ({ filePath, contextPath }) => {
|
|
168
|
+
const result = await addFile(conn, filePath, contextPath, config);
|
|
169
|
+
addCompleted++;
|
|
170
|
+
upsertSpinner.update({
|
|
171
|
+
text: `Adding and describing ${addCompleted}/${filesToAdd.length} files...`,
|
|
172
|
+
});
|
|
173
|
+
return result ? { id: result, contextPath } : null;
|
|
174
|
+
}),
|
|
175
|
+
);
|
|
176
|
+
for (const r of results) {
|
|
177
|
+
if (r) itemIds.push(r);
|
|
178
|
+
}
|
|
162
179
|
}
|
|
180
|
+
|
|
163
181
|
upsertSpinner.success({
|
|
164
|
-
text: `Added ${itemIds.length} file(s)
|
|
182
|
+
text: `Added and described ${itemIds.length} file(s).`,
|
|
165
183
|
});
|
|
166
184
|
|
|
167
185
|
// Phase 3: Chunk + embed in parallel (network I/O)
|
|
@@ -173,7 +191,6 @@ export function registerContextCommand(program: Command) {
|
|
|
173
191
|
process.exit(0);
|
|
174
192
|
}
|
|
175
193
|
|
|
176
|
-
const CONCURRENCY = 10;
|
|
177
194
|
let completed = 0;
|
|
178
195
|
const embedSpinner = createSpinner(
|
|
179
196
|
`Embedding 0/${itemIds.length} files...`,
|
|
@@ -205,7 +222,7 @@ export function registerContextCommand(program: Command) {
|
|
|
205
222
|
let filesAdded = 0;
|
|
206
223
|
let filesUpdated = 0;
|
|
207
224
|
for (const p of prepared) {
|
|
208
|
-
const result = storeIngestion(conn, p);
|
|
225
|
+
const result = await storeIngestion(conn, p);
|
|
209
226
|
chunks += result.chunks;
|
|
210
227
|
if (result.isUpdate) filesUpdated++;
|
|
211
228
|
else filesAdded++;
|
|
@@ -226,9 +243,8 @@ export function registerContextCommand(program: Command) {
|
|
|
226
243
|
.action((query, opts) =>
|
|
227
244
|
withDb(program, async (conn, dir) => {
|
|
228
245
|
const config = await loadConfig(dir);
|
|
229
|
-
initVectorSearch(conn);
|
|
230
246
|
const queryVec = await embedSingle(query, config);
|
|
231
|
-
const results = hybridSearch(conn, query, queryVec, opts.topK);
|
|
247
|
+
const results = await hybridSearch(conn, query, queryVec, opts.topK);
|
|
232
248
|
|
|
233
249
|
if (results.length === 0) {
|
|
234
250
|
logger.dim("No results found.");
|
|
@@ -280,7 +296,7 @@ export function registerContextCommand(program: Command) {
|
|
|
280
296
|
return;
|
|
281
297
|
}
|
|
282
298
|
|
|
283
|
-
const embeddings = getEmbeddingsForItem(conn, item.id);
|
|
299
|
+
const embeddings = await getEmbeddingsForItem(conn, item.id);
|
|
284
300
|
|
|
285
301
|
console.log(ansis.bold(item.title));
|
|
286
302
|
console.log(` Path: ${item.context_path}`);
|
|
@@ -411,7 +427,7 @@ export function registerContextCommand(program: Command) {
|
|
|
411
427
|
|
|
412
428
|
let chunks = 0;
|
|
413
429
|
for (const p of prepared) {
|
|
414
|
-
const result = storeIngestion(conn, p);
|
|
430
|
+
const result = await storeIngestion(conn, p);
|
|
415
431
|
chunks += result.chunks;
|
|
416
432
|
}
|
|
417
433
|
|
|
@@ -443,36 +459,31 @@ async function addFile(
|
|
|
443
459
|
conn: DbConnection,
|
|
444
460
|
filePath: string,
|
|
445
461
|
contextPath: string,
|
|
462
|
+
config: Required<BotholomewConfig>,
|
|
446
463
|
): Promise<string | null> {
|
|
447
464
|
try {
|
|
448
465
|
const bunFile = Bun.file(filePath);
|
|
449
466
|
const mimeType = bunFile.type.split(";")[0] || "application/octet-stream";
|
|
450
467
|
const filename = basename(filePath);
|
|
451
468
|
const textual = isText(filename) !== false;
|
|
452
|
-
|
|
453
469
|
const content = textual ? await bunFile.text() : null;
|
|
454
470
|
|
|
455
|
-
const
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
sourcePath: filePath,
|
|
472
|
-
contextPath,
|
|
473
|
-
isTextual: textual,
|
|
474
|
-
});
|
|
475
|
-
}
|
|
471
|
+
const description = await generateDescription(config, {
|
|
472
|
+
filename,
|
|
473
|
+
mimeType,
|
|
474
|
+
content,
|
|
475
|
+
filePath,
|
|
476
|
+
});
|
|
477
|
+
|
|
478
|
+
const item = await upsertContextItem(conn, {
|
|
479
|
+
title: filename,
|
|
480
|
+
description,
|
|
481
|
+
content: content ?? undefined,
|
|
482
|
+
mimeType,
|
|
483
|
+
sourcePath: filePath,
|
|
484
|
+
contextPath,
|
|
485
|
+
isTextual: textual,
|
|
486
|
+
});
|
|
476
487
|
|
|
477
488
|
return textual && content ? item.id : null;
|
|
478
489
|
} catch (err) {
|
package/src/commands/daemon.ts
CHANGED
|
@@ -12,7 +12,7 @@ export function registerDaemonCommand(program: Command) {
|
|
|
12
12
|
.action(async () => {
|
|
13
13
|
const dir = program.opts().dir;
|
|
14
14
|
const { startDaemon } = await import("../daemon/index.ts");
|
|
15
|
-
await startDaemon(dir);
|
|
15
|
+
await startDaemon(dir, { foreground: true });
|
|
16
16
|
});
|
|
17
17
|
|
|
18
18
|
daemon
|
package/src/commands/schedule.ts
CHANGED
|
@@ -163,7 +163,7 @@ function enabledColor(enabled: boolean): string {
|
|
|
163
163
|
}
|
|
164
164
|
|
|
165
165
|
function printSchedule(s: Schedule) {
|
|
166
|
-
const id = ansis.dim(s.id
|
|
166
|
+
const id = ansis.dim(s.id);
|
|
167
167
|
const lastRun = s.last_run_at
|
|
168
168
|
? s.last_run_at.toISOString()
|
|
169
169
|
: ansis.dim("never");
|
package/src/commands/task.ts
CHANGED
|
@@ -155,7 +155,7 @@ function priorityColor(priority: Task["priority"]): string {
|
|
|
155
155
|
}
|
|
156
156
|
|
|
157
157
|
function printTask(t: Task) {
|
|
158
|
-
const id = ansis.dim(t.id
|
|
158
|
+
const id = ansis.dim(t.id);
|
|
159
159
|
console.log(
|
|
160
160
|
` ${id} ${statusColor(t.status)} ${priorityColor(t.priority)} ${t.name}`,
|
|
161
161
|
);
|
|
@@ -168,6 +168,7 @@ function printTaskDetail(t: Task) {
|
|
|
168
168
|
console.log(` Priority: ${priorityColor(t.priority)}`);
|
|
169
169
|
if (t.description) console.log(` Description: ${t.description}`);
|
|
170
170
|
if (t.waiting_reason) console.log(` Waiting: ${t.waiting_reason}`);
|
|
171
|
+
if (t.output) console.log(` Output: ${t.output}`);
|
|
171
172
|
if (t.claimed_by) console.log(` Claimed by: ${t.claimed_by}`);
|
|
172
173
|
if (t.blocked_by.length > 0)
|
|
173
174
|
console.log(` Blocked by: ${t.blocked_by.join(", ")}`);
|
package/src/commands/thread.ts
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import ansis from "ansis";
|
|
2
2
|
import type { Command } from "commander";
|
|
3
|
-
import type { DbConnection } from "../db/connection.ts";
|
|
4
3
|
import type { Interaction, Thread } from "../db/threads.ts";
|
|
5
4
|
import {
|
|
6
5
|
deleteThread,
|
|
@@ -48,12 +47,7 @@ export function registerThreadCommand(program: Command) {
|
|
|
48
47
|
)
|
|
49
48
|
.action((id, opts) =>
|
|
50
49
|
withDb(program, async (conn) => {
|
|
51
|
-
const
|
|
52
|
-
if (!resolvedId) {
|
|
53
|
-
logger.error(`Thread not found: ${id}`);
|
|
54
|
-
process.exit(1);
|
|
55
|
-
}
|
|
56
|
-
const result = await getThread(conn, resolvedId);
|
|
50
|
+
const result = await getThread(conn, id);
|
|
57
51
|
if (!result) {
|
|
58
52
|
logger.error(`Thread not found: ${id}`);
|
|
59
53
|
process.exit(1);
|
|
@@ -72,17 +66,12 @@ export function registerThreadCommand(program: Command) {
|
|
|
72
66
|
.description("Delete a thread and its interactions")
|
|
73
67
|
.action((id) =>
|
|
74
68
|
withDb(program, async (conn) => {
|
|
75
|
-
const
|
|
76
|
-
if (!resolvedId) {
|
|
77
|
-
logger.error(`Thread not found: ${id}`);
|
|
78
|
-
process.exit(1);
|
|
79
|
-
}
|
|
80
|
-
const deleted = await deleteThread(conn, resolvedId);
|
|
69
|
+
const deleted = await deleteThread(conn, id);
|
|
81
70
|
if (!deleted) {
|
|
82
71
|
logger.error(`Thread not found: ${id}`);
|
|
83
72
|
process.exit(1);
|
|
84
73
|
}
|
|
85
|
-
logger.success(`Deleted thread: ${
|
|
74
|
+
logger.success(`Deleted thread: ${id}`);
|
|
86
75
|
}),
|
|
87
76
|
);
|
|
88
77
|
|
|
@@ -94,12 +83,7 @@ export function registerThreadCommand(program: Command) {
|
|
|
94
83
|
withDb(program, async (conn) => {
|
|
95
84
|
let resolvedId: string;
|
|
96
85
|
if (id) {
|
|
97
|
-
|
|
98
|
-
if (!found) {
|
|
99
|
-
logger.error(`Thread not found: ${id}`);
|
|
100
|
-
process.exit(1);
|
|
101
|
-
}
|
|
102
|
-
resolvedId = found;
|
|
86
|
+
resolvedId = id;
|
|
103
87
|
} else {
|
|
104
88
|
const active = await getActiveThread(conn);
|
|
105
89
|
if (!active) {
|
|
@@ -130,7 +114,7 @@ export function registerThreadCommand(program: Command) {
|
|
|
130
114
|
|
|
131
115
|
const pollMs = opts.interval ?? 500;
|
|
132
116
|
logger.info(
|
|
133
|
-
`Following thread ${ansis.dim(resolvedId
|
|
117
|
+
`Following thread ${ansis.dim(resolvedId)}... (Ctrl+C to stop)`,
|
|
134
118
|
);
|
|
135
119
|
|
|
136
120
|
const interval = setInterval(async () => {
|
|
@@ -168,24 +152,6 @@ export function registerThreadCommand(program: Command) {
|
|
|
168
152
|
);
|
|
169
153
|
}
|
|
170
154
|
|
|
171
|
-
async function resolveThreadId(
|
|
172
|
-
conn: DbConnection,
|
|
173
|
-
idPrefix: string,
|
|
174
|
-
): Promise<string | null> {
|
|
175
|
-
if (idPrefix.length >= 36) return idPrefix;
|
|
176
|
-
const all = await listThreads(conn);
|
|
177
|
-
const matches = all.filter((t) => t.id.startsWith(idPrefix));
|
|
178
|
-
if (matches.length === 1) {
|
|
179
|
-
const match = matches[0] as Thread;
|
|
180
|
-
return match.id;
|
|
181
|
-
}
|
|
182
|
-
if (matches.length === 0) return null;
|
|
183
|
-
logger.error(
|
|
184
|
-
`Ambiguous thread prefix "${idPrefix}" matches ${matches.length} threads`,
|
|
185
|
-
);
|
|
186
|
-
process.exit(1);
|
|
187
|
-
}
|
|
188
|
-
|
|
189
155
|
function typeColor(type: Thread["type"]): string {
|
|
190
156
|
switch (type) {
|
|
191
157
|
case "daemon_tick":
|
|
@@ -213,7 +179,7 @@ function roleColor(role: Interaction["role"]): string {
|
|
|
213
179
|
}
|
|
214
180
|
|
|
215
181
|
function printThread(t: Thread) {
|
|
216
|
-
const id = ansis.dim(t.id
|
|
182
|
+
const id = ansis.dim(t.id);
|
|
217
183
|
const title = t.title || ansis.dim("(untitled)");
|
|
218
184
|
console.log(` ${id} ${typeColor(t.type)} ${statusLabel(t)} ${title}`);
|
|
219
185
|
}
|
package/src/commands/with-db.ts
CHANGED
|
@@ -13,8 +13,8 @@ export async function withDb<T>(
|
|
|
13
13
|
fn: (conn: DbConnection, dir: string) => Promise<T>,
|
|
14
14
|
): Promise<T> {
|
|
15
15
|
const dir = program.opts().dir;
|
|
16
|
-
const conn = getConnection(getDbPath(dir));
|
|
17
|
-
migrate(conn);
|
|
16
|
+
const conn = await getConnection(getDbPath(dir));
|
|
17
|
+
await migrate(conn);
|
|
18
18
|
try {
|
|
19
19
|
return await fn(conn, dir);
|
|
20
20
|
} finally {
|
package/src/constants.ts
CHANGED
|
@@ -12,7 +12,7 @@ export const DEFAULTS = {
|
|
|
12
12
|
UPDATE_CHECK_INTERVAL_MS: 24 * 60 * 60 * 1000, // 24 hours
|
|
13
13
|
UPDATE_CHECK_TIMEOUT_MS: 5_000,
|
|
14
14
|
} as const;
|
|
15
|
-
export const DB_FILENAME = "data.
|
|
15
|
+
export const DB_FILENAME = "data.duckdb";
|
|
16
16
|
export const PID_FILENAME = "daemon.pid";
|
|
17
17
|
export const LOG_FILENAME = "daemon.log";
|
|
18
18
|
export const CONFIG_FILENAME = "config.json";
|
package/src/context/chunker.ts
CHANGED
|
@@ -1,16 +1,14 @@
|
|
|
1
1
|
import Anthropic from "@anthropic-ai/sdk";
|
|
2
2
|
import type { BotholomewConfig } from "../config/schemas.ts";
|
|
3
|
-
import { logger } from "../utils/logger.ts";
|
|
4
3
|
|
|
5
4
|
export interface Chunk {
|
|
6
5
|
index: number;
|
|
7
6
|
content: string;
|
|
8
7
|
}
|
|
9
8
|
|
|
10
|
-
const DEFAULT_WINDOW_CHARS = 2000;
|
|
11
|
-
const DEFAULT_OVERLAP_CHARS = 200;
|
|
12
9
|
const SHORT_CONTENT_THRESHOLD = 200;
|
|
13
10
|
const LLM_TIMEOUT_MS = 10_000;
|
|
11
|
+
const DEFAULT_OVERLAP_LINES = 2;
|
|
14
12
|
|
|
15
13
|
const CHUNKER_TOOL_NAME = "return_chunks";
|
|
16
14
|
const CHUNKER_TOOL = {
|
|
@@ -44,42 +42,23 @@ const CHUNKER_TOOL = {
|
|
|
44
42
|
};
|
|
45
43
|
|
|
46
44
|
/**
|
|
47
|
-
*
|
|
48
|
-
*
|
|
49
|
-
* breaking at newlines when possible.
|
|
45
|
+
* Add overlapping lines from the end of each chunk to the start of the next.
|
|
46
|
+
* Improves retrieval when concepts span chunk boundaries.
|
|
50
47
|
*/
|
|
51
|
-
export function
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
overlapChars = DEFAULT_OVERLAP_CHARS,
|
|
48
|
+
export function addOverlapToChunks(
|
|
49
|
+
chunks: Chunk[],
|
|
50
|
+
overlapLines = DEFAULT_OVERLAP_LINES,
|
|
55
51
|
): Chunk[] {
|
|
56
|
-
if (
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
// Try to break at a newline near the end of the window
|
|
68
|
-
if (end < content.length) {
|
|
69
|
-
const lastNewline = content.lastIndexOf("\n", end);
|
|
70
|
-
if (lastNewline > start + windowChars / 2) {
|
|
71
|
-
end = lastNewline + 1;
|
|
72
|
-
}
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
chunks.push({ index, content: content.slice(start, end) });
|
|
76
|
-
index++;
|
|
77
|
-
|
|
78
|
-
if (end >= content.length) break;
|
|
79
|
-
start = end - overlapChars;
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
return chunks;
|
|
52
|
+
if (chunks.length <= 1 || overlapLines <= 0) return chunks;
|
|
53
|
+
|
|
54
|
+
return chunks.map((c, i) => {
|
|
55
|
+
if (i === 0) return { ...c };
|
|
56
|
+
const prevChunk = chunks[i - 1];
|
|
57
|
+
if (!prevChunk) return { ...c };
|
|
58
|
+
const prevLines = prevChunk.content.split("\n");
|
|
59
|
+
const overlap = prevLines.slice(-overlapLines).join("\n");
|
|
60
|
+
return { ...c, content: `${overlap}\n${c.content}` };
|
|
61
|
+
});
|
|
83
62
|
}
|
|
84
63
|
|
|
85
64
|
/**
|
|
@@ -139,7 +118,7 @@ ${content}`,
|
|
|
139
118
|
}
|
|
140
119
|
|
|
141
120
|
/**
|
|
142
|
-
* Chunk content using LLM
|
|
121
|
+
* Chunk content using the LLM chunker.
|
|
143
122
|
* Short content (<200 chars) is returned as a single chunk.
|
|
144
123
|
*/
|
|
145
124
|
export async function chunk(
|
|
@@ -151,14 +130,12 @@ export async function chunk(
|
|
|
151
130
|
return [{ index: 0, content }];
|
|
152
131
|
}
|
|
153
132
|
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
} catch (err) {
|
|
159
|
-
logger.debug(`LLM chunking failed, using sliding window: ${err}`);
|
|
160
|
-
}
|
|
133
|
+
if (!config.anthropic_api_key) {
|
|
134
|
+
throw new Error(
|
|
135
|
+
"Anthropic API key is required for chunking. Set anthropic_api_key in config.",
|
|
136
|
+
);
|
|
161
137
|
}
|
|
162
138
|
|
|
163
|
-
|
|
139
|
+
const chunks = await chunkWithLLM(content, mimeType, config);
|
|
140
|
+
return addOverlapToChunks(chunks);
|
|
164
141
|
}
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
import Anthropic from "@anthropic-ai/sdk";
|
|
2
|
+
import type { BotholomewConfig } from "../config/schemas.ts";
|
|
3
|
+
import { logger } from "../utils/logger.ts";
|
|
4
|
+
|
|
5
|
+
const DESCRIBE_TOOL_NAME = "return_description";
|
|
6
|
+
const DESCRIBE_TOOL = {
|
|
7
|
+
name: DESCRIBE_TOOL_NAME,
|
|
8
|
+
description: "Return a one-sentence description of this content.",
|
|
9
|
+
input_schema: {
|
|
10
|
+
type: "object" as const,
|
|
11
|
+
properties: {
|
|
12
|
+
description: {
|
|
13
|
+
type: "string",
|
|
14
|
+
description:
|
|
15
|
+
"A concise one-sentence summary of what this content is about.",
|
|
16
|
+
},
|
|
17
|
+
},
|
|
18
|
+
required: ["description"],
|
|
19
|
+
},
|
|
20
|
+
};
|
|
21
|
+
|
|
22
|
+
const TIMEOUT_MS = 10_000;
|
|
23
|
+
const MAX_CONTENT_CHARS = 8000;
|
|
24
|
+
const MAX_FILE_BYTES = 10 * 1024 * 1024; // 10 MB
|
|
25
|
+
|
|
26
|
+
const IMAGE_TYPES = new Set([
|
|
27
|
+
"image/jpeg",
|
|
28
|
+
"image/png",
|
|
29
|
+
"image/gif",
|
|
30
|
+
"image/webp",
|
|
31
|
+
]);
|
|
32
|
+
|
|
33
|
+
type ImageMediaType = "image/jpeg" | "image/png" | "image/gif" | "image/webp";
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Build the message content array for the LLM description request.
|
|
37
|
+
* Attaches the file as an image or document block when possible.
|
|
38
|
+
*/
|
|
39
|
+
async function buildMessageContent(
|
|
40
|
+
opts: DescriberOpts,
|
|
41
|
+
): Promise<Anthropic.Messages.ContentBlockParam[]> {
|
|
42
|
+
const textPrompt = `Describe this file in one sentence. Be specific about what it contains, not generic.\n\nFilename: ${opts.filename}\nMIME type: ${opts.mimeType}`;
|
|
43
|
+
|
|
44
|
+
// Text file — include content inline
|
|
45
|
+
if (opts.content) {
|
|
46
|
+
const truncated =
|
|
47
|
+
opts.content.length > MAX_CONTENT_CHARS
|
|
48
|
+
? `${opts.content.slice(0, MAX_CONTENT_CHARS)}\n... (truncated)`
|
|
49
|
+
: opts.content;
|
|
50
|
+
return [{ type: "text", text: `${textPrompt}\n\nContent:\n${truncated}` }];
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
// Binary file — try to attach if we have a file path
|
|
54
|
+
if (opts.filePath) {
|
|
55
|
+
const file = Bun.file(opts.filePath);
|
|
56
|
+
const size = file.size;
|
|
57
|
+
|
|
58
|
+
if (size > 0 && size <= MAX_FILE_BYTES) {
|
|
59
|
+
const data = Buffer.from(await file.arrayBuffer()).toString("base64");
|
|
60
|
+
|
|
61
|
+
if (IMAGE_TYPES.has(opts.mimeType)) {
|
|
62
|
+
return [
|
|
63
|
+
{
|
|
64
|
+
type: "image",
|
|
65
|
+
source: {
|
|
66
|
+
type: "base64",
|
|
67
|
+
media_type: opts.mimeType as ImageMediaType,
|
|
68
|
+
data,
|
|
69
|
+
},
|
|
70
|
+
},
|
|
71
|
+
{ type: "text", text: textPrompt },
|
|
72
|
+
];
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
if (opts.mimeType === "application/pdf") {
|
|
76
|
+
return [
|
|
77
|
+
{
|
|
78
|
+
type: "document",
|
|
79
|
+
source: { type: "base64", media_type: "application/pdf", data },
|
|
80
|
+
},
|
|
81
|
+
{ type: "text", text: textPrompt },
|
|
82
|
+
];
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
// Fallback — describe from filename and MIME type only
|
|
88
|
+
return [
|
|
89
|
+
{
|
|
90
|
+
type: "text",
|
|
91
|
+
text: `${textPrompt}\n\n(Binary file — no content preview available)`,
|
|
92
|
+
},
|
|
93
|
+
];
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
interface DescriberOpts {
|
|
97
|
+
filename: string;
|
|
98
|
+
mimeType: string;
|
|
99
|
+
content: string | null;
|
|
100
|
+
filePath?: string;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
/**
|
|
104
|
+
* Generate a short description of a file using the LLM.
|
|
105
|
+
* For textual files, summarises the content.
|
|
106
|
+
* For binary files, attaches images/PDFs directly or describes from metadata.
|
|
107
|
+
*/
|
|
108
|
+
export async function generateDescription(
|
|
109
|
+
config: Required<BotholomewConfig>,
|
|
110
|
+
opts: DescriberOpts,
|
|
111
|
+
): Promise<string> {
|
|
112
|
+
if (!config.anthropic_api_key) {
|
|
113
|
+
return "";
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
const client = new Anthropic({ apiKey: config.anthropic_api_key });
|
|
117
|
+
|
|
118
|
+
try {
|
|
119
|
+
const content = await buildMessageContent(opts);
|
|
120
|
+
|
|
121
|
+
const response = await Promise.race([
|
|
122
|
+
client.messages.create({
|
|
123
|
+
model: config.chunker_model,
|
|
124
|
+
max_tokens: 256,
|
|
125
|
+
tools: [DESCRIBE_TOOL],
|
|
126
|
+
tool_choice: { type: "tool", name: DESCRIBE_TOOL_NAME },
|
|
127
|
+
messages: [{ role: "user", content }],
|
|
128
|
+
}),
|
|
129
|
+
new Promise<never>((_, reject) =>
|
|
130
|
+
setTimeout(
|
|
131
|
+
() => reject(new Error("Description generation timeout")),
|
|
132
|
+
TIMEOUT_MS,
|
|
133
|
+
),
|
|
134
|
+
),
|
|
135
|
+
]);
|
|
136
|
+
|
|
137
|
+
const toolBlock = response.content.find((b) => b.type === "tool_use");
|
|
138
|
+
if (!toolBlock || toolBlock.type !== "tool_use") return "";
|
|
139
|
+
|
|
140
|
+
const input = toolBlock.input as { description: string };
|
|
141
|
+
return input.description || "";
|
|
142
|
+
} catch (err) {
|
|
143
|
+
logger.debug(`Description generation failed: ${err}`);
|
|
144
|
+
return "";
|
|
145
|
+
}
|
|
146
|
+
}
|