botholomew 0.9.11 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -5
- package/package.json +15 -4
- package/src/chat/agent.ts +1 -1
- package/src/commands/context.ts +16 -6
- package/src/commands/db.ts +22 -11
- package/src/commands/prepare.ts +3 -7
- package/src/config/loader.ts +0 -3
- package/src/config/schemas.ts +2 -4
- package/src/constants.ts +2 -2
- package/src/context/embedder-impl.ts +29 -31
- package/src/context/ingest.ts +1 -10
- package/src/context/refresh.ts +2 -3
- package/src/db/doctor.ts +37 -9
- package/src/db/reembed.ts +113 -0
- package/src/db/schema.ts +7 -0
- package/src/db/sql/18-reset_embeddings_for_local.sql +39 -0
- package/src/tui/components/StatusBar.tsx +23 -15
- package/src/worker/llm.ts +19 -0
- package/src/worker/prompt.ts +1 -1
- package/src/worker/tick.ts +3 -0
package/README.md
CHANGED
|
@@ -88,9 +88,9 @@ bun run dev -- --help
|
|
|
88
88
|
# 1. Initialize a project in the current directory
|
|
89
89
|
botholomew init
|
|
90
90
|
|
|
91
|
-
# 2. Add your
|
|
91
|
+
# 2. Add your Anthropic key to .botholomew/config.json, or export it
|
|
92
92
|
export ANTHROPIC_API_KEY=sk-ant-...
|
|
93
|
-
|
|
93
|
+
# Embeddings run locally — no API key required.
|
|
94
94
|
|
|
95
95
|
# 3. Queue some work
|
|
96
96
|
botholomew task add "Summarize every markdown file in ~/notes"
|
|
@@ -144,7 +144,7 @@ Everything the agent can touch is here. No surprises.
|
|
|
144
144
|
| `botholomew chat` | Interactive Ink/React TUI |
|
|
145
145
|
| `botholomew task list\|add\|view\|update\|reset\|delete` | Manage the task queue |
|
|
146
146
|
| `botholomew schedule list\|add\|view\|enable\|disable\|trigger\|delete` | Recurring work |
|
|
147
|
-
| `botholomew context add\|list\|search\|chunks\|refresh\|delete` | Ingest & browse knowledge (files, folders, URLs); also exposes the agent's `read`/`write`/`tree`/`edit`/… tools as subcommands |
|
|
147
|
+
| `botholomew context add\|list\|search\|chunks\|refresh\|reembed\|delete` | Ingest & browse knowledge (files, folders, URLs); `reembed` rebuilds every vector after upgrading the embedding model; also exposes the agent's `read`/`write`/`tree`/`edit`/… tools as subcommands |
|
|
148
148
|
| `botholomew capabilities` | Rescan built-in + MCPX tools and rewrite `.botholomew/capabilities.md` |
|
|
149
149
|
| `botholomew mcpx servers\|list\|add\|remove\|info\|search\|exec\|ping\|auth\|deauth\|import-global\|…` | Configure external MCP servers (passthrough to `mcpx`) |
|
|
150
150
|
| `botholomew skill list\|show\|create\|validate` | Manage slash-command skills |
|
|
@@ -193,6 +193,8 @@ See [docs/architecture.md](docs/architecture.md) for a deeper tour.
|
|
|
193
193
|
|
|
194
194
|
## Deep dives
|
|
195
195
|
|
|
196
|
+
> The full docs site is published at **[www.botholomew.com](https://www.botholomew.com)**.
|
|
197
|
+
|
|
196
198
|
Topics worth understanding in detail:
|
|
197
199
|
|
|
198
200
|
- **[Architecture](docs/architecture.md)** — workers, chat, and how
|
|
@@ -234,8 +236,9 @@ Topics worth understanding in detail:
|
|
|
234
236
|
built-in FTS extension for BM25 keyword search
|
|
235
237
|
- **[Anthropic SDK](https://docs.anthropic.com/en/api/client-sdks)** for
|
|
236
238
|
Claude — the reasoning model
|
|
237
|
-
- **
|
|
238
|
-
|
|
239
|
+
- **[`@huggingface/transformers`](https://huggingface.co/docs/transformers.js)**
|
|
240
|
+
for local embeddings (default `Xenova/bge-small-en-v1.5`, 384-dim) —
|
|
241
|
+
no API key, weights cached on first run
|
|
239
242
|
- **[MCPX](https://github.com/evantahler/mcpx)** for external tools
|
|
240
243
|
- **[Ink 6](https://github.com/vadimdemedes/ink)** + **React 19** for the
|
|
241
244
|
terminal UI
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "botholomew",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.10.0",
|
|
4
4
|
"description": "An autonomous AI agent for knowledge work — works your task queue while you sleep.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
@@ -20,12 +20,16 @@
|
|
|
20
20
|
"dev:demo": "bun run src/cli.ts chat -p 'learn everything you can about me from the connected MCP services and then save what you'\\''ve learned about me to context'",
|
|
21
21
|
"test": "bun test",
|
|
22
22
|
"lint": "tsc --noEmit && biome check .",
|
|
23
|
-
"capture": "bun run scripts/capture.ts"
|
|
23
|
+
"capture": "bun run scripts/capture.ts",
|
|
24
|
+
"docs:dev": "vitepress dev docs",
|
|
25
|
+
"docs:build": "vitepress build docs",
|
|
26
|
+
"docs:preview": "vitepress preview docs"
|
|
24
27
|
},
|
|
25
28
|
"dependencies": {
|
|
26
29
|
"@anthropic-ai/sdk": "^0.88.0",
|
|
27
30
|
"@duckdb/node-api": "^1.5.2-r.1",
|
|
28
31
|
"@evantahler/mcpx": "0.18.6",
|
|
32
|
+
"@huggingface/transformers": "^4.2.0",
|
|
29
33
|
"ansis": "^4.2.0",
|
|
30
34
|
"commander": "^14.0.0",
|
|
31
35
|
"gray-matter": "^4.0.3",
|
|
@@ -43,6 +47,13 @@
|
|
|
43
47
|
"@types/bun": "latest",
|
|
44
48
|
"@types/react": "^19.1.0",
|
|
45
49
|
"@types/uuid": "^11.0.0",
|
|
46
|
-
"typescript": "^6.0.2"
|
|
47
|
-
|
|
50
|
+
"typescript": "^6.0.2",
|
|
51
|
+
"vitepress": "^1.5.0",
|
|
52
|
+
"vitepress-plugin-llms": "^1.12.1",
|
|
53
|
+
"vue": "^3.5.0"
|
|
54
|
+
},
|
|
55
|
+
"trustedDependencies": [
|
|
56
|
+
"onnxruntime-node",
|
|
57
|
+
"protobufjs"
|
|
58
|
+
]
|
|
48
59
|
}
|
package/src/chat/agent.ts
CHANGED
|
@@ -86,7 +86,7 @@ export async function buildChatSystemPrompt(
|
|
|
86
86
|
|
|
87
87
|
const dbPath = options?.dbPath;
|
|
88
88
|
const config = options?.config;
|
|
89
|
-
if (dbPath && config
|
|
89
|
+
if (dbPath && config && keywordSource) {
|
|
90
90
|
try {
|
|
91
91
|
const queryVec = await embedSingle(keywordSource, config);
|
|
92
92
|
const results = await withDb(dbPath, (conn) =>
|
package/src/commands/context.ts
CHANGED
|
@@ -6,6 +6,7 @@ import { isText } from "istextorbinary";
|
|
|
6
6
|
import { createSpinner } from "nanospinner";
|
|
7
7
|
import { loadConfig } from "../config/loader.ts";
|
|
8
8
|
import type { BotholomewConfig } from "../config/schemas.ts";
|
|
9
|
+
import { getDbPath } from "../constants.ts";
|
|
9
10
|
import { generateDescription } from "../context/describer.ts";
|
|
10
11
|
import {
|
|
11
12
|
type DriveTarget,
|
|
@@ -36,6 +37,7 @@ import {
|
|
|
36
37
|
upsertContextItem,
|
|
37
38
|
} from "../db/context.ts";
|
|
38
39
|
import { getEmbeddingsForItem, hybridSearch } from "../db/embeddings.ts";
|
|
40
|
+
import { reembedMissingVectors } from "../db/reembed.ts";
|
|
39
41
|
import { createMcpxClient } from "../mcpx/client.ts";
|
|
40
42
|
import { logger } from "../utils/logger.ts";
|
|
41
43
|
import {
|
|
@@ -425,10 +427,7 @@ export function registerContextCommand(program: Command) {
|
|
|
425
427
|
|
|
426
428
|
skipped.push(...dedupSkipped);
|
|
427
429
|
|
|
428
|
-
if (itemIds.length === 0
|
|
429
|
-
if (!config.openai_api_key) {
|
|
430
|
-
logger.dim("Skipping embeddings (no OpenAI API key configured).");
|
|
431
|
-
}
|
|
430
|
+
if (itemIds.length === 0) {
|
|
432
431
|
const msg = buildSummary({
|
|
433
432
|
added: itemIds.length,
|
|
434
433
|
refreshed: refreshedCount,
|
|
@@ -693,12 +692,23 @@ export function registerContextCommand(program: Command) {
|
|
|
693
692
|
logger.success(
|
|
694
693
|
`Refreshed ${result.updated} item(s), ${result.chunks} chunk(s) re-indexed.`,
|
|
695
694
|
);
|
|
696
|
-
} else if (result.embeddings_skipped) {
|
|
697
|
-
logger.dim("Skipping embeddings (no OpenAI API key configured).");
|
|
698
695
|
}
|
|
699
696
|
}),
|
|
700
697
|
);
|
|
701
698
|
|
|
699
|
+
ctx
|
|
700
|
+
.command("reembed")
|
|
701
|
+
.description(
|
|
702
|
+
"Recompute every embedding using the configured local model. Run this after upgrading or after changing embedding_model.",
|
|
703
|
+
)
|
|
704
|
+
.action(() =>
|
|
705
|
+
withDb(program, async (_conn, dir) => {
|
|
706
|
+
const config = await loadConfig(dir);
|
|
707
|
+
const dbPath = getDbPath(dir);
|
|
708
|
+
await reembedMissingVectors(dbPath, config, { mode: "all" });
|
|
709
|
+
}),
|
|
710
|
+
);
|
|
711
|
+
|
|
702
712
|
registerContextToolSubcommands(ctx);
|
|
703
713
|
}
|
|
704
714
|
|
package/src/commands/db.ts
CHANGED
|
@@ -3,11 +3,12 @@ import type { Command } from "commander";
|
|
|
3
3
|
import { getDbPath } from "../constants.ts";
|
|
4
4
|
import { withDb as coreWithDb } from "../db/connection.ts";
|
|
5
5
|
import {
|
|
6
|
+
isPidAlive,
|
|
6
7
|
type ProbeResult,
|
|
7
8
|
probeAllTables,
|
|
8
9
|
repairDatabase,
|
|
9
10
|
} from "../db/doctor.ts";
|
|
10
|
-
import { listWorkers } from "../db/workers.ts";
|
|
11
|
+
import { listWorkers, type Worker } from "../db/workers.ts";
|
|
11
12
|
import { logger } from "../utils/logger.ts";
|
|
12
13
|
|
|
13
14
|
function statusBadge(status: ProbeResult["status"]): string {
|
|
@@ -78,28 +79,38 @@ async function doctor(program: Command, repair: boolean): Promise<void> {
|
|
|
78
79
|
process.exit(1);
|
|
79
80
|
}
|
|
80
81
|
|
|
81
|
-
// Repair requires exclusive access — refuse if any worker is
|
|
82
|
-
//
|
|
82
|
+
// Repair requires exclusive access — refuse if any worker is actually
|
|
83
|
+
// running, otherwise the EXPORT would race with the worker's writes.
|
|
84
|
+
// Stale `status='running'` rows whose PID is dead (the exact case that
|
|
85
|
+
// tends to coexist with workers-table corruption) are reported but do
|
|
86
|
+
// not block repair: trying to flip them to `stopped` would just trip
|
|
87
|
+
// the same corruption we're about to fix.
|
|
83
88
|
const running = await coreWithDb(dbPath, async (conn) => {
|
|
84
89
|
try {
|
|
85
90
|
return await listWorkers(conn, { status: "running" });
|
|
86
91
|
} catch {
|
|
87
|
-
|
|
88
|
-
// fall through and let repair proceed; the user is on their own
|
|
89
|
-
// for confirming no live workers, which `worker reap` would also
|
|
90
|
-
// be unable to do anyway.
|
|
91
|
-
return [];
|
|
92
|
+
return [] as Worker[];
|
|
92
93
|
}
|
|
93
94
|
});
|
|
94
|
-
|
|
95
|
+
const live = running.filter((w) => isPidAlive(w.pid));
|
|
96
|
+
const stale = running.filter((w) => !isPidAlive(w.pid));
|
|
97
|
+
if (live.length > 0) {
|
|
95
98
|
logger.error(
|
|
96
|
-
`${
|
|
99
|
+
`${live.length} worker(s) actually running. Stop them first: botholomew worker stop <id>`,
|
|
97
100
|
);
|
|
98
|
-
for (const w of
|
|
101
|
+
for (const w of live) {
|
|
99
102
|
logger.dim(` ${w.id} (pid ${w.pid}, mode=${w.mode})`);
|
|
100
103
|
}
|
|
101
104
|
process.exit(1);
|
|
102
105
|
}
|
|
106
|
+
if (stale.length > 0) {
|
|
107
|
+
logger.warn(
|
|
108
|
+
`${stale.length} worker row(s) marked 'running' but PID is dead — proceeding (rows will be carried through repair, then reapable):`,
|
|
109
|
+
);
|
|
110
|
+
for (const w of stale) {
|
|
111
|
+
logger.dim(` ${w.id} (pid ${w.pid}, mode=${w.mode})`);
|
|
112
|
+
}
|
|
113
|
+
}
|
|
103
114
|
|
|
104
115
|
logger.phase("repair", "EXPORT DATABASE → swap files → IMPORT DATABASE");
|
|
105
116
|
const result = await repairDatabase(dbPath);
|
package/src/commands/prepare.ts
CHANGED
|
@@ -12,14 +12,10 @@ export function registerPrepareCommand(program: Command) {
|
|
|
12
12
|
withDb(program, async (_conn, dir) => {
|
|
13
13
|
logger.info("Preparing Botholomew...");
|
|
14
14
|
const config = await loadConfig(dir);
|
|
15
|
-
if (!config.openai_api_key) {
|
|
16
|
-
logger.error(
|
|
17
|
-
"OpenAI API key not set. Set openai_api_key in config or OPENAI_API_KEY env var.",
|
|
18
|
-
);
|
|
19
|
-
process.exit(1);
|
|
20
|
-
}
|
|
21
15
|
await embedSingle("test", config);
|
|
22
|
-
logger.success(
|
|
16
|
+
logger.success(
|
|
17
|
+
`Embedding model ${config.embedding_model} is loaded and ready.`,
|
|
18
|
+
);
|
|
23
19
|
}),
|
|
24
20
|
);
|
|
25
21
|
}
|
package/src/config/loader.ts
CHANGED
|
@@ -19,9 +19,6 @@ export async function loadConfig(
|
|
|
19
19
|
if (process.env.ANTHROPIC_API_KEY) {
|
|
20
20
|
config.anthropic_api_key = process.env.ANTHROPIC_API_KEY;
|
|
21
21
|
}
|
|
22
|
-
if (process.env.OPENAI_API_KEY) {
|
|
23
|
-
config.openai_api_key = process.env.OPENAI_API_KEY;
|
|
24
|
-
}
|
|
25
22
|
|
|
26
23
|
setLogLevel(config.log_level);
|
|
27
24
|
|
package/src/config/schemas.ts
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
export interface BotholomewConfig {
|
|
2
2
|
anthropic_api_key?: string;
|
|
3
|
-
openai_api_key?: string;
|
|
4
3
|
model?: string;
|
|
5
4
|
chunker_model?: string;
|
|
6
5
|
embedding_model?: string;
|
|
@@ -20,11 +19,10 @@ export interface BotholomewConfig {
|
|
|
20
19
|
|
|
21
20
|
export const DEFAULT_CONFIG: Required<BotholomewConfig> = {
|
|
22
21
|
anthropic_api_key: "",
|
|
23
|
-
openai_api_key: "",
|
|
24
22
|
model: "claude-opus-4-20250514",
|
|
25
23
|
chunker_model: "claude-haiku-4-5-20251001",
|
|
26
|
-
embedding_model: "
|
|
27
|
-
embedding_dimension:
|
|
24
|
+
embedding_model: "Xenova/bge-small-en-v1.5",
|
|
25
|
+
embedding_dimension: 384,
|
|
28
26
|
tick_interval_seconds: 300,
|
|
29
27
|
max_tick_duration_seconds: 120,
|
|
30
28
|
system_prompt_override: "",
|
package/src/constants.ts
CHANGED
|
@@ -18,8 +18,8 @@ export const CONFIG_FILENAME = "config.json";
|
|
|
18
18
|
export const MCPX_DIR = "mcpx";
|
|
19
19
|
export const SKILLS_DIR = "skills";
|
|
20
20
|
export const MCPX_SERVERS_FILENAME = "servers.json";
|
|
21
|
-
export const EMBEDDING_DIMENSION =
|
|
22
|
-
export const EMBEDDING_MODEL = "
|
|
21
|
+
export const EMBEDDING_DIMENSION = 384;
|
|
22
|
+
export const EMBEDDING_MODEL = "Xenova/bge-small-en-v1.5";
|
|
23
23
|
|
|
24
24
|
export function getBotholomewDir(projectDir: string): string {
|
|
25
25
|
return join(projectDir, BOTHOLOMEW_DIR);
|
|
@@ -1,18 +1,36 @@
|
|
|
1
|
+
import {
|
|
2
|
+
type FeatureExtractionPipeline,
|
|
3
|
+
pipeline,
|
|
4
|
+
} from "@huggingface/transformers";
|
|
1
5
|
import type { BotholomewConfig } from "../config/schemas.ts";
|
|
6
|
+
import { logger } from "../utils/logger.ts";
|
|
2
7
|
|
|
3
8
|
type EmbedFn = (
|
|
4
9
|
texts: string[],
|
|
5
10
|
config: Required<BotholomewConfig>,
|
|
6
11
|
) => Promise<number[][]>;
|
|
7
12
|
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
13
|
+
// Singleton pipeline keyed by model name. Loading the model is expensive
|
|
14
|
+
// (downloads weights on first run, then ~hundreds of ms to instantiate the
|
|
15
|
+
// ONNX runtime), so we hold one per model for the life of the process.
|
|
16
|
+
const pipelinePromises = new Map<string, Promise<FeatureExtractionPipeline>>();
|
|
17
|
+
|
|
18
|
+
async function getPipeline(model: string): Promise<FeatureExtractionPipeline> {
|
|
19
|
+
let p = pipelinePromises.get(model);
|
|
20
|
+
if (!p) {
|
|
21
|
+
logger.info(
|
|
22
|
+
`Loading embedding model ${model} (first run downloads weights)`,
|
|
23
|
+
);
|
|
24
|
+
p = pipeline("feature-extraction", model);
|
|
25
|
+
pipelinePromises.set(model, p);
|
|
26
|
+
}
|
|
27
|
+
return p;
|
|
11
28
|
}
|
|
12
29
|
|
|
13
30
|
/**
|
|
14
|
-
* Embed multiple texts using
|
|
15
|
-
* Returns an array of float vectors with the
|
|
31
|
+
* Embed multiple texts using a local @huggingface/transformers feature-extraction
|
|
32
|
+
* pipeline. Returns an array of L2-normalized float vectors with the model's
|
|
33
|
+
* native dimension (must match `config.embedding_dimension`).
|
|
16
34
|
*/
|
|
17
35
|
export async function embed(
|
|
18
36
|
texts: string[],
|
|
@@ -20,37 +38,17 @@ export async function embed(
|
|
|
20
38
|
): Promise<number[][]> {
|
|
21
39
|
if (texts.length === 0) return [];
|
|
22
40
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
);
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
const response = await fetch("https://api.openai.com/v1/embeddings", {
|
|
30
|
-
method: "POST",
|
|
31
|
-
headers: {
|
|
32
|
-
Authorization: `Bearer ${config.openai_api_key}`,
|
|
33
|
-
"Content-Type": "application/json",
|
|
34
|
-
},
|
|
35
|
-
body: JSON.stringify({
|
|
36
|
-
input: texts,
|
|
37
|
-
model: config.embedding_model,
|
|
38
|
-
dimensions: config.embedding_dimension,
|
|
39
|
-
}),
|
|
40
|
-
});
|
|
41
|
+
const extractor = await getPipeline(config.embedding_model);
|
|
42
|
+
const output = await extractor(texts, { pooling: "mean", normalize: true });
|
|
43
|
+
const data = output.tolist() as number[][];
|
|
41
44
|
|
|
42
|
-
if (
|
|
43
|
-
const body = await response.text();
|
|
45
|
+
if (data[0] && data[0].length !== config.embedding_dimension) {
|
|
44
46
|
throw new Error(
|
|
45
|
-
`
|
|
47
|
+
`Embedding model ${config.embedding_model} returned ${data[0].length}-dim vectors, but embedding_dimension is set to ${config.embedding_dimension}. Update embedding_dimension in config and re-embed.`,
|
|
46
48
|
);
|
|
47
49
|
}
|
|
48
50
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
// Sort by index to ensure order matches input
|
|
52
|
-
const sorted = result.data.sort((a, b) => a.index - b.index);
|
|
53
|
-
return sorted.map((d) => d.embedding);
|
|
51
|
+
return data;
|
|
54
52
|
}
|
|
55
53
|
|
|
56
54
|
/**
|
package/src/context/ingest.ts
CHANGED
|
@@ -44,16 +44,7 @@ export async function prepareIngestion(
|
|
|
44
44
|
return null;
|
|
45
45
|
}
|
|
46
46
|
|
|
47
|
-
|
|
48
|
-
const doEmbed =
|
|
49
|
-
embedFn ??
|
|
50
|
-
(config.openai_api_key
|
|
51
|
-
? (texts: string[]) => defaultEmbed(texts, config)
|
|
52
|
-
: null);
|
|
53
|
-
if (!doEmbed) {
|
|
54
|
-
logger.debug("ingest: skipping embeddings (no OpenAI API key configured)");
|
|
55
|
-
return null;
|
|
56
|
-
}
|
|
47
|
+
const doEmbed = embedFn ?? ((texts: string[]) => defaultEmbed(texts, config));
|
|
57
48
|
|
|
58
49
|
const chunks = await chunk(item.content, item.mime_type, config);
|
|
59
50
|
if (chunks.length === 0) return null;
|
package/src/context/refresh.ts
CHANGED
|
@@ -132,8 +132,7 @@ export async function refreshContextItems(
|
|
|
132
132
|
const unchanged = results.filter((r) => r.status === "unchanged").length;
|
|
133
133
|
const missing = results.filter((r) => r.status === "missing").length;
|
|
134
134
|
|
|
135
|
-
|
|
136
|
-
if (toReembed.length === 0 || !hasEmbedder) {
|
|
135
|
+
if (toReembed.length === 0) {
|
|
137
136
|
return {
|
|
138
137
|
checked: refreshable.length,
|
|
139
138
|
updated,
|
|
@@ -141,7 +140,7 @@ export async function refreshContextItems(
|
|
|
141
140
|
missing,
|
|
142
141
|
reembedded: 0,
|
|
143
142
|
chunks: 0,
|
|
144
|
-
embeddings_skipped:
|
|
143
|
+
embeddings_skipped: false,
|
|
145
144
|
items: results,
|
|
146
145
|
};
|
|
147
146
|
}
|
package/src/db/doctor.ts
CHANGED
|
@@ -85,12 +85,15 @@ export async function probeTable(
|
|
|
85
85
|
}
|
|
86
86
|
`;
|
|
87
87
|
|
|
88
|
+
// Discard the child's stderr. When the probe panics, Bun writes a multi-
|
|
89
|
+
// line crash banner there which would otherwise spill into our table
|
|
90
|
+
// output via the fallback message. The exit code alone tells us what we
|
|
91
|
+
// need to know.
|
|
88
92
|
const proc = Bun.spawn(["bun", "-e", script], {
|
|
89
|
-
stdio: ["ignore", "pipe", "
|
|
93
|
+
stdio: ["ignore", "pipe", "ignore"],
|
|
90
94
|
});
|
|
91
|
-
const [stdout,
|
|
95
|
+
const [stdout, exitCode] = await Promise.all([
|
|
92
96
|
new Response(proc.stdout).text(),
|
|
93
|
-
new Response(proc.stderr).text(),
|
|
94
97
|
proc.exited,
|
|
95
98
|
]);
|
|
96
99
|
|
|
@@ -103,20 +106,21 @@ export async function probeTable(
|
|
|
103
106
|
return {
|
|
104
107
|
table,
|
|
105
108
|
status: "missing",
|
|
106
|
-
message: stdout.slice("MISSING:".length),
|
|
109
|
+
message: firstLine(stdout.slice("MISSING:".length)),
|
|
107
110
|
};
|
|
108
111
|
}
|
|
109
112
|
if (stdout.startsWith("CORRUPT:")) {
|
|
110
113
|
return {
|
|
111
114
|
table,
|
|
112
115
|
status: "corrupt",
|
|
113
|
-
message: stdout.slice("CORRUPT:".length),
|
|
116
|
+
message: firstLine(stdout.slice("CORRUPT:".length)),
|
|
114
117
|
};
|
|
115
118
|
}
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
119
|
+
return {
|
|
120
|
+
table,
|
|
121
|
+
status: "corrupt",
|
|
122
|
+
message: `child exited with code ${exitCode} (likely native panic)`,
|
|
123
|
+
};
|
|
120
124
|
}
|
|
121
125
|
|
|
122
126
|
/**
|
|
@@ -212,3 +216,27 @@ async function pathExists(p: string): Promise<boolean> {
|
|
|
212
216
|
return false;
|
|
213
217
|
}
|
|
214
218
|
}
|
|
219
|
+
|
|
220
|
+
function firstLine(s: string): string {
|
|
221
|
+
const trimmed = s.trim();
|
|
222
|
+
const nl = trimmed.indexOf("\n");
|
|
223
|
+
return nl === -1 ? trimmed : trimmed.slice(0, nl);
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
/**
|
|
227
|
+
* Send signal 0 to test whether `pid` corresponds to a live process. Returns
|
|
228
|
+
* false on ESRCH (no such process) and on any other error (including EPERM,
|
|
229
|
+
* which we conservatively treat as "not ours, not relevant"). Used by the
|
|
230
|
+
* doctor's safety gate to distinguish workers actually running from rows
|
|
231
|
+
* that say `status = 'running'` because the worker crashed before flipping
|
|
232
|
+
* its row to `stopped` or `dead`.
|
|
233
|
+
*/
|
|
234
|
+
export function isPidAlive(pid: number): boolean {
|
|
235
|
+
if (!pid || pid < 1) return false;
|
|
236
|
+
try {
|
|
237
|
+
process.kill(pid, 0);
|
|
238
|
+
return true;
|
|
239
|
+
} catch {
|
|
240
|
+
return false;
|
|
241
|
+
}
|
|
242
|
+
}
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
import type { BotholomewConfig } from "../config/schemas.ts";
|
|
2
|
+
import { embed } from "../context/embedder.ts";
|
|
3
|
+
import { logger } from "../utils/logger.ts";
|
|
4
|
+
import { withDb } from "./connection.ts";
|
|
5
|
+
import { rebuildSearchIndex } from "./embeddings.ts";
|
|
6
|
+
|
|
7
|
+
interface PendingRow {
|
|
8
|
+
id: string;
|
|
9
|
+
chunk_content: string | null;
|
|
10
|
+
title: string;
|
|
11
|
+
description: string;
|
|
12
|
+
drive: string | null;
|
|
13
|
+
path: string | null;
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
const BATCH_SIZE = 32;
|
|
17
|
+
|
|
18
|
+
function buildEmbeddingInput(row: PendingRow): string {
|
|
19
|
+
const parts: string[] = [];
|
|
20
|
+
if (row.title) parts.push(`Title: ${row.title}`);
|
|
21
|
+
if (row.description) parts.push(`Description: ${row.description}`);
|
|
22
|
+
if (row.drive && row.path) parts.push(`Source: ${row.drive}:${row.path}`);
|
|
23
|
+
if (row.chunk_content) parts.push(row.chunk_content);
|
|
24
|
+
return parts.join("\n");
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
interface ReembedOptions {
|
|
28
|
+
/**
|
|
29
|
+
* `"missing"` (default) — only re-embed rows where `embedding IS NULL`.
|
|
30
|
+
* `"all"` — re-embed every row, including ones that already have a vector.
|
|
31
|
+
* Use this after changing `embedding_model` so old vectors don't
|
|
32
|
+
* sit alongside new ones in a different space.
|
|
33
|
+
*/
|
|
34
|
+
mode?: "missing" | "all";
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Recompute embeddings for rows in the embeddings table.
|
|
39
|
+
*
|
|
40
|
+
* Default mode (`"missing"`) only touches NULL rows — the case after migration
|
|
41
|
+
* 18 leaves existing rows with no vector. The `context reembed` CLI command
|
|
42
|
+
* passes `mode: "all"` to force a full rebuild after the user changes
|
|
43
|
+
* `embedding_model`.
|
|
44
|
+
*
|
|
45
|
+
* Each batch is its own withDb so the file lock releases between embedding
|
|
46
|
+
* calls — long sweeps don't block other workers from acquiring the DB.
|
|
47
|
+
*/
|
|
48
|
+
export async function reembedMissingVectors(
|
|
49
|
+
dbPath: string,
|
|
50
|
+
config: Required<BotholomewConfig>,
|
|
51
|
+
options: ReembedOptions = {},
|
|
52
|
+
): Promise<void> {
|
|
53
|
+
const mode = options.mode ?? "missing";
|
|
54
|
+
const filter = mode === "all" ? "" : "WHERE embedding IS NULL";
|
|
55
|
+
|
|
56
|
+
const total = await withDb(dbPath, async (conn) => {
|
|
57
|
+
const row = await conn.queryGet<{ count: number }>(
|
|
58
|
+
`SELECT count(*)::INTEGER AS count FROM embeddings ${filter}`,
|
|
59
|
+
);
|
|
60
|
+
return row?.count ?? 0;
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
if (total === 0) {
|
|
64
|
+
logger.info("No embeddings to recompute.");
|
|
65
|
+
return;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
logger.info(
|
|
69
|
+
`re-embedding ${total} row${total === 1 ? "" : "s"} with model ${config.embedding_model}`,
|
|
70
|
+
);
|
|
71
|
+
|
|
72
|
+
let processed = 0;
|
|
73
|
+
while (processed < total) {
|
|
74
|
+
const batch = await withDb(dbPath, async (conn) => {
|
|
75
|
+
const offsetClause = mode === "all" ? `LIMIT ?1 OFFSET ?2` : `LIMIT ?1`;
|
|
76
|
+
const sql = `SELECT e.id, e.chunk_content, e.title, e.description, ci.drive, ci.path
|
|
77
|
+
FROM embeddings e
|
|
78
|
+
LEFT JOIN context_items ci ON ci.id = e.context_item_id
|
|
79
|
+
${filter}
|
|
80
|
+
ORDER BY e.id
|
|
81
|
+
${offsetClause}`;
|
|
82
|
+
return mode === "all"
|
|
83
|
+
? conn.queryAll<PendingRow>(sql, BATCH_SIZE, processed)
|
|
84
|
+
: conn.queryAll<PendingRow>(sql, BATCH_SIZE);
|
|
85
|
+
});
|
|
86
|
+
|
|
87
|
+
if (batch.length === 0) break;
|
|
88
|
+
|
|
89
|
+
const inputs = batch.map(buildEmbeddingInput);
|
|
90
|
+
const vectors = await embed(inputs, config);
|
|
91
|
+
|
|
92
|
+
await withDb(dbPath, async (conn) => {
|
|
93
|
+
for (let i = 0; i < batch.length; i++) {
|
|
94
|
+
const row = batch[i];
|
|
95
|
+
const vec = vectors[i];
|
|
96
|
+
if (!row || !vec) continue;
|
|
97
|
+
await conn.queryRun(
|
|
98
|
+
`UPDATE embeddings
|
|
99
|
+
SET embedding = ?1::FLOAT[${config.embedding_dimension}]
|
|
100
|
+
WHERE id = ?2`,
|
|
101
|
+
vec,
|
|
102
|
+
row.id,
|
|
103
|
+
);
|
|
104
|
+
}
|
|
105
|
+
});
|
|
106
|
+
|
|
107
|
+
processed += batch.length;
|
|
108
|
+
logger.info(` re-embedded ${processed}/${total}`);
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
await withDb(dbPath, (conn) => rebuildSearchIndex(conn));
|
|
112
|
+
logger.success(`re-embed complete (${processed} rows)`);
|
|
113
|
+
}
|
package/src/db/schema.ts
CHANGED
|
@@ -2,6 +2,7 @@ import { readdirSync, readFileSync } from "node:fs";
|
|
|
2
2
|
import { join } from "node:path";
|
|
3
3
|
import { logger } from "../utils/logger.ts";
|
|
4
4
|
import type { DbConnection } from "./connection.ts";
|
|
5
|
+
import { rebuildSearchIndex } from "./embeddings.ts";
|
|
5
6
|
|
|
6
7
|
interface Migration {
|
|
7
8
|
id: number;
|
|
@@ -83,4 +84,10 @@ export async function migrate(db: DbConnection): Promise<void> {
|
|
|
83
84
|
if (appliedAny) {
|
|
84
85
|
await db.exec("CHECKPOINT");
|
|
85
86
|
}
|
|
87
|
+
|
|
88
|
+
// Ensure the FTS index exists. Migration 18 drops it (it can't recreate it
|
|
89
|
+
// in the same SQL run without DuckDB rejecting the dependency commit), and
|
|
90
|
+
// fresh DBs need it created at least once. `overwrite = 1` makes this
|
|
91
|
+
// idempotent for DBs that already have a healthy FTS index.
|
|
92
|
+
await rebuildSearchIndex(db);
|
|
86
93
|
}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
-- Switch from OpenAI 1536-dim embeddings to local 384-dim embeddings.
|
|
2
|
+
--
|
|
3
|
+
-- DuckDB encodes array dimension in the column type, so we rebuild the
|
|
4
|
+
-- embeddings table preserving every row's metadata (chunk_content, title,
|
|
5
|
+
-- description, context_item_id, chunk_index, created_at). The vectors
|
|
6
|
+
-- themselves are NULLed and repopulated by `botholomew context reembed`
|
|
7
|
+
-- using the locally-loaded embedding model.
|
|
8
|
+
--
|
|
9
|
+
-- Idempotency: every destructive step uses IF EXISTS so a partial prior
|
|
10
|
+
-- run can be re-attempted cleanly. The FTS index is dropped here but NOT
|
|
11
|
+
-- recreated — `migrate()` calls rebuildSearchIndex once after all SQL
|
|
12
|
+
-- migrations apply, which avoids a same-migration drop-then-create that
|
|
13
|
+
-- DuckDB rejects with "Could not commit creation of dependency, subject
|
|
14
|
+
-- 'stopwords' has been deleted".
|
|
15
|
+
|
|
16
|
+
DROP SCHEMA IF EXISTS fts_main_embeddings CASCADE;
|
|
17
|
+
|
|
18
|
+
DROP TABLE IF EXISTS embeddings_new;
|
|
19
|
+
|
|
20
|
+
CREATE TABLE embeddings_new (
|
|
21
|
+
id TEXT PRIMARY KEY,
|
|
22
|
+
context_item_id TEXT NOT NULL,
|
|
23
|
+
chunk_index INTEGER NOT NULL,
|
|
24
|
+
chunk_content TEXT,
|
|
25
|
+
title TEXT NOT NULL,
|
|
26
|
+
description TEXT NOT NULL DEFAULT '',
|
|
27
|
+
embedding FLOAT[384],
|
|
28
|
+
created_at TEXT NOT NULL DEFAULT (current_timestamp::VARCHAR),
|
|
29
|
+
UNIQUE(context_item_id, chunk_index)
|
|
30
|
+
);
|
|
31
|
+
|
|
32
|
+
INSERT INTO embeddings_new (id, context_item_id, chunk_index, chunk_content, title, description, embedding, created_at)
|
|
33
|
+
SELECT id, context_item_id, chunk_index, chunk_content, title, description, NULL, created_at
|
|
34
|
+
FROM embeddings;
|
|
35
|
+
|
|
36
|
+
DROP TABLE embeddings;
|
|
37
|
+
ALTER TABLE embeddings_new RENAME TO embeddings;
|
|
38
|
+
|
|
39
|
+
CHECKPOINT;
|
|
@@ -33,22 +33,30 @@ export function StatusBar({
|
|
|
33
33
|
useEffect(() => {
|
|
34
34
|
let mounted = true;
|
|
35
35
|
|
|
36
|
+
// Errors here (e.g. transient DuckDB lock conflicts while a freshly
|
|
37
|
+
// spawned worker is migrating) must not freeze the count — the next
|
|
38
|
+
// interval tick will retry. Swallow silently rather than logging
|
|
39
|
+
// because logger writes to stdout and would corrupt the Ink render.
|
|
36
40
|
const refresh = async () => {
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
41
|
+
try {
|
|
42
|
+
const [pending, inProgress, workers] = await withDb(
|
|
43
|
+
dbPath,
|
|
44
|
+
async (conn) => [
|
|
45
|
+
await listTasks(conn, { status: "pending" }),
|
|
46
|
+
await listTasks(conn, { status: "in_progress" }),
|
|
47
|
+
await listWorkers(conn, { status: "running" }),
|
|
48
|
+
],
|
|
49
|
+
);
|
|
50
|
+
if (mounted) {
|
|
51
|
+
setStatus({
|
|
52
|
+
workerCount: workers.length,
|
|
53
|
+
pendingCount: pending.length,
|
|
54
|
+
inProgressCount: inProgress.length,
|
|
55
|
+
});
|
|
56
|
+
onWorkerStatusChange?.(workers.length > 0);
|
|
57
|
+
}
|
|
58
|
+
} catch {
|
|
59
|
+
// Keep prior state; next tick will retry.
|
|
52
60
|
}
|
|
53
61
|
};
|
|
54
62
|
|
package/src/worker/llm.ts
CHANGED
|
@@ -11,12 +11,17 @@ import { getTask, type Task } from "../db/tasks.ts";
|
|
|
11
11
|
import { logInteraction } from "../db/threads.ts";
|
|
12
12
|
import { registerAllTools } from "../tools/registry.ts";
|
|
13
13
|
import { getTool, type ToolContext, toAnthropicTools } from "../tools/tool.ts";
|
|
14
|
+
import { logger } from "../utils/logger.ts";
|
|
14
15
|
import { fitToContextWindow, getMaxInputTokens } from "./context.ts";
|
|
15
16
|
import { clearLargeResults, maybeStoreResult } from "./large-results.ts";
|
|
16
17
|
import { createLlmClient } from "./llm-client.ts";
|
|
17
18
|
|
|
18
19
|
registerAllTools();
|
|
19
20
|
|
|
21
|
+
function truncate(s: string, max: number): string {
|
|
22
|
+
return s.length > max ? `${s.slice(0, max)}…` : s;
|
|
23
|
+
}
|
|
24
|
+
|
|
20
25
|
export interface WorkerStreamCallbacks {
|
|
21
26
|
onToken: (text: string) => void;
|
|
22
27
|
onToolStart: (name: string, input: string) => void;
|
|
@@ -153,6 +158,9 @@ export async function runAgentLoop(input: {
|
|
|
153
158
|
tokenCount,
|
|
154
159
|
}),
|
|
155
160
|
);
|
|
161
|
+
if (!callbacks) {
|
|
162
|
+
logger.phase("assistant", block.text);
|
|
163
|
+
}
|
|
156
164
|
}
|
|
157
165
|
}
|
|
158
166
|
|
|
@@ -175,6 +183,12 @@ export async function runAgentLoop(input: {
|
|
|
175
183
|
for (const toolUse of toolUseBlocks) {
|
|
176
184
|
const toolInput = JSON.stringify(toolUse.input);
|
|
177
185
|
callbacks?.onToolStart(toolUse.name, toolInput);
|
|
186
|
+
if (!callbacks) {
|
|
187
|
+
logger.phase(
|
|
188
|
+
"tool-call",
|
|
189
|
+
`${toolUse.name} ${truncate(toolInput, 200)}`,
|
|
190
|
+
);
|
|
191
|
+
}
|
|
178
192
|
await withDb(dbPath, (conn) =>
|
|
179
193
|
logInteraction(conn, threadId, {
|
|
180
194
|
role: "assistant",
|
|
@@ -222,6 +236,11 @@ export async function runAgentLoop(input: {
|
|
|
222
236
|
durationMs,
|
|
223
237
|
}),
|
|
224
238
|
);
|
|
239
|
+
if (!callbacks) {
|
|
240
|
+
const seconds = (durationMs / 1000).toFixed(1);
|
|
241
|
+
const status = result.isError ? "err" : "ok";
|
|
242
|
+
logger.phase("tool-result", `${toolUse.name} ${status} in ${seconds}s`);
|
|
243
|
+
}
|
|
225
244
|
|
|
226
245
|
if (result.terminal && result.agentResult) {
|
|
227
246
|
return result.agentResult;
|
package/src/worker/prompt.ts
CHANGED
|
@@ -104,7 +104,7 @@ export async function buildSystemPrompt(
|
|
|
104
104
|
|
|
105
105
|
prompt += await loadPersistentContext(projectDir, taskKeywords);
|
|
106
106
|
|
|
107
|
-
if (task && dbPath && _config
|
|
107
|
+
if (task && dbPath && _config) {
|
|
108
108
|
try {
|
|
109
109
|
const query = `${task.name} ${task.description}`;
|
|
110
110
|
const queryVec = await embedSingle(query, _config);
|
package/src/worker/tick.ts
CHANGED
|
@@ -133,6 +133,9 @@ async function runClaimedTask(opts: {
|
|
|
133
133
|
const { projectDir, dbPath, config, mcpxClient, callbacks, task } = opts;
|
|
134
134
|
|
|
135
135
|
logger.info(`Claimed task: ${task.name} (${task.id})`);
|
|
136
|
+
if (!callbacks && task.description) {
|
|
137
|
+
logger.dim(task.description);
|
|
138
|
+
}
|
|
136
139
|
callbacks?.onTaskStart(task);
|
|
137
140
|
|
|
138
141
|
const threadId = await withDb(dbPath, (conn) =>
|