botholomew 0.9.12 → 0.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -5
- package/package.json +15 -4
- package/src/chat/agent.ts +1 -1
- package/src/commands/context.ts +16 -6
- package/src/commands/prepare.ts +3 -7
- package/src/config/loader.ts +8 -4
- package/src/config/schemas.ts +3 -5
- package/src/constants.ts +10 -2
- package/src/context/embedder-impl.ts +44 -31
- package/src/context/ingest.ts +1 -10
- package/src/context/refresh.ts +2 -3
- package/src/db/reembed.ts +113 -0
- package/src/db/schema.ts +7 -0
- package/src/db/sql/18-reset_embeddings_for_local.sql +39 -0
- package/src/init/templates.ts +3 -4
- package/src/tui/components/StatusBar.tsx +23 -15
- package/src/worker/prompt.ts +1 -1
package/README.md
CHANGED
|
@@ -88,9 +88,9 @@ bun run dev -- --help
|
|
|
88
88
|
# 1. Initialize a project in the current directory
|
|
89
89
|
botholomew init
|
|
90
90
|
|
|
91
|
-
# 2. Add your
|
|
91
|
+
# 2. Add your Anthropic key to .botholomew/config.json, or export it
|
|
92
92
|
export ANTHROPIC_API_KEY=sk-ant-...
|
|
93
|
-
|
|
93
|
+
# Embeddings run locally — no API key required.
|
|
94
94
|
|
|
95
95
|
# 3. Queue some work
|
|
96
96
|
botholomew task add "Summarize every markdown file in ~/notes"
|
|
@@ -144,7 +144,7 @@ Everything the agent can touch is here. No surprises.
|
|
|
144
144
|
| `botholomew chat` | Interactive Ink/React TUI |
|
|
145
145
|
| `botholomew task list\|add\|view\|update\|reset\|delete` | Manage the task queue |
|
|
146
146
|
| `botholomew schedule list\|add\|view\|enable\|disable\|trigger\|delete` | Recurring work |
|
|
147
|
-
| `botholomew context add\|list\|search\|chunks\|refresh\|delete` | Ingest & browse knowledge (files, folders, URLs); also exposes the agent's `read`/`write`/`tree`/`edit`/… tools as subcommands |
|
|
147
|
+
| `botholomew context add\|list\|search\|chunks\|refresh\|reembed\|delete` | Ingest & browse knowledge (files, folders, URLs); `reembed` rebuilds every vector after upgrading the embedding model; also exposes the agent's `read`/`write`/`tree`/`edit`/… tools as subcommands |
|
|
148
148
|
| `botholomew capabilities` | Rescan built-in + MCPX tools and rewrite `.botholomew/capabilities.md` |
|
|
149
149
|
| `botholomew mcpx servers\|list\|add\|remove\|info\|search\|exec\|ping\|auth\|deauth\|import-global\|…` | Configure external MCP servers (passthrough to `mcpx`) |
|
|
150
150
|
| `botholomew skill list\|show\|create\|validate` | Manage slash-command skills |
|
|
@@ -193,6 +193,8 @@ See [docs/architecture.md](docs/architecture.md) for a deeper tour.
|
|
|
193
193
|
|
|
194
194
|
## Deep dives
|
|
195
195
|
|
|
196
|
+
> The full docs site is published at **[www.botholomew.com](https://www.botholomew.com)**.
|
|
197
|
+
|
|
196
198
|
Topics worth understanding in detail:
|
|
197
199
|
|
|
198
200
|
- **[Architecture](docs/architecture.md)** — workers, chat, and how
|
|
@@ -234,8 +236,9 @@ Topics worth understanding in detail:
|
|
|
234
236
|
built-in FTS extension for BM25 keyword search
|
|
235
237
|
- **[Anthropic SDK](https://docs.anthropic.com/en/api/client-sdks)** for
|
|
236
238
|
Claude — the reasoning model
|
|
237
|
-
- **
|
|
238
|
-
|
|
239
|
+
- **[`@huggingface/transformers`](https://huggingface.co/docs/transformers.js)**
|
|
240
|
+
for local embeddings (default `Xenova/bge-small-en-v1.5`, 384-dim) —
|
|
241
|
+
no API key, weights cached on first run
|
|
239
242
|
- **[MCPX](https://github.com/evantahler/mcpx)** for external tools
|
|
240
243
|
- **[Ink 6](https://github.com/vadimdemedes/ink)** + **React 19** for the
|
|
241
244
|
terminal UI
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "botholomew",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.10.1",
|
|
4
4
|
"description": "An autonomous AI agent for knowledge work — works your task queue while you sleep.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
@@ -20,12 +20,16 @@
|
|
|
20
20
|
"dev:demo": "bun run src/cli.ts chat -p 'learn everything you can about me from the connected MCP services and then save what you'\\''ve learned about me to context'",
|
|
21
21
|
"test": "bun test",
|
|
22
22
|
"lint": "tsc --noEmit && biome check .",
|
|
23
|
-
"capture": "bun run scripts/capture.ts"
|
|
23
|
+
"capture": "bun run scripts/capture.ts",
|
|
24
|
+
"docs:dev": "vitepress dev docs",
|
|
25
|
+
"docs:build": "vitepress build docs",
|
|
26
|
+
"docs:preview": "vitepress preview docs"
|
|
24
27
|
},
|
|
25
28
|
"dependencies": {
|
|
26
29
|
"@anthropic-ai/sdk": "^0.88.0",
|
|
27
30
|
"@duckdb/node-api": "^1.5.2-r.1",
|
|
28
31
|
"@evantahler/mcpx": "0.18.6",
|
|
32
|
+
"@huggingface/transformers": "^4.2.0",
|
|
29
33
|
"ansis": "^4.2.0",
|
|
30
34
|
"commander": "^14.0.0",
|
|
31
35
|
"gray-matter": "^4.0.3",
|
|
@@ -43,6 +47,13 @@
|
|
|
43
47
|
"@types/bun": "latest",
|
|
44
48
|
"@types/react": "^19.1.0",
|
|
45
49
|
"@types/uuid": "^11.0.0",
|
|
46
|
-
"typescript": "^6.0.2"
|
|
47
|
-
|
|
50
|
+
"typescript": "^6.0.2",
|
|
51
|
+
"vitepress": "^1.5.0",
|
|
52
|
+
"vitepress-plugin-llms": "^1.12.1",
|
|
53
|
+
"vue": "^3.5.0"
|
|
54
|
+
},
|
|
55
|
+
"trustedDependencies": [
|
|
56
|
+
"onnxruntime-node",
|
|
57
|
+
"protobufjs"
|
|
58
|
+
]
|
|
48
59
|
}
|
package/src/chat/agent.ts
CHANGED
|
@@ -86,7 +86,7 @@ export async function buildChatSystemPrompt(
|
|
|
86
86
|
|
|
87
87
|
const dbPath = options?.dbPath;
|
|
88
88
|
const config = options?.config;
|
|
89
|
-
if (dbPath && config
|
|
89
|
+
if (dbPath && config && keywordSource) {
|
|
90
90
|
try {
|
|
91
91
|
const queryVec = await embedSingle(keywordSource, config);
|
|
92
92
|
const results = await withDb(dbPath, (conn) =>
|
package/src/commands/context.ts
CHANGED
|
@@ -6,6 +6,7 @@ import { isText } from "istextorbinary";
|
|
|
6
6
|
import { createSpinner } from "nanospinner";
|
|
7
7
|
import { loadConfig } from "../config/loader.ts";
|
|
8
8
|
import type { BotholomewConfig } from "../config/schemas.ts";
|
|
9
|
+
import { getDbPath } from "../constants.ts";
|
|
9
10
|
import { generateDescription } from "../context/describer.ts";
|
|
10
11
|
import {
|
|
11
12
|
type DriveTarget,
|
|
@@ -36,6 +37,7 @@ import {
|
|
|
36
37
|
upsertContextItem,
|
|
37
38
|
} from "../db/context.ts";
|
|
38
39
|
import { getEmbeddingsForItem, hybridSearch } from "../db/embeddings.ts";
|
|
40
|
+
import { reembedMissingVectors } from "../db/reembed.ts";
|
|
39
41
|
import { createMcpxClient } from "../mcpx/client.ts";
|
|
40
42
|
import { logger } from "../utils/logger.ts";
|
|
41
43
|
import {
|
|
@@ -425,10 +427,7 @@ export function registerContextCommand(program: Command) {
|
|
|
425
427
|
|
|
426
428
|
skipped.push(...dedupSkipped);
|
|
427
429
|
|
|
428
|
-
if (itemIds.length === 0
|
|
429
|
-
if (!config.openai_api_key) {
|
|
430
|
-
logger.dim("Skipping embeddings (no OpenAI API key configured).");
|
|
431
|
-
}
|
|
430
|
+
if (itemIds.length === 0) {
|
|
432
431
|
const msg = buildSummary({
|
|
433
432
|
added: itemIds.length,
|
|
434
433
|
refreshed: refreshedCount,
|
|
@@ -693,12 +692,23 @@ export function registerContextCommand(program: Command) {
|
|
|
693
692
|
logger.success(
|
|
694
693
|
`Refreshed ${result.updated} item(s), ${result.chunks} chunk(s) re-indexed.`,
|
|
695
694
|
);
|
|
696
|
-
} else if (result.embeddings_skipped) {
|
|
697
|
-
logger.dim("Skipping embeddings (no OpenAI API key configured).");
|
|
698
695
|
}
|
|
699
696
|
}),
|
|
700
697
|
);
|
|
701
698
|
|
|
699
|
+
ctx
|
|
700
|
+
.command("reembed")
|
|
701
|
+
.description(
|
|
702
|
+
"Recompute every embedding using the configured local model. Run this after upgrading or after changing embedding_model.",
|
|
703
|
+
)
|
|
704
|
+
.action(() =>
|
|
705
|
+
withDb(program, async (_conn, dir) => {
|
|
706
|
+
const config = await loadConfig(dir);
|
|
707
|
+
const dbPath = getDbPath(dir);
|
|
708
|
+
await reembedMissingVectors(dbPath, config, { mode: "all" });
|
|
709
|
+
}),
|
|
710
|
+
);
|
|
711
|
+
|
|
702
712
|
registerContextToolSubcommands(ctx);
|
|
703
713
|
}
|
|
704
714
|
|
package/src/commands/prepare.ts
CHANGED
|
@@ -12,14 +12,10 @@ export function registerPrepareCommand(program: Command) {
|
|
|
12
12
|
withDb(program, async (_conn, dir) => {
|
|
13
13
|
logger.info("Preparing Botholomew...");
|
|
14
14
|
const config = await loadConfig(dir);
|
|
15
|
-
if (!config.openai_api_key) {
|
|
16
|
-
logger.error(
|
|
17
|
-
"OpenAI API key not set. Set openai_api_key in config or OPENAI_API_KEY env var.",
|
|
18
|
-
);
|
|
19
|
-
process.exit(1);
|
|
20
|
-
}
|
|
21
15
|
await embedSingle("test", config);
|
|
22
|
-
logger.success(
|
|
16
|
+
logger.success(
|
|
17
|
+
`Embedding model ${config.embedding_model} is loaded and ready.`,
|
|
18
|
+
);
|
|
23
19
|
}),
|
|
24
20
|
);
|
|
25
21
|
}
|
package/src/config/loader.ts
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { mkdirSync } from "node:fs";
|
|
2
|
+
import { getConfigPath, getModelsDir } from "../constants.ts";
|
|
2
3
|
import { setLogLevel } from "../utils/logger.ts";
|
|
3
4
|
import { type BotholomewConfig, DEFAULT_CONFIG } from "./schemas.ts";
|
|
4
5
|
|
|
@@ -19,12 +20,15 @@ export async function loadConfig(
|
|
|
19
20
|
if (process.env.ANTHROPIC_API_KEY) {
|
|
20
21
|
config.anthropic_api_key = process.env.ANTHROPIC_API_KEY;
|
|
21
22
|
}
|
|
22
|
-
if (process.env.OPENAI_API_KEY) {
|
|
23
|
-
config.openai_api_key = process.env.OPENAI_API_KEY;
|
|
24
|
-
}
|
|
25
23
|
|
|
26
24
|
setLogLevel(config.log_level);
|
|
27
25
|
|
|
26
|
+
const modelsDir = getModelsDir(projectDir);
|
|
27
|
+
mkdirSync(modelsDir, { recursive: true });
|
|
28
|
+
// Dynamic import keeps @huggingface/transformers (heavy, pulls ONNX runtime) out of commands that never embed.
|
|
29
|
+
const { setEmbeddingCacheDir } = await import("../context/embedder-impl.ts");
|
|
30
|
+
setEmbeddingCacheDir(modelsDir);
|
|
31
|
+
|
|
28
32
|
return config;
|
|
29
33
|
}
|
|
30
34
|
|
package/src/config/schemas.ts
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
export interface BotholomewConfig {
|
|
2
2
|
anthropic_api_key?: string;
|
|
3
|
-
openai_api_key?: string;
|
|
4
3
|
model?: string;
|
|
5
4
|
chunker_model?: string;
|
|
6
5
|
embedding_model?: string;
|
|
@@ -20,11 +19,10 @@ export interface BotholomewConfig {
|
|
|
20
19
|
|
|
21
20
|
export const DEFAULT_CONFIG: Required<BotholomewConfig> = {
|
|
22
21
|
anthropic_api_key: "",
|
|
23
|
-
|
|
24
|
-
model: "claude-opus-4-20250514",
|
|
22
|
+
model: "claude-opus-4-6",
|
|
25
23
|
chunker_model: "claude-haiku-4-5-20251001",
|
|
26
|
-
embedding_model: "
|
|
27
|
-
embedding_dimension:
|
|
24
|
+
embedding_model: "Xenova/bge-small-en-v1.5",
|
|
25
|
+
embedding_dimension: 384,
|
|
28
26
|
tick_interval_seconds: 300,
|
|
29
27
|
max_tick_duration_seconds: 120,
|
|
30
28
|
system_prompt_override: "",
|
package/src/constants.ts
CHANGED
|
@@ -16,10 +16,11 @@ export const DB_FILENAME = "data.duckdb";
|
|
|
16
16
|
export const LOGS_DIR = "logs";
|
|
17
17
|
export const CONFIG_FILENAME = "config.json";
|
|
18
18
|
export const MCPX_DIR = "mcpx";
|
|
19
|
+
export const MODELS_DIR = "models";
|
|
19
20
|
export const SKILLS_DIR = "skills";
|
|
20
21
|
export const MCPX_SERVERS_FILENAME = "servers.json";
|
|
21
|
-
export const EMBEDDING_DIMENSION =
|
|
22
|
-
export const EMBEDDING_MODEL = "
|
|
22
|
+
export const EMBEDDING_DIMENSION = 384;
|
|
23
|
+
export const EMBEDDING_MODEL = "Xenova/bge-small-en-v1.5";
|
|
23
24
|
|
|
24
25
|
export function getBotholomewDir(projectDir: string): string {
|
|
25
26
|
return join(projectDir, BOTHOLOMEW_DIR);
|
|
@@ -45,6 +46,13 @@ export function getMcpxDir(projectDir: string): string {
|
|
|
45
46
|
return join(projectDir, BOTHOLOMEW_DIR, MCPX_DIR);
|
|
46
47
|
}
|
|
47
48
|
|
|
49
|
+
export function getModelsDir(projectDir: string): string {
|
|
50
|
+
return (
|
|
51
|
+
process.env.BOTHOLOMEW_MODELS_DIR_OVERRIDE ??
|
|
52
|
+
join(projectDir, BOTHOLOMEW_DIR, MODELS_DIR)
|
|
53
|
+
);
|
|
54
|
+
}
|
|
55
|
+
|
|
48
56
|
export function getSkillsDir(projectDir: string): string {
|
|
49
57
|
return join(projectDir, BOTHOLOMEW_DIR, SKILLS_DIR);
|
|
50
58
|
}
|
|
@@ -1,18 +1,51 @@
|
|
|
1
|
+
import { existsSync } from "node:fs";
|
|
2
|
+
import { join } from "node:path";
|
|
3
|
+
import {
|
|
4
|
+
env,
|
|
5
|
+
type FeatureExtractionPipeline,
|
|
6
|
+
pipeline,
|
|
7
|
+
} from "@huggingface/transformers";
|
|
1
8
|
import type { BotholomewConfig } from "../config/schemas.ts";
|
|
9
|
+
import { logger } from "../utils/logger.ts";
|
|
2
10
|
|
|
3
11
|
type EmbedFn = (
|
|
4
12
|
texts: string[],
|
|
5
13
|
config: Required<BotholomewConfig>,
|
|
6
14
|
) => Promise<number[][]>;
|
|
7
15
|
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
16
|
+
// Singleton pipeline keyed by model name. Loading the model is expensive
|
|
17
|
+
// (downloads weights on first run, then ~hundreds of ms to instantiate the
|
|
18
|
+
// ONNX runtime), so we hold one per model for the life of the process.
|
|
19
|
+
const pipelinePromises = new Map<string, Promise<FeatureExtractionPipeline>>();
|
|
20
|
+
|
|
21
|
+
export function setEmbeddingCacheDir(dir: string): void {
|
|
22
|
+
// Trailing separator matters: transformers.js builds paths as `${cacheDir}${rel}` (no separator).
|
|
23
|
+
env.cacheDir = dir.endsWith("/") ? dir : `${dir}/`;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
function isModelCached(model: string): boolean {
|
|
27
|
+
if (!env.cacheDir) return false;
|
|
28
|
+
return existsSync(join(env.cacheDir, model));
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
async function getPipeline(model: string): Promise<FeatureExtractionPipeline> {
|
|
32
|
+
let p = pipelinePromises.get(model);
|
|
33
|
+
if (!p) {
|
|
34
|
+
logger.info(
|
|
35
|
+
isModelCached(model)
|
|
36
|
+
? `Loading embedding model ${model}`
|
|
37
|
+
: `Loading embedding model ${model} (first run, downloading weights)`,
|
|
38
|
+
);
|
|
39
|
+
p = pipeline("feature-extraction", model);
|
|
40
|
+
pipelinePromises.set(model, p);
|
|
41
|
+
}
|
|
42
|
+
return p;
|
|
11
43
|
}
|
|
12
44
|
|
|
13
45
|
/**
|
|
14
|
-
* Embed multiple texts using
|
|
15
|
-
* Returns an array of float vectors with the
|
|
46
|
+
* Embed multiple texts using a local @huggingface/transformers feature-extraction
|
|
47
|
+
* pipeline. Returns an array of L2-normalized float vectors with the model's
|
|
48
|
+
* native dimension (must match `config.embedding_dimension`).
|
|
16
49
|
*/
|
|
17
50
|
export async function embed(
|
|
18
51
|
texts: string[],
|
|
@@ -20,37 +53,17 @@ export async function embed(
|
|
|
20
53
|
): Promise<number[][]> {
|
|
21
54
|
if (texts.length === 0) return [];
|
|
22
55
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
);
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
const response = await fetch("https://api.openai.com/v1/embeddings", {
|
|
30
|
-
method: "POST",
|
|
31
|
-
headers: {
|
|
32
|
-
Authorization: `Bearer ${config.openai_api_key}`,
|
|
33
|
-
"Content-Type": "application/json",
|
|
34
|
-
},
|
|
35
|
-
body: JSON.stringify({
|
|
36
|
-
input: texts,
|
|
37
|
-
model: config.embedding_model,
|
|
38
|
-
dimensions: config.embedding_dimension,
|
|
39
|
-
}),
|
|
40
|
-
});
|
|
56
|
+
const extractor = await getPipeline(config.embedding_model);
|
|
57
|
+
const output = await extractor(texts, { pooling: "mean", normalize: true });
|
|
58
|
+
const data = output.tolist() as number[][];
|
|
41
59
|
|
|
42
|
-
if (
|
|
43
|
-
const body = await response.text();
|
|
60
|
+
if (data[0] && data[0].length !== config.embedding_dimension) {
|
|
44
61
|
throw new Error(
|
|
45
|
-
`
|
|
62
|
+
`Embedding model ${config.embedding_model} returned ${data[0].length}-dim vectors, but embedding_dimension is set to ${config.embedding_dimension}. Update embedding_dimension in config and re-embed.`,
|
|
46
63
|
);
|
|
47
64
|
}
|
|
48
65
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
// Sort by index to ensure order matches input
|
|
52
|
-
const sorted = result.data.sort((a, b) => a.index - b.index);
|
|
53
|
-
return sorted.map((d) => d.embedding);
|
|
66
|
+
return data;
|
|
54
67
|
}
|
|
55
68
|
|
|
56
69
|
/**
|
package/src/context/ingest.ts
CHANGED
|
@@ -44,16 +44,7 @@ export async function prepareIngestion(
|
|
|
44
44
|
return null;
|
|
45
45
|
}
|
|
46
46
|
|
|
47
|
-
|
|
48
|
-
const doEmbed =
|
|
49
|
-
embedFn ??
|
|
50
|
-
(config.openai_api_key
|
|
51
|
-
? (texts: string[]) => defaultEmbed(texts, config)
|
|
52
|
-
: null);
|
|
53
|
-
if (!doEmbed) {
|
|
54
|
-
logger.debug("ingest: skipping embeddings (no OpenAI API key configured)");
|
|
55
|
-
return null;
|
|
56
|
-
}
|
|
47
|
+
const doEmbed = embedFn ?? ((texts: string[]) => defaultEmbed(texts, config));
|
|
57
48
|
|
|
58
49
|
const chunks = await chunk(item.content, item.mime_type, config);
|
|
59
50
|
if (chunks.length === 0) return null;
|
package/src/context/refresh.ts
CHANGED
|
@@ -132,8 +132,7 @@ export async function refreshContextItems(
|
|
|
132
132
|
const unchanged = results.filter((r) => r.status === "unchanged").length;
|
|
133
133
|
const missing = results.filter((r) => r.status === "missing").length;
|
|
134
134
|
|
|
135
|
-
|
|
136
|
-
if (toReembed.length === 0 || !hasEmbedder) {
|
|
135
|
+
if (toReembed.length === 0) {
|
|
137
136
|
return {
|
|
138
137
|
checked: refreshable.length,
|
|
139
138
|
updated,
|
|
@@ -141,7 +140,7 @@ export async function refreshContextItems(
|
|
|
141
140
|
missing,
|
|
142
141
|
reembedded: 0,
|
|
143
142
|
chunks: 0,
|
|
144
|
-
embeddings_skipped:
|
|
143
|
+
embeddings_skipped: false,
|
|
145
144
|
items: results,
|
|
146
145
|
};
|
|
147
146
|
}
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
import type { BotholomewConfig } from "../config/schemas.ts";
|
|
2
|
+
import { embed } from "../context/embedder.ts";
|
|
3
|
+
import { logger } from "../utils/logger.ts";
|
|
4
|
+
import { withDb } from "./connection.ts";
|
|
5
|
+
import { rebuildSearchIndex } from "./embeddings.ts";
|
|
6
|
+
|
|
7
|
+
interface PendingRow {
|
|
8
|
+
id: string;
|
|
9
|
+
chunk_content: string | null;
|
|
10
|
+
title: string;
|
|
11
|
+
description: string;
|
|
12
|
+
drive: string | null;
|
|
13
|
+
path: string | null;
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
const BATCH_SIZE = 32;
|
|
17
|
+
|
|
18
|
+
function buildEmbeddingInput(row: PendingRow): string {
|
|
19
|
+
const parts: string[] = [];
|
|
20
|
+
if (row.title) parts.push(`Title: ${row.title}`);
|
|
21
|
+
if (row.description) parts.push(`Description: ${row.description}`);
|
|
22
|
+
if (row.drive && row.path) parts.push(`Source: ${row.drive}:${row.path}`);
|
|
23
|
+
if (row.chunk_content) parts.push(row.chunk_content);
|
|
24
|
+
return parts.join("\n");
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
interface ReembedOptions {
|
|
28
|
+
/**
|
|
29
|
+
* `"missing"` (default) — only re-embed rows where `embedding IS NULL`.
|
|
30
|
+
* `"all"` — re-embed every row, including ones that already have a vector.
|
|
31
|
+
* Use this after changing `embedding_model` so old vectors don't
|
|
32
|
+
* sit alongside new ones in a different space.
|
|
33
|
+
*/
|
|
34
|
+
mode?: "missing" | "all";
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Recompute embeddings for rows in the embeddings table.
|
|
39
|
+
*
|
|
40
|
+
* Default mode (`"missing"`) only touches NULL rows — the case after migration
|
|
41
|
+
* 18 leaves existing rows with no vector. The `context reembed` CLI command
|
|
42
|
+
* passes `mode: "all"` to force a full rebuild after the user changes
|
|
43
|
+
* `embedding_model`.
|
|
44
|
+
*
|
|
45
|
+
* Each batch is its own withDb so the file lock releases between embedding
|
|
46
|
+
* calls — long sweeps don't block other workers from acquiring the DB.
|
|
47
|
+
*/
|
|
48
|
+
export async function reembedMissingVectors(
|
|
49
|
+
dbPath: string,
|
|
50
|
+
config: Required<BotholomewConfig>,
|
|
51
|
+
options: ReembedOptions = {},
|
|
52
|
+
): Promise<void> {
|
|
53
|
+
const mode = options.mode ?? "missing";
|
|
54
|
+
const filter = mode === "all" ? "" : "WHERE embedding IS NULL";
|
|
55
|
+
|
|
56
|
+
const total = await withDb(dbPath, async (conn) => {
|
|
57
|
+
const row = await conn.queryGet<{ count: number }>(
|
|
58
|
+
`SELECT count(*)::INTEGER AS count FROM embeddings ${filter}`,
|
|
59
|
+
);
|
|
60
|
+
return row?.count ?? 0;
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
if (total === 0) {
|
|
64
|
+
logger.info("No embeddings to recompute.");
|
|
65
|
+
return;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
logger.info(
|
|
69
|
+
`re-embedding ${total} row${total === 1 ? "" : "s"} with model ${config.embedding_model}`,
|
|
70
|
+
);
|
|
71
|
+
|
|
72
|
+
let processed = 0;
|
|
73
|
+
while (processed < total) {
|
|
74
|
+
const batch = await withDb(dbPath, async (conn) => {
|
|
75
|
+
const offsetClause = mode === "all" ? `LIMIT ?1 OFFSET ?2` : `LIMIT ?1`;
|
|
76
|
+
const sql = `SELECT e.id, e.chunk_content, e.title, e.description, ci.drive, ci.path
|
|
77
|
+
FROM embeddings e
|
|
78
|
+
LEFT JOIN context_items ci ON ci.id = e.context_item_id
|
|
79
|
+
${filter}
|
|
80
|
+
ORDER BY e.id
|
|
81
|
+
${offsetClause}`;
|
|
82
|
+
return mode === "all"
|
|
83
|
+
? conn.queryAll<PendingRow>(sql, BATCH_SIZE, processed)
|
|
84
|
+
: conn.queryAll<PendingRow>(sql, BATCH_SIZE);
|
|
85
|
+
});
|
|
86
|
+
|
|
87
|
+
if (batch.length === 0) break;
|
|
88
|
+
|
|
89
|
+
const inputs = batch.map(buildEmbeddingInput);
|
|
90
|
+
const vectors = await embed(inputs, config);
|
|
91
|
+
|
|
92
|
+
await withDb(dbPath, async (conn) => {
|
|
93
|
+
for (let i = 0; i < batch.length; i++) {
|
|
94
|
+
const row = batch[i];
|
|
95
|
+
const vec = vectors[i];
|
|
96
|
+
if (!row || !vec) continue;
|
|
97
|
+
await conn.queryRun(
|
|
98
|
+
`UPDATE embeddings
|
|
99
|
+
SET embedding = ?1::FLOAT[${config.embedding_dimension}]
|
|
100
|
+
WHERE id = ?2`,
|
|
101
|
+
vec,
|
|
102
|
+
row.id,
|
|
103
|
+
);
|
|
104
|
+
}
|
|
105
|
+
});
|
|
106
|
+
|
|
107
|
+
processed += batch.length;
|
|
108
|
+
logger.info(` re-embedded ${processed}/${total}`);
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
await withDb(dbPath, (conn) => rebuildSearchIndex(conn));
|
|
112
|
+
logger.success(`re-embed complete (${processed} rows)`);
|
|
113
|
+
}
|
package/src/db/schema.ts
CHANGED
|
@@ -2,6 +2,7 @@ import { readdirSync, readFileSync } from "node:fs";
|
|
|
2
2
|
import { join } from "node:path";
|
|
3
3
|
import { logger } from "../utils/logger.ts";
|
|
4
4
|
import type { DbConnection } from "./connection.ts";
|
|
5
|
+
import { rebuildSearchIndex } from "./embeddings.ts";
|
|
5
6
|
|
|
6
7
|
interface Migration {
|
|
7
8
|
id: number;
|
|
@@ -83,4 +84,10 @@ export async function migrate(db: DbConnection): Promise<void> {
|
|
|
83
84
|
if (appliedAny) {
|
|
84
85
|
await db.exec("CHECKPOINT");
|
|
85
86
|
}
|
|
87
|
+
|
|
88
|
+
// Ensure the FTS index exists. Migration 18 drops it (it can't recreate it
|
|
89
|
+
// in the same SQL run without DuckDB rejecting the dependency commit), and
|
|
90
|
+
// fresh DBs need it created at least once. `overwrite = 1` makes this
|
|
91
|
+
// idempotent for DBs that already have a healthy FTS index.
|
|
92
|
+
await rebuildSearchIndex(db);
|
|
86
93
|
}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
-- Switch from OpenAI 1536-dim embeddings to local 384-dim embeddings.
|
|
2
|
+
--
|
|
3
|
+
-- DuckDB encodes array dimension in the column type, so we rebuild the
|
|
4
|
+
-- embeddings table preserving every row's metadata (chunk_content, title,
|
|
5
|
+
-- description, context_item_id, chunk_index, created_at). The vectors
|
|
6
|
+
-- themselves are NULLed and repopulated by `botholomew context reembed`
|
|
7
|
+
-- using the locally-loaded embedding model.
|
|
8
|
+
--
|
|
9
|
+
-- Idempotency: every destructive step uses IF EXISTS so a partial prior
|
|
10
|
+
-- run can be re-attempted cleanly. The FTS index is dropped here but NOT
|
|
11
|
+
-- recreated — `migrate()` calls rebuildSearchIndex once after all SQL
|
|
12
|
+
-- migrations apply, which avoids a same-migration drop-then-create that
|
|
13
|
+
-- DuckDB rejects with "Could not commit creation of dependency, subject
|
|
14
|
+
-- 'stopwords' has been deleted".
|
|
15
|
+
|
|
16
|
+
DROP SCHEMA IF EXISTS fts_main_embeddings CASCADE;
|
|
17
|
+
|
|
18
|
+
DROP TABLE IF EXISTS embeddings_new;
|
|
19
|
+
|
|
20
|
+
CREATE TABLE embeddings_new (
|
|
21
|
+
id TEXT PRIMARY KEY,
|
|
22
|
+
context_item_id TEXT NOT NULL,
|
|
23
|
+
chunk_index INTEGER NOT NULL,
|
|
24
|
+
chunk_content TEXT,
|
|
25
|
+
title TEXT NOT NULL,
|
|
26
|
+
description TEXT NOT NULL DEFAULT '',
|
|
27
|
+
embedding FLOAT[384],
|
|
28
|
+
created_at TEXT NOT NULL DEFAULT (current_timestamp::VARCHAR),
|
|
29
|
+
UNIQUE(context_item_id, chunk_index)
|
|
30
|
+
);
|
|
31
|
+
|
|
32
|
+
INSERT INTO embeddings_new (id, context_item_id, chunk_index, chunk_content, title, description, embedding, created_at)
|
|
33
|
+
SELECT id, context_item_id, chunk_index, chunk_content, title, description, NULL, created_at
|
|
34
|
+
FROM embeddings;
|
|
35
|
+
|
|
36
|
+
DROP TABLE embeddings;
|
|
37
|
+
ALTER TABLE embeddings_new RENAME TO embeddings;
|
|
38
|
+
|
|
39
|
+
CHECKPOINT;
|
package/src/init/templates.ts
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import { DEFAULT_CONFIG as SCHEMA_DEFAULT_CONFIG } from "../config/schemas.ts";
|
|
2
|
+
|
|
1
3
|
export const SOUL_MD = `---
|
|
2
4
|
loading: always
|
|
3
5
|
agent-modification: false
|
|
@@ -85,11 +87,8 @@ and currently in progress) and format a brief standup-style update with:
|
|
|
85
87
|
`;
|
|
86
88
|
|
|
87
89
|
export const DEFAULT_CONFIG = {
|
|
90
|
+
...SCHEMA_DEFAULT_CONFIG,
|
|
88
91
|
anthropic_api_key: "your-api-key-here",
|
|
89
|
-
model: "claude-opus-4-20250514",
|
|
90
|
-
tick_interval_seconds: 300,
|
|
91
|
-
max_tick_duration_seconds: 120,
|
|
92
|
-
max_turns: 0,
|
|
93
92
|
};
|
|
94
93
|
|
|
95
94
|
export const DEFAULT_MCPX_SERVERS = {
|
|
@@ -33,22 +33,30 @@ export function StatusBar({
|
|
|
33
33
|
useEffect(() => {
|
|
34
34
|
let mounted = true;
|
|
35
35
|
|
|
36
|
+
// Errors here (e.g. transient DuckDB lock conflicts while a freshly
|
|
37
|
+
// spawned worker is migrating) must not freeze the count — the next
|
|
38
|
+
// interval tick will retry. Swallow silently rather than logging
|
|
39
|
+
// because logger writes to stdout and would corrupt the Ink render.
|
|
36
40
|
const refresh = async () => {
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
41
|
+
try {
|
|
42
|
+
const [pending, inProgress, workers] = await withDb(
|
|
43
|
+
dbPath,
|
|
44
|
+
async (conn) => [
|
|
45
|
+
await listTasks(conn, { status: "pending" }),
|
|
46
|
+
await listTasks(conn, { status: "in_progress" }),
|
|
47
|
+
await listWorkers(conn, { status: "running" }),
|
|
48
|
+
],
|
|
49
|
+
);
|
|
50
|
+
if (mounted) {
|
|
51
|
+
setStatus({
|
|
52
|
+
workerCount: workers.length,
|
|
53
|
+
pendingCount: pending.length,
|
|
54
|
+
inProgressCount: inProgress.length,
|
|
55
|
+
});
|
|
56
|
+
onWorkerStatusChange?.(workers.length > 0);
|
|
57
|
+
}
|
|
58
|
+
} catch {
|
|
59
|
+
// Keep prior state; next tick will retry.
|
|
52
60
|
}
|
|
53
61
|
};
|
|
54
62
|
|
package/src/worker/prompt.ts
CHANGED
|
@@ -104,7 +104,7 @@ export async function buildSystemPrompt(
|
|
|
104
104
|
|
|
105
105
|
prompt += await loadPersistentContext(projectDir, taskKeywords);
|
|
106
106
|
|
|
107
|
-
if (task && dbPath && _config
|
|
107
|
+
if (task && dbPath && _config) {
|
|
108
108
|
try {
|
|
109
109
|
const query = `${task.name} ${task.description}`;
|
|
110
110
|
const queryVec = await embedSingle(query, _config);
|