botholomew 0.12.5 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +91 -68
- package/package.json +2 -2
- package/src/chat/agent.ts +42 -82
- package/src/chat/session.ts +29 -25
- package/src/commands/capabilities.ts +1 -1
- package/src/commands/context.ts +177 -926
- package/src/commands/db.ts +9 -13
- package/src/commands/init.ts +4 -1
- package/src/commands/nuke.ts +57 -90
- package/src/commands/schedule.ts +103 -124
- package/src/commands/skill.ts +2 -2
- package/src/commands/task.ts +86 -95
- package/src/commands/thread.ts +107 -112
- package/src/commands/worker.ts +88 -88
- package/src/constants.ts +93 -16
- package/src/context/capabilities.ts +10 -10
- package/src/context/fetcher.ts +9 -10
- package/src/context/reindex.ts +189 -0
- package/src/context/store.ts +630 -0
- package/src/db/doctor.ts +1 -8
- package/src/db/embeddings.ts +227 -175
- package/src/db/sql/19-disk_backed_index.sql +36 -0
- package/src/db/sql/20-drop_db_tables_for_files.sql +19 -0
- package/src/fs/atomic.ts +217 -0
- package/src/fs/compat.ts +86 -0
- package/src/fs/sandbox.ts +279 -0
- package/src/init/index.ts +69 -52
- package/src/init/templates.ts +1 -1
- package/src/mcpx/client.ts +1 -1
- package/src/schedules/schema.ts +19 -0
- package/src/schedules/store.ts +296 -0
- package/src/skills/commands.ts +1 -3
- package/src/tasks/schema.ts +47 -0
- package/src/tasks/store.ts +486 -0
- package/src/threads/store.ts +559 -0
- package/src/tools/capabilities/refresh.ts +42 -21
- package/src/tools/context/pipe.ts +15 -71
- package/src/tools/context/update-beliefs.ts +3 -3
- package/src/tools/context/update-goals.ts +3 -3
- package/src/tools/dir/create.ts +26 -23
- package/src/tools/dir/size.ts +46 -17
- package/src/tools/dir/tree.ts +73 -279
- package/src/tools/file/copy.ts +50 -24
- package/src/tools/file/count-lines.ts +34 -10
- package/src/tools/file/delete.ts +44 -23
- package/src/tools/file/edit.ts +39 -14
- package/src/tools/file/exists.ts +12 -26
- package/src/tools/file/info.ts +25 -85
- package/src/tools/file/move.ts +39 -24
- package/src/tools/file/read.ts +32 -80
- package/src/tools/file/write.ts +14 -91
- package/src/tools/registry.ts +3 -7
- package/src/tools/schedule/create.ts +2 -2
- package/src/tools/schedule/list.ts +7 -3
- package/src/tools/search/fuse.ts +12 -33
- package/src/tools/search/index.ts +36 -43
- package/src/tools/search/regexp.ts +29 -17
- package/src/tools/search/semantic.ts +137 -51
- package/src/tools/skill/delete.ts +1 -1
- package/src/tools/skill/list.ts +1 -1
- package/src/tools/skill/write.ts +1 -1
- package/src/tools/task/create.ts +41 -16
- package/src/tools/task/delete.ts +3 -3
- package/src/tools/task/list.ts +6 -3
- package/src/tools/task/update.ts +31 -9
- package/src/tools/task/view.ts +6 -6
- package/src/tools/thread/list.ts +2 -2
- package/src/tools/thread/search.ts +208 -0
- package/src/tools/thread/view.ts +50 -5
- package/src/tools/worker/spawn.ts +28 -14
- package/src/tui/App.tsx +12 -19
- package/src/tui/components/ContextPanel.tsx +83 -316
- package/src/tui/components/SchedulePanel.tsx +34 -48
- package/src/tui/components/StatusBar.tsx +15 -15
- package/src/tui/components/TaskPanel.tsx +34 -38
- package/src/tui/components/ThreadPanel.tsx +29 -38
- package/src/tui/components/WorkerPanel.tsx +21 -19
- package/src/tui/markdown.ts +2 -8
- package/src/utils/title.ts +5 -7
- package/src/utils/v7-date.ts +47 -0
- package/src/worker/heartbeat.ts +46 -24
- package/src/worker/index.ts +13 -15
- package/src/worker/llm.ts +30 -37
- package/src/worker/prompt.ts +19 -41
- package/src/worker/schedules.ts +48 -69
- package/src/worker/spawn.ts +11 -11
- package/src/worker/tick.ts +39 -43
- package/src/workers/store.ts +247 -0
- package/src/commands/tools.ts +0 -367
- package/src/context/describer.ts +0 -140
- package/src/context/drives.ts +0 -110
- package/src/context/ingest.ts +0 -162
- package/src/context/refresh.ts +0 -183
- package/src/db/context.ts +0 -637
- package/src/db/daemon-state.ts +0 -6
- package/src/db/reembed.ts +0 -113
- package/src/db/schedules.ts +0 -213
- package/src/db/tasks.ts +0 -347
- package/src/db/threads.ts +0 -276
- package/src/db/workers.ts +0 -212
- package/src/tools/context/list-drives.ts +0 -36
- package/src/tools/context/refresh.ts +0 -165
- package/src/tools/context/search.ts +0 -54
package/src/db/embeddings.ts
CHANGED
|
@@ -1,248 +1,300 @@
|
|
|
1
1
|
import { EMBEDDING_DIMENSION } from "../constants.ts";
|
|
2
2
|
import type { DbConnection } from "./connection.ts";
|
|
3
|
-
import { uuidv7 } from "./uuid.ts";
|
|
4
3
|
|
|
5
4
|
if (!Number.isInteger(EMBEDDING_DIMENSION) || EMBEDDING_DIMENSION <= 0) {
|
|
6
5
|
throw new Error(`Invalid EMBEDDING_DIMENSION: ${EMBEDDING_DIMENSION}`);
|
|
7
6
|
}
|
|
8
7
|
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
8
|
+
/**
|
|
9
|
+
* Disk-backed search index over `<projectDir>/context/`. One row per
|
|
10
|
+
* `(path, chunk_index)`; `content_hash` is the file-level sha256 so the
|
|
11
|
+
* reindex algorithm can detect adds, updates, and removals in one pass.
|
|
12
|
+
*/
|
|
13
|
+
export interface IndexedChunk {
|
|
14
|
+
path: string;
|
|
12
15
|
chunk_index: number;
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
description: string;
|
|
16
|
+
content_hash: string;
|
|
17
|
+
chunk_content: string;
|
|
16
18
|
embedding: number[];
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
export interface EmbeddingSearchResult extends Embedding {
|
|
21
|
-
score: number;
|
|
19
|
+
mtime_ms: number;
|
|
20
|
+
size_bytes: number;
|
|
21
|
+
indexed_at: Date;
|
|
22
22
|
}
|
|
23
23
|
|
|
24
|
-
interface
|
|
25
|
-
|
|
26
|
-
context_item_id: string;
|
|
24
|
+
interface IndexRow {
|
|
25
|
+
path: string;
|
|
27
26
|
chunk_index: number;
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
description: string;
|
|
27
|
+
content_hash: string;
|
|
28
|
+
chunk_content: string;
|
|
31
29
|
embedding: number[] | null;
|
|
32
|
-
|
|
30
|
+
mtime_ms: number;
|
|
31
|
+
size_bytes: number;
|
|
32
|
+
indexed_at: string;
|
|
33
33
|
}
|
|
34
34
|
|
|
35
|
-
function
|
|
35
|
+
function rowToChunk(row: IndexRow): IndexedChunk {
|
|
36
36
|
return {
|
|
37
|
-
|
|
38
|
-
context_item_id: row.context_item_id,
|
|
37
|
+
path: row.path,
|
|
39
38
|
chunk_index: row.chunk_index,
|
|
39
|
+
content_hash: row.content_hash,
|
|
40
40
|
chunk_content: row.chunk_content,
|
|
41
|
-
title: row.title,
|
|
42
|
-
description: row.description,
|
|
43
41
|
embedding: row.embedding ?? [],
|
|
44
|
-
|
|
42
|
+
mtime_ms: Number(row.mtime_ms),
|
|
43
|
+
size_bytes: Number(row.size_bytes),
|
|
44
|
+
indexed_at: new Date(row.indexed_at),
|
|
45
45
|
};
|
|
46
46
|
}
|
|
47
47
|
|
|
48
|
+
export interface ChunkInput {
|
|
49
|
+
chunk_index: number;
|
|
50
|
+
chunk_content: string;
|
|
51
|
+
embedding: number[];
|
|
52
|
+
}
|
|
53
|
+
|
|
48
54
|
/**
|
|
49
|
-
*
|
|
50
|
-
*
|
|
51
|
-
*
|
|
55
|
+
* Replace all rows for `path` with the supplied chunks. The file-level
|
|
56
|
+
* `content_hash` / `mtime_ms` / `size_bytes` are stored on every row so a
|
|
57
|
+
* subsequent reindex can short-circuit by comparing just those columns.
|
|
52
58
|
*/
|
|
53
|
-
export async function
|
|
59
|
+
export async function upsertChunksForPath(
|
|
54
60
|
conn: DbConnection,
|
|
55
61
|
params: {
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
embedding: number[];
|
|
62
|
+
path: string;
|
|
63
|
+
contentHash: string;
|
|
64
|
+
mtimeMs: number;
|
|
65
|
+
sizeBytes: number;
|
|
66
|
+
chunks: ChunkInput[];
|
|
62
67
|
},
|
|
63
|
-
): Promise<
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
68
|
+
): Promise<void> {
|
|
69
|
+
await conn.queryRun("DELETE FROM context_index WHERE path = ?1", params.path);
|
|
70
|
+
for (const c of params.chunks) {
|
|
71
|
+
await conn.queryRun(
|
|
72
|
+
`INSERT INTO context_index
|
|
73
|
+
(path, chunk_index, content_hash, chunk_content, embedding, mtime_ms, size_bytes, indexed_at)
|
|
74
|
+
VALUES (?1, ?2, ?3, ?4, ?5::FLOAT[${EMBEDDING_DIMENSION}], ?6, ?7, current_timestamp::VARCHAR)`,
|
|
75
|
+
params.path,
|
|
76
|
+
c.chunk_index,
|
|
77
|
+
params.contentHash,
|
|
78
|
+
c.chunk_content,
|
|
79
|
+
c.embedding,
|
|
80
|
+
params.mtimeMs,
|
|
81
|
+
params.sizeBytes,
|
|
82
|
+
);
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
export async function deleteIndexedPath(
|
|
87
|
+
conn: DbConnection,
|
|
88
|
+
path: string,
|
|
89
|
+
): Promise<number> {
|
|
90
|
+
const result = await conn.queryRun(
|
|
91
|
+
"DELETE FROM context_index WHERE path = ?1",
|
|
92
|
+
path,
|
|
75
93
|
);
|
|
94
|
+
return result.changes;
|
|
95
|
+
}
|
|
76
96
|
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
description: params.description ?? "",
|
|
84
|
-
embedding: params.embedding,
|
|
85
|
-
created_at: new Date(),
|
|
86
|
-
};
|
|
97
|
+
export interface IndexedPathSummary {
|
|
98
|
+
path: string;
|
|
99
|
+
content_hash: string;
|
|
100
|
+
mtime_ms: number;
|
|
101
|
+
size_bytes: number;
|
|
102
|
+
chunk_count: number;
|
|
87
103
|
}
|
|
88
104
|
|
|
89
|
-
export async function
|
|
105
|
+
export async function listIndexedPaths(
|
|
90
106
|
conn: DbConnection,
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
107
|
+
): Promise<IndexedPathSummary[]> {
|
|
108
|
+
const rows = await conn.queryAll<{
|
|
109
|
+
path: string;
|
|
110
|
+
content_hash: string;
|
|
111
|
+
mtime_ms: number;
|
|
112
|
+
size_bytes: number;
|
|
113
|
+
chunk_count: number;
|
|
114
|
+
}>(
|
|
115
|
+
`SELECT path,
|
|
116
|
+
ANY_VALUE(content_hash) AS content_hash,
|
|
117
|
+
ANY_VALUE(mtime_ms) AS mtime_ms,
|
|
118
|
+
ANY_VALUE(size_bytes) AS size_bytes,
|
|
119
|
+
COUNT(*) AS chunk_count
|
|
120
|
+
FROM context_index
|
|
121
|
+
GROUP BY path
|
|
122
|
+
ORDER BY path ASC`,
|
|
96
123
|
);
|
|
97
|
-
return rows.map(
|
|
124
|
+
return rows.map((r) => ({
|
|
125
|
+
path: r.path,
|
|
126
|
+
content_hash: r.content_hash,
|
|
127
|
+
mtime_ms: Number(r.mtime_ms),
|
|
128
|
+
size_bytes: Number(r.size_bytes),
|
|
129
|
+
chunk_count: Number(r.chunk_count),
|
|
130
|
+
}));
|
|
98
131
|
}
|
|
99
132
|
|
|
100
|
-
|
|
101
|
-
* Delete all embeddings for a context item. Callers are responsible for
|
|
102
|
-
* calling `rebuildSearchIndex()` afterward — the FTS index is a snapshot and
|
|
103
|
-
* will still reference the deleted rows until rebuilt.
|
|
104
|
-
*/
|
|
105
|
-
export async function deleteEmbeddingsForItem(
|
|
133
|
+
export async function getIndexedPath(
|
|
106
134
|
conn: DbConnection,
|
|
107
|
-
|
|
108
|
-
): Promise<
|
|
109
|
-
const
|
|
110
|
-
|
|
111
|
-
|
|
135
|
+
path: string,
|
|
136
|
+
): Promise<IndexedPathSummary | null> {
|
|
137
|
+
const row = await conn.queryGet<{
|
|
138
|
+
path: string;
|
|
139
|
+
content_hash: string;
|
|
140
|
+
mtime_ms: number;
|
|
141
|
+
size_bytes: number;
|
|
142
|
+
chunk_count: number;
|
|
143
|
+
}>(
|
|
144
|
+
`SELECT path,
|
|
145
|
+
ANY_VALUE(content_hash) AS content_hash,
|
|
146
|
+
ANY_VALUE(mtime_ms) AS mtime_ms,
|
|
147
|
+
ANY_VALUE(size_bytes) AS size_bytes,
|
|
148
|
+
COUNT(*) AS chunk_count
|
|
149
|
+
FROM context_index
|
|
150
|
+
WHERE path = ?1
|
|
151
|
+
GROUP BY path`,
|
|
152
|
+
path,
|
|
112
153
|
);
|
|
113
|
-
return
|
|
154
|
+
if (!row) return null;
|
|
155
|
+
return {
|
|
156
|
+
path: row.path,
|
|
157
|
+
content_hash: row.content_hash,
|
|
158
|
+
mtime_ms: Number(row.mtime_ms),
|
|
159
|
+
size_bytes: Number(row.size_bytes),
|
|
160
|
+
chunk_count: Number(row.chunk_count),
|
|
161
|
+
};
|
|
114
162
|
}
|
|
115
163
|
|
|
116
|
-
interface
|
|
117
|
-
|
|
164
|
+
export interface SearchResult extends IndexedChunk {
|
|
165
|
+
score: number;
|
|
118
166
|
}
|
|
119
167
|
|
|
120
168
|
/**
|
|
121
|
-
* Vector similarity
|
|
122
|
-
*
|
|
123
|
-
* uses the index for top-k queries. Returns results sorted by
|
|
124
|
-
* similarity (closest first), with score = 1 - distance.
|
|
169
|
+
* Vector similarity over `context_index.embedding`. Returns chunks sorted by
|
|
170
|
+
* cosine similarity (higher = closer). Skips rows whose embedding is NULL.
|
|
125
171
|
*/
|
|
126
|
-
export async function
|
|
172
|
+
export async function searchSemantic(
|
|
127
173
|
conn: DbConnection,
|
|
128
174
|
queryEmbedding: number[],
|
|
129
175
|
limit = 10,
|
|
130
|
-
): Promise<
|
|
131
|
-
const rows = await conn.queryAll<
|
|
176
|
+
): Promise<SearchResult[]> {
|
|
177
|
+
const rows = await conn.queryAll<IndexRow & { distance: number }>(
|
|
132
178
|
`SELECT *, array_cosine_distance(embedding, ?1::FLOAT[${EMBEDDING_DIMENSION}]) AS distance
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
179
|
+
FROM context_index
|
|
180
|
+
WHERE embedding IS NOT NULL
|
|
181
|
+
ORDER BY distance ASC
|
|
182
|
+
LIMIT ?2`,
|
|
136
183
|
queryEmbedding,
|
|
137
184
|
limit,
|
|
138
185
|
);
|
|
139
|
-
|
|
140
186
|
return rows.map((row) => ({
|
|
141
|
-
...
|
|
187
|
+
...rowToChunk(row),
|
|
142
188
|
score: 1 - row.distance,
|
|
143
189
|
}));
|
|
144
190
|
}
|
|
145
191
|
|
|
146
|
-
export interface HybridSearchResult extends EmbeddingSearchResult {
|
|
147
|
-
drive: string | null;
|
|
148
|
-
path: string | null;
|
|
149
|
-
}
|
|
150
|
-
|
|
151
192
|
/**
|
|
152
|
-
*
|
|
153
|
-
*
|
|
154
|
-
* batch writer must call this once its transaction commits. Cheap at our
|
|
155
|
-
* scale (hundreds to low thousands of rows).
|
|
156
|
-
*
|
|
157
|
-
* The trailing CHECKPOINT is load-bearing: `overwrite = 1` writes a
|
|
158
|
-
* `DROP SCHEMA fts_main_embeddings` record into the WAL. If the WAL still
|
|
159
|
-
* contains that drop on the next open, replay fails with "Cannot drop entry
|
|
160
|
-
* 'fts_main_embeddings' because there are entries that depend on it". Forcing
|
|
161
|
-
* a checkpoint flushes the WAL so the next open has nothing to replay.
|
|
193
|
+
* BM25 keyword search over (chunk_content, path). The FTS index is rebuilt
|
|
194
|
+
* lazily by `rebuildSearchIndex`. Returns null-scoring rows filtered out.
|
|
162
195
|
*/
|
|
163
|
-
export async function
|
|
164
|
-
|
|
165
|
-
|
|
196
|
+
export async function searchKeyword(
|
|
197
|
+
conn: DbConnection,
|
|
198
|
+
query: string,
|
|
199
|
+
limit = 10,
|
|
200
|
+
): Promise<SearchResult[]> {
|
|
201
|
+
// The FTS index is created with `path` as input_id (see
|
|
202
|
+
// rebuildSearchIndex), so match_bm25's first argument must be the path
|
|
203
|
+
// value, not rowid. Passing rowid silently returns no hits — searchHybrid
|
|
204
|
+
// would then degrade to semantic-only.
|
|
205
|
+
const rows = await conn.queryAll<IndexRow & { score: number }>(
|
|
206
|
+
`SELECT context_index.*,
|
|
207
|
+
fts_main_context_index.match_bm25(context_index.path, ?1) AS score
|
|
208
|
+
FROM context_index
|
|
209
|
+
WHERE fts_main_context_index.match_bm25(context_index.path, ?1) IS NOT NULL
|
|
210
|
+
ORDER BY score DESC
|
|
211
|
+
LIMIT ?2`,
|
|
212
|
+
query,
|
|
213
|
+
limit,
|
|
166
214
|
);
|
|
167
|
-
|
|
215
|
+
return rows.map((row) => ({ ...rowToChunk(row), score: Number(row.score) }));
|
|
168
216
|
}
|
|
169
217
|
|
|
170
|
-
|
|
218
|
+
/**
|
|
219
|
+
* Reciprocal-rank fusion of semantic + keyword results, deduped by
|
|
220
|
+
* (path, chunk_index).
|
|
221
|
+
*/
|
|
222
|
+
export async function searchHybrid(
|
|
171
223
|
conn: DbConnection,
|
|
172
224
|
query: string,
|
|
173
225
|
queryEmbedding: number[],
|
|
174
226
|
limit = 10,
|
|
175
|
-
): Promise<
|
|
176
|
-
const k = 60;
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
// at the lowest ranks. Stemming, stopwords, and tokenization are handled
|
|
182
|
-
// by FTS — more query terms produce higher scores, which is exactly the
|
|
183
|
-
// behaviour a naive per-token ILIKE loop fails to provide.
|
|
184
|
-
const keywordRows = await conn.queryAll<EmbeddingRow>(
|
|
185
|
-
`SELECT * FROM embeddings
|
|
186
|
-
WHERE fts_main_embeddings.match_bm25(id, ?1) IS NOT NULL
|
|
187
|
-
ORDER BY fts_main_embeddings.match_bm25(id, ?1) DESC
|
|
188
|
-
LIMIT 100`,
|
|
189
|
-
query,
|
|
190
|
-
);
|
|
191
|
-
|
|
192
|
-
const keywordRanked = keywordRows.map(rowToEmbedding);
|
|
193
|
-
|
|
194
|
-
const vectorResults = await searchEmbeddings(conn, queryEmbedding, 100);
|
|
227
|
+
): Promise<SearchResult[]> {
|
|
228
|
+
const k = 60;
|
|
229
|
+
const [semantic, keyword] = await Promise.all([
|
|
230
|
+
searchSemantic(conn, queryEmbedding, 100),
|
|
231
|
+
searchKeyword(conn, query, 100).catch(() => [] as SearchResult[]),
|
|
232
|
+
]);
|
|
195
233
|
|
|
196
|
-
const scores = new Map<string, {
|
|
234
|
+
const scores = new Map<string, { chunk: IndexedChunk; score: number }>();
|
|
235
|
+
const key = (c: IndexedChunk) => `${c.path}::${c.chunk_index}`;
|
|
197
236
|
|
|
198
|
-
for (
|
|
199
|
-
const
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
}
|
|
237
|
+
for (let i = 0; i < semantic.length; i++) {
|
|
238
|
+
const c = semantic[i];
|
|
239
|
+
if (!c) continue;
|
|
240
|
+
const existing = scores.get(key(c));
|
|
241
|
+
const rrf = 1 / (k + i + 1);
|
|
242
|
+
if (existing) existing.score += rrf;
|
|
243
|
+
else scores.set(key(c), { chunk: c, score: rrf });
|
|
206
244
|
}
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
const existing = scores.get(
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
scores.set(emb.id, { embedding: emb, score: rrfScore });
|
|
215
|
-
}
|
|
245
|
+
for (let i = 0; i < keyword.length; i++) {
|
|
246
|
+
const c = keyword[i];
|
|
247
|
+
if (!c) continue;
|
|
248
|
+
const existing = scores.get(key(c));
|
|
249
|
+
const rrf = 1 / (k + i + 1);
|
|
250
|
+
if (existing) existing.score += rrf;
|
|
251
|
+
else scores.set(key(c), { chunk: c, score: rrf });
|
|
216
252
|
}
|
|
253
|
+
const merged = [...scores.values()].sort((a, b) => b.score - a.score);
|
|
254
|
+
return merged.slice(0, limit).map((m) => ({ ...m.chunk, score: m.score }));
|
|
255
|
+
}
|
|
217
256
|
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
257
|
+
/**
|
|
258
|
+
* Rebuild the FTS index over (chunk_content, path). DuckDB's FTS index is a
|
|
259
|
+
* snapshot — it does not update incrementally on INSERT/UPDATE/DELETE, so any
|
|
260
|
+
* batch writer must call this once its transaction commits.
|
|
261
|
+
*
|
|
262
|
+
* The trailing CHECKPOINT is load-bearing (see history): `overwrite = 1`
|
|
263
|
+
* writes a `DROP SCHEMA fts_main_context_index` record into the WAL; without
|
|
264
|
+
* the checkpoint, replay on the next open can fail with "Cannot drop entry
|
|
265
|
+
* 'fts_main_context_index' because there are entries that depend on it".
|
|
266
|
+
*/
|
|
267
|
+
export async function rebuildSearchIndex(conn: DbConnection): Promise<void> {
|
|
268
|
+
// Skip if the table doesn't exist yet (e.g., fresh tests with an empty
|
|
269
|
+
// schema). The FTS extension errors out on a missing table.
|
|
270
|
+
const exists = await conn.queryGet<{ name: string }>(
|
|
271
|
+
"SELECT table_name AS name FROM information_schema.tables WHERE table_name = 'context_index'",
|
|
227
272
|
);
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
273
|
+
if (!exists) return;
|
|
274
|
+
await conn.exec(
|
|
275
|
+
"PRAGMA create_fts_index('context_index', 'path', 'chunk_content', 'path', overwrite = 1)",
|
|
276
|
+
);
|
|
277
|
+
await conn.exec("CHECKPOINT");
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
export async function indexStats(conn: DbConnection): Promise<{
|
|
281
|
+
paths: number;
|
|
282
|
+
chunks: number;
|
|
283
|
+
embedded: number;
|
|
284
|
+
}> {
|
|
285
|
+
const row = await conn.queryGet<{
|
|
286
|
+
paths: number;
|
|
287
|
+
chunks: number;
|
|
288
|
+
embedded: number;
|
|
233
289
|
}>(
|
|
234
|
-
`SELECT
|
|
235
|
-
|
|
290
|
+
`SELECT COUNT(DISTINCT path) AS paths,
|
|
291
|
+
COUNT(*) AS chunks,
|
|
292
|
+
COUNT(embedding) AS embedded
|
|
293
|
+
FROM context_index`,
|
|
236
294
|
);
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
...entry.embedding,
|
|
243
|
-
score: entry.score,
|
|
244
|
-
drive: item?.drive ?? null,
|
|
245
|
-
path: item?.path ?? null,
|
|
246
|
-
};
|
|
247
|
-
});
|
|
295
|
+
return {
|
|
296
|
+
paths: Number(row?.paths ?? 0),
|
|
297
|
+
chunks: Number(row?.chunks ?? 0),
|
|
298
|
+
embedded: Number(row?.embedded ?? 0),
|
|
299
|
+
};
|
|
248
300
|
}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
-- Switch the search index from "tracks DuckDB-backed virtual files" to
|
|
2
|
+
-- "tracks real files on disk under context/", and drop every table whose
|
|
3
|
+
-- contents now live on the filesystem (tasks, schedules) or that nothing
|
|
4
|
+
-- writes to anymore (daemon_state). The remaining DuckDB tables are:
|
|
5
|
+
-- workers, threads, interactions, context_index, _migrations
|
|
6
|
+
--
|
|
7
|
+
-- A new `context_index` table holds one row per (path, chunk_index), with a
|
|
8
|
+
-- file-level content hash + mtime so `botholomew context reindex` can detect
|
|
9
|
+
-- adds, updates, and removals in one pass.
|
|
10
|
+
--
|
|
11
|
+
-- Idempotent: every step uses IF EXISTS so a partial prior run is safe to
|
|
12
|
+
-- re-attempt. The FTS index over the new chunk_content column is created by
|
|
13
|
+
-- migrate() via rebuildSearchIndex() after all migrations apply.
|
|
14
|
+
|
|
15
|
+
DROP SCHEMA IF EXISTS fts_main_embeddings CASCADE;
|
|
16
|
+
DROP TABLE IF EXISTS embeddings;
|
|
17
|
+
DROP TABLE IF EXISTS context_items;
|
|
18
|
+
DROP TABLE IF EXISTS tasks;
|
|
19
|
+
DROP TABLE IF EXISTS schedules;
|
|
20
|
+
DROP TABLE IF EXISTS daemon_state;
|
|
21
|
+
|
|
22
|
+
CREATE TABLE IF NOT EXISTS context_index (
|
|
23
|
+
path TEXT NOT NULL,
|
|
24
|
+
chunk_index INTEGER NOT NULL,
|
|
25
|
+
content_hash TEXT NOT NULL,
|
|
26
|
+
chunk_content TEXT NOT NULL,
|
|
27
|
+
embedding FLOAT[384],
|
|
28
|
+
mtime_ms BIGINT NOT NULL,
|
|
29
|
+
size_bytes BIGINT NOT NULL,
|
|
30
|
+
indexed_at TEXT NOT NULL DEFAULT (current_timestamp::VARCHAR),
|
|
31
|
+
PRIMARY KEY (path, chunk_index)
|
|
32
|
+
);
|
|
33
|
+
|
|
34
|
+
CREATE INDEX IF NOT EXISTS idx_context_index_path ON context_index(path);
|
|
35
|
+
|
|
36
|
+
CHECKPOINT;
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
-- Tasks, schedules, threads, interactions, and workers all moved out of
|
|
2
|
+
-- DuckDB onto disk:
|
|
3
|
+
-- tasks/ markdown files with frontmatter (one per task)
|
|
4
|
+
-- schedules/ markdown files with frontmatter (one per schedule)
|
|
5
|
+
-- threads/ CSV per conversation (searchable via the index)
|
|
6
|
+
-- workers/ JSON pidfile per worker, mtime-checked heartbeats
|
|
7
|
+
--
|
|
8
|
+
-- The only remaining DuckDB objects after this migration are _migrations,
|
|
9
|
+
-- context_index, and the FTS index built over context_index by
|
|
10
|
+
-- rebuildSearchIndex(). Idempotent via IF EXISTS.
|
|
11
|
+
|
|
12
|
+
DROP TABLE IF EXISTS interactions;
|
|
13
|
+
DROP TABLE IF EXISTS threads;
|
|
14
|
+
DROP TABLE IF EXISTS workers;
|
|
15
|
+
|
|
16
|
+
DROP TABLE IF EXISTS tasks;
|
|
17
|
+
DROP TABLE IF EXISTS schedules;
|
|
18
|
+
|
|
19
|
+
CHECKPOINT;
|