botholomew 0.16.4 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +46 -41
- package/package.json +4 -9
- package/src/chat/agent.ts +37 -40
- package/src/chat/session.ts +10 -10
- package/src/cli.ts +0 -2
- package/src/commands/capabilities.ts +35 -33
- package/src/commands/context.ts +133 -221
- package/src/commands/init.ts +22 -1
- package/src/commands/mcpx.ts +21 -8
- package/src/commands/nuke.ts +52 -15
- package/src/commands/prepare.ts +16 -13
- package/src/config/loader.ts +1 -8
- package/src/config/schemas.ts +6 -0
- package/src/constants.ts +16 -32
- package/src/init/index.ts +52 -27
- package/src/mcpx/client.ts +21 -5
- package/src/mem/client.ts +33 -0
- package/src/{context → prompts}/capabilities.ts +11 -7
- package/src/schedules/store.ts +1 -1
- package/src/tasks/store.ts +1 -1
- package/src/threads/store.ts +1 -1
- package/src/tools/capabilities/refresh.ts +1 -1
- package/src/tools/membot/adapter.ts +111 -0
- package/src/tools/membot/copy.ts +59 -0
- package/src/tools/membot/count_lines.ts +53 -0
- package/src/tools/membot/edit.ts +72 -0
- package/src/tools/membot/exists.ts +54 -0
- package/src/tools/membot/index.ts +26 -0
- package/src/tools/{context → membot}/pipe.ts +34 -32
- package/src/tools/registry.ts +6 -37
- package/src/tools/tool.ts +6 -8
- package/src/tui/App.tsx +3 -4
- package/src/tui/components/ContextPanel.tsx +109 -226
- package/src/tui/components/HelpPanel.tsx +2 -2
- package/src/tui/components/StatusBar.tsx +0 -6
- package/src/tui/components/ThreadPanel.tsx +8 -7
- package/src/tui/wrapDetail.ts +11 -0
- package/src/worker/heartbeat.ts +0 -20
- package/src/worker/index.ts +13 -13
- package/src/worker/llm.ts +7 -9
- package/src/worker/prompt.ts +25 -13
- package/src/worker/spawn.ts +1 -1
- package/src/worker/tick.ts +10 -9
- package/src/commands/db.ts +0 -119
- package/src/commands/with-db.ts +0 -22
- package/src/context/chunker.ts +0 -275
- package/src/context/embedder-impl.ts +0 -100
- package/src/context/embedder.ts +0 -9
- package/src/context/fetcher-errors.ts +0 -8
- package/src/context/fetcher.ts +0 -515
- package/src/context/locks.ts +0 -146
- package/src/context/markdown-converter.ts +0 -186
- package/src/context/reindex.ts +0 -198
- package/src/context/store.ts +0 -841
- package/src/context/url-utils.ts +0 -25
- package/src/db/connection.ts +0 -255
- package/src/db/doctor.ts +0 -235
- package/src/db/embeddings.ts +0 -317
- package/src/db/query.ts +0 -56
- package/src/db/schema.ts +0 -93
- package/src/db/sql/1-core_tables.sql +0 -53
- package/src/db/sql/10-dedupe_context_items.sql +0 -26
- package/src/db/sql/11-rebuild_hnsw.sql +0 -8
- package/src/db/sql/12-workers.sql +0 -66
- package/src/db/sql/13-drive-paths.sql +0 -47
- package/src/db/sql/14-drop_hnsw_index.sql +0 -8
- package/src/db/sql/15-fts_index.sql +0 -8
- package/src/db/sql/16-source_url.sql +0 -7
- package/src/db/sql/17-worker_log_path.sql +0 -3
- package/src/db/sql/18-reset_embeddings_for_local.sql +0 -39
- package/src/db/sql/19-disk_backed_index.sql +0 -36
- package/src/db/sql/2-logging_tables.sql +0 -24
- package/src/db/sql/20-drop_db_tables_for_files.sql +0 -19
- package/src/db/sql/3-daemon_state.sql +0 -5
- package/src/db/sql/4-unique_context_path.sql +0 -1
- package/src/db/sql/5-reset_embeddings_for_openai.sql +0 -1
- package/src/db/sql/6-vss_index.sql +0 -7
- package/src/db/sql/7-drop_embeddings_fk.sql +0 -23
- package/src/db/sql/8-task_output.sql +0 -1
- package/src/db/sql/9-source-type.sql +0 -1
- package/src/tools/context/read-large-result.ts +0 -33
- package/src/tools/dir/create.ts +0 -47
- package/src/tools/dir/size.ts +0 -77
- package/src/tools/dir/tree.ts +0 -124
- package/src/tools/file/copy.ts +0 -73
- package/src/tools/file/count-lines.ts +0 -54
- package/src/tools/file/delete.ts +0 -83
- package/src/tools/file/edit.ts +0 -76
- package/src/tools/file/exists.ts +0 -33
- package/src/tools/file/info.ts +0 -66
- package/src/tools/file/move.ts +0 -66
- package/src/tools/file/read.ts +0 -67
- package/src/tools/file/write.ts +0 -58
- package/src/tools/search/fuse.ts +0 -96
- package/src/tools/search/index.ts +0 -127
- package/src/tools/search/regexp.ts +0 -82
- package/src/tools/search/semantic.ts +0 -167
- /package/src/{db → utils}/uuid.ts +0 -0
package/src/db/embeddings.ts
DELETED
|
@@ -1,317 +0,0 @@
|
|
|
1
|
-
import { EMBEDDING_DIMENSION } from "../constants.ts";
|
|
2
|
-
import type { DbConnection } from "./connection.ts";
|
|
3
|
-
|
|
4
|
-
if (!Number.isInteger(EMBEDDING_DIMENSION) || EMBEDDING_DIMENSION <= 0) {
|
|
5
|
-
throw new Error(`Invalid EMBEDDING_DIMENSION: ${EMBEDDING_DIMENSION}`);
|
|
6
|
-
}
|
|
7
|
-
|
|
8
|
-
/**
|
|
9
|
-
* Disk-backed search index over `<projectDir>/context/`. One row per
|
|
10
|
-
* `(path, chunk_index)`; `content_hash` is the file-level sha256 so the
|
|
11
|
-
* reindex algorithm can detect adds, updates, and removals in one pass.
|
|
12
|
-
*/
|
|
13
|
-
export interface IndexedChunk {
|
|
14
|
-
path: string;
|
|
15
|
-
chunk_index: number;
|
|
16
|
-
content_hash: string;
|
|
17
|
-
chunk_content: string;
|
|
18
|
-
embedding: number[];
|
|
19
|
-
mtime_ms: number;
|
|
20
|
-
size_bytes: number;
|
|
21
|
-
indexed_at: Date;
|
|
22
|
-
}
|
|
23
|
-
|
|
24
|
-
interface IndexRow {
|
|
25
|
-
path: string;
|
|
26
|
-
chunk_index: number;
|
|
27
|
-
content_hash: string;
|
|
28
|
-
chunk_content: string;
|
|
29
|
-
embedding: number[] | null;
|
|
30
|
-
mtime_ms: number;
|
|
31
|
-
size_bytes: number;
|
|
32
|
-
indexed_at: string;
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
function rowToChunk(row: IndexRow): IndexedChunk {
|
|
36
|
-
return {
|
|
37
|
-
path: row.path,
|
|
38
|
-
chunk_index: row.chunk_index,
|
|
39
|
-
content_hash: row.content_hash,
|
|
40
|
-
chunk_content: row.chunk_content,
|
|
41
|
-
embedding: row.embedding ?? [],
|
|
42
|
-
mtime_ms: Number(row.mtime_ms),
|
|
43
|
-
size_bytes: Number(row.size_bytes),
|
|
44
|
-
indexed_at: new Date(row.indexed_at),
|
|
45
|
-
};
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
export interface ChunkInput {
|
|
49
|
-
chunk_index: number;
|
|
50
|
-
chunk_content: string;
|
|
51
|
-
embedding: number[];
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
/**
|
|
55
|
-
* Replace all rows for `path` with the supplied chunks. The file-level
|
|
56
|
-
* `content_hash` / `mtime_ms` / `size_bytes` are stored on every row so a
|
|
57
|
-
* subsequent reindex can short-circuit by comparing just those columns.
|
|
58
|
-
*/
|
|
59
|
-
export async function upsertChunksForPath(
|
|
60
|
-
conn: DbConnection,
|
|
61
|
-
params: {
|
|
62
|
-
path: string;
|
|
63
|
-
contentHash: string;
|
|
64
|
-
mtimeMs: number;
|
|
65
|
-
sizeBytes: number;
|
|
66
|
-
chunks: ChunkInput[];
|
|
67
|
-
},
|
|
68
|
-
): Promise<void> {
|
|
69
|
-
await conn.queryRun("DELETE FROM context_index WHERE path = ?1", params.path);
|
|
70
|
-
for (const c of params.chunks) {
|
|
71
|
-
await conn.queryRun(
|
|
72
|
-
`INSERT INTO context_index
|
|
73
|
-
(path, chunk_index, content_hash, chunk_content, embedding, mtime_ms, size_bytes, indexed_at)
|
|
74
|
-
VALUES (?1, ?2, ?3, ?4, ?5::FLOAT[${EMBEDDING_DIMENSION}], ?6, ?7, current_timestamp::VARCHAR)`,
|
|
75
|
-
params.path,
|
|
76
|
-
c.chunk_index,
|
|
77
|
-
params.contentHash,
|
|
78
|
-
c.chunk_content,
|
|
79
|
-
c.embedding,
|
|
80
|
-
params.mtimeMs,
|
|
81
|
-
params.sizeBytes,
|
|
82
|
-
);
|
|
83
|
-
}
|
|
84
|
-
}
|
|
85
|
-
|
|
86
|
-
export async function deleteIndexedPath(
|
|
87
|
-
conn: DbConnection,
|
|
88
|
-
path: string,
|
|
89
|
-
): Promise<number> {
|
|
90
|
-
const result = await conn.queryRun(
|
|
91
|
-
"DELETE FROM context_index WHERE path = ?1",
|
|
92
|
-
path,
|
|
93
|
-
);
|
|
94
|
-
return result.changes;
|
|
95
|
-
}
|
|
96
|
-
|
|
97
|
-
/**
|
|
98
|
-
* Remove every indexed entry whose path equals `prefix` or lives beneath
|
|
99
|
-
* `prefix/`. Used when a folder is deleted from `context/` and we need to
|
|
100
|
-
* drop all child entries in one shot.
|
|
101
|
-
*/
|
|
102
|
-
export async function deleteIndexedPathsUnder(
|
|
103
|
-
conn: DbConnection,
|
|
104
|
-
prefix: string,
|
|
105
|
-
): Promise<number> {
|
|
106
|
-
const result = await conn.queryRun(
|
|
107
|
-
"DELETE FROM context_index WHERE path = ?1 OR path LIKE ?2",
|
|
108
|
-
prefix,
|
|
109
|
-
`${prefix}/%`,
|
|
110
|
-
);
|
|
111
|
-
return result.changes;
|
|
112
|
-
}
|
|
113
|
-
|
|
114
|
-
export interface IndexedPathSummary {
|
|
115
|
-
path: string;
|
|
116
|
-
content_hash: string;
|
|
117
|
-
mtime_ms: number;
|
|
118
|
-
size_bytes: number;
|
|
119
|
-
chunk_count: number;
|
|
120
|
-
}
|
|
121
|
-
|
|
122
|
-
export async function listIndexedPaths(
|
|
123
|
-
conn: DbConnection,
|
|
124
|
-
): Promise<IndexedPathSummary[]> {
|
|
125
|
-
const rows = await conn.queryAll<{
|
|
126
|
-
path: string;
|
|
127
|
-
content_hash: string;
|
|
128
|
-
mtime_ms: number;
|
|
129
|
-
size_bytes: number;
|
|
130
|
-
chunk_count: number;
|
|
131
|
-
}>(
|
|
132
|
-
`SELECT path,
|
|
133
|
-
ANY_VALUE(content_hash) AS content_hash,
|
|
134
|
-
ANY_VALUE(mtime_ms) AS mtime_ms,
|
|
135
|
-
ANY_VALUE(size_bytes) AS size_bytes,
|
|
136
|
-
COUNT(*) AS chunk_count
|
|
137
|
-
FROM context_index
|
|
138
|
-
GROUP BY path
|
|
139
|
-
ORDER BY path ASC`,
|
|
140
|
-
);
|
|
141
|
-
return rows.map((r) => ({
|
|
142
|
-
path: r.path,
|
|
143
|
-
content_hash: r.content_hash,
|
|
144
|
-
mtime_ms: Number(r.mtime_ms),
|
|
145
|
-
size_bytes: Number(r.size_bytes),
|
|
146
|
-
chunk_count: Number(r.chunk_count),
|
|
147
|
-
}));
|
|
148
|
-
}
|
|
149
|
-
|
|
150
|
-
export async function getIndexedPath(
|
|
151
|
-
conn: DbConnection,
|
|
152
|
-
path: string,
|
|
153
|
-
): Promise<IndexedPathSummary | null> {
|
|
154
|
-
const row = await conn.queryGet<{
|
|
155
|
-
path: string;
|
|
156
|
-
content_hash: string;
|
|
157
|
-
mtime_ms: number;
|
|
158
|
-
size_bytes: number;
|
|
159
|
-
chunk_count: number;
|
|
160
|
-
}>(
|
|
161
|
-
`SELECT path,
|
|
162
|
-
ANY_VALUE(content_hash) AS content_hash,
|
|
163
|
-
ANY_VALUE(mtime_ms) AS mtime_ms,
|
|
164
|
-
ANY_VALUE(size_bytes) AS size_bytes,
|
|
165
|
-
COUNT(*) AS chunk_count
|
|
166
|
-
FROM context_index
|
|
167
|
-
WHERE path = ?1
|
|
168
|
-
GROUP BY path`,
|
|
169
|
-
path,
|
|
170
|
-
);
|
|
171
|
-
if (!row) return null;
|
|
172
|
-
return {
|
|
173
|
-
path: row.path,
|
|
174
|
-
content_hash: row.content_hash,
|
|
175
|
-
mtime_ms: Number(row.mtime_ms),
|
|
176
|
-
size_bytes: Number(row.size_bytes),
|
|
177
|
-
chunk_count: Number(row.chunk_count),
|
|
178
|
-
};
|
|
179
|
-
}
|
|
180
|
-
|
|
181
|
-
export interface SearchResult extends IndexedChunk {
|
|
182
|
-
score: number;
|
|
183
|
-
}
|
|
184
|
-
|
|
185
|
-
/**
|
|
186
|
-
* Vector similarity over `context_index.embedding`. Returns chunks sorted by
|
|
187
|
-
* cosine similarity (higher = closer). Skips rows whose embedding is NULL.
|
|
188
|
-
*/
|
|
189
|
-
export async function searchSemantic(
|
|
190
|
-
conn: DbConnection,
|
|
191
|
-
queryEmbedding: number[],
|
|
192
|
-
limit = 10,
|
|
193
|
-
): Promise<SearchResult[]> {
|
|
194
|
-
const rows = await conn.queryAll<IndexRow & { distance: number }>(
|
|
195
|
-
`SELECT *, array_cosine_distance(embedding, ?1::FLOAT[${EMBEDDING_DIMENSION}]) AS distance
|
|
196
|
-
FROM context_index
|
|
197
|
-
WHERE embedding IS NOT NULL
|
|
198
|
-
ORDER BY distance ASC
|
|
199
|
-
LIMIT ?2`,
|
|
200
|
-
queryEmbedding,
|
|
201
|
-
limit,
|
|
202
|
-
);
|
|
203
|
-
return rows.map((row) => ({
|
|
204
|
-
...rowToChunk(row),
|
|
205
|
-
score: 1 - row.distance,
|
|
206
|
-
}));
|
|
207
|
-
}
|
|
208
|
-
|
|
209
|
-
/**
|
|
210
|
-
* BM25 keyword search over (chunk_content, path). The FTS index is rebuilt
|
|
211
|
-
* lazily by `rebuildSearchIndex`. Returns null-scoring rows filtered out.
|
|
212
|
-
*/
|
|
213
|
-
export async function searchKeyword(
|
|
214
|
-
conn: DbConnection,
|
|
215
|
-
query: string,
|
|
216
|
-
limit = 10,
|
|
217
|
-
): Promise<SearchResult[]> {
|
|
218
|
-
// The FTS index is created with `path` as input_id (see
|
|
219
|
-
// rebuildSearchIndex), so match_bm25's first argument must be the path
|
|
220
|
-
// value, not rowid. Passing rowid silently returns no hits — searchHybrid
|
|
221
|
-
// would then degrade to semantic-only.
|
|
222
|
-
const rows = await conn.queryAll<IndexRow & { score: number }>(
|
|
223
|
-
`SELECT context_index.*,
|
|
224
|
-
fts_main_context_index.match_bm25(context_index.path, ?1) AS score
|
|
225
|
-
FROM context_index
|
|
226
|
-
WHERE fts_main_context_index.match_bm25(context_index.path, ?1) IS NOT NULL
|
|
227
|
-
ORDER BY score DESC
|
|
228
|
-
LIMIT ?2`,
|
|
229
|
-
query,
|
|
230
|
-
limit,
|
|
231
|
-
);
|
|
232
|
-
return rows.map((row) => ({ ...rowToChunk(row), score: Number(row.score) }));
|
|
233
|
-
}
|
|
234
|
-
|
|
235
|
-
/**
|
|
236
|
-
* Reciprocal-rank fusion of semantic + keyword results, deduped by
|
|
237
|
-
* (path, chunk_index).
|
|
238
|
-
*/
|
|
239
|
-
export async function searchHybrid(
|
|
240
|
-
conn: DbConnection,
|
|
241
|
-
query: string,
|
|
242
|
-
queryEmbedding: number[],
|
|
243
|
-
limit = 10,
|
|
244
|
-
): Promise<SearchResult[]> {
|
|
245
|
-
const k = 60;
|
|
246
|
-
const [semantic, keyword] = await Promise.all([
|
|
247
|
-
searchSemantic(conn, queryEmbedding, 100),
|
|
248
|
-
searchKeyword(conn, query, 100).catch(() => [] as SearchResult[]),
|
|
249
|
-
]);
|
|
250
|
-
|
|
251
|
-
const scores = new Map<string, { chunk: IndexedChunk; score: number }>();
|
|
252
|
-
const key = (c: IndexedChunk) => `${c.path}::${c.chunk_index}`;
|
|
253
|
-
|
|
254
|
-
for (let i = 0; i < semantic.length; i++) {
|
|
255
|
-
const c = semantic[i];
|
|
256
|
-
if (!c) continue;
|
|
257
|
-
const existing = scores.get(key(c));
|
|
258
|
-
const rrf = 1 / (k + i + 1);
|
|
259
|
-
if (existing) existing.score += rrf;
|
|
260
|
-
else scores.set(key(c), { chunk: c, score: rrf });
|
|
261
|
-
}
|
|
262
|
-
for (let i = 0; i < keyword.length; i++) {
|
|
263
|
-
const c = keyword[i];
|
|
264
|
-
if (!c) continue;
|
|
265
|
-
const existing = scores.get(key(c));
|
|
266
|
-
const rrf = 1 / (k + i + 1);
|
|
267
|
-
if (existing) existing.score += rrf;
|
|
268
|
-
else scores.set(key(c), { chunk: c, score: rrf });
|
|
269
|
-
}
|
|
270
|
-
const merged = [...scores.values()].sort((a, b) => b.score - a.score);
|
|
271
|
-
return merged.slice(0, limit).map((m) => ({ ...m.chunk, score: m.score }));
|
|
272
|
-
}
|
|
273
|
-
|
|
274
|
-
/**
|
|
275
|
-
* Rebuild the FTS index over (chunk_content, path). DuckDB's FTS index is a
|
|
276
|
-
* snapshot — it does not update incrementally on INSERT/UPDATE/DELETE, so any
|
|
277
|
-
* batch writer must call this once its transaction commits.
|
|
278
|
-
*
|
|
279
|
-
* The trailing CHECKPOINT is load-bearing (see history): `overwrite = 1`
|
|
280
|
-
* writes a `DROP SCHEMA fts_main_context_index` record into the WAL; without
|
|
281
|
-
* the checkpoint, replay on the next open can fail with "Cannot drop entry
|
|
282
|
-
* 'fts_main_context_index' because there are entries that depend on it".
|
|
283
|
-
*/
|
|
284
|
-
export async function rebuildSearchIndex(conn: DbConnection): Promise<void> {
|
|
285
|
-
// Skip if the table doesn't exist yet (e.g., fresh tests with an empty
|
|
286
|
-
// schema). The FTS extension errors out on a missing table.
|
|
287
|
-
const exists = await conn.queryGet<{ name: string }>(
|
|
288
|
-
"SELECT table_name AS name FROM information_schema.tables WHERE table_name = 'context_index'",
|
|
289
|
-
);
|
|
290
|
-
if (!exists) return;
|
|
291
|
-
await conn.exec(
|
|
292
|
-
"PRAGMA create_fts_index('context_index', 'path', 'chunk_content', 'path', overwrite = 1)",
|
|
293
|
-
);
|
|
294
|
-
await conn.exec("CHECKPOINT");
|
|
295
|
-
}
|
|
296
|
-
|
|
297
|
-
export async function indexStats(conn: DbConnection): Promise<{
|
|
298
|
-
paths: number;
|
|
299
|
-
chunks: number;
|
|
300
|
-
embedded: number;
|
|
301
|
-
}> {
|
|
302
|
-
const row = await conn.queryGet<{
|
|
303
|
-
paths: number;
|
|
304
|
-
chunks: number;
|
|
305
|
-
embedded: number;
|
|
306
|
-
}>(
|
|
307
|
-
`SELECT COUNT(DISTINCT path) AS paths,
|
|
308
|
-
COUNT(*) AS chunks,
|
|
309
|
-
COUNT(embedding) AS embedded
|
|
310
|
-
FROM context_index`,
|
|
311
|
-
);
|
|
312
|
-
return {
|
|
313
|
-
paths: Number(row?.paths ?? 0),
|
|
314
|
-
chunks: Number(row?.chunks ?? 0),
|
|
315
|
-
embedded: Number(row?.embedded ?? 0),
|
|
316
|
-
};
|
|
317
|
-
}
|
package/src/db/query.ts
DELETED
|
@@ -1,56 +0,0 @@
|
|
|
1
|
-
type SqlParam = string | number | null;
|
|
2
|
-
|
|
3
|
-
/**
|
|
4
|
-
* Validate that a value is a positive integer, suitable for use in
|
|
5
|
-
* LIMIT / OFFSET clauses that must be interpolated into SQL strings.
|
|
6
|
-
*/
|
|
7
|
-
export function sanitizeInt(val: number): number {
|
|
8
|
-
if (!Number.isInteger(val) || val <= 0) {
|
|
9
|
-
throw new Error(`Expected a positive integer, got: ${val}`);
|
|
10
|
-
}
|
|
11
|
-
return val;
|
|
12
|
-
}
|
|
13
|
-
|
|
14
|
-
/**
|
|
15
|
-
* Build a WHERE clause from column-value pairs.
|
|
16
|
-
* Entries with `undefined` values are skipped.
|
|
17
|
-
*/
|
|
18
|
-
export function buildWhereClause(filters: [string, SqlParam | undefined][]): {
|
|
19
|
-
where: string;
|
|
20
|
-
params: SqlParam[];
|
|
21
|
-
} {
|
|
22
|
-
const conditions: string[] = [];
|
|
23
|
-
const params: SqlParam[] = [];
|
|
24
|
-
|
|
25
|
-
for (const [col, val] of filters) {
|
|
26
|
-
if (val !== undefined) {
|
|
27
|
-
params.push(val);
|
|
28
|
-
conditions.push(`${col} = ?${params.length}`);
|
|
29
|
-
}
|
|
30
|
-
}
|
|
31
|
-
|
|
32
|
-
const where =
|
|
33
|
-
conditions.length > 0 ? `WHERE ${conditions.join(" AND ")}` : "";
|
|
34
|
-
return { where, params };
|
|
35
|
-
}
|
|
36
|
-
|
|
37
|
-
/**
|
|
38
|
-
* Build SET clauses for an UPDATE from column-value pairs.
|
|
39
|
-
* Entries with `undefined` values are skipped.
|
|
40
|
-
*/
|
|
41
|
-
export function buildSetClauses(fields: [string, SqlParam | undefined][]): {
|
|
42
|
-
setClauses: string[];
|
|
43
|
-
params: SqlParam[];
|
|
44
|
-
} {
|
|
45
|
-
const setClauses: string[] = [];
|
|
46
|
-
const params: SqlParam[] = [];
|
|
47
|
-
|
|
48
|
-
for (const [col, val] of fields) {
|
|
49
|
-
if (val !== undefined) {
|
|
50
|
-
params.push(val);
|
|
51
|
-
setClauses.push(`${col} = ?${params.length}`);
|
|
52
|
-
}
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
return { setClauses, params };
|
|
56
|
-
}
|
package/src/db/schema.ts
DELETED
|
@@ -1,93 +0,0 @@
|
|
|
1
|
-
import { readdirSync, readFileSync } from "node:fs";
|
|
2
|
-
import { join } from "node:path";
|
|
3
|
-
import { logger } from "../utils/logger.ts";
|
|
4
|
-
import type { DbConnection } from "./connection.ts";
|
|
5
|
-
import { rebuildSearchIndex } from "./embeddings.ts";
|
|
6
|
-
|
|
7
|
-
interface Migration {
|
|
8
|
-
id: number;
|
|
9
|
-
name: string;
|
|
10
|
-
sql: string;
|
|
11
|
-
}
|
|
12
|
-
|
|
13
|
-
const sqlDir = join(import.meta.dir, "sql");
|
|
14
|
-
|
|
15
|
-
function loadMigrations(): Migration[] {
|
|
16
|
-
const files = readdirSync(sqlDir).filter((f) => f.endsWith(".sql"));
|
|
17
|
-
|
|
18
|
-
const migrations = files.map((file) => {
|
|
19
|
-
const match = file.match(/^(\d+)-(.+)\.sql$/);
|
|
20
|
-
if (!match) throw new Error(`Invalid migration filename: ${file}`);
|
|
21
|
-
const id = match[1];
|
|
22
|
-
const name = match[2];
|
|
23
|
-
if (!id || !name) throw new Error(`Invalid migration filename: ${file}`);
|
|
24
|
-
return {
|
|
25
|
-
id: parseInt(id, 10),
|
|
26
|
-
name,
|
|
27
|
-
sql: readFileSync(join(sqlDir, file), "utf-8"),
|
|
28
|
-
};
|
|
29
|
-
});
|
|
30
|
-
|
|
31
|
-
// Sort by numeric id so `12-` runs after `2-`, not between `11-` and `2-`.
|
|
32
|
-
return migrations.sort((a, b) => a.id - b.id);
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
export async function migrate(db: DbConnection): Promise<void> {
|
|
36
|
-
// Create migrations tracking table
|
|
37
|
-
await db.exec(`
|
|
38
|
-
CREATE TABLE IF NOT EXISTS _migrations (
|
|
39
|
-
id INTEGER PRIMARY KEY,
|
|
40
|
-
name TEXT NOT NULL,
|
|
41
|
-
applied_at TEXT DEFAULT (current_timestamp::VARCHAR)
|
|
42
|
-
)
|
|
43
|
-
`);
|
|
44
|
-
|
|
45
|
-
// Get already-applied migrations
|
|
46
|
-
const rows = await db.queryAll<{ id: number }>("SELECT id FROM _migrations");
|
|
47
|
-
const applied = new Set(rows.map((row) => row.id));
|
|
48
|
-
|
|
49
|
-
// Run pending migrations in order
|
|
50
|
-
const pending = loadMigrations().filter((m) => !applied.has(m.id));
|
|
51
|
-
if (pending.length > 0) {
|
|
52
|
-
logger.info(
|
|
53
|
-
`applying ${pending.length} migration${pending.length === 1 ? "" : "s"}`,
|
|
54
|
-
);
|
|
55
|
-
}
|
|
56
|
-
|
|
57
|
-
let appliedAny = false;
|
|
58
|
-
for (const migration of pending) {
|
|
59
|
-
logger.info(` ${migration.id}. ${migration.name}`);
|
|
60
|
-
|
|
61
|
-
// Split on semicolons and run each statement individually
|
|
62
|
-
const statements = migration.sql
|
|
63
|
-
.split(";")
|
|
64
|
-
.map((s) => s.trim())
|
|
65
|
-
.filter((s) => s.length > 0);
|
|
66
|
-
|
|
67
|
-
for (const statement of statements) {
|
|
68
|
-
await db.exec(statement);
|
|
69
|
-
}
|
|
70
|
-
|
|
71
|
-
await db.queryRun(
|
|
72
|
-
"INSERT INTO _migrations (id, name) VALUES (?1, ?2)",
|
|
73
|
-
migration.id,
|
|
74
|
-
migration.name,
|
|
75
|
-
);
|
|
76
|
-
appliedAny = true;
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
// Flush the WAL so the next open has no schema entries to replay. DuckDB's
|
|
80
|
-
// WAL replay of ALTER TABLE re-binds all column defaults on the target
|
|
81
|
-
// table, and our CREATE TABLE defaults use `current_timestamp::VARCHAR` —
|
|
82
|
-
// which cannot be resolved during replay (no default database attached yet),
|
|
83
|
-
// crashing the process on reopen.
|
|
84
|
-
if (appliedAny) {
|
|
85
|
-
await db.exec("CHECKPOINT");
|
|
86
|
-
}
|
|
87
|
-
|
|
88
|
-
// Ensure the FTS index exists. Migration 18 drops it (it can't recreate it
|
|
89
|
-
// in the same SQL run without DuckDB rejecting the dependency commit), and
|
|
90
|
-
// fresh DBs need it created at least once. `overwrite = 1` makes this
|
|
91
|
-
// idempotent for DBs that already have a healthy FTS index.
|
|
92
|
-
await rebuildSearchIndex(db);
|
|
93
|
-
}
|
|
@@ -1,53 +0,0 @@
|
|
|
1
|
-
CREATE TABLE tasks (
|
|
2
|
-
id TEXT PRIMARY KEY,
|
|
3
|
-
name TEXT NOT NULL,
|
|
4
|
-
description TEXT NOT NULL DEFAULT '',
|
|
5
|
-
priority TEXT NOT NULL DEFAULT 'medium' CHECK(priority IN ('low', 'medium', 'high')),
|
|
6
|
-
status TEXT NOT NULL DEFAULT 'pending' CHECK(status IN ('pending', 'in_progress', 'failed', 'complete', 'waiting')),
|
|
7
|
-
waiting_reason TEXT,
|
|
8
|
-
claimed_by TEXT,
|
|
9
|
-
claimed_at TEXT,
|
|
10
|
-
blocked_by TEXT NOT NULL DEFAULT '[]',
|
|
11
|
-
context_ids TEXT NOT NULL DEFAULT '[]',
|
|
12
|
-
created_at TEXT NOT NULL DEFAULT (current_timestamp::VARCHAR),
|
|
13
|
-
updated_at TEXT NOT NULL DEFAULT (current_timestamp::VARCHAR)
|
|
14
|
-
);
|
|
15
|
-
|
|
16
|
-
CREATE TABLE schedules (
|
|
17
|
-
id TEXT PRIMARY KEY,
|
|
18
|
-
name TEXT NOT NULL,
|
|
19
|
-
description TEXT NOT NULL DEFAULT '',
|
|
20
|
-
frequency TEXT NOT NULL,
|
|
21
|
-
last_run_at TEXT,
|
|
22
|
-
enabled BOOLEAN NOT NULL DEFAULT true,
|
|
23
|
-
created_at TEXT NOT NULL DEFAULT (current_timestamp::VARCHAR),
|
|
24
|
-
updated_at TEXT NOT NULL DEFAULT (current_timestamp::VARCHAR)
|
|
25
|
-
);
|
|
26
|
-
|
|
27
|
-
CREATE TABLE context_items (
|
|
28
|
-
id TEXT PRIMARY KEY,
|
|
29
|
-
title TEXT NOT NULL,
|
|
30
|
-
description TEXT NOT NULL DEFAULT '',
|
|
31
|
-
content TEXT,
|
|
32
|
-
content_blob BLOB,
|
|
33
|
-
mime_type TEXT NOT NULL DEFAULT 'text/plain',
|
|
34
|
-
is_textual BOOLEAN NOT NULL DEFAULT true,
|
|
35
|
-
source_path TEXT,
|
|
36
|
-
context_path TEXT NOT NULL,
|
|
37
|
-
indexed_at TEXT,
|
|
38
|
-
created_at TEXT NOT NULL DEFAULT (current_timestamp::VARCHAR),
|
|
39
|
-
updated_at TEXT NOT NULL DEFAULT (current_timestamp::VARCHAR)
|
|
40
|
-
);
|
|
41
|
-
|
|
42
|
-
CREATE TABLE embeddings (
|
|
43
|
-
id TEXT PRIMARY KEY,
|
|
44
|
-
context_item_id TEXT NOT NULL REFERENCES context_items(id),
|
|
45
|
-
chunk_index INTEGER NOT NULL,
|
|
46
|
-
chunk_content TEXT,
|
|
47
|
-
title TEXT NOT NULL,
|
|
48
|
-
description TEXT NOT NULL DEFAULT '',
|
|
49
|
-
source_path TEXT,
|
|
50
|
-
embedding FLOAT[1536],
|
|
51
|
-
created_at TEXT NOT NULL DEFAULT (current_timestamp::VARCHAR),
|
|
52
|
-
UNIQUE(context_item_id, chunk_index)
|
|
53
|
-
);
|
|
@@ -1,26 +0,0 @@
|
|
|
1
|
-
-- Older DBs could accumulate duplicate rows in context_items with the same
|
|
2
|
-
-- context_path: migration 4's CREATE UNIQUE INDEX IF NOT EXISTS silently left
|
|
3
|
-
-- the index metadata in place without enforcing it when duplicates predated
|
|
4
|
-
-- the migration. The resulting "corrupt" unique index triggers a native
|
|
5
|
-
-- crash in @duckdb/node-api on UPDATE ... RETURNING. Rebuild cleanly.
|
|
6
|
-
DROP INDEX IF EXISTS idx_context_items_context_path;
|
|
7
|
-
|
|
8
|
-
DELETE FROM embeddings WHERE context_item_id IN (
|
|
9
|
-
SELECT id FROM (
|
|
10
|
-
SELECT id, ROW_NUMBER() OVER (
|
|
11
|
-
PARTITION BY context_path
|
|
12
|
-
ORDER BY updated_at DESC, id DESC
|
|
13
|
-
) AS rn FROM context_items
|
|
14
|
-
) WHERE rn > 1
|
|
15
|
-
);
|
|
16
|
-
|
|
17
|
-
DELETE FROM context_items WHERE id IN (
|
|
18
|
-
SELECT id FROM (
|
|
19
|
-
SELECT id, ROW_NUMBER() OVER (
|
|
20
|
-
PARTITION BY context_path
|
|
21
|
-
ORDER BY updated_at DESC, id DESC
|
|
22
|
-
) AS rn FROM context_items
|
|
23
|
-
) WHERE rn > 1
|
|
24
|
-
);
|
|
25
|
-
|
|
26
|
-
CREATE UNIQUE INDEX idx_context_items_context_path ON context_items(context_path);
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
-- Historical: this migration used to drop and recreate the HNSW index
|
|
2
|
-
-- to clean up an internally-inconsistent state after native-side crashes
|
|
3
|
-
-- during embedding writes. HNSW is now gone (see migration 14) and the
|
|
4
|
-
-- VSS extension is no longer loaded at connection time, so the original
|
|
5
|
-
-- DDL would fail on fresh DBs. Kept as a no-op to preserve migration
|
|
6
|
-
-- numbering for existing databases that have already recorded id 11 in
|
|
7
|
-
-- _migrations.
|
|
8
|
-
SELECT 1;
|
|
@@ -1,66 +0,0 @@
|
|
|
1
|
-
-- Worker agents: replaces the PID-file + OS-watchdog single-daemon model
|
|
2
|
-
-- with multiple in-DB registered workers that heartbeat and can be reaped.
|
|
3
|
-
|
|
4
|
-
CREATE TABLE workers (
|
|
5
|
-
id TEXT PRIMARY KEY,
|
|
6
|
-
pid INTEGER NOT NULL,
|
|
7
|
-
hostname TEXT NOT NULL,
|
|
8
|
-
mode TEXT NOT NULL CHECK(mode IN ('persist', 'once')),
|
|
9
|
-
task_id TEXT,
|
|
10
|
-
status TEXT NOT NULL CHECK(status IN ('running', 'stopped', 'dead')),
|
|
11
|
-
started_at TEXT NOT NULL DEFAULT (current_timestamp::VARCHAR),
|
|
12
|
-
last_heartbeat_at TEXT NOT NULL DEFAULT (current_timestamp::VARCHAR),
|
|
13
|
-
stopped_at TEXT
|
|
14
|
-
);
|
|
15
|
-
|
|
16
|
-
CREATE INDEX idx_workers_status_heartbeat ON workers(status, last_heartbeat_at);
|
|
17
|
-
|
|
18
|
-
-- Schedule claim columns: only one worker evaluates a schedule per window.
|
|
19
|
-
ALTER TABLE schedules ADD COLUMN claimed_by TEXT;
|
|
20
|
-
ALTER TABLE schedules ADD COLUMN claimed_at TEXT;
|
|
21
|
-
|
|
22
|
-
-- Rewrite threads.type values: daemon_tick → worker_tick. The existing
|
|
23
|
-
-- CHECK constraint forbids the new value, so we rebuild both threads and
|
|
24
|
-
-- interactions (whose FK to threads would block a DROP). Dropping the FK
|
|
25
|
-
-- follows the 7-drop_embeddings_fk.sql precedent.
|
|
26
|
-
CREATE TABLE threads_backup AS SELECT * FROM threads;
|
|
27
|
-
CREATE TABLE interactions_backup AS SELECT * FROM interactions;
|
|
28
|
-
|
|
29
|
-
DROP TABLE interactions;
|
|
30
|
-
DROP TABLE threads;
|
|
31
|
-
|
|
32
|
-
CREATE TABLE threads (
|
|
33
|
-
id TEXT PRIMARY KEY,
|
|
34
|
-
type TEXT NOT NULL CHECK(type IN ('worker_tick', 'chat_session')),
|
|
35
|
-
task_id TEXT,
|
|
36
|
-
title TEXT NOT NULL DEFAULT '',
|
|
37
|
-
started_at TEXT NOT NULL DEFAULT (current_timestamp::VARCHAR),
|
|
38
|
-
ended_at TEXT,
|
|
39
|
-
metadata TEXT
|
|
40
|
-
);
|
|
41
|
-
|
|
42
|
-
CREATE TABLE interactions (
|
|
43
|
-
id TEXT PRIMARY KEY,
|
|
44
|
-
thread_id TEXT NOT NULL,
|
|
45
|
-
sequence INTEGER NOT NULL,
|
|
46
|
-
role TEXT NOT NULL CHECK(role IN ('user', 'assistant', 'system', 'tool')),
|
|
47
|
-
kind TEXT NOT NULL CHECK(kind IN ('message', 'thinking', 'tool_use', 'tool_result', 'context_update', 'status_change')),
|
|
48
|
-
content TEXT NOT NULL,
|
|
49
|
-
tool_name TEXT,
|
|
50
|
-
tool_input TEXT,
|
|
51
|
-
duration_ms INTEGER,
|
|
52
|
-
token_count INTEGER,
|
|
53
|
-
created_at TEXT NOT NULL DEFAULT (current_timestamp::VARCHAR),
|
|
54
|
-
UNIQUE(thread_id, sequence)
|
|
55
|
-
);
|
|
56
|
-
|
|
57
|
-
INSERT INTO threads
|
|
58
|
-
SELECT id,
|
|
59
|
-
CASE WHEN type = 'daemon_tick' THEN 'worker_tick' ELSE type END,
|
|
60
|
-
task_id, title, started_at, ended_at, metadata
|
|
61
|
-
FROM threads_backup;
|
|
62
|
-
|
|
63
|
-
INSERT INTO interactions SELECT * FROM interactions_backup;
|
|
64
|
-
|
|
65
|
-
DROP TABLE threads_backup;
|
|
66
|
-
DROP TABLE interactions_backup;
|
|
@@ -1,47 +0,0 @@
|
|
|
1
|
-
-- Milestone 10: collapse `source_path` + `context_path` + `source_type` into a
|
|
2
|
-
-- single `(drive, path)` identity pair. Pre-1.0, no backwards-compat promise —
|
|
3
|
-
-- we wipe context_items + embeddings and have the user re-add their content.
|
|
4
|
-
--
|
|
5
|
-
-- DuckDB's ALTER TABLE support is thin (no SET NOT NULL, flaky DROP COLUMN with
|
|
6
|
-
-- existing indexes), so this is a table rebuild. Order matters: drop indexes
|
|
7
|
-
-- first, then the old tables, then recreate with the new shape.
|
|
8
|
-
|
|
9
|
-
DELETE FROM embeddings;
|
|
10
|
-
DELETE FROM context_items;
|
|
11
|
-
|
|
12
|
-
DROP INDEX IF EXISTS idx_embeddings_cosine;
|
|
13
|
-
DROP INDEX IF EXISTS idx_context_items_context_path;
|
|
14
|
-
|
|
15
|
-
DROP TABLE embeddings;
|
|
16
|
-
DROP TABLE context_items;
|
|
17
|
-
|
|
18
|
-
CREATE TABLE context_items (
|
|
19
|
-
id TEXT PRIMARY KEY,
|
|
20
|
-
title TEXT NOT NULL,
|
|
21
|
-
description TEXT NOT NULL DEFAULT '',
|
|
22
|
-
content TEXT,
|
|
23
|
-
content_blob BLOB,
|
|
24
|
-
mime_type TEXT NOT NULL DEFAULT 'text/plain',
|
|
25
|
-
is_textual BOOLEAN NOT NULL DEFAULT true,
|
|
26
|
-
drive TEXT NOT NULL,
|
|
27
|
-
path TEXT NOT NULL,
|
|
28
|
-
indexed_at TEXT,
|
|
29
|
-
created_at TEXT NOT NULL DEFAULT (current_timestamp::VARCHAR),
|
|
30
|
-
updated_at TEXT NOT NULL DEFAULT (current_timestamp::VARCHAR)
|
|
31
|
-
);
|
|
32
|
-
|
|
33
|
-
CREATE UNIQUE INDEX idx_context_items_drive_path ON context_items(drive, path);
|
|
34
|
-
|
|
35
|
-
CREATE TABLE embeddings (
|
|
36
|
-
id TEXT PRIMARY KEY,
|
|
37
|
-
context_item_id TEXT NOT NULL,
|
|
38
|
-
chunk_index INTEGER NOT NULL,
|
|
39
|
-
chunk_content TEXT,
|
|
40
|
-
title TEXT NOT NULL,
|
|
41
|
-
description TEXT NOT NULL DEFAULT '',
|
|
42
|
-
embedding FLOAT[1536],
|
|
43
|
-
created_at TEXT NOT NULL DEFAULT (current_timestamp::VARCHAR),
|
|
44
|
-
UNIQUE(context_item_id, chunk_index)
|
|
45
|
-
);
|
|
46
|
-
|
|
47
|
-
CHECKPOINT;
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
-- HNSW has caused two separate corruption modes in this project: the
|
|
2
|
-
-- "Duplicate keys not allowed in high-level wrappers" failure addressed by
|
|
3
|
-
-- migration 11, and a second mode where the index silently returns zero rows
|
|
4
|
-
-- for cosine top-K queries (its stored SQL loses the `WITH (metric = 'cosine')`
|
|
5
|
-
-- clause). At our scale a linear scan of array_cosine_distance is plenty fast
|
|
6
|
-
-- and array_cosine_distance is a core DuckDB function — no VSS extension
|
|
7
|
-
-- required. Drop the index and move on.
|
|
8
|
-
DROP INDEX IF EXISTS idx_embeddings_cosine;
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
-- Keyword search uses DuckDB's FTS extension for BM25 ranking over
|
|
2
|
-
-- chunk_content and title. The index is a snapshot and must be rebuilt
|
|
3
|
-
-- after any write to the embeddings table. rebuildSearchIndex() in
|
|
4
|
-
-- src/db/embeddings.ts is the single entry point and is called from the
|
|
5
|
-
-- ingest transaction. overwrite = 1 makes this PRAGMA idempotent, which
|
|
6
|
-
-- also gives us a first-run rebuild for users upgrading from a DB that
|
|
7
|
-
-- never had FTS.
|
|
8
|
-
PRAGMA create_fts_index('embeddings', 'id', 'chunk_content', 'title', overwrite = 1);
|
|
@@ -1,7 +0,0 @@
|
|
|
1
|
-
-- Issue #145: preserve the original URL that produced each context item so
|
|
2
|
-
-- `context refresh` can re-fetch loss-lessly for service-specific drives
|
|
3
|
-
-- (google-docs, github, ...). Nullable — local-origin drives (disk, agent,
|
|
4
|
-
-- tool writes) leave it NULL and use their own refresh path. Legacy rows
|
|
5
|
-
-- ingested before this column existed also leave it NULL and surface a
|
|
6
|
-
-- "re-add from URL" error on refresh.
|
|
7
|
-
ALTER TABLE context_items ADD COLUMN source_url TEXT;
|