botholomew 0.12.3 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. package/README.md +91 -68
  2. package/package.json +3 -3
  3. package/src/chat/agent.ts +42 -82
  4. package/src/chat/session.ts +29 -25
  5. package/src/commands/capabilities.ts +1 -1
  6. package/src/commands/context.ts +177 -926
  7. package/src/commands/db.ts +9 -13
  8. package/src/commands/init.ts +4 -1
  9. package/src/commands/nuke.ts +57 -90
  10. package/src/commands/schedule.ts +103 -124
  11. package/src/commands/skill.ts +2 -2
  12. package/src/commands/task.ts +86 -95
  13. package/src/commands/thread.ts +107 -112
  14. package/src/commands/worker.ts +88 -88
  15. package/src/constants.ts +93 -16
  16. package/src/context/capabilities.ts +10 -10
  17. package/src/context/fetcher.ts +9 -10
  18. package/src/context/reindex.ts +189 -0
  19. package/src/context/store.ts +630 -0
  20. package/src/db/doctor.ts +1 -8
  21. package/src/db/embeddings.ts +227 -175
  22. package/src/db/sql/19-disk_backed_index.sql +36 -0
  23. package/src/db/sql/20-drop_db_tables_for_files.sql +19 -0
  24. package/src/fs/atomic.ts +217 -0
  25. package/src/fs/compat.ts +86 -0
  26. package/src/fs/sandbox.ts +279 -0
  27. package/src/init/index.ts +69 -52
  28. package/src/init/templates.ts +1 -1
  29. package/src/mcpx/client.ts +1 -1
  30. package/src/schedules/schema.ts +19 -0
  31. package/src/schedules/store.ts +296 -0
  32. package/src/skills/commands.ts +1 -3
  33. package/src/tasks/schema.ts +47 -0
  34. package/src/tasks/store.ts +486 -0
  35. package/src/threads/store.ts +559 -0
  36. package/src/tools/capabilities/refresh.ts +42 -21
  37. package/src/tools/context/pipe.ts +15 -71
  38. package/src/tools/context/update-beliefs.ts +3 -3
  39. package/src/tools/context/update-goals.ts +3 -3
  40. package/src/tools/dir/create.ts +26 -23
  41. package/src/tools/dir/size.ts +46 -17
  42. package/src/tools/dir/tree.ts +73 -279
  43. package/src/tools/file/copy.ts +50 -24
  44. package/src/tools/file/count-lines.ts +34 -10
  45. package/src/tools/file/delete.ts +44 -23
  46. package/src/tools/file/edit.ts +39 -14
  47. package/src/tools/file/exists.ts +12 -26
  48. package/src/tools/file/info.ts +25 -85
  49. package/src/tools/file/move.ts +39 -24
  50. package/src/tools/file/read.ts +32 -80
  51. package/src/tools/file/write.ts +14 -91
  52. package/src/tools/registry.ts +3 -7
  53. package/src/tools/schedule/create.ts +2 -2
  54. package/src/tools/schedule/list.ts +7 -3
  55. package/src/tools/search/fuse.ts +12 -33
  56. package/src/tools/search/index.ts +36 -43
  57. package/src/tools/search/regexp.ts +29 -17
  58. package/src/tools/search/semantic.ts +137 -51
  59. package/src/tools/skill/delete.ts +1 -1
  60. package/src/tools/skill/list.ts +1 -1
  61. package/src/tools/skill/write.ts +1 -1
  62. package/src/tools/task/create.ts +41 -16
  63. package/src/tools/task/delete.ts +3 -3
  64. package/src/tools/task/list.ts +6 -3
  65. package/src/tools/task/update.ts +31 -9
  66. package/src/tools/task/view.ts +6 -6
  67. package/src/tools/thread/list.ts +2 -2
  68. package/src/tools/thread/search.ts +208 -0
  69. package/src/tools/thread/view.ts +50 -5
  70. package/src/tools/worker/spawn.ts +28 -14
  71. package/src/tui/App.tsx +12 -19
  72. package/src/tui/components/ContextPanel.tsx +83 -316
  73. package/src/tui/components/SchedulePanel.tsx +34 -48
  74. package/src/tui/components/StatusBar.tsx +15 -15
  75. package/src/tui/components/TaskPanel.tsx +34 -38
  76. package/src/tui/components/ThreadPanel.tsx +29 -38
  77. package/src/tui/components/WorkerPanel.tsx +21 -19
  78. package/src/tui/markdown.ts +2 -8
  79. package/src/types/file-imports.d.ts +9 -0
  80. package/src/utils/title.ts +5 -7
  81. package/src/utils/v7-date.ts +47 -0
  82. package/src/worker/heartbeat.ts +46 -24
  83. package/src/worker/index.ts +13 -15
  84. package/src/worker/llm.ts +30 -37
  85. package/src/worker/prompt.ts +19 -41
  86. package/src/worker/schedules.ts +48 -69
  87. package/src/worker/spawn.ts +11 -11
  88. package/src/worker/tick.ts +39 -43
  89. package/src/workers/store.ts +247 -0
  90. package/src/commands/tools.ts +0 -367
  91. package/src/context/describer.ts +0 -140
  92. package/src/context/drives.ts +0 -110
  93. package/src/context/ingest.ts +0 -162
  94. package/src/context/refresh.ts +0 -183
  95. package/src/db/context.ts +0 -637
  96. package/src/db/daemon-state.ts +0 -6
  97. package/src/db/reembed.ts +0 -113
  98. package/src/db/schedules.ts +0 -213
  99. package/src/db/tasks.ts +0 -347
  100. package/src/db/threads.ts +0 -276
  101. package/src/db/workers.ts +0 -212
  102. package/src/tools/context/list-drives.ts +0 -36
  103. package/src/tools/context/refresh.ts +0 -165
  104. package/src/tools/context/search.ts +0 -54
@@ -1,248 +1,300 @@
1
1
  import { EMBEDDING_DIMENSION } from "../constants.ts";
2
2
  import type { DbConnection } from "./connection.ts";
3
- import { uuidv7 } from "./uuid.ts";
4
3
 
5
4
  if (!Number.isInteger(EMBEDDING_DIMENSION) || EMBEDDING_DIMENSION <= 0) {
6
5
  throw new Error(`Invalid EMBEDDING_DIMENSION: ${EMBEDDING_DIMENSION}`);
7
6
  }
8
7
 
9
- export interface Embedding {
10
- id: string;
11
- context_item_id: string;
8
+ /**
9
+ * Disk-backed search index over `<projectDir>/context/`. One row per
10
+ * `(path, chunk_index)`; `content_hash` is the file-level sha256 so the
11
+ * reindex algorithm can detect adds, updates, and removals in one pass.
12
+ */
13
+ export interface IndexedChunk {
14
+ path: string;
12
15
  chunk_index: number;
13
- chunk_content: string | null;
14
- title: string;
15
- description: string;
16
+ content_hash: string;
17
+ chunk_content: string;
16
18
  embedding: number[];
17
- created_at: Date;
18
- }
19
-
20
- export interface EmbeddingSearchResult extends Embedding {
21
- score: number;
19
+ mtime_ms: number;
20
+ size_bytes: number;
21
+ indexed_at: Date;
22
22
  }
23
23
 
24
- interface EmbeddingRow {
25
- id: string;
26
- context_item_id: string;
24
+ interface IndexRow {
25
+ path: string;
27
26
  chunk_index: number;
28
- chunk_content: string | null;
29
- title: string;
30
- description: string;
27
+ content_hash: string;
28
+ chunk_content: string;
31
29
  embedding: number[] | null;
32
- created_at: string;
30
+ mtime_ms: number;
31
+ size_bytes: number;
32
+ indexed_at: string;
33
33
  }
34
34
 
35
- function rowToEmbedding(row: EmbeddingRow): Embedding {
35
+ function rowToChunk(row: IndexRow): IndexedChunk {
36
36
  return {
37
- id: row.id,
38
- context_item_id: row.context_item_id,
37
+ path: row.path,
39
38
  chunk_index: row.chunk_index,
39
+ content_hash: row.content_hash,
40
40
  chunk_content: row.chunk_content,
41
- title: row.title,
42
- description: row.description,
43
41
  embedding: row.embedding ?? [],
44
- created_at: new Date(row.created_at),
42
+ mtime_ms: Number(row.mtime_ms),
43
+ size_bytes: Number(row.size_bytes),
44
+ indexed_at: new Date(row.indexed_at),
45
45
  };
46
46
  }
47
47
 
48
+ export interface ChunkInput {
49
+ chunk_index: number;
50
+ chunk_content: string;
51
+ embedding: number[];
52
+ }
53
+
48
54
  /**
49
- * Insert a single embedding row. Callers that bulk-write embeddings are
50
- * responsible for calling `rebuildSearchIndex()` afterward the FTS index is
51
- * a snapshot and will not reflect new rows until rebuilt.
55
+ * Replace all rows for `path` with the supplied chunks. The file-level
56
+ * `content_hash` / `mtime_ms` / `size_bytes` are stored on every row so a
57
+ * subsequent reindex can short-circuit by comparing just those columns.
52
58
  */
53
- export async function createEmbedding(
59
+ export async function upsertChunksForPath(
54
60
  conn: DbConnection,
55
61
  params: {
56
- contextItemId: string;
57
- chunkIndex: number;
58
- chunkContent: string | null;
59
- title: string;
60
- description?: string;
61
- embedding: number[];
62
+ path: string;
63
+ contentHash: string;
64
+ mtimeMs: number;
65
+ sizeBytes: number;
66
+ chunks: ChunkInput[];
62
67
  },
63
- ): Promise<Embedding> {
64
- const id = uuidv7();
65
- await conn.queryRun(
66
- `INSERT INTO embeddings (id, context_item_id, chunk_index, chunk_content, title, description, embedding)
67
- VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7::FLOAT[${EMBEDDING_DIMENSION}])`,
68
- id,
69
- params.contextItemId,
70
- params.chunkIndex,
71
- params.chunkContent,
72
- params.title,
73
- params.description ?? "",
74
- params.embedding,
68
+ ): Promise<void> {
69
+ await conn.queryRun("DELETE FROM context_index WHERE path = ?1", params.path);
70
+ for (const c of params.chunks) {
71
+ await conn.queryRun(
72
+ `INSERT INTO context_index
73
+ (path, chunk_index, content_hash, chunk_content, embedding, mtime_ms, size_bytes, indexed_at)
74
+ VALUES (?1, ?2, ?3, ?4, ?5::FLOAT[${EMBEDDING_DIMENSION}], ?6, ?7, current_timestamp::VARCHAR)`,
75
+ params.path,
76
+ c.chunk_index,
77
+ params.contentHash,
78
+ c.chunk_content,
79
+ c.embedding,
80
+ params.mtimeMs,
81
+ params.sizeBytes,
82
+ );
83
+ }
84
+ }
85
+
86
+ export async function deleteIndexedPath(
87
+ conn: DbConnection,
88
+ path: string,
89
+ ): Promise<number> {
90
+ const result = await conn.queryRun(
91
+ "DELETE FROM context_index WHERE path = ?1",
92
+ path,
75
93
  );
94
+ return result.changes;
95
+ }
76
96
 
77
- return {
78
- id,
79
- context_item_id: params.contextItemId,
80
- chunk_index: params.chunkIndex,
81
- chunk_content: params.chunkContent,
82
- title: params.title,
83
- description: params.description ?? "",
84
- embedding: params.embedding,
85
- created_at: new Date(),
86
- };
97
+ export interface IndexedPathSummary {
98
+ path: string;
99
+ content_hash: string;
100
+ mtime_ms: number;
101
+ size_bytes: number;
102
+ chunk_count: number;
87
103
  }
88
104
 
89
- export async function getEmbeddingsForItem(
105
+ export async function listIndexedPaths(
90
106
  conn: DbConnection,
91
- contextItemId: string,
92
- ): Promise<Embedding[]> {
93
- const rows = await conn.queryAll<EmbeddingRow>(
94
- "SELECT * FROM embeddings WHERE context_item_id = ?1 ORDER BY chunk_index ASC",
95
- contextItemId,
107
+ ): Promise<IndexedPathSummary[]> {
108
+ const rows = await conn.queryAll<{
109
+ path: string;
110
+ content_hash: string;
111
+ mtime_ms: number;
112
+ size_bytes: number;
113
+ chunk_count: number;
114
+ }>(
115
+ `SELECT path,
116
+ ANY_VALUE(content_hash) AS content_hash,
117
+ ANY_VALUE(mtime_ms) AS mtime_ms,
118
+ ANY_VALUE(size_bytes) AS size_bytes,
119
+ COUNT(*) AS chunk_count
120
+ FROM context_index
121
+ GROUP BY path
122
+ ORDER BY path ASC`,
96
123
  );
97
- return rows.map(rowToEmbedding);
124
+ return rows.map((r) => ({
125
+ path: r.path,
126
+ content_hash: r.content_hash,
127
+ mtime_ms: Number(r.mtime_ms),
128
+ size_bytes: Number(r.size_bytes),
129
+ chunk_count: Number(r.chunk_count),
130
+ }));
98
131
  }
99
132
 
100
- /**
101
- * Delete all embeddings for a context item. Callers are responsible for
102
- * calling `rebuildSearchIndex()` afterward — the FTS index is a snapshot and
103
- * will still reference the deleted rows until rebuilt.
104
- */
105
- export async function deleteEmbeddingsForItem(
133
+ export async function getIndexedPath(
106
134
  conn: DbConnection,
107
- contextItemId: string,
108
- ): Promise<number> {
109
- const result = await conn.queryRun(
110
- "DELETE FROM embeddings WHERE context_item_id = ?1",
111
- contextItemId,
135
+ path: string,
136
+ ): Promise<IndexedPathSummary | null> {
137
+ const row = await conn.queryGet<{
138
+ path: string;
139
+ content_hash: string;
140
+ mtime_ms: number;
141
+ size_bytes: number;
142
+ chunk_count: number;
143
+ }>(
144
+ `SELECT path,
145
+ ANY_VALUE(content_hash) AS content_hash,
146
+ ANY_VALUE(mtime_ms) AS mtime_ms,
147
+ ANY_VALUE(size_bytes) AS size_bytes,
148
+ COUNT(*) AS chunk_count
149
+ FROM context_index
150
+ WHERE path = ?1
151
+ GROUP BY path`,
152
+ path,
112
153
  );
113
- return result.changes;
154
+ if (!row) return null;
155
+ return {
156
+ path: row.path,
157
+ content_hash: row.content_hash,
158
+ mtime_ms: Number(row.mtime_ms),
159
+ size_bytes: Number(row.size_bytes),
160
+ chunk_count: Number(row.chunk_count),
161
+ };
114
162
  }
115
163
 
116
- interface VectorSearchRow extends EmbeddingRow {
117
- distance: number;
164
+ export interface SearchResult extends IndexedChunk {
165
+ score: number;
118
166
  }
119
167
 
120
168
  /**
121
- * Vector similarity search using DuckDB's array_cosine_distance().
122
- * With an HNSW index on the embedding column, DuckDB automatically
123
- * uses the index for top-k queries. Returns results sorted by
124
- * similarity (closest first), with score = 1 - distance.
169
+ * Vector similarity over `context_index.embedding`. Returns chunks sorted by
170
+ * cosine similarity (higher = closer). Skips rows whose embedding is NULL.
125
171
  */
126
- export async function searchEmbeddings(
172
+ export async function searchSemantic(
127
173
  conn: DbConnection,
128
174
  queryEmbedding: number[],
129
175
  limit = 10,
130
- ): Promise<EmbeddingSearchResult[]> {
131
- const rows = await conn.queryAll<VectorSearchRow>(
176
+ ): Promise<SearchResult[]> {
177
+ const rows = await conn.queryAll<IndexRow & { distance: number }>(
132
178
  `SELECT *, array_cosine_distance(embedding, ?1::FLOAT[${EMBEDDING_DIMENSION}]) AS distance
133
- FROM embeddings
134
- ORDER BY distance ASC
135
- LIMIT ?2`,
179
+ FROM context_index
180
+ WHERE embedding IS NOT NULL
181
+ ORDER BY distance ASC
182
+ LIMIT ?2`,
136
183
  queryEmbedding,
137
184
  limit,
138
185
  );
139
-
140
186
  return rows.map((row) => ({
141
- ...rowToEmbedding(row),
187
+ ...rowToChunk(row),
142
188
  score: 1 - row.distance,
143
189
  }));
144
190
  }
145
191
 
146
- export interface HybridSearchResult extends EmbeddingSearchResult {
147
- drive: string | null;
148
- path: string | null;
149
- }
150
-
151
192
  /**
152
- * Rebuild the FTS index over (chunk_content, title). DuckDB's FTS index is a
153
- * snapshot it does not update incrementally on INSERT/UPDATE/DELETE, so any
154
- * batch writer must call this once its transaction commits. Cheap at our
155
- * scale (hundreds to low thousands of rows).
156
- *
157
- * The trailing CHECKPOINT is load-bearing: `overwrite = 1` writes a
158
- * `DROP SCHEMA fts_main_embeddings` record into the WAL. If the WAL still
159
- * contains that drop on the next open, replay fails with "Cannot drop entry
160
- * 'fts_main_embeddings' because there are entries that depend on it". Forcing
161
- * a checkpoint flushes the WAL so the next open has nothing to replay.
193
+ * BM25 keyword search over (chunk_content, path). The FTS index is rebuilt
194
+ * lazily by `rebuildSearchIndex`. Returns null-scoring rows filtered out.
162
195
  */
163
- export async function rebuildSearchIndex(conn: DbConnection): Promise<void> {
164
- await conn.exec(
165
- "PRAGMA create_fts_index('embeddings', 'id', 'chunk_content', 'title', overwrite = 1)",
196
+ export async function searchKeyword(
197
+ conn: DbConnection,
198
+ query: string,
199
+ limit = 10,
200
+ ): Promise<SearchResult[]> {
201
+ // The FTS index is created with `path` as input_id (see
202
+ // rebuildSearchIndex), so match_bm25's first argument must be the path
203
+ // value, not rowid. Passing rowid silently returns no hits — searchHybrid
204
+ // would then degrade to semantic-only.
205
+ const rows = await conn.queryAll<IndexRow & { score: number }>(
206
+ `SELECT context_index.*,
207
+ fts_main_context_index.match_bm25(context_index.path, ?1) AS score
208
+ FROM context_index
209
+ WHERE fts_main_context_index.match_bm25(context_index.path, ?1) IS NOT NULL
210
+ ORDER BY score DESC
211
+ LIMIT ?2`,
212
+ query,
213
+ limit,
166
214
  );
167
- await conn.exec("CHECKPOINT");
215
+ return rows.map((row) => ({ ...rowToChunk(row), score: Number(row.score) }));
168
216
  }
169
217
 
170
- export async function hybridSearch(
218
+ /**
219
+ * Reciprocal-rank fusion of semantic + keyword results, deduped by
220
+ * (path, chunk_index).
221
+ */
222
+ export async function searchHybrid(
171
223
  conn: DbConnection,
172
224
  query: string,
173
225
  queryEmbedding: number[],
174
226
  limit = 10,
175
- ): Promise<HybridSearchResult[]> {
176
- const k = 60; // RRF constant
177
-
178
- // Keyword side: BM25 over chunk_content + title via the FTS extension.
179
- // `match_bm25` returns NULL for rows with no token overlap; we keep only
180
- // scored rows and order by descending score so RRF sees the best matches
181
- // at the lowest ranks. Stemming, stopwords, and tokenization are handled
182
- // by FTS — more query terms produce higher scores, which is exactly the
183
- // behaviour a naive per-token ILIKE loop fails to provide.
184
- const keywordRows = await conn.queryAll<EmbeddingRow>(
185
- `SELECT * FROM embeddings
186
- WHERE fts_main_embeddings.match_bm25(id, ?1) IS NOT NULL
187
- ORDER BY fts_main_embeddings.match_bm25(id, ?1) DESC
188
- LIMIT 100`,
189
- query,
190
- );
191
-
192
- const keywordRanked = keywordRows.map(rowToEmbedding);
193
-
194
- const vectorResults = await searchEmbeddings(conn, queryEmbedding, 100);
227
+ ): Promise<SearchResult[]> {
228
+ const k = 60;
229
+ const [semantic, keyword] = await Promise.all([
230
+ searchSemantic(conn, queryEmbedding, 100),
231
+ searchKeyword(conn, query, 100).catch(() => [] as SearchResult[]),
232
+ ]);
195
233
 
196
- const scores = new Map<string, { embedding: Embedding; score: number }>();
234
+ const scores = new Map<string, { chunk: IndexedChunk; score: number }>();
235
+ const key = (c: IndexedChunk) => `${c.path}::${c.chunk_index}`;
197
236
 
198
- for (const [i, emb] of keywordRanked.entries()) {
199
- const rrfScore = 1 / (k + i + 1);
200
- const existing = scores.get(emb.id);
201
- if (existing) {
202
- existing.score += rrfScore;
203
- } else {
204
- scores.set(emb.id, { embedding: emb, score: rrfScore });
205
- }
237
+ for (let i = 0; i < semantic.length; i++) {
238
+ const c = semantic[i];
239
+ if (!c) continue;
240
+ const existing = scores.get(key(c));
241
+ const rrf = 1 / (k + i + 1);
242
+ if (existing) existing.score += rrf;
243
+ else scores.set(key(c), { chunk: c, score: rrf });
206
244
  }
207
-
208
- for (const [i, emb] of vectorResults.entries()) {
209
- const rrfScore = 1 / (k + i + 1);
210
- const existing = scores.get(emb.id);
211
- if (existing) {
212
- existing.score += rrfScore;
213
- } else {
214
- scores.set(emb.id, { embedding: emb, score: rrfScore });
215
- }
245
+ for (let i = 0; i < keyword.length; i++) {
246
+ const c = keyword[i];
247
+ if (!c) continue;
248
+ const existing = scores.get(key(c));
249
+ const rrf = 1 / (k + i + 1);
250
+ if (existing) existing.score += rrf;
251
+ else scores.set(key(c), { chunk: c, score: rrf });
216
252
  }
253
+ const merged = [...scores.values()].sort((a, b) => b.score - a.score);
254
+ return merged.slice(0, limit).map((m) => ({ ...m.chunk, score: m.score }));
255
+ }
217
256
 
218
- const merged = Array.from(scores.values());
219
- merged.sort((a, b) => b.score - a.score);
220
-
221
- const top = merged.slice(0, limit);
222
- if (top.length === 0) return [];
223
-
224
- // Look up drive + path from context_items for each surviving embedding
225
- const itemIds = Array.from(
226
- new Set(top.map((t) => t.embedding.context_item_id)),
257
+ /**
258
+ * Rebuild the FTS index over (chunk_content, path). DuckDB's FTS index is a
259
+ * snapshot — it does not update incrementally on INSERT/UPDATE/DELETE, so any
260
+ * batch writer must call this once its transaction commits.
261
+ *
262
+ * The trailing CHECKPOINT is load-bearing (see history): `overwrite = 1`
263
+ * writes a `DROP SCHEMA fts_main_context_index` record into the WAL; without
264
+ * the checkpoint, replay on the next open can fail with "Cannot drop entry
265
+ * 'fts_main_context_index' because there are entries that depend on it".
266
+ */
267
+ export async function rebuildSearchIndex(conn: DbConnection): Promise<void> {
268
+ // Skip if the table doesn't exist yet (e.g., fresh tests with an empty
269
+ // schema). The FTS extension errors out on a missing table.
270
+ const exists = await conn.queryGet<{ name: string }>(
271
+ "SELECT table_name AS name FROM information_schema.tables WHERE table_name = 'context_index'",
227
272
  );
228
- const placeholders = itemIds.map((_, i) => `?${i + 1}`).join(", ");
229
- const itemRows = await conn.queryAll<{
230
- id: string;
231
- drive: string;
232
- path: string;
273
+ if (!exists) return;
274
+ await conn.exec(
275
+ "PRAGMA create_fts_index('context_index', 'path', 'chunk_content', 'path', overwrite = 1)",
276
+ );
277
+ await conn.exec("CHECKPOINT");
278
+ }
279
+
280
+ export async function indexStats(conn: DbConnection): Promise<{
281
+ paths: number;
282
+ chunks: number;
283
+ embedded: number;
284
+ }> {
285
+ const row = await conn.queryGet<{
286
+ paths: number;
287
+ chunks: number;
288
+ embedded: number;
233
289
  }>(
234
- `SELECT id, drive, path FROM context_items WHERE id IN (${placeholders})`,
235
- ...itemIds,
290
+ `SELECT COUNT(DISTINCT path) AS paths,
291
+ COUNT(*) AS chunks,
292
+ COUNT(embedding) AS embedded
293
+ FROM context_index`,
236
294
  );
237
- const itemIndex = new Map(itemRows.map((r) => [r.id, r]));
238
-
239
- return top.map((entry) => {
240
- const item = itemIndex.get(entry.embedding.context_item_id);
241
- return {
242
- ...entry.embedding,
243
- score: entry.score,
244
- drive: item?.drive ?? null,
245
- path: item?.path ?? null,
246
- };
247
- });
295
+ return {
296
+ paths: Number(row?.paths ?? 0),
297
+ chunks: Number(row?.chunks ?? 0),
298
+ embedded: Number(row?.embedded ?? 0),
299
+ };
248
300
  }
@@ -0,0 +1,36 @@
1
+ -- Switch the search index from "tracks DuckDB-backed virtual files" to
2
+ -- "tracks real files on disk under context/", and drop every table whose
3
+ -- contents now live on the filesystem (tasks, schedules) or that nothing
4
+ -- writes to anymore (daemon_state). The remaining DuckDB tables are:
5
+ -- workers, threads, interactions, context_index, _migrations
6
+ --
7
+ -- A new `context_index` table holds one row per (path, chunk_index), with a
8
+ -- file-level content hash + mtime so `botholomew context reindex` can detect
9
+ -- adds, updates, and removals in one pass.
10
+ --
11
+ -- Idempotent: every step uses IF EXISTS so a partial prior run is safe to
12
+ -- re-attempt. The FTS index over the new chunk_content column is created by
13
+ -- migrate() via rebuildSearchIndex() after all migrations apply.
14
+
15
+ DROP SCHEMA IF EXISTS fts_main_embeddings CASCADE;
16
+ DROP TABLE IF EXISTS embeddings;
17
+ DROP TABLE IF EXISTS context_items;
18
+ DROP TABLE IF EXISTS tasks;
19
+ DROP TABLE IF EXISTS schedules;
20
+ DROP TABLE IF EXISTS daemon_state;
21
+
22
+ CREATE TABLE IF NOT EXISTS context_index (
23
+ path TEXT NOT NULL,
24
+ chunk_index INTEGER NOT NULL,
25
+ content_hash TEXT NOT NULL,
26
+ chunk_content TEXT NOT NULL,
27
+ embedding FLOAT[384],
28
+ mtime_ms BIGINT NOT NULL,
29
+ size_bytes BIGINT NOT NULL,
30
+ indexed_at TEXT NOT NULL DEFAULT (current_timestamp::VARCHAR),
31
+ PRIMARY KEY (path, chunk_index)
32
+ );
33
+
34
+ CREATE INDEX IF NOT EXISTS idx_context_index_path ON context_index(path);
35
+
36
+ CHECKPOINT;
@@ -0,0 +1,19 @@
1
+ -- Tasks, schedules, threads, interactions, and workers all moved out of
2
+ -- DuckDB onto disk:
3
+ -- tasks/ markdown files with frontmatter (one per task)
4
+ -- schedules/ markdown files with frontmatter (one per schedule)
5
+ -- threads/ CSV per conversation (searchable via the index)
6
+ -- workers/ JSON pidfile per worker, mtime-checked heartbeats
7
+ --
8
+ -- The only remaining DuckDB objects after this migration are _migrations,
9
+ -- context_index, and the FTS index built over context_index by
10
+ -- rebuildSearchIndex(). Idempotent via IF EXISTS.
11
+
12
+ DROP TABLE IF EXISTS interactions;
13
+ DROP TABLE IF EXISTS threads;
14
+ DROP TABLE IF EXISTS workers;
15
+
16
+ DROP TABLE IF EXISTS tasks;
17
+ DROP TABLE IF EXISTS schedules;
18
+
19
+ CHECKPOINT;