botholomew 0.9.5 → 0.9.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/chat/agent.ts +19 -2
- package/src/commands/context.ts +1 -0
- package/src/context/chunker.ts +40 -3
- package/src/context/ingest.ts +8 -1
- package/src/context/refresh.ts +28 -18
- package/src/db/connection.ts +2 -4
- package/src/db/context.ts +16 -3
- package/src/db/embeddings.ts +37 -2
- package/src/db/schema.ts +10 -2
- package/src/db/sql/11-rebuild_hnsw.sql +8 -9
- package/src/db/sql/13-drive-paths.sql +0 -2
- package/src/db/sql/14-drop_hnsw_index.sql +8 -0
- package/src/db/sql/15-fts_index.sql +8 -0
- package/src/db/sql/16-source_url.sql +7 -0
- package/src/db/sql/6-vss_index.sql +7 -1
- package/src/db/sql/7-drop_embeddings_fk.sql +0 -1
- package/src/worker/prompt.ts +19 -2
package/package.json
CHANGED
package/src/chat/agent.ts
CHANGED
|
@@ -115,9 +115,26 @@ Format your responses using Markdown. Use headings, bold, italic, lists, and cod
|
|
|
115
115
|
prompt += `
|
|
116
116
|
## External Tools (MCP)
|
|
117
117
|
|
|
118
|
-
|
|
118
|
+
### Local context first
|
|
119
119
|
|
|
120
|
-
|
|
120
|
+
**Before any MCP read, search local context.** Drive, Gmail, GitHub, URLs, and prior agent runs are usually already ingested — refetching is slower, costs tokens, and risks rate limits.
|
|
121
|
+
|
|
122
|
+
Workflow for any "look up / find / read" intent:
|
|
123
|
+
|
|
124
|
+
1. \`search_semantic\` (semantic) or \`context_search\` (keyword), then \`context_read\` / \`context_tree\` to drill in.
|
|
125
|
+
2. If freshness matters, call \`context_info\` and check \`indexed_at\`. To re-pull a single stale item, use \`context_refresh\` rather than going to MCP for the whole document.
|
|
126
|
+
3. Only call \`mcp_exec\` for reads when the data is genuinely missing locally **or** must be real-time (e.g., "what's on my calendar right now").
|
|
127
|
+
|
|
128
|
+
Writes always go through MCP — sending an email, creating an issue, posting to Slack. Don't search context first for those.
|
|
129
|
+
|
|
130
|
+
Examples:
|
|
131
|
+
- "What does doc X say?" → \`search_semantic\` first.
|
|
132
|
+
- "Any new emails from Y?" → check the \`gmail\` drive first; only hit Gmail MCP if the freshest indexed item is too old for the question.
|
|
133
|
+
- "Send an email to Y" → MCP write directly; no context lookup.
|
|
134
|
+
|
|
135
|
+
### Calling MCP tools
|
|
136
|
+
|
|
137
|
+
Before calling any MCP tool you haven't used yet this session, you MUST fetch its schema first:
|
|
121
138
|
|
|
122
139
|
1. Discover tools with \`mcp_search\` (preferred — semantic) or \`mcp_list_tools\`.
|
|
123
140
|
2. Call \`mcp_info\` with the exact \`server\` and \`tool\` to read the tool's input schema, required fields, and types.
|
package/src/commands/context.ts
CHANGED
package/src/context/chunker.ts
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import Anthropic from "@anthropic-ai/sdk";
|
|
2
2
|
import type { BotholomewConfig } from "../config/schemas.ts";
|
|
3
|
+
import { logger } from "../utils/logger.ts";
|
|
3
4
|
|
|
4
5
|
export interface Chunk {
|
|
5
6
|
index: number;
|
|
@@ -16,6 +17,10 @@ const DEFAULT_OVERLAP_LINES = 2;
|
|
|
16
17
|
// 8192-token limit, leaving headroom for the title/description prefix
|
|
17
18
|
// prepended at embed time.
|
|
18
19
|
const MAX_CHUNK_CHARS = 15_000;
|
|
20
|
+
// Target size for deterministic fallback chunks. Smaller than MAX_CHUNK_CHARS
|
|
21
|
+
// so a large doc produces multiple chunks of reasonable granularity when the
|
|
22
|
+
// LLM chunker fails.
|
|
23
|
+
const FALLBACK_TARGET_CHARS = 4_000;
|
|
19
24
|
|
|
20
25
|
const CHUNKER_TOOL_NAME = "return_chunks";
|
|
21
26
|
const CHUNKER_TOOL = {
|
|
@@ -152,6 +157,26 @@ export function addOverlapToChunks(
|
|
|
152
157
|
});
|
|
153
158
|
}
|
|
154
159
|
|
|
160
|
+
export type LLMChunkerFn = (
|
|
161
|
+
content: string,
|
|
162
|
+
mimeType: string,
|
|
163
|
+
config: Required<BotholomewConfig>,
|
|
164
|
+
) => Promise<Chunk[]>;
|
|
165
|
+
|
|
166
|
+
/**
|
|
167
|
+
* Deterministic fallback that splits content on paragraph / line /
|
|
168
|
+
* hard-char boundaries. Used when the LLM chunker errors or times out.
|
|
169
|
+
*/
|
|
170
|
+
export function chunkByTextSplit(
|
|
171
|
+
content: string,
|
|
172
|
+
targetChars = FALLBACK_TARGET_CHARS,
|
|
173
|
+
): Chunk[] {
|
|
174
|
+
return splitText(content, targetChars).map((c, i) => ({
|
|
175
|
+
index: i,
|
|
176
|
+
content: c,
|
|
177
|
+
}));
|
|
178
|
+
}
|
|
179
|
+
|
|
155
180
|
/**
|
|
156
181
|
* LLM-driven chunker that asks Claude to identify semantic boundaries.
|
|
157
182
|
* Uses structured outputs via tool_use with forced tool_choice.
|
|
@@ -167,7 +192,7 @@ export async function chunkWithLLM(
|
|
|
167
192
|
const response = await Promise.race([
|
|
168
193
|
client.messages.create({
|
|
169
194
|
model: config.chunker_model,
|
|
170
|
-
max_tokens:
|
|
195
|
+
max_tokens: 2048,
|
|
171
196
|
tools: [CHUNKER_TOOL],
|
|
172
197
|
tool_choice: { type: "tool", name: CHUNKER_TOOL_NAME },
|
|
173
198
|
messages: [
|
|
@@ -209,13 +234,15 @@ ${content}`,
|
|
|
209
234
|
}
|
|
210
235
|
|
|
211
236
|
/**
|
|
212
|
-
* Chunk content using the LLM chunker
|
|
237
|
+
* Chunk content using the LLM chunker, with a deterministic fallback
|
|
238
|
+
* when the LLM call fails (timeout, empty boundaries, API error, …).
|
|
213
239
|
* Short content (<200 chars) is returned as a single chunk.
|
|
214
240
|
*/
|
|
215
241
|
export async function chunk(
|
|
216
242
|
content: string,
|
|
217
243
|
mimeType: string,
|
|
218
244
|
config: Required<BotholomewConfig>,
|
|
245
|
+
llmChunker: LLMChunkerFn = chunkWithLLM,
|
|
219
246
|
): Promise<Chunk[]> {
|
|
220
247
|
if (content.length < SHORT_CONTENT_THRESHOLD) {
|
|
221
248
|
return [{ index: 0, content }];
|
|
@@ -227,7 +254,17 @@ export async function chunk(
|
|
|
227
254
|
);
|
|
228
255
|
}
|
|
229
256
|
|
|
230
|
-
|
|
257
|
+
let chunks: Chunk[];
|
|
258
|
+
try {
|
|
259
|
+
chunks = await llmChunker(content, mimeType, config);
|
|
260
|
+
} catch (err) {
|
|
261
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
262
|
+
logger.warn(
|
|
263
|
+
`chunker: LLM chunking failed (${msg}); falling back to deterministic text split`,
|
|
264
|
+
);
|
|
265
|
+
chunks = chunkByTextSplit(content);
|
|
266
|
+
}
|
|
267
|
+
|
|
231
268
|
// Enforce a hard size cap before AND after overlap. The first pass handles
|
|
232
269
|
// oversize chunks from the LLM (common for docs with very long lines); the
|
|
233
270
|
// second pass handles the rare case where added overlap pushes a near-limit
|
package/src/context/ingest.ts
CHANGED
|
@@ -1,7 +1,11 @@
|
|
|
1
1
|
import type { BotholomewConfig } from "../config/schemas.ts";
|
|
2
2
|
import type { DbConnection } from "../db/connection.ts";
|
|
3
3
|
import { getContextItem, getContextItemById } from "../db/context.ts";
|
|
4
|
-
import {
|
|
4
|
+
import {
|
|
5
|
+
createEmbedding,
|
|
6
|
+
deleteEmbeddingsForItem,
|
|
7
|
+
rebuildSearchIndex,
|
|
8
|
+
} from "../db/embeddings.ts";
|
|
5
9
|
import { logger } from "../utils/logger.ts";
|
|
6
10
|
import { chunk } from "./chunker.ts";
|
|
7
11
|
import { type DriveTarget, formatDriveRef } from "./drives.ts";
|
|
@@ -121,6 +125,9 @@ export async function storeIngestion(
|
|
|
121
125
|
throw err;
|
|
122
126
|
}
|
|
123
127
|
|
|
128
|
+
// FTS index is a snapshot and doesn't see the writes above until rebuilt.
|
|
129
|
+
await rebuildSearchIndex(conn);
|
|
130
|
+
|
|
124
131
|
const action = isUpdate ? "updated" : "added";
|
|
125
132
|
logger.info(
|
|
126
133
|
`ingest: ${action} ${prepared.chunks.length} chunks for "${prepared.title}" (${prepared.itemId})`,
|
package/src/context/refresh.ts
CHANGED
|
@@ -3,7 +3,7 @@ import type { BotholomewConfig } from "../config/schemas.ts";
|
|
|
3
3
|
import type { DbConnection } from "../db/connection.ts";
|
|
4
4
|
import { type ContextItem, updateContextItem } from "../db/context.ts";
|
|
5
5
|
import { formatDriveRef } from "./drives.ts";
|
|
6
|
-
import { fetchUrl } from "./fetcher.ts";
|
|
6
|
+
import { type FetchedContent, fetchUrl } from "./fetcher.ts";
|
|
7
7
|
import {
|
|
8
8
|
type PreparedIngestion,
|
|
9
9
|
prepareIngestion,
|
|
@@ -40,6 +40,13 @@ export interface RefreshOptions {
|
|
|
40
40
|
|
|
41
41
|
type IngestEmbedFn = (texts: string[]) => Promise<number[][]>;
|
|
42
42
|
|
|
43
|
+
/** Signature compatible with {@link fetchUrl}. Injectable for tests. */
|
|
44
|
+
export type FetchUrlFn = (
|
|
45
|
+
url: string,
|
|
46
|
+
config: Required<BotholomewConfig>,
|
|
47
|
+
mcpxClient: McpxClient | null,
|
|
48
|
+
) => Promise<FetchedContent>;
|
|
49
|
+
|
|
43
50
|
/**
|
|
44
51
|
* Refresh a batch of context items: re-read from origin, diff, update
|
|
45
52
|
* content, and re-embed only the items that changed.
|
|
@@ -47,10 +54,12 @@ type IngestEmbedFn = (texts: string[]) => Promise<number[][]>;
|
|
|
47
54
|
* Dispatches on `drive`:
|
|
48
55
|
* disk → read from filesystem
|
|
49
56
|
* agent → skip (no external origin)
|
|
50
|
-
* other → re-fetch
|
|
51
|
-
*
|
|
52
|
-
*
|
|
53
|
-
* `
|
|
57
|
+
* other → re-fetch via `item.source_url` (captured at ingest time).
|
|
58
|
+
* The built-in `url` drive stores the URL as its path so it can
|
|
59
|
+
* also refresh directly from `path`. Any other drive with no
|
|
60
|
+
* `source_url` surfaces a per-item error — the user must re-add
|
|
61
|
+
* from URL. No code here knows anything about the remote
|
|
62
|
+
* service behind a drive.
|
|
54
63
|
*/
|
|
55
64
|
export async function refreshContextItems(
|
|
56
65
|
conn: DbConnection,
|
|
@@ -59,6 +68,7 @@ export async function refreshContextItems(
|
|
|
59
68
|
mcpxClient: McpxClient | null,
|
|
60
69
|
opts: RefreshOptions = {},
|
|
61
70
|
embedFn?: IngestEmbedFn,
|
|
71
|
+
fetchFn: FetchUrlFn = fetchUrl,
|
|
62
72
|
): Promise<RefreshResult> {
|
|
63
73
|
const refreshable = items.filter((i) => i.drive !== "agent");
|
|
64
74
|
|
|
@@ -84,20 +94,20 @@ export async function refreshContextItems(
|
|
|
84
94
|
continue;
|
|
85
95
|
}
|
|
86
96
|
content = await bunFile.text();
|
|
87
|
-
} else if (item.drive === "url") {
|
|
88
|
-
const url = item.path.startsWith("/") ? item.path.slice(1) : item.path;
|
|
89
|
-
const fetched = await fetchUrl(url, config, mcpxClient);
|
|
90
|
-
content = fetched.content;
|
|
91
97
|
} else {
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
98
|
+
const url =
|
|
99
|
+
item.source_url ??
|
|
100
|
+
(item.drive === "url" ? item.path.replace(/^\//, "") : null);
|
|
101
|
+
if (!url) {
|
|
102
|
+
results.push({
|
|
103
|
+
...base,
|
|
104
|
+
status: "error",
|
|
105
|
+
error: `Cannot refresh ${formatDriveRef(item)}: no source_url recorded. Re-add from the original URL.`,
|
|
106
|
+
});
|
|
107
|
+
continue;
|
|
108
|
+
}
|
|
109
|
+
const fetched = await fetchFn(url, config, mcpxClient);
|
|
110
|
+
content = fetched.content;
|
|
101
111
|
}
|
|
102
112
|
|
|
103
113
|
if (content === item.content) {
|
package/src/db/connection.ts
CHANGED
|
@@ -186,8 +186,7 @@ export async function getConnection(dbPath?: string): Promise<DbConnection> {
|
|
|
186
186
|
if (isMemoryPath(path)) {
|
|
187
187
|
const instance = await DuckDBInstance.create(path);
|
|
188
188
|
const conn = await instance.connect();
|
|
189
|
-
await conn.run("INSTALL
|
|
190
|
-
await conn.run("SET hnsw_enable_experimental_persistence = true;");
|
|
189
|
+
await conn.run("INSTALL fts; LOAD fts;");
|
|
191
190
|
return new DbConnection(conn, instance, path);
|
|
192
191
|
}
|
|
193
192
|
|
|
@@ -197,8 +196,7 @@ export async function getConnection(dbPath?: string): Promise<DbConnection> {
|
|
|
197
196
|
// INSTALL is a no-op after the first successful install (the extension
|
|
198
197
|
// is persisted to the user's DuckDB extension directory). LOAD is
|
|
199
198
|
// cheap per connection.
|
|
200
|
-
await conn.run("INSTALL
|
|
201
|
-
await conn.run("SET hnsw_enable_experimental_persistence = true;");
|
|
199
|
+
await conn.run("INSTALL fts; LOAD fts;");
|
|
202
200
|
return new DbConnection(conn, null, path);
|
|
203
201
|
} catch (err) {
|
|
204
202
|
releaseInstance(path);
|
package/src/db/context.ts
CHANGED
|
@@ -17,6 +17,7 @@ export interface ContextItem {
|
|
|
17
17
|
is_textual: boolean;
|
|
18
18
|
drive: string;
|
|
19
19
|
path: string;
|
|
20
|
+
source_url: string | null;
|
|
20
21
|
indexed_at: Date | null;
|
|
21
22
|
created_at: Date;
|
|
22
23
|
updated_at: Date;
|
|
@@ -38,6 +39,7 @@ interface ContextItemRow {
|
|
|
38
39
|
is_textual: boolean;
|
|
39
40
|
drive: string;
|
|
40
41
|
path: string;
|
|
42
|
+
source_url: string | null;
|
|
41
43
|
indexed_at: string | null;
|
|
42
44
|
created_at: string;
|
|
43
45
|
updated_at: string;
|
|
@@ -53,6 +55,7 @@ function rowToContextItem(row: ContextItemRow): ContextItem {
|
|
|
53
55
|
is_textual: !!row.is_textual,
|
|
54
56
|
drive: row.drive,
|
|
55
57
|
path: row.path,
|
|
58
|
+
source_url: row.source_url,
|
|
56
59
|
indexed_at: row.indexed_at ? new Date(row.indexed_at) : null,
|
|
57
60
|
created_at: new Date(row.created_at),
|
|
58
61
|
updated_at: new Date(row.updated_at),
|
|
@@ -84,12 +87,13 @@ export async function createContextItem(
|
|
|
84
87
|
path: string;
|
|
85
88
|
description?: string;
|
|
86
89
|
isTextual?: boolean;
|
|
90
|
+
sourceUrl?: string | null;
|
|
87
91
|
},
|
|
88
92
|
): Promise<ContextItem> {
|
|
89
93
|
const id = uuidv7();
|
|
90
94
|
const row = await db.queryGet<ContextItemRow>(
|
|
91
|
-
`INSERT INTO context_items (id, title, description, content, mime_type, is_textual, drive, path)
|
|
92
|
-
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8)
|
|
95
|
+
`INSERT INTO context_items (id, title, description, content, mime_type, is_textual, drive, path, source_url)
|
|
96
|
+
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9)
|
|
93
97
|
RETURNING *`,
|
|
94
98
|
id,
|
|
95
99
|
params.title,
|
|
@@ -99,6 +103,7 @@ export async function createContextItem(
|
|
|
99
103
|
params.isTextual !== false,
|
|
100
104
|
params.drive,
|
|
101
105
|
params.path,
|
|
106
|
+
params.sourceUrl ?? null,
|
|
102
107
|
);
|
|
103
108
|
if (!row) throw new Error("INSERT did not return a row");
|
|
104
109
|
return rowToContextItem(row);
|
|
@@ -122,6 +127,7 @@ export async function upsertContextItem(
|
|
|
122
127
|
path: string;
|
|
123
128
|
description?: string;
|
|
124
129
|
isTextual?: boolean;
|
|
130
|
+
sourceUrl?: string | null;
|
|
125
131
|
},
|
|
126
132
|
): Promise<ContextItem> {
|
|
127
133
|
const existing = await getContextItem(db, {
|
|
@@ -133,6 +139,7 @@ export async function upsertContextItem(
|
|
|
133
139
|
title: params.title,
|
|
134
140
|
content: params.content,
|
|
135
141
|
mime_type: params.mimeType,
|
|
142
|
+
source_url: params.sourceUrl,
|
|
136
143
|
});
|
|
137
144
|
if (!updated)
|
|
138
145
|
throw new Error(
|
|
@@ -157,6 +164,7 @@ export async function createContextItemStrict(
|
|
|
157
164
|
path: string;
|
|
158
165
|
description?: string;
|
|
159
166
|
isTextual?: boolean;
|
|
167
|
+
sourceUrl?: string | null;
|
|
160
168
|
},
|
|
161
169
|
): Promise<ContextItem> {
|
|
162
170
|
const existing = await getContextItem(db, {
|
|
@@ -426,7 +434,10 @@ export async function updateContextItem(
|
|
|
426
434
|
db: DbConnection,
|
|
427
435
|
id: string,
|
|
428
436
|
updates: Partial<
|
|
429
|
-
Pick<
|
|
437
|
+
Pick<
|
|
438
|
+
ContextItem,
|
|
439
|
+
"title" | "description" | "content" | "mime_type" | "source_url"
|
|
440
|
+
>
|
|
430
441
|
>,
|
|
431
442
|
): Promise<ContextItem | null> {
|
|
432
443
|
const { setClauses, params } = buildSetClauses([
|
|
@@ -434,6 +445,7 @@ export async function updateContextItem(
|
|
|
434
445
|
["description", updates.description],
|
|
435
446
|
["content", updates.content],
|
|
436
447
|
["mime_type", updates.mime_type],
|
|
448
|
+
["source_url", updates.source_url],
|
|
437
449
|
]);
|
|
438
450
|
|
|
439
451
|
setClauses.push("updated_at = current_timestamp::VARCHAR");
|
|
@@ -514,6 +526,7 @@ export async function copyContextItem(
|
|
|
514
526
|
drive: dst.drive,
|
|
515
527
|
path: dst.path,
|
|
516
528
|
isTextual: source.is_textual,
|
|
529
|
+
sourceUrl: source.source_url,
|
|
517
530
|
});
|
|
518
531
|
}
|
|
519
532
|
|
package/src/db/embeddings.ts
CHANGED
|
@@ -45,6 +45,11 @@ function rowToEmbedding(row: EmbeddingRow): Embedding {
|
|
|
45
45
|
};
|
|
46
46
|
}
|
|
47
47
|
|
|
48
|
+
/**
|
|
49
|
+
* Insert a single embedding row. Callers that bulk-write embeddings are
|
|
50
|
+
* responsible for calling `rebuildSearchIndex()` afterward — the FTS index is
|
|
51
|
+
* a snapshot and will not reflect new rows until rebuilt.
|
|
52
|
+
*/
|
|
48
53
|
export async function createEmbedding(
|
|
49
54
|
conn: DbConnection,
|
|
50
55
|
params: {
|
|
@@ -92,6 +97,11 @@ export async function getEmbeddingsForItem(
|
|
|
92
97
|
return rows.map(rowToEmbedding);
|
|
93
98
|
}
|
|
94
99
|
|
|
100
|
+
/**
|
|
101
|
+
* Delete all embeddings for a context item. Callers are responsible for
|
|
102
|
+
* calling `rebuildSearchIndex()` afterward — the FTS index is a snapshot and
|
|
103
|
+
* will still reference the deleted rows until rebuilt.
|
|
104
|
+
*/
|
|
95
105
|
export async function deleteEmbeddingsForItem(
|
|
96
106
|
conn: DbConnection,
|
|
97
107
|
contextItemId: string,
|
|
@@ -138,6 +148,25 @@ export interface HybridSearchResult extends EmbeddingSearchResult {
|
|
|
138
148
|
path: string | null;
|
|
139
149
|
}
|
|
140
150
|
|
|
151
|
+
/**
|
|
152
|
+
* Rebuild the FTS index over (chunk_content, title). DuckDB's FTS index is a
|
|
153
|
+
* snapshot — it does not update incrementally on INSERT/UPDATE/DELETE, so any
|
|
154
|
+
* batch writer must call this once its transaction commits. Cheap at our
|
|
155
|
+
* scale (hundreds to low thousands of rows).
|
|
156
|
+
*
|
|
157
|
+
* The trailing CHECKPOINT is load-bearing: `overwrite = 1` writes a
|
|
158
|
+
* `DROP SCHEMA fts_main_embeddings` record into the WAL. If the WAL still
|
|
159
|
+
* contains that drop on the next open, replay fails with "Cannot drop entry
|
|
160
|
+
* 'fts_main_embeddings' because there are entries that depend on it". Forcing
|
|
161
|
+
* a checkpoint flushes the WAL so the next open has nothing to replay.
|
|
162
|
+
*/
|
|
163
|
+
export async function rebuildSearchIndex(conn: DbConnection): Promise<void> {
|
|
164
|
+
await conn.exec(
|
|
165
|
+
"PRAGMA create_fts_index('embeddings', 'id', 'chunk_content', 'title', overwrite = 1)",
|
|
166
|
+
);
|
|
167
|
+
await conn.exec("CHECKPOINT");
|
|
168
|
+
}
|
|
169
|
+
|
|
141
170
|
export async function hybridSearch(
|
|
142
171
|
conn: DbConnection,
|
|
143
172
|
query: string,
|
|
@@ -146,10 +175,16 @@ export async function hybridSearch(
|
|
|
146
175
|
): Promise<HybridSearchResult[]> {
|
|
147
176
|
const k = 60; // RRF constant
|
|
148
177
|
|
|
178
|
+
// Keyword side: BM25 over chunk_content + title via the FTS extension.
|
|
179
|
+
// `match_bm25` returns NULL for rows with no token overlap; we keep only
|
|
180
|
+
// scored rows and order by descending score so RRF sees the best matches
|
|
181
|
+
// at the lowest ranks. Stemming, stopwords, and tokenization are handled
|
|
182
|
+
// by FTS — more query terms produce higher scores, which is exactly the
|
|
183
|
+
// behaviour a naive per-token ILIKE loop fails to provide.
|
|
149
184
|
const keywordRows = await conn.queryAll<EmbeddingRow>(
|
|
150
185
|
`SELECT * FROM embeddings
|
|
151
|
-
WHERE
|
|
152
|
-
|
|
186
|
+
WHERE fts_main_embeddings.match_bm25(id, ?1) IS NOT NULL
|
|
187
|
+
ORDER BY fts_main_embeddings.match_bm25(id, ?1) DESC
|
|
153
188
|
LIMIT 100`,
|
|
154
189
|
query,
|
|
155
190
|
);
|
package/src/db/schema.ts
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { readdirSync, readFileSync } from "node:fs";
|
|
2
2
|
import { join } from "node:path";
|
|
3
|
+
import { logger } from "../utils/logger.ts";
|
|
3
4
|
import type { DbConnection } from "./connection.ts";
|
|
4
5
|
|
|
5
6
|
interface Migration {
|
|
@@ -45,9 +46,16 @@ export async function migrate(db: DbConnection): Promise<void> {
|
|
|
45
46
|
const applied = new Set(rows.map((row) => row.id));
|
|
46
47
|
|
|
47
48
|
// Run pending migrations in order
|
|
49
|
+
const pending = loadMigrations().filter((m) => !applied.has(m.id));
|
|
50
|
+
if (pending.length > 0) {
|
|
51
|
+
logger.info(
|
|
52
|
+
`applying ${pending.length} migration${pending.length === 1 ? "" : "s"}`,
|
|
53
|
+
);
|
|
54
|
+
}
|
|
55
|
+
|
|
48
56
|
let appliedAny = false;
|
|
49
|
-
for (const migration of
|
|
50
|
-
|
|
57
|
+
for (const migration of pending) {
|
|
58
|
+
logger.info(` ${migration.id}. ${migration.name}`);
|
|
51
59
|
|
|
52
60
|
// Split on semicolons and run each statement individually
|
|
53
61
|
const statements = migration.sql
|
|
@@ -1,9 +1,8 @@
|
|
|
1
|
-
--
|
|
2
|
-
-- state after
|
|
3
|
-
--
|
|
4
|
-
--
|
|
5
|
-
--
|
|
6
|
-
--
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
CREATE INDEX idx_embeddings_cosine ON embeddings USING HNSW (embedding) WITH (metric = 'cosine');
|
|
1
|
+
-- Historical: this migration used to drop and recreate the HNSW index
|
|
2
|
+
-- to clean up an internally-inconsistent state after native-side crashes
|
|
3
|
+
-- during embedding writes. HNSW is now gone (see migration 14) and the
|
|
4
|
+
-- VSS extension is no longer loaded at connection time, so the original
|
|
5
|
+
-- DDL would fail on fresh DBs. Kept as a no-op to preserve migration
|
|
6
|
+
-- numbering for existing databases that have already recorded id 11 in
|
|
7
|
+
-- _migrations.
|
|
8
|
+
SELECT 1;
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
-- HNSW has caused two separate corruption modes in this project: the
|
|
2
|
+
-- "Duplicate keys not allowed in high-level wrappers" failure addressed by
|
|
3
|
+
-- migration 11, and a second mode where the index silently returns zero rows
|
|
4
|
+
-- for cosine top-K queries (its stored SQL loses the `WITH (metric = 'cosine')`
|
|
5
|
+
-- clause). At our scale a linear scan of array_cosine_distance is plenty fast
|
|
6
|
+
-- and array_cosine_distance is a core DuckDB function — no VSS extension
|
|
7
|
+
-- required. Drop the index and move on.
|
|
8
|
+
DROP INDEX IF EXISTS idx_embeddings_cosine;
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
-- Keyword search uses DuckDB's FTS extension for BM25 ranking over
|
|
2
|
+
-- chunk_content and title. The index is a snapshot and must be rebuilt
|
|
3
|
+
-- after any write to the embeddings table. rebuildSearchIndex() in
|
|
4
|
+
-- src/db/embeddings.ts is the single entry point and is called from the
|
|
5
|
+
-- ingest transaction. overwrite = 1 makes this PRAGMA idempotent, which
|
|
6
|
+
-- also gives us a first-run rebuild for users upgrading from a DB that
|
|
7
|
+
-- never had FTS.
|
|
8
|
+
PRAGMA create_fts_index('embeddings', 'id', 'chunk_content', 'title', overwrite = 1);
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
-- Issue #145: preserve the original URL that produced each context item so
|
|
2
|
+
-- `context refresh` can re-fetch loss-lessly for service-specific drives
|
|
3
|
+
-- (google-docs, github, ...). Nullable — local-origin drives (disk, agent,
|
|
4
|
+
-- tool writes) leave it NULL and use their own refresh path. Legacy rows
|
|
5
|
+
-- ingested before this column existed also leave it NULL and surface a
|
|
6
|
+
-- "re-add from URL" error on refresh.
|
|
7
|
+
ALTER TABLE context_items ADD COLUMN source_url TEXT;
|
|
@@ -1 +1,7 @@
|
|
|
1
|
-
|
|
1
|
+
-- Historical: this migration used to CREATE an HNSW index on embeddings
|
|
2
|
+
-- via the VSS extension. HNSW has since been removed (see migration 12)
|
|
3
|
+
-- (see migration 14) and the VSS extension is no longer loaded at
|
|
4
|
+
-- connection time, so running `CREATE INDEX ... USING HNSW` here would
|
|
5
|
+
-- fail on fresh DBs. Kept as a no-op to preserve migration numbering
|
|
6
|
+
-- for existing databases that have already recorded id 6 in _migrations.
|
|
7
|
+
SELECT 1;
|
package/src/worker/prompt.ts
CHANGED
|
@@ -131,9 +131,26 @@ When calling complete_task, write a summary that captures your key findings, dec
|
|
|
131
131
|
prompt += `
|
|
132
132
|
## External Tools (MCP)
|
|
133
133
|
|
|
134
|
-
|
|
134
|
+
### Local context first
|
|
135
135
|
|
|
136
|
-
|
|
136
|
+
**Before any MCP read, search local context.** Drive, Gmail, GitHub, URLs, and prior agent runs are usually already ingested — refetching is slower, costs tokens, and risks rate limits.
|
|
137
|
+
|
|
138
|
+
Workflow for any "look up / find / read" intent:
|
|
139
|
+
|
|
140
|
+
1. \`search_semantic\` (semantic) or \`context_search\` (keyword), then \`context_read\` / \`context_tree\` to drill in.
|
|
141
|
+
2. If freshness matters, call \`context_info\` and check \`indexed_at\`. To re-pull a single stale item, use \`context_refresh\` rather than going to MCP for the whole document.
|
|
142
|
+
3. Only call \`mcp_exec\` for reads when the data is genuinely missing locally **or** must be real-time (e.g., "what's on my calendar right now").
|
|
143
|
+
|
|
144
|
+
Writes always go through MCP — sending an email, creating an issue, posting to Slack. Don't search context first for those.
|
|
145
|
+
|
|
146
|
+
Examples:
|
|
147
|
+
- "What does doc X say?" → \`search_semantic\` first.
|
|
148
|
+
- "Any new emails from Y?" → check the \`gmail\` drive first; only hit Gmail MCP if the freshest indexed item is too old for the question.
|
|
149
|
+
- "Send an email to Y" → MCP write directly; no context lookup.
|
|
150
|
+
|
|
151
|
+
### Calling MCP tools
|
|
152
|
+
|
|
153
|
+
Before calling any MCP tool you haven't used yet this session, you MUST fetch its schema first:
|
|
137
154
|
|
|
138
155
|
1. Discover tools with \`mcp_search\` (preferred — semantic) or \`mcp_list_tools\`.
|
|
139
156
|
2. Call \`mcp_info\` with the exact \`server\` and \`tool\` to read the tool's input schema, required fields, and types.
|