botholomew 0.11.5 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "botholomew",
3
- "version": "0.11.5",
3
+ "version": "0.12.0",
4
4
  "description": "An autonomous AI agent for knowledge work — works your task queue while you sleep.",
5
5
  "type": "module",
6
6
  "bin": {
package/src/chat/agent.ts CHANGED
@@ -42,8 +42,7 @@ const CHAT_TOOL_NAMES = new Set([
42
42
  "context_read",
43
43
  "context_write",
44
44
  "context_edit",
45
- "search_grep",
46
- "search_semantic",
45
+ "search",
47
46
  "list_threads",
48
47
  "view_thread",
49
48
  "create_schedule",
@@ -56,6 +55,7 @@ const CHAT_TOOL_NAMES = new Set([
56
55
  "mcp_info",
57
56
  "mcp_exec",
58
57
  "read_large_result",
58
+ "pipe_to_context",
59
59
  "spawn_worker",
60
60
  "skill_list",
61
61
  "skill_read",
@@ -133,14 +133,14 @@ Format your responses using Markdown. Use headings, bold, italic, lists, and cod
133
133
 
134
134
  Workflow for any "look up / find / read" intent:
135
135
 
136
- 1. \`search_semantic\` (semantic) or \`context_search\` (keyword), then \`context_read\` / \`context_tree\` to drill in.
136
+ 1. \`search\` (hybrid regexp + semantic) or \`context_search\` (keyword), then \`context_read\` / \`context_tree\` to drill in.
137
137
  2. If freshness matters, call \`context_info\` and check \`indexed_at\`. To re-pull a single stale item, use \`context_refresh\` rather than going to MCP for the whole document.
138
138
  3. Only call \`mcp_exec\` for reads when the data is genuinely missing locally **or** must be real-time (e.g., "what's on my calendar right now").
139
139
 
140
140
  Writes always go through MCP — sending an email, creating an issue, posting to Slack. Don't search context first for those.
141
141
 
142
142
  Examples:
143
- - "What does doc X say?" → \`search_semantic\` first.
143
+ - "What does doc X say?" → \`search\` first.
144
144
  - "Any new emails from Y?" → check the \`gmail\` drive first; only hit Gmail MCP if the freshest indexed item is too old for the question.
145
145
  - "Send an email to Y" → MCP write directly; no context lookup.
146
146
 
@@ -14,7 +14,6 @@ import {
14
14
  formatDriveRef,
15
15
  parseDriveRef,
16
16
  } from "../context/drives.ts";
17
- import { embedSingle } from "../context/embedder.ts";
18
17
  import { FetchFailureError, fetchUrl } from "../context/fetcher.ts";
19
18
  import {
20
19
  type PreparedIngestion,
@@ -36,14 +35,13 @@ import {
36
35
  resolveContextItem,
37
36
  upsertContextItem,
38
37
  } from "../db/context.ts";
39
- import { getEmbeddingsForItem, hybridSearch } from "../db/embeddings.ts";
38
+ import { getEmbeddingsForItem } from "../db/embeddings.ts";
40
39
  import { reembedMissingVectors } from "../db/reembed.ts";
41
40
  import { createMcpxClient } from "../mcpx/client.ts";
41
+ import { searchTool } from "../tools/search/index.ts";
42
+ import type { ToolContext } from "../tools/tool.ts";
42
43
  import { logger } from "../utils/logger.ts";
43
- import {
44
- registerContextToolSubcommands,
45
- registerSearchToolSubcommands,
46
- } from "./tools.ts";
44
+ import { registerContextToolSubcommands } from "./tools.ts";
47
45
  import { withDb } from "./with-db.ts";
48
46
 
49
47
  function fmtDate(d: Date): string {
@@ -513,46 +511,82 @@ export function registerContextCommand(program: Command) {
513
511
 
514
512
  const search = ctx
515
513
  .command("search")
516
- .description("Search context entries")
517
- .argument("[query]", "search query (hybrid keyword + semantic)")
518
- .option("-k, --top-k <n>", "max results", Number.parseInt, 10)
514
+ .description("Search context entries (hybrid regexp + semantic)")
515
+ .argument(
516
+ "[query]",
517
+ "natural-language query (semantic + BM25). Combine with --pattern for fused regexp + semantic ranking.",
518
+ )
519
+ .option("-k, --top-k <n>", "max results", Number.parseInt, 20)
520
+ .option(
521
+ "--pattern <regex>",
522
+ "regex pattern (regexp side). May be combined with [query] to fuse signals.",
523
+ )
524
+ .option("--drive <drive>", "restrict to a single drive")
525
+ .option("--path <path>", "directory prefix within drive (requires --drive)")
526
+ .option("--glob <glob>", "filter results to files whose basename matches")
527
+ .option("--ignore-case", "case-insensitive regex")
528
+ .option(
529
+ "--context <n>",
530
+ "context lines around each regexp hit",
531
+ Number.parseInt,
532
+ )
519
533
  .action((query, opts) =>
520
534
  withDb(program, async (conn, dir) => {
521
- if (!query) {
535
+ if (!query && !opts.pattern) {
522
536
  search.help();
523
537
  return;
524
538
  }
525
539
  const config = await loadConfig(dir);
526
- const queryVec = await embedSingle(query, config);
527
- const results = await hybridSearch(conn, query, queryVec, opts.topK);
540
+ const toolCtx: ToolContext = {
541
+ conn,
542
+ dbPath: getDbPath(dir),
543
+ projectDir: dir,
544
+ config,
545
+ mcpxClient: null,
546
+ };
547
+ const result = await searchTool.execute(
548
+ {
549
+ query,
550
+ pattern: opts.pattern,
551
+ drive: opts.drive,
552
+ path: opts.path,
553
+ glob: opts.glob,
554
+ ignore_case: opts.ignoreCase,
555
+ context: opts.context,
556
+ max_results: opts.topK,
557
+ },
558
+ toolCtx,
559
+ );
528
560
 
529
- if (results.length === 0) {
561
+ if (result.is_error) {
562
+ logger.error(result.message ?? "Search failed");
563
+ process.exit(1);
564
+ }
565
+
566
+ if (result.matches.length === 0) {
530
567
  logger.dim("No results found.");
531
568
  return;
532
569
  }
533
570
 
534
- for (const [i, r] of results.entries()) {
535
- const score = (r.score * 100).toFixed(1);
536
- console.log(
537
- `${ansis.bold(`${i + 1}.`)} ${ansis.cyan(r.title)} ${ansis.dim(`(${score}%)`)}`,
538
- );
539
- const ref =
540
- r.drive && r.path
541
- ? formatDriveRef({ drive: r.drive, path: r.path })
542
- : r.context_item_id;
571
+ for (const [i, m] of result.matches.entries()) {
572
+ const tagColor =
573
+ m.match_type === "both"
574
+ ? ansis.green
575
+ : m.match_type === "regexp"
576
+ ? ansis.yellow
577
+ : ansis.cyan;
578
+ const tag = tagColor(`[${m.match_type}]`);
579
+ const location = m.line != null ? `${m.ref}:${m.line}` : m.ref;
543
580
  console.log(
544
- ` ${ansis.dim(ref)} ${ansis.dim(fmtDate(r.created_at))}`,
581
+ `${ansis.bold(`${i + 1}.`)} ${tag} ${ansis.cyan(location)} ${ansis.dim(`score=${m.score.toFixed(4)}`)}`,
545
582
  );
546
- if (r.chunk_content) {
547
- const snippet = r.chunk_content.slice(0, 120).replace(/\n/g, " ");
548
- console.log(` ${snippet}...`);
549
- }
583
+ const snippet = m.content.slice(0, 200).replace(/\n/g, " ");
584
+ if (snippet) console.log(` ${snippet}`);
550
585
  console.log("");
551
586
  }
552
587
  }),
553
588
  );
554
589
 
555
- registerSearchToolSubcommands(search);
556
590
  ctx
557
591
  .command("delete <ref>")
558
592
  .description("Delete a context entry (UUID or drive:/path)")
@@ -37,16 +37,6 @@ export function registerContextToolSubcommands(parent: Command) {
37
37
  }
38
38
  }
39
39
 
40
- /**
41
- * Register search tool subcommands (grep, semantic) onto an
42
- * existing Commander command (e.g. the "context search" group).
43
- */
44
- export function registerSearchToolSubcommands(parent: Command) {
45
- for (const tool of getToolsByGroup("search")) {
46
- registerToolAsCLI(parent, tool);
47
- }
48
- }
49
-
50
40
  /** Derive CLI subcommand name from tool name: "context_read" → "read", "context_create_dir" → "create-dir" */
51
41
  function deriveSubName(toolName: string): string {
52
42
  return toolName.replace(/^[^_]+_/, "").replace(/_/g, "-");
@@ -341,8 +331,6 @@ function isPositionalArg(key: string, toolName: string): boolean {
341
331
  context_exists: ["path"],
342
332
  context_count_lines: ["path"],
343
333
  context_search: ["query"],
344
- search_grep: ["pattern"],
345
- search_semantic: ["query"],
346
334
  };
347
335
  return positionalKeys[toolName]?.includes(key) ?? false;
348
336
  }
@@ -47,11 +47,13 @@ function isModelCached(model: string): boolean {
47
47
  async function getPipeline(model: string): Promise<FeatureExtractionPipeline> {
48
48
  let p = pipelinePromises.get(model);
49
49
  if (!p) {
50
- logger.info(
51
- isModelCached(model)
52
- ? `Loading embedding model ${model}`
53
- : `Loading embedding model ${model} (first run, downloading weights)`,
54
- );
50
+ if (isModelCached(model)) {
51
+ logger.debug(`Loading embedding model ${model}`);
52
+ } else {
53
+ logger.info(
54
+ `Loading embedding model ${model} (first run, downloading weights)`,
55
+ );
56
+ }
55
57
  p = pipeline("feature-extraction", model);
56
58
  pipelinePromises.set(model, p);
57
59
  }
@@ -0,0 +1,228 @@
1
+ import { isText } from "istextorbinary";
2
+ import { z } from "zod";
3
+ import { formatDriveRef } from "../../context/drives.ts";
4
+ import { ingestByPath } from "../../context/ingest.ts";
5
+ import {
6
+ createContextItemStrict,
7
+ PathConflictError,
8
+ upsertContextItem,
9
+ } from "../../db/context.ts";
10
+ import { getTool, type ToolDefinition } from "../tool.ts";
11
+
12
+ const PREVIEW_CHARS = 200;
13
+ const ERROR_MESSAGE_CAP = 2000;
14
+ const TOOL_NAME = "pipe_to_context";
15
+
16
+ function mimeFromPath(path: string): string {
17
+ const type = Bun.file(path).type.split(";")[0];
18
+ return type ?? "application/octet-stream";
19
+ }
20
+
21
+ function isTextualPath(path: string): boolean {
22
+ const filename = path.split("/").pop() ?? path;
23
+ return isText(filename) !== false;
24
+ }
25
+
26
+ function truncate(s: string, cap: number): string {
27
+ if (s.length <= cap) return s;
28
+ return `${s.slice(0, cap)}…[truncated, ${s.length - cap} more chars]`;
29
+ }
30
+
31
+ const inputSchema = z.object({
32
+ tool_name: z
33
+ .string()
34
+ .describe(
35
+ "Name of the tool to dispatch. Its full output is piped into a context item; you (the LLM) will only see the storage acknowledgment, never the raw bytes.",
36
+ ),
37
+ tool_input: z
38
+ .record(z.string(), z.unknown())
39
+ .describe(
40
+ "Arguments to pass to the inner tool (same shape as a normal call).",
41
+ ),
42
+ drive: z
43
+ .string()
44
+ .default("agent")
45
+ .describe(
46
+ "Drive to write to (defaults to 'agent', the agent's scratch drive).",
47
+ ),
48
+ path: z.string().describe("Path within the drive (starts with /)"),
49
+ title: z
50
+ .string()
51
+ .optional()
52
+ .describe("Title for the file (defaults to filename)"),
53
+ description: z.string().optional().describe("Description of the file"),
54
+ on_conflict: z
55
+ .enum(["error", "overwrite"])
56
+ .optional()
57
+ .describe(
58
+ "What to do if a file already exists at this (drive, path). Defaults to 'error'. Pass 'overwrite' to replace.",
59
+ ),
60
+ });
61
+
62
+ const outputSchema = z.object({
63
+ is_error: z.boolean(),
64
+ id: z.string().optional(),
65
+ drive: z.string().optional(),
66
+ path: z.string().optional(),
67
+ ref: z.string().optional(),
68
+ bytes_written: z.number().optional(),
69
+ preview: z
70
+ .string()
71
+ .optional()
72
+ .describe(
73
+ `First ${PREVIEW_CHARS} characters of the stored content so you can sanity-check what was captured.`,
74
+ ),
75
+ inner_tool_is_error: z.boolean().optional(),
76
+ error_type: z
77
+ .enum([
78
+ "unknown_tool",
79
+ "forbidden_tool",
80
+ "invalid_input",
81
+ "inner_tool_error",
82
+ "path_conflict",
83
+ ])
84
+ .optional(),
85
+ message: z.string().optional(),
86
+ next_action_hint: z.string().optional(),
87
+ });
88
+
89
+ export const pipeToContextTool = {
90
+ name: TOOL_NAME,
91
+ description:
92
+ "[[ bash equivalent command: cmd > file ]] Run another tool and pipe its full output directly into a context item, without the result flowing through the conversation. Use this when you need a large tool output (web pages, search dumps, big mcp_exec results) to be searchable/embedded for later but you do NOT need to read the bytes yourself. You'll only see the storage ack (drive, path, id, size, short preview).",
93
+ group: "context",
94
+ inputSchema,
95
+ outputSchema,
96
+ execute: async (input, ctx) => {
97
+ const inner = getTool(input.tool_name);
98
+ if (!inner) {
99
+ return {
100
+ is_error: true,
101
+ error_type: "unknown_tool",
102
+ message: `No tool named "${input.tool_name}".`,
103
+ next_action_hint:
104
+ "Check the tool name spelling, or call the inner tool directly if you do need to see its output.",
105
+ };
106
+ }
107
+
108
+ if (inner.name === TOOL_NAME || inner.terminal) {
109
+ return {
110
+ is_error: true,
111
+ error_type: "forbidden_tool",
112
+ message: `Tool "${inner.name}" cannot be piped (terminal tools and pipe_to_context itself are not allowed).`,
113
+ next_action_hint:
114
+ "Pipe a non-terminal tool (search_grep, mcp_exec, context_refresh, etc.) instead.",
115
+ };
116
+ }
117
+
118
+ const parsedInner = inner.inputSchema.safeParse(input.tool_input);
119
+ if (!parsedInner.success) {
120
+ const issues = parsedInner.error.issues
121
+ .map((i) => `${i.path.join(".")}: ${i.message}`)
122
+ .join("; ");
123
+ return {
124
+ is_error: true,
125
+ error_type: "invalid_input",
126
+ message: `Invalid input for ${inner.name}: ${issues}.`,
127
+ next_action_hint:
128
+ "Fix tool_input to match the inner tool's schema and retry.",
129
+ };
130
+ }
131
+
132
+ let innerResult: unknown;
133
+ try {
134
+ innerResult = await inner.execute(parsedInner.data, ctx);
135
+ } catch (err) {
136
+ return {
137
+ is_error: true,
138
+ error_type: "inner_tool_error",
139
+ inner_tool_is_error: true,
140
+ message: truncate(
141
+ `Tool ${inner.name} threw: ${err instanceof Error ? err.message : String(err)}`,
142
+ ERROR_MESSAGE_CAP,
143
+ ),
144
+ next_action_hint:
145
+ "Retry with different arguments, or call the tool directly to see the full error.",
146
+ };
147
+ }
148
+
149
+ const innerIsError =
150
+ typeof innerResult === "object" &&
151
+ innerResult !== null &&
152
+ "is_error" in innerResult
153
+ ? (innerResult as { is_error: boolean }).is_error
154
+ : false;
155
+
156
+ const innerOutput =
157
+ typeof innerResult === "string"
158
+ ? innerResult
159
+ : JSON.stringify(innerResult);
160
+
161
+ if (innerIsError) {
162
+ return {
163
+ is_error: true,
164
+ error_type: "inner_tool_error",
165
+ inner_tool_is_error: true,
166
+ message: truncate(innerOutput, ERROR_MESSAGE_CAP),
167
+ next_action_hint:
168
+ "The inner tool returned an error and nothing was written. Fix the inputs and retry, or pipe a different tool.",
169
+ };
170
+ }
171
+
172
+ const mimeType = mimeFromPath(input.path);
173
+ const isTextual = isTextualPath(input.path);
174
+ const title =
175
+ input.title ?? input.path.split("/").filter(Boolean).pop() ?? input.path;
176
+ const onConflict = input.on_conflict ?? "error";
177
+ const target = { drive: input.drive, path: input.path };
178
+
179
+ try {
180
+ const item =
181
+ onConflict === "overwrite"
182
+ ? await upsertContextItem(ctx.conn, {
183
+ title,
184
+ description: input.description,
185
+ content: innerOutput,
186
+ drive: target.drive,
187
+ path: target.path,
188
+ mimeType,
189
+ isTextual,
190
+ })
191
+ : await createContextItemStrict(ctx.conn, {
192
+ title,
193
+ description: input.description,
194
+ content: innerOutput,
195
+ drive: target.drive,
196
+ path: target.path,
197
+ mimeType,
198
+ isTextual,
199
+ });
200
+
201
+ await ingestByPath(ctx.conn, target, ctx.config);
202
+
203
+ return {
204
+ is_error: false,
205
+ id: item.id,
206
+ drive: item.drive,
207
+ path: item.path,
208
+ ref: formatDriveRef(item),
209
+ bytes_written: innerOutput.length,
210
+ preview: innerOutput.slice(0, PREVIEW_CHARS),
211
+ };
212
+ } catch (err) {
213
+ if (err instanceof PathConflictError) {
214
+ return {
215
+ is_error: true,
216
+ error_type: "path_conflict",
217
+ drive: err.drive,
218
+ path: err.path,
219
+ ref: formatDriveRef({ drive: err.drive, path: err.path }),
220
+ message: `A file already exists at ${formatDriveRef({ drive: err.drive, path: err.path })} (id: ${err.existingId}). The inner tool ran but its output was discarded.`,
221
+ next_action_hint:
222
+ "Retry with on_conflict='overwrite' to replace, or pick a different path.",
223
+ };
224
+ }
225
+ throw err;
226
+ }
227
+ },
228
+ } satisfies ToolDefinition<typeof inputSchema, typeof outputSchema>;
@@ -2,6 +2,7 @@
2
2
  import { capabilitiesRefreshTool } from "./capabilities/refresh.ts";
3
3
  // Context tools
4
4
  import { contextListDrivesTool } from "./context/list-drives.ts";
5
+ import { pipeToContextTool } from "./context/pipe.ts";
5
6
  import { readLargeResultTool } from "./context/read-large-result.ts";
6
7
  import { contextRefreshTool } from "./context/refresh.ts";
7
8
  import { contextSearchTool } from "./context/search.ts";
@@ -30,8 +31,7 @@ import { mcpSearchTool } from "./mcp/search.ts";
30
31
  import { createScheduleTool } from "./schedule/create.ts";
31
32
  import { listSchedulesTool } from "./schedule/list.ts";
32
33
  // Search tools
33
- import { searchGrepTool } from "./search/grep.ts";
34
- import { searchSemanticTool } from "./search/semantic.ts";
34
+ import { searchTool } from "./search/index.ts";
35
35
  // Skill tools
36
36
  import { skillDeleteTool } from "./skill/delete.ts";
37
37
  import { skillEditTool } from "./skill/edit.ts";
@@ -85,6 +85,7 @@ export function registerAllTools(): void {
85
85
  registerTool(updateBeliefsTool);
86
86
  registerTool(updateGoalsTool);
87
87
  registerTool(readLargeResultTool);
88
+ registerTool(pipeToContextTool);
88
89
 
89
90
  // Capabilities
90
91
  registerTool(capabilitiesRefreshTool);
@@ -94,8 +95,7 @@ export function registerAllTools(): void {
94
95
  registerTool(listSchedulesTool);
95
96
 
96
97
  // Search
97
- registerTool(searchGrepTool);
98
- registerTool(searchSemanticTool);
98
+ registerTool(searchTool);
99
99
 
100
100
  // Skill
101
101
  registerTool(skillListTool);
@@ -0,0 +1,117 @@
1
+ import type { RegexpHit } from "./regexp.ts";
2
+ import type { SemanticHit } from "./semantic.ts";
3
+
4
+ export interface FusedMatch {
5
+ ref: string;
6
+ drive: string;
7
+ path: string;
8
+ line: number | null;
9
+ content: string;
10
+ context_lines: string[];
11
+ match_type: "regexp" | "semantic" | "both";
12
+ semantic_score: number | null;
13
+ score: number;
14
+ }
15
+
16
+ const SNIPPET_MAX = 300;
17
+
18
+ /**
19
+ * Reciprocal rank fusion of regexp line hits and semantic chunk hits.
20
+ *
21
+ * Each regexp hit becomes its own row. If the file (drive + path) also has a
22
+ * semantic hit, the regexp row picks up that semantic side's RRF contribution
23
+ * and is tagged `match_type: "both"` — exact-line + semantic agreement is
24
+ * the strongest signal.
25
+ *
26
+ * Semantic hits are emitted as their own rows only for paths with no regexp
27
+ * hit; otherwise the regexp row already represents that file (and is more
28
+ * locatable). This keeps the result list focused without losing pure
29
+ * semantic matches in files the regexp didn't touch.
30
+ */
31
+ export function fuseRRF(
32
+ regexpHits: RegexpHit[],
33
+ semanticHits: SemanticHit[],
34
+ options: { k?: number; limit: number },
35
+ ): FusedMatch[] {
36
+ const k = options.k ?? 60;
37
+
38
+ const bestSemByPath = new Map<
39
+ string,
40
+ { rank: number; score: number; hit: SemanticHit }
41
+ >();
42
+ for (let i = 0; i < semanticHits.length; i++) {
43
+ const hit = semanticHits[i];
44
+ if (!hit) continue;
45
+ const key = pathKey(hit.drive, hit.path);
46
+ if (key == null) continue;
47
+ const existing = bestSemByPath.get(key);
48
+ if (!existing || i < existing.rank) {
49
+ bestSemByPath.set(key, { rank: i, score: hit.score, hit });
50
+ }
51
+ }
52
+
53
+ const regexpPaths = new Set<string>();
54
+ for (const hit of regexpHits) {
55
+ regexpPaths.add(pathKey(hit.drive, hit.path) ?? "");
56
+ }
57
+
58
+ const fused: FusedMatch[] = [];
59
+
60
+ for (let i = 0; i < regexpHits.length; i++) {
61
+ const rx = regexpHits[i];
62
+ if (!rx) continue;
63
+ const key = pathKey(rx.drive, rx.path) ?? "";
64
+ const sem = bestSemByPath.get(key);
65
+ let score = 1 / (k + i + 1);
66
+ let matchType: FusedMatch["match_type"] = "regexp";
67
+ let semanticScore: number | null = null;
68
+ if (sem) {
69
+ score += 1 / (k + sem.rank + 1);
70
+ matchType = "both";
71
+ semanticScore = round(sem.score);
72
+ }
73
+ fused.push({
74
+ ref: rx.ref,
75
+ drive: rx.drive,
76
+ path: rx.path,
77
+ line: rx.line,
78
+ content: rx.content,
79
+ context_lines: rx.context_lines,
80
+ match_type: matchType,
81
+ semantic_score: semanticScore,
82
+ score: round(score),
83
+ });
84
+ }
85
+
86
+ for (let i = 0; i < semanticHits.length; i++) {
87
+ const sem = semanticHits[i];
88
+ if (!sem) continue;
89
+ const key = pathKey(sem.drive, sem.path);
90
+ if (key == null) continue;
91
+ if (regexpPaths.has(key)) continue;
92
+ const score = 1 / (k + i + 1);
93
+ fused.push({
94
+ ref: sem.ref,
95
+ drive: sem.drive ?? "",
96
+ path: sem.path ?? "",
97
+ line: null,
98
+ content: sem.chunk_content.slice(0, SNIPPET_MAX),
99
+ context_lines: [],
100
+ match_type: "semantic",
101
+ semantic_score: round(sem.score),
102
+ score: round(score),
103
+ });
104
+ }
105
+
106
+ fused.sort((a, b) => b.score - a.score);
107
+ return fused.slice(0, options.limit);
108
+ }
109
+
110
+ function pathKey(drive: string | null, path: string | null): string | null {
111
+ if (!drive || !path) return null;
112
+ return `${drive}:${path}`;
113
+ }
114
+
115
+ function round(n: number): number {
116
+ return Math.round(n * 10000) / 10000;
117
+ }
@@ -0,0 +1,134 @@
1
+ import { z } from "zod";
2
+ import {
3
+ listContextItems,
4
+ listContextItemsByPrefix,
5
+ } from "../../db/context.ts";
6
+ import type { ToolDefinition } from "../tool.ts";
7
+ import { fuseRRF } from "./fuse.ts";
8
+ import { runRegexp } from "./regexp.ts";
9
+ import { runSemantic } from "./semantic.ts";
10
+
11
+ const MatchSchema = z.object({
12
+ ref: z.string(),
13
+ drive: z.string(),
14
+ path: z.string(),
15
+ line: z.number().nullable(),
16
+ content: z.string(),
17
+ context_lines: z.array(z.string()),
18
+ match_type: z.enum(["regexp", "semantic", "both"]),
19
+ semantic_score: z.number().nullable(),
20
+ score: z.number(),
21
+ });
22
+
23
+ const inputSchema = z.object({
24
+ query: z
25
+ .string()
26
+ .optional()
27
+ .describe(
28
+ "Natural-language query for semantic + keyword (BM25) hybrid search. Provide alongside `pattern` for the strongest signal — chunks matched by both methods are boosted via reciprocal rank fusion.",
29
+ ),
30
+ pattern: z
31
+ .string()
32
+ .optional()
33
+ .describe("Regex pattern for exact text search across context contents."),
34
+ drive: z
35
+ .string()
36
+ .optional()
37
+ .describe(
38
+ "Restrict to a single drive (applies to both `query` and `pattern`).",
39
+ ),
40
+ path: z
41
+ .string()
42
+ .optional()
43
+ .describe("Directory prefix within the drive. Requires `drive`."),
44
+ glob: z
45
+ .string()
46
+ .optional()
47
+ .describe("Filter results to files whose basename matches this glob."),
48
+ ignore_case: z
49
+ .boolean()
50
+ .optional()
51
+ .describe("Case-insensitive regex (only affects `pattern`)."),
52
+ context: z
53
+ .number()
54
+ .optional()
55
+ .describe(
56
+ "Lines of surrounding context to include for each regex hit (only affects `pattern`).",
57
+ ),
58
+ max_results: z
59
+ .number()
60
+ .optional()
61
+ .describe("Maximum number of fused results to return (default 20)."),
62
+ });
63
+
64
+ const outputSchema = z.object({
65
+ matches: z.array(MatchSchema),
66
+ is_error: z.boolean(),
67
+ error_type: z.string().optional(),
68
+ message: z.string().optional(),
69
+ });
70
+
71
+ export const searchTool = {
72
+ name: "search",
73
+ description:
74
+ "[[ bash equivalent command: grep -r ]] Hybrid search over indexed context. At least one of `query` (natural language → semantic + BM25) or `pattern` (regex over file contents) is required. Pass both for the strongest signal: results matched by both methods float to the top via reciprocal rank fusion. Scoping (`drive`, `path`, `glob`) applies to both sides.",
75
+ group: "search",
76
+ inputSchema,
77
+ outputSchema,
78
+ execute: async (input, ctx) => {
79
+ if (!input.query && !input.pattern) {
80
+ return {
81
+ matches: [],
82
+ is_error: true,
83
+ error_type: "invalid_arguments",
84
+ message:
85
+ "Provide at least one of `query` (natural language) or `pattern` (regex). Pass both to fuse semantic and exact-match signals.",
86
+ };
87
+ }
88
+ if (input.path && !input.drive) {
89
+ return {
90
+ matches: [],
91
+ is_error: true,
92
+ error_type: "invalid_arguments",
93
+ message:
94
+ "`path` requires `drive` — call context_list_drives to see which drives exist, then pass `drive` alongside `path`.",
95
+ };
96
+ }
97
+
98
+ const limit = input.max_results ?? 20;
99
+
100
+ const regexpHits = input.pattern
101
+ ? runRegexp(
102
+ input.drive
103
+ ? await listContextItemsByPrefix(
104
+ ctx.conn,
105
+ input.drive,
106
+ input.path ?? "/",
107
+ { recursive: true },
108
+ )
109
+ : await listContextItems(ctx.conn),
110
+ {
111
+ pattern: input.pattern,
112
+ glob: input.glob,
113
+ ignore_case: input.ignore_case,
114
+ context: input.context,
115
+ max_results: 100,
116
+ },
117
+ )
118
+ : [];
119
+
120
+ const semanticHits = input.query
121
+ ? await runSemantic(ctx, {
122
+ query: input.query,
123
+ drive: input.drive,
124
+ path: input.path,
125
+ glob: input.glob,
126
+ limit: 100,
127
+ })
128
+ : [];
129
+
130
+ const matches = fuseRRF(regexpHits, semanticHits, { limit });
131
+
132
+ return { matches, is_error: false };
133
+ },
134
+ } satisfies ToolDefinition<typeof inputSchema, typeof outputSchema>;
@@ -0,0 +1,70 @@
1
+ import { formatDriveRef } from "../../context/drives.ts";
2
+ import type { ContextItem } from "../../db/context.ts";
3
+
4
+ export interface RegexpHit {
5
+ ref: string;
6
+ drive: string;
7
+ path: string;
8
+ line: number;
9
+ content: string;
10
+ context_lines: string[];
11
+ }
12
+
13
+ export interface RegexpOptions {
14
+ pattern: string;
15
+ glob?: string;
16
+ ignore_case?: boolean;
17
+ context?: number;
18
+ max_results?: number;
19
+ }
20
+
21
+ export function runRegexp(
22
+ items: ContextItem[],
23
+ options: RegexpOptions,
24
+ ): RegexpHit[] {
25
+ const flags = options.ignore_case ? "gi" : "g";
26
+ const regex = new RegExp(options.pattern, flags);
27
+ const globRegex = options.glob ? globToRegex(options.glob) : null;
28
+ const contextLines = options.context ?? 0;
29
+ const maxResults = options.max_results ?? 100;
30
+
31
+ const hits: RegexpHit[] = [];
32
+
33
+ for (const item of items) {
34
+ if (item.content == null) continue;
35
+
36
+ if (globRegex) {
37
+ const filename = item.path.split("/").pop() ?? "";
38
+ if (!globRegex.test(filename)) continue;
39
+ }
40
+
41
+ const lines = item.content.split("\n");
42
+ for (let i = 0; i < lines.length; i++) {
43
+ regex.lastIndex = 0;
44
+ const line = lines[i];
45
+ if (line !== undefined && regex.test(line)) {
46
+ const start = Math.max(0, i - contextLines);
47
+ const end = Math.min(lines.length, i + contextLines + 1);
48
+ hits.push({
49
+ ref: formatDriveRef(item),
50
+ drive: item.drive,
51
+ path: item.path,
52
+ line: i + 1,
53
+ content: line,
54
+ context_lines: lines.slice(start, end),
55
+ });
56
+ if (hits.length >= maxResults) return hits;
57
+ }
58
+ }
59
+ }
60
+
61
+ return hits;
62
+ }
63
+
64
+ export function globToRegex(glob: string): RegExp {
65
+ const escaped = glob
66
+ .replace(/[.+^${}()|[\]\\]/g, "\\$&")
67
+ .replace(/\*/g, ".*")
68
+ .replace(/\?/g, ".");
69
+ return new RegExp(`^${escaped}$`, "i");
70
+ }
@@ -1,69 +1,81 @@
1
- import { z } from "zod";
2
1
  import { formatDriveRef } from "../../context/drives.ts";
3
2
  import { embedSingle } from "../../context/embedder.ts";
4
- import { hybridSearch } from "../../db/embeddings.ts";
5
- import type { ToolDefinition } from "../tool.ts";
3
+ import { type HybridSearchResult, hybridSearch } from "../../db/embeddings.ts";
4
+ import type { ToolContext } from "../tool.ts";
5
+ import { globToRegex } from "./regexp.ts";
6
6
 
7
- const inputSchema = z.object({
8
- query: z.string().describe("Natural language search query"),
9
- top_k: z
10
- .number()
11
- .optional()
12
- .default(10)
13
- .describe("Maximum number of results to return (defaults to 10)"),
14
- threshold: z
15
- .number()
16
- .optional()
17
- .describe("Minimum similarity score (0-1) to include in results"),
18
- });
7
+ export interface SemanticHit {
8
+ ref: string;
9
+ drive: string | null;
10
+ path: string | null;
11
+ context_item_id: string;
12
+ chunk_index: number;
13
+ title: string;
14
+ chunk_content: string;
15
+ score: number;
16
+ }
19
17
 
20
- const outputSchema = z.object({
21
- results: z.array(
22
- z.object({
23
- ref: z.string(),
24
- title: z.string(),
25
- score: z.number(),
26
- snippet: z.string(),
27
- }),
28
- ),
29
- is_error: z.boolean(),
30
- });
18
+ export interface SemanticOptions {
19
+ query: string;
20
+ drive?: string;
21
+ path?: string;
22
+ glob?: string;
23
+ limit?: number;
24
+ }
31
25
 
32
- export const searchSemanticTool = {
33
- name: "search_semantic",
34
- description:
35
- "Semantic search over indexed context using vector embeddings. Finds conceptually related content, not just keyword matches.",
36
- group: "search",
37
- inputSchema,
38
- outputSchema,
39
- execute: async (input, ctx) => {
40
- const queryVec = await embedSingle(input.query, ctx.config);
41
- const results = await hybridSearch(
42
- ctx.conn,
43
- input.query,
44
- queryVec,
45
- input.top_k,
46
- );
26
+ /**
27
+ * Run the embedding + hybrid-search pipeline. Scoping (`drive` / `path` /
28
+ * `glob`) is applied as a *post-filter* on results so the caller gets
29
+ * consistent behavior whether they used the regex side, the semantic side,
30
+ * or both.
31
+ */
32
+ export async function runSemantic(
33
+ ctx: ToolContext,
34
+ options: SemanticOptions,
35
+ ): Promise<SemanticHit[]> {
36
+ const queryVec = await embedSingle(options.query, ctx.config);
37
+ const results = await hybridSearch(
38
+ ctx.conn,
39
+ options.query,
40
+ queryVec,
41
+ options.limit ?? 100,
42
+ );
47
43
 
48
- const threshold = input.threshold;
49
- const filtered =
50
- threshold !== undefined
51
- ? results.filter((r) => r.score >= threshold)
52
- : results;
44
+ return results.filter((r) => matchesScope(r, options)).map(toHit);
45
+ }
53
46
 
54
- return {
55
- results: filtered
56
- .map((r) => ({
57
- ref:
58
- r.drive && r.path
59
- ? formatDriveRef({ drive: r.drive, path: r.path })
60
- : r.context_item_id,
61
- title: r.title,
62
- score: Math.round(r.score * 1000) / 1000,
63
- snippet: (r.chunk_content || "").slice(0, 300),
64
- }))
65
- .sort((a, b) => b.score - a.score),
66
- is_error: false,
67
- };
68
- },
69
- } satisfies ToolDefinition<typeof inputSchema, typeof outputSchema>;
47
+ function matchesScope(
48
+ result: HybridSearchResult,
49
+ options: SemanticOptions,
50
+ ): boolean {
51
+ if (options.drive && result.drive !== options.drive) return false;
52
+ if (options.path && result.path) {
53
+ const prefix = options.path.endsWith("/")
54
+ ? options.path
55
+ : `${options.path}/`;
56
+ if (result.path !== options.path && !result.path.startsWith(prefix)) {
57
+ return false;
58
+ }
59
+ }
60
+ if (options.glob && result.path) {
61
+ const filename = result.path.split("/").pop() ?? "";
62
+ if (!globToRegex(options.glob).test(filename)) return false;
63
+ }
64
+ return true;
65
+ }
66
+
67
+ function toHit(r: HybridSearchResult): SemanticHit {
68
+ return {
69
+ ref:
70
+ r.drive && r.path
71
+ ? formatDriveRef({ drive: r.drive, path: r.path })
72
+ : r.context_item_id,
73
+ drive: r.drive,
74
+ path: r.path,
75
+ context_item_id: r.context_item_id,
76
+ chunk_index: r.chunk_index,
77
+ title: r.title,
78
+ chunk_content: r.chunk_content ?? "",
79
+ score: r.score,
80
+ };
81
+ }
@@ -145,14 +145,14 @@ When calling complete_task, write a summary that captures your key findings, dec
145
145
 
146
146
  Workflow for any "look up / find / read" intent:
147
147
 
148
- 1. \`search_semantic\` (semantic) or \`context_search\` (keyword), then \`context_read\` / \`context_tree\` to drill in.
148
+ 1. \`search\` (hybrid regexp + semantic) or \`context_search\` (keyword), then \`context_read\` / \`context_tree\` to drill in.
149
149
  2. If freshness matters, call \`context_info\` and check \`indexed_at\`. To re-pull a single stale item, use \`context_refresh\` rather than going to MCP for the whole document.
150
150
  3. Only call \`mcp_exec\` for reads when the data is genuinely missing locally **or** must be real-time (e.g., "what's on my calendar right now").
151
151
 
152
152
  Writes always go through MCP — sending an email, creating an issue, posting to Slack. Don't search context first for those.
153
153
 
154
154
  Examples:
155
- - "What does doc X say?" → \`search_semantic\` first.
155
+ - "What does doc X say?" → \`search\` first.
156
156
  - "Any new emails from Y?" → check the \`gmail\` drive first; only hit Gmail MCP if the freshest indexed item is too old for the question.
157
157
  - "Send an email to Y" → MCP write directly; no context lookup.
158
158
 
@@ -1,128 +0,0 @@
1
- import { z } from "zod";
2
- import { formatDriveRef } from "../../context/drives.ts";
3
- import {
4
- listContextItems,
5
- listContextItemsByPrefix,
6
- } from "../../db/context.ts";
7
- import type { ToolDefinition } from "../tool.ts";
8
-
9
- const GrepMatchSchema = z.object({
10
- ref: z.string(),
11
- drive: z.string(),
12
- path: z.string(),
13
- line: z.number(),
14
- content: z.string(),
15
- context_lines: z.array(z.string()),
16
- });
17
-
18
- const inputSchema = z.object({
19
- pattern: z.string().describe("Regex pattern to search for"),
20
- drive: z
21
- .string()
22
- .optional()
23
- .describe("Restrict search to a single drive (defaults to all drives)"),
24
- path: z
25
- .string()
26
- .optional()
27
- .describe(
28
- "Directory to search under within the drive (defaults to /). Requires `drive`.",
29
- ),
30
- glob: z
31
- .string()
32
- .optional()
33
- .describe("Only search files whose basename matches this glob pattern"),
34
- ignore_case: z.boolean().optional().describe("Case-insensitive search"),
35
- context: z
36
- .number()
37
- .optional()
38
- .describe("Number of context lines before and after each match"),
39
- max_results: z
40
- .number()
41
- .optional()
42
- .describe("Maximum number of matches to return"),
43
- });
44
-
45
- const outputSchema = z.object({
46
- matches: z.array(GrepMatchSchema),
47
- is_error: z.boolean(),
48
- error_type: z.string().optional(),
49
- message: z.string().optional(),
50
- });
51
-
52
- export const searchGrepTool = {
53
- name: "search_grep",
54
- description: "Search file contents by regex pattern across context drives.",
55
- group: "search",
56
- inputSchema,
57
- outputSchema,
58
- execute: async (input, ctx) => {
59
- // `path` scopes to a directory within a single drive; requiring `drive`
60
- // alongside prevents a silent full-DB scan when only `path` is passed.
61
- if (input.path && !input.drive) {
62
- return {
63
- matches: [],
64
- is_error: true,
65
- error_type: "invalid_arguments",
66
- message:
67
- "`path` requires `drive` — use context_list_drives to see which drives exist, then pass `drive` alongside `path`.",
68
- };
69
- }
70
-
71
- const items = input.drive
72
- ? await listContextItemsByPrefix(
73
- ctx.conn,
74
- input.drive,
75
- input.path ?? "/",
76
- {
77
- recursive: true,
78
- },
79
- )
80
- : await listContextItems(ctx.conn);
81
-
82
- const flags = input.ignore_case ? "gi" : "g";
83
- const regex = new RegExp(input.pattern, flags);
84
- const globRegex = input.glob ? globToRegex(input.glob) : null;
85
- const contextLines = input.context ?? 0;
86
- const maxResults = input.max_results ?? 100;
87
-
88
- const matches: z.infer<typeof GrepMatchSchema>[] = [];
89
-
90
- for (const item of items) {
91
- if (item.content == null) continue;
92
-
93
- if (globRegex) {
94
- const filename = item.path.split("/").pop() ?? "";
95
- if (!globRegex.test(filename)) continue;
96
- }
97
-
98
- const lines = item.content.split("\n");
99
- for (let i = 0; i < lines.length; i++) {
100
- regex.lastIndex = 0;
101
- const line = lines[i];
102
- if (line !== undefined && regex.test(line)) {
103
- const start = Math.max(0, i - contextLines);
104
- const end = Math.min(lines.length, i + contextLines + 1);
105
- matches.push({
106
- ref: formatDriveRef(item),
107
- drive: item.drive,
108
- path: item.path,
109
- line: i + 1,
110
- content: line,
111
- context_lines: lines.slice(start, end),
112
- });
113
- if (matches.length >= maxResults) return { matches, is_error: false };
114
- }
115
- }
116
- }
117
-
118
- return { matches, is_error: false };
119
- },
120
- } satisfies ToolDefinition<typeof inputSchema, typeof outputSchema>;
121
-
122
- function globToRegex(glob: string): RegExp {
123
- const escaped = glob
124
- .replace(/[.+^${}()|[\]\\]/g, "\\$&")
125
- .replace(/\*/g, ".*")
126
- .replace(/\?/g, ".");
127
- return new RegExp(`^${escaped}$`, "i");
128
- }