membot 0.3.1 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -2,8 +2,7 @@
2
2
 
3
3
  > Versioned context store with hybrid search for AI agents. Stdio + HTTP MCP server and CLI.
4
4
 
5
- [![npm](https://img.shields.io/npm/v/membot.svg)](https://www.npmjs.com/package/membot)
6
- [![license](https://img.shields.io/npm/l/membot.svg)](./LICENSE)
5
+ [![license](https://img.shields.io/github/license/evantahler/membot.svg)](./LICENSE)
7
6
 
8
7
  `membot` is a single-binary CLI and MCP server that gives AI agents a persistent, versioned, searchable context store. Files (markdown, PDFs, DOCX, HTML, URLs, agent-authored notes) are ingested, converted to markdown, chunked, embedded **locally** with `@huggingface/transformers` (WASM, no cloud calls), and indexed in DuckDB with hybrid search (semantic vector + BM25). Every change creates a new version — nothing is overwritten in place.
9
8
 
@@ -16,11 +15,9 @@
16
15
 
17
16
  ```bash
18
17
  bun install -g membot
19
- # or
20
- npm install -g membot
21
18
  ```
22
19
 
23
- This pulls in DuckDB's per-platform native bindings alongside membot. The build externalizes `@duckdb/*` (those `.node` bindings can't be embedded by `bun build --compile`), so a global npm/bun install is the supported path.
20
+ This pulls in DuckDB's per-platform native bindings alongside membot. The build externalizes `@duckdb/*` (those `.node` bindings can't be embedded by `bun build --compile`), so a global Bun install is the supported path.
24
21
 
25
22
  ## Quick start
26
23
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "membot",
3
- "version": "0.3.1",
3
+ "version": "0.4.1",
4
4
  "description": "Versioned context store with hybrid search for AI agents. Stdio + HTTP MCP server and CLI.",
5
5
  "type": "module",
6
6
  "exports": {
@@ -23,6 +23,12 @@ export const DaemonConfigSchema = z.object({
23
23
  tick_interval_sec: z.number().int().positive().default(DEFAULTS.DAEMON_TICK_SEC),
24
24
  });
25
25
 
26
+ export const DbLockRetryConfigSchema = z.object({
27
+ max_attempts: z.number().int().positive().default(30),
28
+ base_delay_ms: z.number().int().positive().default(100),
29
+ max_delay_ms: z.number().int().positive().default(2000),
30
+ });
31
+
26
32
  export const MembotConfigSchema = z.object({
27
33
  data_dir: z.string().default(defaultMembotHome()),
28
34
  embedding_model: z.string().default(EMBEDDING_MODEL),
@@ -31,6 +37,7 @@ export const MembotConfigSchema = z.object({
31
37
  llm: LlmConfigSchema.default(() => LlmConfigSchema.parse({})),
32
38
  mcpx: McpxConfigSchema.default(() => McpxConfigSchema.parse({})),
33
39
  daemon: DaemonConfigSchema.default(() => DaemonConfigSchema.parse({})),
40
+ db_lock_retry: DbLockRetryConfigSchema.default(() => DbLockRetryConfigSchema.parse({})),
34
41
  default_refresh_frequency_sec: z.number().int().positive().nullable().default(null),
35
42
  });
36
43
 
package/src/constants.ts CHANGED
@@ -20,6 +20,15 @@ export const ENV = {
20
20
  export const EMBEDDING_MODEL = "Xenova/bge-small-en-v1.5";
21
21
  export const EMBEDDING_DIMENSION = 384;
22
22
 
23
+ /**
24
+ * Max chunks fed to the feature-extraction pipeline in one forward pass.
25
+ * ONNX/WASM allocates activations linearly with batch size, so a single
26
+ * unbounded call OOMs (`std::bad_alloc`) on large files — a 168-chunk file
27
+ * was the original repro. 16 is comfortably within the WASM heap for
28
+ * bge-small-en-v1.5 at 512 tokens and still amortizes the per-call overhead.
29
+ */
30
+ export const EMBEDDING_BATCH_SIZE = 16;
31
+
23
32
  export const DEFAULTS = {
24
33
  CHUNKER_MODE: "deterministic" as const,
25
34
  CHUNKER_TARGET_CHARS: 4_000,
package/src/context.ts CHANGED
@@ -39,7 +39,11 @@ export async function buildContext(options: BuildContextOptions = {}): Promise<A
39
39
 
40
40
  const { config, dataDir, configPath } = await loadConfig({ configFlag: options.configFlag });
41
41
  const dbPath = join(dataDir, FILES.INDEX_DUCKDB);
42
- const db = await openDb(dbPath);
42
+ const db = await openDb(dbPath, {
43
+ maxAttempts: config.db_lock_retry.max_attempts,
44
+ baseDelayMs: config.db_lock_retry.base_delay_ms,
45
+ maxDelayMs: config.db_lock_retry.max_delay_ms,
46
+ });
43
47
 
44
48
  const mcpx = await maybeMcpx(config);
45
49
 
package/src/db/chunks.ts CHANGED
@@ -140,7 +140,8 @@ export async function rebuildFts(db: DbConnection): Promise<RebuildFtsResult> {
140
140
  await db.exec(
141
141
  `CREATE TABLE _current_chunks_fts AS
142
142
  SELECT (logical_path || '::' || CAST(version_id AS VARCHAR) || '::' || chunk_index) AS row_key,
143
- logical_path, CAST(version_id AS VARCHAR) AS version_id, chunk_index, search_text
143
+ logical_path, CAST(version_id AS VARCHAR) AS version_id, chunk_index,
144
+ chunk_content, search_text
144
145
  FROM current_chunks`,
145
146
  );
146
147
  await db.exec(
@@ -8,6 +8,7 @@ import {
8
8
 
9
9
  import { EMBEDDING_DIMENSION } from "../constants.ts";
10
10
  import { asHelpful } from "../errors.ts";
11
+ import { logger } from "../output/logger.ts";
11
12
  import { applyMigrations } from "./migrations.ts";
12
13
 
13
14
  /** Subset of @duckdb/node-api types we feed into / get out of queries. */
@@ -18,25 +19,51 @@ export interface RunResult {
18
19
  changes: number;
19
20
  }
20
21
 
22
+ /** Tunables for retrying a `DuckDBInstance.create()` call when another process holds the file lock. */
23
+ export interface LockRetryOptions {
24
+ maxAttempts: number;
25
+ baseDelayMs: number;
26
+ maxDelayMs: number;
27
+ }
28
+
29
+ export const DEFAULT_LOCK_RETRY: LockRetryOptions = {
30
+ maxAttempts: 30,
31
+ baseDelayMs: 100,
32
+ maxDelayMs: 2000,
33
+ };
34
+
21
35
  /**
22
- * Thin async wrapper around a DuckDB connection. Uses ?N placeholders
23
- * (translated to $N internally) and returns plain JS objects.
36
+ * Async wrapper around DuckDB with **lazy claim / release** semantics so
37
+ * concurrent membot processes don't deadlock on the file lock.
38
+ *
39
+ * Lifecycle:
40
+ * - construct with a path; nothing is opened yet
41
+ * - first query call (`exec`/`queryGet`/`queryAll`/`queryRun`) lazily opens
42
+ * DuckDB, retrying with backoff on lock conflicts, and runs migrations
43
+ * - `release()` closes the underlying DuckDB instance but leaves the
44
+ * wrapper reusable — the next query reopens transparently
45
+ * - `close()` is permanent: subsequent queries throw
46
+ *
47
+ * Long-running flows (MCP server, daemon, multi-file `add`) call `release()`
48
+ * between units of work so other consumers can grab the lock.
24
49
  */
25
50
  export class DbConnection {
26
- private readonly conn: DuckDBNativeConnection;
27
- private readonly instance: DuckDBInstance | null;
28
51
  readonly path: string;
52
+ private readonly retry: LockRetryOptions;
53
+ private conn: DuckDBNativeConnection | null = null;
54
+ private instance: DuckDBInstance | null = null;
29
55
  private closed = false;
56
+ private opening: Promise<void> | null = null;
30
57
 
31
- constructor(conn: DuckDBNativeConnection, instance: DuckDBInstance | null, path: string) {
32
- this.conn = conn;
33
- this.instance = instance;
58
+ constructor(path: string, retry: LockRetryOptions = DEFAULT_LOCK_RETRY) {
34
59
  this.path = path;
60
+ this.retry = retry;
35
61
  }
36
62
 
37
63
  /** Run a parameter-less SQL statement (DDL, PRAGMA, batch SQL). */
38
64
  async exec(sql: string): Promise<void> {
39
- await this.conn.run(sql);
65
+ const conn = await this.ensureOpen();
66
+ await conn.run(sql);
40
67
  }
41
68
 
42
69
  /** Run a query and return the first row, or null. SQL uses `?N` placeholders. */
@@ -44,7 +71,8 @@ export class DbConnection {
44
71
  sql: string,
45
72
  ...params: SqlParam[]
46
73
  ): Promise<T | null> {
47
- const result = await this.conn.runAndReadAll(translateParams(sql), flattenParams(params) as DuckDBValue[]);
74
+ const conn = await this.ensureOpen();
75
+ const result = await conn.runAndReadAll(translateParams(sql), flattenParams(params) as DuckDBValue[]);
48
76
  const rows = (await result.getRowObjectsJS()) as Record<string, unknown>[];
49
77
  if (!rows[0]) return null;
50
78
  return convertRow(rows[0]) as T;
@@ -55,30 +83,169 @@ export class DbConnection {
55
83
  sql: string,
56
84
  ...params: SqlParam[]
57
85
  ): Promise<T[]> {
58
- const result = await this.conn.runAndReadAll(translateParams(sql), flattenParams(params) as DuckDBValue[]);
86
+ const conn = await this.ensureOpen();
87
+ const result = await conn.runAndReadAll(translateParams(sql), flattenParams(params) as DuckDBValue[]);
59
88
  const rows = (await result.getRowObjectsJS()) as Record<string, unknown>[];
60
89
  return rows.map(convertRow) as T[];
61
90
  }
62
91
 
63
92
  /** Run a mutation (INSERT/UPDATE/DELETE) and report rows changed. SQL uses `?N` placeholders. */
64
93
  async queryRun(sql: string, ...params: SqlParam[]): Promise<RunResult> {
65
- const result = await this.conn.run(translateParams(sql), flattenParams(params) as DuckDBValue[]);
94
+ const conn = await this.ensureOpen();
95
+ const result = await conn.run(translateParams(sql), flattenParams(params) as DuckDBValue[]);
66
96
  return { changes: Number(result.rowsChanged) };
67
97
  }
68
98
 
69
- /** Disconnect and close the owning DuckDB instance. Idempotent; subsequent calls are no-ops. */
99
+ /**
100
+ * Release the underlying DuckDB instance so other processes can claim
101
+ * the lock. The wrapper stays usable: the next query reopens. Idempotent
102
+ * — calling it on an already-released wrapper is a no-op.
103
+ */
104
+ async release(): Promise<void> {
105
+ if (this.closed) return;
106
+ // If an open is in-flight, wait for it so we don't leave a stray instance behind.
107
+ if (this.opening) {
108
+ try {
109
+ await this.opening;
110
+ } catch {
111
+ // ensureOpen already cleared state on failure
112
+ return;
113
+ }
114
+ }
115
+ this.disposeHandles();
116
+ }
117
+
118
+ /** Permanently close. Subsequent queries throw. */
70
119
  async close(): Promise<void> {
71
120
  if (this.closed) return;
72
121
  this.closed = true;
73
- this.conn.disconnectSync();
122
+ if (this.opening) {
123
+ try {
124
+ await this.opening;
125
+ } catch {
126
+ return;
127
+ }
128
+ }
129
+ this.disposeHandles();
130
+ }
131
+
132
+ private disposeHandles(): void {
133
+ if (this.conn) {
134
+ try {
135
+ this.conn.disconnectSync();
136
+ } catch {
137
+ // best effort
138
+ }
139
+ this.conn = null;
140
+ }
74
141
  if (this.instance) {
75
142
  try {
76
143
  this.instance.closeSync();
77
144
  } catch {
78
145
  // best effort
79
146
  }
147
+ this.instance = null;
148
+ }
149
+ }
150
+
151
+ private async ensureOpen(): Promise<DuckDBNativeConnection> {
152
+ if (this.closed) {
153
+ throw new Error(`DbConnection at ${this.path} has been closed`);
154
+ }
155
+ if (this.conn) return this.conn;
156
+ if (!this.opening) {
157
+ this.opening = this.openOnce().finally(() => {
158
+ this.opening = null;
159
+ });
160
+ }
161
+ await this.opening;
162
+ if (!this.conn) {
163
+ throw new Error(`DbConnection at ${this.path} failed to open`);
164
+ }
165
+ return this.conn;
166
+ }
167
+
168
+ private async openOnce(): Promise<void> {
169
+ const instance = await createInstanceWithRetry(this.path, this.retry);
170
+ try {
171
+ const conn = await instance.connect();
172
+ this.instance = instance;
173
+ this.conn = conn;
174
+ await applyMigrations(this);
175
+ } catch (err) {
176
+ // On any failure after instance creation, release the lock immediately.
177
+ try {
178
+ instance.closeSync();
179
+ } catch {
180
+ // best effort
181
+ }
182
+ this.instance = null;
183
+ this.conn = null;
184
+ throw err;
185
+ }
186
+ }
187
+ }
188
+
189
+ /** True if the error message looks like DuckDB's lock-conflict shape. */
190
+ export function isLockConflictError(err: unknown): boolean {
191
+ const msg = err instanceof Error ? err.message : String(err ?? "");
192
+ return /could not set lock on file|conflicting lock|database is locked/i.test(msg);
193
+ }
194
+
195
+ /** Sleep helper. */
196
+ function delay(ms: number): Promise<void> {
197
+ return new Promise((resolve) => setTimeout(resolve, ms));
198
+ }
199
+
200
+ /**
201
+ * Run an open-the-DB factory with exponential backoff + jitter when the file
202
+ * lock is held by another process. Non-lock errors are re-thrown immediately
203
+ * (wrapped as `HelpfulError`) — only lock conflicts are retried. After
204
+ * exhausting attempts we throw a `HelpfulError` whose hint names the
205
+ * concurrent-process problem. Exposed (rather than inlined) so tests can
206
+ * verify the retry behavior with a fake factory.
207
+ */
208
+ export async function withLockRetry<T>(
209
+ factory: () => Promise<T>,
210
+ path: string,
211
+ retry: LockRetryOptions = DEFAULT_LOCK_RETRY,
212
+ ): Promise<T> {
213
+ let lastErr: unknown;
214
+ for (let attempt = 1; attempt <= retry.maxAttempts; attempt++) {
215
+ try {
216
+ return await factory();
217
+ } catch (err) {
218
+ lastErr = err;
219
+ if (!isLockConflictError(err)) {
220
+ throw asHelpful(
221
+ err,
222
+ `while opening DuckDB at ${path}`,
223
+ `Check that ${path} is writable and not held open by another process. Delete the file to start fresh.`,
224
+ "internal_error",
225
+ );
226
+ }
227
+ if (attempt === retry.maxAttempts) break;
228
+ const backoff = Math.min(retry.maxDelayMs, retry.baseDelayMs * 2 ** (attempt - 1));
229
+ const jitter = Math.floor(Math.random() * Math.min(retry.baseDelayMs, backoff));
230
+ const wait = backoff + jitter;
231
+ logger.debug(`db: lock held on ${path}, retrying in ${wait}ms (attempt ${attempt}/${retry.maxAttempts})`);
232
+ await delay(wait);
80
233
  }
81
234
  }
235
+ throw asHelpful(
236
+ lastErr,
237
+ `while opening DuckDB at ${path} after ${retry.maxAttempts} attempts`,
238
+ `Another process is holding the database lock. Stop the conflicting process (check for a running 'membot serve' or open DuckDB CLI session) or delete ${path} to start fresh.`,
239
+ "internal_error",
240
+ );
241
+ }
242
+
243
+ /** Open a `DuckDBInstance` for `path`, retrying with backoff on lock conflicts. */
244
+ export function createInstanceWithRetry(
245
+ path: string,
246
+ retry: LockRetryOptions = DEFAULT_LOCK_RETRY,
247
+ ): Promise<DuckDBInstance> {
248
+ return withLockRetry(() => DuckDBInstance.create(path), path, retry);
82
249
  }
83
250
 
84
251
  /** Type guard for the JS values DuckDB returns directly without further coercion. */
@@ -149,25 +316,17 @@ function flattenParams(params: SqlParam[]): unknown[] {
149
316
  }
150
317
 
151
318
  /**
152
- * Open a DuckDB-backed connection for the given file path. Runs all migrations
153
- * against the connection before returning. Pass `:memory:` for in-process tests.
319
+ * Construct a lazy DuckDB-backed connection for the given file path. The
320
+ * underlying DuckDB instance isn't opened until the first query call (which
321
+ * also runs migrations). To surface lock conflicts at the call site, callers
322
+ * may probe with `await db.exec("SELECT 1")` immediately after construction.
154
323
  */
155
- export async function openDb(path: string): Promise<DbConnection> {
156
- let instance: DuckDBInstance;
157
- try {
158
- instance = await DuckDBInstance.create(path);
159
- } catch (err) {
160
- throw asHelpful(
161
- err,
162
- `while opening DuckDB at ${path}`,
163
- `Check that ${path} is writable and not held open by another process. Delete the file to start fresh.`,
164
- "internal_error",
165
- );
166
- }
167
- const conn = await instance.connect();
168
- const wrapper = new DbConnection(conn, instance, path);
169
- await applyMigrations(wrapper);
170
- return wrapper;
324
+ export async function openDb(path: string, retry: LockRetryOptions = DEFAULT_LOCK_RETRY): Promise<DbConnection> {
325
+ const db = new DbConnection(path, retry);
326
+ // Eager probe so initial open errors (lock conflict, bad path, migration
327
+ // failure) surface here rather than at the first query in user code.
328
+ await db.exec("SELECT 1");
329
+ return db;
171
330
  }
172
331
 
173
332
  export { EMBEDDING_DIMENSION };
@@ -16,12 +16,31 @@ export interface Migration {
16
16
 
17
17
  const MIGRATIONS: Migration[] = [MIGRATION_001, MIGRATION_002];
18
18
 
19
+ /**
20
+ * Process-level cache of paths whose migrations have been applied (or
21
+ * confirmed already-current) in this process. With lazy-claim DB connections,
22
+ * `applyMigrations` runs on every reopen — caching here keeps the DDL/SELECT
23
+ * traffic and "migration: applied" log lines off the hot reopen path.
24
+ * Cleared by `forgetMigrations` so tests can simulate a fresh process.
25
+ */
26
+ const checkedPaths = new Set<string>();
27
+
28
+ /** Reset the per-process migration cache. Test-only — production code never calls this. */
29
+ export function forgetMigrations(path?: string): void {
30
+ if (path === undefined) checkedPaths.clear();
31
+ else checkedPaths.delete(path);
32
+ }
33
+
19
34
  /**
20
35
  * Apply every unapplied migration in id order. Tracks applied ids in
21
36
  * `_migrations`. Each successful run is logged via the shared logger so a
22
- * user upgrading membot can see exactly what changed in their store.
37
+ * user upgrading membot can see exactly what changed in their store. The
38
+ * first call for a given DB path checks the table; subsequent calls in the
39
+ * same process short-circuit via `checkedPaths`.
23
40
  */
24
41
  export async function applyMigrations(db: DbConnection): Promise<void> {
42
+ if (checkedPaths.has(db.path)) return;
43
+
25
44
  await db.exec(`CREATE TABLE IF NOT EXISTS _migrations (
26
45
  id INTEGER PRIMARY KEY,
27
46
  name TEXT NOT NULL,
@@ -42,4 +61,6 @@ export async function applyMigrations(db: DbConnection): Promise<void> {
42
61
  await db.queryRun(`INSERT INTO _migrations(id, name) VALUES (?1, ?2)`, migration.id, migration.name);
43
62
  logger.info(`migration: applied ${String(migration.id).padStart(3, "0")}-${migration.name}`);
44
63
  }
64
+
65
+ checkedPaths.add(db.path);
45
66
  }
@@ -1,7 +1,7 @@
1
1
  import { existsSync } from "node:fs";
2
2
  import { join } from "node:path";
3
3
  import { env, type FeatureExtractionPipeline, pipeline } from "@huggingface/transformers";
4
- import { EMBEDDING_DIMENSION, EMBEDDING_MODEL } from "../constants.ts";
4
+ import { EMBEDDING_BATCH_SIZE, EMBEDDING_DIMENSION, EMBEDDING_MODEL } from "../constants.ts";
5
5
  import { HelpfulError } from "../errors.ts";
6
6
  import { logger } from "../output/logger.ts";
7
7
 
@@ -67,20 +67,29 @@ async function getPipeline(model: string): Promise<FeatureExtractionPipeline> {
67
67
  * Embed an array of texts to L2-normalized vectors with the configured
68
68
  * model. Throws a HelpfulError when the model's dimension doesn't match
69
69
  * EMBEDDING_DIMENSION (the value baked into the DB schema).
70
+ *
71
+ * Inputs are sliced into windows of EMBEDDING_BATCH_SIZE so a single
72
+ * forward pass never has to allocate activations for arbitrarily many
73
+ * chunks — large files (hundreds of chunks) otherwise OOM the WASM heap.
70
74
  */
71
75
  export async function embed(texts: string[], model: string = EMBEDDING_MODEL): Promise<number[][]> {
72
76
  if (texts.length === 0) return [];
73
77
  const extractor = await getPipeline(model);
74
- const output = await extractor(texts, { pooling: "mean", normalize: true });
75
- const data = output.tolist() as number[][];
76
- if (data[0] && data[0].length !== EMBEDDING_DIMENSION) {
77
- throw new HelpfulError({
78
- kind: "internal_error",
79
- message: `embedding model ${model} returned ${data[0].length}-dim vectors, expected ${EMBEDDING_DIMENSION}`,
80
- hint: `Set config.embedding_model to a ${EMBEDDING_DIMENSION}-dim model (default: ${EMBEDDING_MODEL}).`,
81
- });
78
+ const out: number[][] = [];
79
+ for (let i = 0; i < texts.length; i += EMBEDDING_BATCH_SIZE) {
80
+ const slice = texts.slice(i, i + EMBEDDING_BATCH_SIZE);
81
+ const output = await extractor(slice, { pooling: "mean", normalize: true });
82
+ const data = output.tolist() as number[][];
83
+ if (out.length === 0 && data[0] && data[0].length !== EMBEDDING_DIMENSION) {
84
+ throw new HelpfulError({
85
+ kind: "internal_error",
86
+ message: `embedding model ${model} returned ${data[0].length}-dim vectors, expected ${EMBEDDING_DIMENSION}`,
87
+ hint: `Set config.embedding_model to a ${EMBEDDING_DIMENSION}-dim model (default: ${EMBEDDING_MODEL}).`,
88
+ });
89
+ }
90
+ for (const vec of data) out.push(vec);
82
91
  }
83
- return data;
92
+ return out;
84
93
  }
85
94
 
86
95
  /** Embed a single text — convenience wrapper for query-time embedding. */
@@ -44,12 +44,36 @@ export interface IngestResult {
44
44
  failed: number;
45
45
  }
46
46
 
47
+ /**
48
+ * Per-entry hooks invoked while a resolved source is being ingested. Used by
49
+ * `add` to drive a single shared progress reporter across many sources
50
+ * without re-resolving anything. `onEntryStart` fires before the pipeline
51
+ * touches an entry; `onEntryComplete` fires after the result (ok / unchanged
52
+ * / failed) is known. Both are optional.
53
+ */
54
+ export interface IngestCallbacks {
55
+ onEntryStart?: (label: string) => void;
56
+ onEntryComplete?: (entry: IngestEntryResult) => void;
57
+ }
58
+
59
+ /**
60
+ * Count how many per-entry results a `ResolvedSource` will produce. Used by
61
+ * `add` to size a shared progress bar before ingestion starts.
62
+ */
63
+ export function countResolvedEntries(resolved: ResolvedSource): number {
64
+ if (resolved.kind === "local-files") return resolved.entries.length;
65
+ return 1;
66
+ }
67
+
47
68
  /**
48
69
  * Top-level ingest orchestrator. Resolves the source arg, dispatches to the
49
70
  * right reader (local / remote / inline), runs the pipeline (convert →
50
71
  * describe → chunk → embed → write), and returns one entry per matched
51
72
  * file. Partial failures are reported per-entry; the entire call doesn't
52
- * abort because one URL or PDF is bad.
73
+ * abort because one URL or PDF is bad. Drives `ctx.progress` itself, so
74
+ * single-source SDK callers get a usable indicator out of the box. When
75
+ * orchestrating many sources at once (e.g. `add`), call `resolveSource` +
76
+ * `ingestResolved` directly so one shared progress spans every entry.
53
77
  */
54
78
  export async function ingest(input: IngestInput, ctx: AppContext): Promise<IngestResult> {
55
79
  const resolved = await resolveSource(input.source, {
@@ -57,17 +81,40 @@ export async function ingest(input: IngestInput, ctx: AppContext): Promise<Inges
57
81
  exclude: input.exclude,
58
82
  followSymlinks: input.follow_symlinks ?? true,
59
83
  });
84
+ const total = countResolvedEntries(resolved);
85
+ ctx.progress.start(total, "ingest");
86
+ const callbacks: IngestCallbacks = {
87
+ onEntryStart: (label) => ctx.progress.tick(label),
88
+ };
89
+ const result = await ingestResolved(resolved, input, ctx, callbacks);
90
+ const okCount = result.ok;
91
+ const unchangedSuffix = result.unchanged > 0 ? ` (${result.unchanged} unchanged)` : "";
92
+ ctx.progress.done(`ingested ${okCount}/${result.total}${unchangedSuffix}`);
93
+ return result;
94
+ }
60
95
 
96
+ /**
97
+ * Run the ingest pipeline against a pre-resolved source. Same as `ingest`
98
+ * but skips the resolve step and delegates progress reporting to the caller
99
+ * via `callbacks`. This is the entry point used by multi-source orchestrators
100
+ * (`add`) so a single progress bar can span every entry across every source.
101
+ */
102
+ export async function ingestResolved(
103
+ resolved: ResolvedSource,
104
+ input: IngestInput,
105
+ ctx: AppContext,
106
+ callbacks?: IngestCallbacks,
107
+ ): Promise<IngestResult> {
61
108
  const refreshSec = parseDuration(input.refresh_frequency);
62
109
  const force = input.force === true;
63
110
 
64
111
  if (resolved.kind === "inline") {
65
- return ingestInline(resolved.text, input, ctx, refreshSec);
112
+ return ingestInline(resolved.text, input, ctx, refreshSec, callbacks);
66
113
  }
67
114
  if (resolved.kind === "url") {
68
- return ingestUrl(resolved.url, input, ctx, refreshSec, force);
115
+ return ingestUrl(resolved.url, input, ctx, refreshSec, force, callbacks);
69
116
  }
70
- return ingestLocalFiles(resolved, input, ctx, refreshSec, force);
117
+ return ingestLocalFiles(resolved, input, ctx, refreshSec, force, callbacks);
71
118
  }
72
119
 
73
120
  /** Ingest a single inline blob (source_type='inline'). */
@@ -76,8 +123,10 @@ async function ingestInline(
76
123
  input: IngestInput,
77
124
  ctx: AppContext,
78
125
  refreshSec: number | null,
126
+ callbacks?: IngestCallbacks,
79
127
  ): Promise<IngestResult> {
80
128
  const logicalPath = input.logical_path ?? defaultInlinePath();
129
+ callbacks?.onEntryStart?.(logicalPath);
81
130
  const bytes = new TextEncoder().encode(text);
82
131
  const sha = sha256Hex(bytes);
83
132
  const result: IngestEntryResult = {
@@ -113,6 +162,7 @@ async function ingestInline(
113
162
  result.status = "failed";
114
163
  result.error = errorMessage(err);
115
164
  }
165
+ callbacks?.onEntryComplete?.(result);
116
166
  return summarize([result]);
117
167
  }
118
168
 
@@ -123,6 +173,7 @@ async function ingestUrl(
123
173
  ctx: AppContext,
124
174
  refreshSec: number | null,
125
175
  force: boolean,
176
+ callbacks?: IngestCallbacks,
126
177
  ): Promise<IngestResult> {
127
178
  const mcpxAdapter = ctx.mcpx
128
179
  ? {
@@ -137,6 +188,7 @@ async function ingestUrl(
137
188
  : null;
138
189
 
139
190
  const logicalPath = input.logical_path ?? defaultLogicalForUrl(url);
191
+ callbacks?.onEntryStart?.(url);
140
192
  const result: IngestEntryResult = {
141
193
  source_path: url,
142
194
  logical_path: logicalPath,
@@ -160,6 +212,7 @@ async function ingestUrl(
160
212
  if (cur && cur.source_sha256 === fetched.sha256) {
161
213
  result.status = "unchanged";
162
214
  result.version_id = cur.version_id;
215
+ callbacks?.onEntryComplete?.(result);
163
216
  return summarize([result]);
164
217
  }
165
218
  }
@@ -185,6 +238,7 @@ async function ingestUrl(
185
238
  result.status = "failed";
186
239
  result.error = errorMessage(err);
187
240
  }
241
+ callbacks?.onEntryComplete?.(result);
188
242
  return summarize([result]);
189
243
  }
190
244
 
@@ -195,8 +249,16 @@ async function ingestLocalFiles(
195
249
  ctx: AppContext,
196
250
  refreshSec: number | null,
197
251
  force: boolean,
252
+ callbacks?: IngestCallbacks,
198
253
  ): Promise<IngestResult> {
199
254
  if (resolved.entries.length === 0) {
255
+ // `filtered: true` means the source resolved successfully but every
256
+ // entry was dropped by --exclude / --include / DEFAULT_EXCLUDES.
257
+ // Treat that as a silent no-op: shell-expanded globs commonly hand
258
+ // us individual files we should skip without aborting the batch.
259
+ if (resolved.filtered) {
260
+ return { ingested: [], total: 0, ok: 0, unchanged: 0, failed: 0 };
261
+ }
200
262
  throw new HelpfulError({
201
263
  kind: "input_error",
202
264
  message: `Glob/path matched 0 files`,
@@ -205,11 +267,10 @@ async function ingestLocalFiles(
205
267
  }
206
268
 
207
269
  const results: IngestEntryResult[] = [];
208
- ctx.progress.start(resolved.entries.length, "ingest");
209
270
  const isMulti = resolved.entries.length > 1;
210
271
 
211
272
  for (const entry of resolved.entries) {
212
- ctx.progress.tick(entry.relPathFromBase);
273
+ callbacks?.onEntryStart?.(entry.relPathFromBase);
213
274
  const logicalPath = pickLogicalPath(input.logical_path, entry, isMulti);
214
275
  const result: IngestEntryResult = {
215
276
  source_path: entry.absPath,
@@ -233,6 +294,7 @@ async function ingestLocalFiles(
233
294
  result.status = "unchanged";
234
295
  result.version_id = cur.version_id;
235
296
  results.push(result);
297
+ callbacks?.onEntryComplete?.(result);
236
298
  continue;
237
299
  }
238
300
  }
@@ -257,13 +319,15 @@ async function ingestLocalFiles(
257
319
  } catch (err) {
258
320
  result.status = "failed";
259
321
  result.error = errorMessage(err);
322
+ } finally {
323
+ // Release the DB lock between files in a directory/glob walk so
324
+ // concurrent processes can wedge in mid-batch. The next entry's
325
+ // first DB call reopens (cheap — same-process reopen).
326
+ await ctx.db.release();
260
327
  }
261
328
  results.push(result);
329
+ callbacks?.onEntryComplete?.(result);
262
330
  }
263
- const okCount = results.filter((r) => r.status === "ok").length;
264
- const unchangedCount = results.filter((r) => r.status === "unchanged").length;
265
- const suffix = unchangedCount > 0 ? ` (${unchangedCount} unchanged)` : "";
266
- ctx.progress.done(`ingested ${okCount}/${results.length}${suffix}`);
267
331
 
268
332
  return summarize(results);
269
333
  }
@@ -1,12 +1,26 @@
1
1
  import { realpath, stat } from "node:fs/promises";
2
+ import { homedir } from "node:os";
2
3
  import { isAbsolute, join, relative, resolve, sep } from "node:path";
3
4
  import picomatch from "picomatch";
4
5
  import { asHelpful, HelpfulError } from "../errors.ts";
5
6
 
7
+ /**
8
+ * Expand a leading `~` or `~/` to the user's home directory. The shell does
9
+ * this for us when the arg is unquoted, but `bun dev add "~/foo/*.md"` passes
10
+ * the literal `~` through, and `path.resolve("~/foo")` treats `~` as a
11
+ * regular directory name. We patch it up so quoted args work like users
12
+ * expect. Inline literals and URLs are caught earlier and never reach here.
13
+ */
14
+ function expandHome(p: string): string {
15
+ if (p === "~") return homedir();
16
+ if (p.startsWith("~/") || p.startsWith(`~${sep}`)) return join(homedir(), p.slice(2));
17
+ return p;
18
+ }
19
+
6
20
  export type ResolvedSource =
7
21
  | { kind: "inline"; text: string; logicalHint: string | null }
8
22
  | { kind: "url"; url: string; logicalHint: string | null }
9
- | { kind: "local-files"; entries: ResolvedLocalEntry[]; basePath: string };
23
+ | { kind: "local-files"; entries: ResolvedLocalEntry[]; basePath: string; filtered?: boolean };
10
24
 
11
25
  export interface ResolvedLocalEntry {
12
26
  /** Absolute filesystem path (post-realpath). */
@@ -28,6 +42,45 @@ export interface ResolveOptions {
28
42
 
29
43
  const DEFAULT_EXCLUDES = ["**/node_modules/**", "**/.git/**", "**/.DS_Store", "**/dist/**", "**/.cache/**"];
30
44
 
45
+ /**
46
+ * Expand a user-supplied include/exclude pattern into a small set of
47
+ * gitignore-ish equivalents so common spellings all do the intuitive thing.
48
+ * Examples (all exclude the whole subtree): a bare name like `node_modules`,
49
+ * a trailing-slash form like `node_modules/`, the shell-style `node_modules`
50
+ * followed by single-star, the canonical doublestar forms — every spelling
51
+ * a user would reasonably reach for ends up matching nested files.
52
+ * Patterns starting with `**`-slash, `/`, or `./` are considered anchored
53
+ * and are not given an any-depth variant. `DEFAULT_EXCLUDES` are already
54
+ * canonical and bypass this helper.
55
+ */
56
+ export function expandUserPattern(p: string): string[] {
57
+ const out = new Set<string>([p]);
58
+ const anchored = p.startsWith("**/") || p.startsWith("/") || p.startsWith("./");
59
+ const hasSlash = p.includes("/");
60
+ const hasGlob = /[*?[\]{}!]/.test(p);
61
+ // Path-like patterns ("foo/bar", "node_modules/*") imply the user is
62
+ // thinking about a directory tree — match at any depth. Bare globs like
63
+ // "*.md" are left alone so they keep their anchored top-level meaning.
64
+ if (hasSlash && !anchored) out.add(`**/${p}`);
65
+ if (p.endsWith("/*") && !p.endsWith("/**/*")) {
66
+ const base = p.slice(0, -2);
67
+ out.add(`${base}/**`);
68
+ if (!anchored) out.add(`**/${base}/**`);
69
+ }
70
+ if (p.endsWith("/")) {
71
+ const base = p.slice(0, -1);
72
+ out.add(`${base}/**`);
73
+ if (!anchored) out.add(`**/${base}/**`);
74
+ }
75
+ // Bare name with no slashes and no glob chars (e.g. "node_modules",
76
+ // "dist") → treat as a directory match anywhere in the tree.
77
+ if (!hasSlash && !hasGlob) {
78
+ out.add(`**/${p}`);
79
+ out.add(`**/${p}/**`);
80
+ }
81
+ return [...out];
82
+ }
83
+
31
84
  /**
32
85
  * Polymorphic source-arg expander. Accepts:
33
86
  * - "inline:<text>" → inline literal
@@ -48,20 +101,28 @@ export async function resolveSource(source: string, options: ResolveOptions = {}
48
101
  return { kind: "url", url: source, logicalHint: null };
49
102
  }
50
103
 
104
+ source = expandHome(source);
105
+
51
106
  const followSymlinks = options.followSymlinks !== false;
52
- const userIncludes = options.include
107
+ const userIncludesRaw = options.include
53
108
  ? options.include
54
109
  .split(",")
55
110
  .map((g) => g.trim())
56
111
  .filter(Boolean)
57
112
  : [];
58
- const excludeMatchers = [
59
- ...DEFAULT_EXCLUDES,
60
- ...(options.exclude ?? "")
61
- .split(",")
62
- .map((g) => g.trim())
63
- .filter(Boolean),
64
- ];
113
+ const userExcludesRaw = (options.exclude ?? "")
114
+ .split(",")
115
+ .map((g) => g.trim())
116
+ .filter(Boolean);
117
+ const userIncludesExpanded = userIncludesRaw.flatMap(expandUserPattern);
118
+ const userExcludesExpanded = userExcludesRaw.flatMap(expandUserPattern);
119
+ const excludeMatchers = [...DEFAULT_EXCLUDES, ...userExcludesExpanded];
120
+ // Single-file matchers run against the absolute path so shell-expanded
121
+ // globs (where each file lands here individually) still honor excludes.
122
+ const isExcludeAbs = picomatch(excludeMatchers, { dot: false });
123
+ const isIncludeAbs = userIncludesExpanded.length
124
+ ? picomatch(userIncludesExpanded, { dot: false, nocase: false })
125
+ : null;
65
126
 
66
127
  if (isGlob(source)) {
67
128
  const base = globBase(source);
@@ -71,7 +132,7 @@ export async function resolveSource(source: string, options: ResolveOptions = {}
71
132
  // Source glob acts as a hard filter; user includes (if any) further
72
133
  // narrow the result via AND. Pass them as a separate matcher so the
73
134
  // two sets aren't picomatch-OR'd together.
74
- const extraIncludes = userIncludes.length > 0 ? [userIncludes] : [];
135
+ const extraIncludes = userIncludesExpanded.length > 0 ? [userIncludesExpanded] : [];
75
136
  return walk(realBase, [remainder], excludeMatchers, followSymlinks, extraIncludes);
76
137
  } catch (err) {
77
138
  throw asHelpful(
@@ -98,6 +159,16 @@ export async function resolveSource(source: string, options: ResolveOptions = {}
98
159
 
99
160
  if (st.isFile()) {
100
161
  const real = await realpath(abs);
162
+ // Shell-expanded globs (e.g. zsh expanding `~/foo/**/*.md`) deliver
163
+ // each match here individually, so this branch must enforce both
164
+ // DEFAULT_EXCLUDES and the user's own --include/--exclude. Otherwise
165
+ // `node_modules` paths slip through whenever the shell expanded for us.
166
+ if (isExcludeAbs(real)) {
167
+ return { kind: "local-files", basePath: real, entries: [], filtered: true };
168
+ }
169
+ if (isIncludeAbs && !isIncludeAbs(real)) {
170
+ return { kind: "local-files", basePath: real, entries: [], filtered: true };
171
+ }
101
172
  return {
102
173
  kind: "local-files",
103
174
  basePath: real,
@@ -107,7 +178,7 @@ export async function resolveSource(source: string, options: ResolveOptions = {}
107
178
 
108
179
  if (st.isDirectory()) {
109
180
  const realBase = await realpath(abs);
110
- const dirIncludes = userIncludes.length > 0 ? userIncludes : ["**/*"];
181
+ const dirIncludes = userIncludesExpanded.length > 0 ? userIncludesExpanded : ["**/*"];
111
182
  return walk(realBase, dirIncludes, excludeMatchers, followSymlinks);
112
183
  }
113
184
 
@@ -170,6 +241,14 @@ async function walk(
170
241
  const isInclude = picomatch(includes, { dot: false, nocase: false });
171
242
  const extraMatchers = extraIncludeSets.map((set) => picomatch(set, { dot: false, nocase: false }));
172
243
  const isExclude = excludes.length ? picomatch(excludes, { dot: false }) : null;
244
+ // Directory-prune patterns: derived from excludes by stripping a trailing
245
+ // `/**` or `/*`. Without this we descend into massive subtrees (e.g.
246
+ // every `node_modules/` under a workspace) before discarding files one
247
+ // by one — which on real machines presents as a hang.
248
+ const dirPrunePatterns = excludes
249
+ .map((p) => (p.endsWith("/**") ? p.slice(0, -3) : p.endsWith("/*") ? p.slice(0, -2) : p))
250
+ .filter((p) => p.length > 0);
251
+ const isExcludeDir = dirPrunePatterns.length ? picomatch(dirPrunePatterns, { dot: false }) : null;
173
252
 
174
253
  const queue: string[] = [base];
175
254
  while (queue.length > 0) {
@@ -191,6 +270,8 @@ async function walk(
191
270
  }
192
271
  if (st.isSymbolicLink() && !followSymlinks) continue;
193
272
  if (st.isDirectory()) {
273
+ const rel = relative(base, real);
274
+ if (rel.length > 0 && isExcludeDir?.(rel)) continue;
194
275
  let names: string[];
195
276
  try {
196
277
  names = await readdir(real);
package/src/mount/mcp.ts CHANGED
@@ -48,6 +48,14 @@ export function mountAsMcpTool<I extends z.ZodObject, O extends z.ZodTypeAny>(
48
48
  };
49
49
  } catch (err) {
50
50
  return renderMcpError(err);
51
+ } finally {
52
+ // Drop the DuckDB lock between MCP tool calls so concurrent CLI
53
+ // or daemon callers can claim it. The next tool call reopens.
54
+ try {
55
+ await ctx.db.release();
56
+ } catch {
57
+ // best effort — never let release failures mask a tool result
58
+ }
51
59
  }
52
60
  },
53
61
  );
@@ -1,5 +1,12 @@
1
1
  import { z } from "zod";
2
- import { ingest } from "../ingest/ingest.ts";
2
+ import {
3
+ countResolvedEntries,
4
+ type IngestCallbacks,
5
+ type IngestEntryResult,
6
+ type IngestResult,
7
+ ingestResolved,
8
+ } from "../ingest/ingest.ts";
9
+ import { type ResolvedSource, resolveSource } from "../ingest/source-resolver.ts";
3
10
  import { colors } from "../output/formatter.ts";
4
11
  import { defineOperation } from "./types.ts";
5
12
 
@@ -97,21 +104,122 @@ Pass \`logical_path\` to override. For a multi-source / directory / glob walk it
97
104
  },
98
105
  handler: async (input, ctx) => {
99
106
  const { sources, ...rest } = input;
100
- const aggregated = {
101
- ingested: [] as Awaited<ReturnType<typeof ingest>>["ingested"],
107
+ const followSymlinks = rest.follow_symlinks ?? true;
108
+
109
+ // Phase 1: resolve every source upfront so the shared progress bar
110
+ // knows its total. A resolve failure (bad path, glob with no base) is
111
+ // captured per-source so one bad arg doesn't abort the whole batch.
112
+ type ResolveOutcome = { source: string; resolved: ResolvedSource } | { source: string; error: Error };
113
+ const outcomes: ResolveOutcome[] = [];
114
+ for (const source of sources) {
115
+ try {
116
+ const resolved = await resolveSource(source, {
117
+ include: rest.include,
118
+ exclude: rest.exclude,
119
+ followSymlinks,
120
+ });
121
+ outcomes.push({ source, resolved });
122
+ } catch (err) {
123
+ outcomes.push({ source, error: err instanceof Error ? err : new Error(String(err)) });
124
+ }
125
+ }
126
+
127
+ const total = outcomes.reduce((n, o) => ("error" in o ? n + 1 : n + countResolvedEntries(o.resolved)), 0);
128
+
129
+ const aggregated: IngestResult = {
130
+ ingested: [],
102
131
  total: 0,
103
132
  ok: 0,
104
133
  unchanged: 0,
105
134
  failed: 0,
106
135
  };
107
- for (const source of sources) {
108
- const r = await ingest({ ...rest, source }, ctx);
109
- aggregated.ingested.push(...r.ingested);
110
- aggregated.total += r.total;
111
- aggregated.ok += r.ok;
112
- aggregated.unchanged += r.unchanged;
113
- aggregated.failed += r.failed;
136
+
137
+ ctx.progress.start(total, "ingest");
138
+ const callbacks: IngestCallbacks = {
139
+ onEntryStart: (label) => ctx.progress.tick(label),
140
+ onEntryComplete: (entry) => ctx.progress.entry(formatEntryLine(entry)),
141
+ };
142
+
143
+ for (const outcome of outcomes) {
144
+ if ("error" in outcome) {
145
+ const failed: IngestEntryResult = {
146
+ source_path: outcome.source,
147
+ logical_path: outcome.source,
148
+ version_id: null,
149
+ status: "failed",
150
+ error: outcome.error.message,
151
+ mime_type: null,
152
+ size_bytes: 0,
153
+ fetcher: "local",
154
+ source_sha256: "",
155
+ };
156
+ callbacks.onEntryStart?.(outcome.source);
157
+ callbacks.onEntryComplete?.(failed);
158
+ aggregated.ingested.push(failed);
159
+ aggregated.total += 1;
160
+ aggregated.failed += 1;
161
+ continue;
162
+ }
163
+
164
+ try {
165
+ const r = await ingestResolved(outcome.resolved, { ...rest, source: outcome.source }, ctx, callbacks);
166
+ aggregated.ingested.push(...r.ingested);
167
+ aggregated.total += r.total;
168
+ aggregated.ok += r.ok;
169
+ aggregated.unchanged += r.unchanged;
170
+ aggregated.failed += r.failed;
171
+ } catch (err) {
172
+ const message = err instanceof Error ? err.message : String(err);
173
+ const failed: IngestEntryResult = {
174
+ source_path: outcome.source,
175
+ logical_path: outcome.source,
176
+ version_id: null,
177
+ status: "failed",
178
+ error: message,
179
+ mime_type: null,
180
+ size_bytes: 0,
181
+ fetcher: "local",
182
+ source_sha256: "",
183
+ };
184
+ callbacks.onEntryStart?.(outcome.source);
185
+ callbacks.onEntryComplete?.(failed);
186
+ aggregated.ingested.push(failed);
187
+ aggregated.total += 1;
188
+ aggregated.failed += 1;
189
+ } finally {
190
+ // Release the DB lock between sources so other consumers (a
191
+ // concurrent CLI call, the daemon, or a separate MCP server)
192
+ // can wedge in. The next source's first DB call reopens.
193
+ await ctx.db.release();
194
+ }
114
195
  }
196
+
197
+ const summary = formatSummary(aggregated);
198
+ ctx.progress.done(summary);
115
199
  return aggregated;
116
200
  },
117
201
  });
202
+
203
+ /**
204
+ * Render the persistent stderr line shown for one completed entry. Mirrors
205
+ * the glyphs used by the final `console_formatter` so users see the same
206
+ * status indicators twice (once during ingest on stderr, once in the final
207
+ * stdout summary).
208
+ */
209
+ function formatEntryLine(entry: IngestEntryResult): string {
210
+ if (entry.status === "ok") {
211
+ return `${colors.green("✓")} ${colors.cyan(entry.logical_path)} ${colors.dim(`(${entry.fetcher}, ${entry.size_bytes}B)`)}`;
212
+ }
213
+ if (entry.status === "unchanged") {
214
+ return `${colors.dim("≡")} ${colors.cyan(entry.logical_path)} ${colors.dim("(unchanged)")}`;
215
+ }
216
+ return `${colors.red("✗")} ${entry.source_path} ${colors.dim(entry.error ?? "")}`;
217
+ }
218
+
219
+ /** Compose the final spinner-success line summarising the whole batch. */
220
+ function formatSummary(r: IngestResult): string {
221
+ const parts: string[] = [`added ${r.ok}/${r.total}`];
222
+ if (r.unchanged > 0) parts.push(`${r.unchanged} unchanged`);
223
+ if (r.failed > 0) parts.push(`${r.failed} failed`);
224
+ return parts.join(", ");
225
+ }
@@ -27,9 +27,19 @@ export const searchOperation = defineOperation({
27
27
  version_id: z.string(),
28
28
  chunk_index: z.number(),
29
29
  snippet: z.string(),
30
- score: z.number(),
31
- semantic_score: z.number().nullable(),
32
- keyword_score: z.number().nullable(),
30
+ score: z
31
+ .number()
32
+ .describe(
33
+ "Normalized fusion score in [0,1]; 1.0 = chunk was top-1 on both semantic and keyword lists, ~0.5 = top-1 on one",
34
+ ),
35
+ semantic_score: z
36
+ .number()
37
+ .nullable()
38
+ .describe("Cosine similarity from the semantic side (0-1), or null if not matched"),
39
+ keyword_score: z
40
+ .number()
41
+ .nullable()
42
+ .describe("Raw BM25 score from the keyword side (unbounded), or null if not matched"),
33
43
  }),
34
44
  ),
35
45
  mode: z.string(),
@@ -40,7 +50,10 @@ export const searchOperation = defineOperation({
40
50
  return colors.dim(`(no hits in ${result.mode} mode)`);
41
51
  }
42
52
  const blocks = result.hits.map((h) => {
43
- const head = `${colors.cyan(h.logical_path)} ${colors.dim(`v=${h.version_id}`)} ${colors.green(`score=${h.score.toFixed(3)}`)}`;
53
+ const parts = [`score=${h.score.toFixed(3)}`];
54
+ if (h.semantic_score !== null) parts.push(`sem=${h.semantic_score.toFixed(3)}`);
55
+ if (h.keyword_score !== null) parts.push(`bm25=${h.keyword_score.toFixed(2)}`);
56
+ const head = `${colors.cyan(h.logical_path)} ${colors.dim(`v=${h.version_id}`)} ${colors.green(parts.join(" "))}`;
44
57
  const snippet = h.snippet
45
58
  .split("\n")
46
59
  .map((l) => ` ${l}`)
@@ -2,21 +2,48 @@ import { logger } from "./logger.ts";
2
2
  import { isSilent, useSpinner } from "./tty.ts";
3
3
 
4
4
  /**
5
- * Minimal progress reporter for multi-entry operations (directory/glob ingest,
6
- * batch refresh). Operations call `start(total)`, then `tick(label)` for each
7
- * entry, then `done(summary)`.
5
+ * Progress reporter for multi-entry operations (directory/glob ingest, batch
6
+ * refresh, multi-source `add`). Operations call `start(total)`, then for each
7
+ * entry call `tick(label)` (advances the in-flight indicator) and optionally
8
+ * `entry(line)` (writes a persistent stderr line that survives the spinner),
9
+ * then `done(summary)`.
8
10
  *
9
- * Interactive: replaces a single spinner line as work happens.
10
- * Non-interactive: emits `info` lines per entry.
11
+ * Interactive: replaces a single spinner line as work happens, with an ASCII
12
+ * bar like `[████░░░░░░] 4/15 (26%) — relative/path.md`.
13
+ * Non-interactive: emits `info` lines per `tick` and per `entry`.
11
14
  */
12
15
  export interface Progress {
13
16
  start(total: number, label?: string): void;
14
17
  tick(label: string): void;
18
+ entry(line: string): void;
15
19
  done(summary?: string): void;
16
20
  fail(summary?: string): void;
17
21
  info(msg: string): void;
18
22
  }
19
23
 
24
+ const BAR_WIDTH = 20;
25
+ const LABEL_MAX = 60;
26
+
27
+ /**
28
+ * Render a fixed-width ASCII progress bar. Uses block-drawing characters in
29
+ * interactive mode so the bar reads naturally next to other unicode glyphs.
30
+ */
31
+ export function renderBar(count: number, total: number, width = BAR_WIDTH): string {
32
+ if (total <= 0) return `[${"░".repeat(width)}]`;
33
+ const ratio = Math.min(1, Math.max(0, count / total));
34
+ const filled = Math.round(ratio * width);
35
+ return `[${"█".repeat(filled)}${"░".repeat(width - filled)}]`;
36
+ }
37
+
38
+ /**
39
+ * Truncate a label from the left so the most-specific tail of a long path
40
+ * stays visible. Keeps the spinner line on a single terminal row.
41
+ */
42
+ function truncateLabel(label: string, max = LABEL_MAX): string {
43
+ if (label.length <= max) return label;
44
+ return `…${label.slice(label.length - max + 1)}`;
45
+ }
46
+
20
47
  /**
21
48
  * Build a `Progress` reporter whose mode is decided once, at call time, from
22
49
  * the current TTY state. Use one per multi-entry operation.
@@ -29,26 +56,38 @@ export function createProgress(): Progress {
29
56
  const interactive = useSpinner();
30
57
  const silent = isSilent();
31
58
 
59
+ const renderSpinnerText = (label: string): string => {
60
+ const bar = renderBar(count, total);
61
+ const pct = total > 0 ? Math.floor((count / total) * 100) : 0;
62
+ const tail = label ? ` — ${truncateLabel(label)}` : "";
63
+ return `${bar} ${count}/${total} (${pct}%)${tail}`;
64
+ };
65
+
32
66
  return {
33
67
  start(t: number, label?: string) {
34
68
  total = t;
35
69
  count = 0;
36
70
  if (silent) return;
37
71
  if (interactive) {
38
- spinner = logger.startSpinner(label ? `${label} (0/${total})` : `0/${total}`);
72
+ const initial = renderSpinnerText(label ?? "");
73
+ spinner = logger.startSpinner(initial);
39
74
  } else if (label) {
40
- logger.info(label);
75
+ logger.info(`${label}: 0/${total}`);
41
76
  }
42
77
  },
43
78
  tick(label: string) {
44
79
  count += 1;
45
80
  if (silent) return;
46
81
  if (interactive && spinner) {
47
- spinner.update(`${count}/${total} — ${label}`);
82
+ spinner.update(renderSpinnerText(label));
48
83
  } else {
49
84
  logger.info(`[${count}/${total}] ${label}`);
50
85
  }
51
86
  },
87
+ entry(line: string) {
88
+ if (silent) return;
89
+ logger.info(line);
90
+ },
52
91
  done(summary?: string) {
53
92
  if (silent) return;
54
93
  if (interactive && spinner) {
@@ -40,6 +40,14 @@ export function startDaemon(ctx: AppContext, tickSec: number): () => void {
40
40
  await runDueRefreshes(ctx);
41
41
  } catch (err) {
42
42
  logger.warn(`daemon: tick failed (${err instanceof Error ? err.message : String(err)})`);
43
+ } finally {
44
+ // Drop the DuckDB lock between ticks so the CLI / MCP server can
45
+ // run while the daemon is idle. Next tick reopens transparently.
46
+ try {
47
+ await ctx.db.release();
48
+ } catch {
49
+ // best effort
50
+ }
43
51
  }
44
52
  if (!stopped) setTimeout(loop, intervalMs);
45
53
  };
package/src/sdk.ts CHANGED
@@ -14,8 +14,10 @@ export { chunkDeterministic } from "./ingest/chunker.ts";
14
14
  export { embed, embedSingle } from "./ingest/embedder.ts";
15
15
  export type { FetchedRemote, FetchOptions } from "./ingest/fetcher.ts";
16
16
  export { fetchRemote } from "./ingest/fetcher.ts";
17
- export type { IngestEntryResult, IngestInput, IngestResult } from "./ingest/ingest.ts";
18
- export { ingest } from "./ingest/ingest.ts";
17
+ export type { IngestCallbacks, IngestEntryResult, IngestInput, IngestResult } from "./ingest/ingest.ts";
18
+ export { countResolvedEntries, ingest, ingestResolved } from "./ingest/ingest.ts";
19
+ export type { ResolvedLocalEntry, ResolvedSource } from "./ingest/source-resolver.ts";
20
+ export { resolveSource } from "./ingest/source-resolver.ts";
19
21
  export { buildMcpServer, startHttpServer, startStdioServer } from "./mcp/server.ts";
20
22
  export { OPERATIONS } from "./operations/index.ts";
21
23
  export type { CliMetadata, Operation } from "./operations/types.ts";
@@ -17,6 +17,11 @@ const SNIPPET_MAX = 300;
17
17
  * Reciprocal-rank fusion of semantic and keyword hit lists. Each result is
18
18
  * keyed by `(logical_path, version_id, chunk_index)` so the same chunk
19
19
  * appearing in both lists gets one fused score = sum of its RRF scores.
20
+ *
21
+ * The returned `score` is normalized to [0,1] by dividing by the theoretical
22
+ * max RRF (`2/(k+1)`, achieved when a chunk is rank-0 on both lists). This
23
+ * preserves ordering — division is monotonic — but makes the displayed value
24
+ * interpretable: 1.0 = top-1 on both signals, ~0.5 = top-1 on one.
20
25
  */
21
26
  export function fuseRRF(
22
27
  semantic: SemanticHit[],
@@ -24,6 +29,7 @@ export function fuseRRF(
24
29
  options: { k?: number; limit: number },
25
30
  ): FusedHit[] {
26
31
  const k = options.k ?? 60;
32
+ const maxRrf = 2 / (k + 1);
27
33
  const merged = new Map<
28
34
  string,
29
35
  {
@@ -89,7 +95,7 @@ export function fuseRRF(
89
95
  version_id: h.version_id,
90
96
  chunk_index: h.chunk_index,
91
97
  snippet: h.snippet,
92
- score: round(h.rrf),
98
+ score: round(h.rrf / maxRrf),
93
99
  semantic_score: h.semantic_score,
94
100
  keyword_score: h.keyword_score,
95
101
  }));
@@ -1,5 +1,6 @@
1
1
  import { rebuildFts } from "../db/chunks.ts";
2
2
  import type { DbConnection } from "../db/connection.ts";
3
+ import { asHelpful } from "../errors.ts";
3
4
 
4
5
  export interface KeywordHit {
5
6
  logical_path: string;
@@ -23,9 +24,10 @@ interface RawKeywordRow {
23
24
 
24
25
  /**
25
26
  * BM25 keyword search over `chunks.search_text` via the FTS extension.
26
- * Returns an empty list when FTS isn't available on this platform the
27
- * hybrid layer treats missing keyword hits as "no signal" and degrades
28
- * to semantic-only.
27
+ * Returns an empty list when FTS isn't available on this platform or the
28
+ * index is empty — the hybrid layer treats missing keyword hits as "no
29
+ * signal" and degrades to semantic-only. Genuine SQL/runtime errors are
30
+ * surfaced as HelpfulError so they don't get silently buried.
29
31
  */
30
32
  export async function searchKeyword(
31
33
  db: DbConnection,
@@ -36,15 +38,15 @@ export async function searchKeyword(
36
38
  if (result.kind !== "rebuilt") return [];
37
39
 
38
40
  const limit = options.limit ?? 50;
41
+ const sql = `SELECT row_key, logical_path, version_id, chunk_index,
42
+ chunk_content, search_text,
43
+ fts_main__current_chunks_fts.match_bm25(row_key, ?1) AS bm25_score
44
+ FROM _current_chunks_fts
45
+ WHERE fts_main__current_chunks_fts.match_bm25(row_key, ?1) IS NOT NULL
46
+ ${options.pathPrefix ? "AND logical_path LIKE ?2" : ""}
47
+ ORDER BY bm25_score DESC
48
+ LIMIT ${Number(limit)}`;
39
49
  try {
40
- const sql = `SELECT row_key, logical_path, version_id, chunk_index,
41
- chunk_content, search_text,
42
- fts_main__current_chunks_fts.match_bm25(row_key, ?1) AS bm25_score
43
- FROM _current_chunks_fts
44
- WHERE fts_main__current_chunks_fts.match_bm25(row_key, ?1) IS NOT NULL
45
- ${options.pathPrefix ? "AND logical_path LIKE ?2" : ""}
46
- ORDER BY bm25_score DESC
47
- LIMIT ${Number(limit)}`;
48
50
  const rows: RawKeywordRow[] = options.pathPrefix
49
51
  ? await db.queryAll<RawKeywordRow>(sql, query, `${options.pathPrefix}%`)
50
52
  : await db.queryAll<RawKeywordRow>(sql, query);
@@ -56,7 +58,12 @@ export async function searchKeyword(
56
58
  search_text: r.search_text,
57
59
  score: Number(r.bm25_score),
58
60
  }));
59
- } catch {
60
- return [];
61
+ } catch (e) {
62
+ throw asHelpful(
63
+ e,
64
+ "while running BM25 keyword search",
65
+ "Run `membot reindex` to rebuild the FTS index, then retry the search.",
66
+ "internal_error",
67
+ );
61
68
  }
62
69
  }