membot 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -5
- package/package.json +1 -1
- package/src/config/schemas.ts +7 -0
- package/src/context.ts +5 -1
- package/src/db/chunks.ts +2 -1
- package/src/db/connection.ts +190 -31
- package/src/db/migrations.ts +22 -1
- package/src/ingest/ingest.ts +74 -10
- package/src/ingest/source-resolver.ts +92 -11
- package/src/mount/mcp.ts +8 -0
- package/src/operations/add.ts +118 -10
- package/src/operations/search.ts +26 -4
- package/src/output/progress.ts +47 -8
- package/src/refresh/scheduler.ts +8 -0
- package/src/sdk.ts +4 -2
- package/src/search/hybrid.ts +7 -1
- package/src/search/keyword.ts +20 -13
package/README.md
CHANGED
|
@@ -2,8 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
> Versioned context store with hybrid search for AI agents. Stdio + HTTP MCP server and CLI.
|
|
4
4
|
|
|
5
|
-
[](./LICENSE)
|
|
5
|
+
[](./LICENSE)
|
|
7
6
|
|
|
8
7
|
`membot` is a single-binary CLI and MCP server that gives AI agents a persistent, versioned, searchable context store. Files (markdown, PDFs, DOCX, HTML, URLs, agent-authored notes) are ingested, converted to markdown, chunked, embedded **locally** with `@huggingface/transformers` (WASM, no cloud calls), and indexed in DuckDB with hybrid search (semantic vector + BM25). Every change creates a new version — nothing is overwritten in place.
|
|
9
8
|
|
|
@@ -16,11 +15,9 @@
|
|
|
16
15
|
|
|
17
16
|
```bash
|
|
18
17
|
bun install -g membot
|
|
19
|
-
# or
|
|
20
|
-
npm install -g membot
|
|
21
18
|
```
|
|
22
19
|
|
|
23
|
-
This pulls in DuckDB's per-platform native bindings alongside membot. The build externalizes `@duckdb/*` (those `.node` bindings can't be embedded by `bun build --compile`), so a global
|
|
20
|
+
This pulls in DuckDB's per-platform native bindings alongside membot. The build externalizes `@duckdb/*` (those `.node` bindings can't be embedded by `bun build --compile`), so a global Bun install is the supported path.
|
|
24
21
|
|
|
25
22
|
## Quick start
|
|
26
23
|
|
package/package.json
CHANGED
package/src/config/schemas.ts
CHANGED
|
@@ -23,6 +23,12 @@ export const DaemonConfigSchema = z.object({
|
|
|
23
23
|
tick_interval_sec: z.number().int().positive().default(DEFAULTS.DAEMON_TICK_SEC),
|
|
24
24
|
});
|
|
25
25
|
|
|
26
|
+
export const DbLockRetryConfigSchema = z.object({
|
|
27
|
+
max_attempts: z.number().int().positive().default(30),
|
|
28
|
+
base_delay_ms: z.number().int().positive().default(100),
|
|
29
|
+
max_delay_ms: z.number().int().positive().default(2000),
|
|
30
|
+
});
|
|
31
|
+
|
|
26
32
|
export const MembotConfigSchema = z.object({
|
|
27
33
|
data_dir: z.string().default(defaultMembotHome()),
|
|
28
34
|
embedding_model: z.string().default(EMBEDDING_MODEL),
|
|
@@ -31,6 +37,7 @@ export const MembotConfigSchema = z.object({
|
|
|
31
37
|
llm: LlmConfigSchema.default(() => LlmConfigSchema.parse({})),
|
|
32
38
|
mcpx: McpxConfigSchema.default(() => McpxConfigSchema.parse({})),
|
|
33
39
|
daemon: DaemonConfigSchema.default(() => DaemonConfigSchema.parse({})),
|
|
40
|
+
db_lock_retry: DbLockRetryConfigSchema.default(() => DbLockRetryConfigSchema.parse({})),
|
|
34
41
|
default_refresh_frequency_sec: z.number().int().positive().nullable().default(null),
|
|
35
42
|
});
|
|
36
43
|
|
package/src/context.ts
CHANGED
|
@@ -39,7 +39,11 @@ export async function buildContext(options: BuildContextOptions = {}): Promise<A
|
|
|
39
39
|
|
|
40
40
|
const { config, dataDir, configPath } = await loadConfig({ configFlag: options.configFlag });
|
|
41
41
|
const dbPath = join(dataDir, FILES.INDEX_DUCKDB);
|
|
42
|
-
const db = await openDb(dbPath
|
|
42
|
+
const db = await openDb(dbPath, {
|
|
43
|
+
maxAttempts: config.db_lock_retry.max_attempts,
|
|
44
|
+
baseDelayMs: config.db_lock_retry.base_delay_ms,
|
|
45
|
+
maxDelayMs: config.db_lock_retry.max_delay_ms,
|
|
46
|
+
});
|
|
43
47
|
|
|
44
48
|
const mcpx = await maybeMcpx(config);
|
|
45
49
|
|
package/src/db/chunks.ts
CHANGED
|
@@ -140,7 +140,8 @@ export async function rebuildFts(db: DbConnection): Promise<RebuildFtsResult> {
|
|
|
140
140
|
await db.exec(
|
|
141
141
|
`CREATE TABLE _current_chunks_fts AS
|
|
142
142
|
SELECT (logical_path || '::' || CAST(version_id AS VARCHAR) || '::' || chunk_index) AS row_key,
|
|
143
|
-
logical_path, CAST(version_id AS VARCHAR) AS version_id, chunk_index,
|
|
143
|
+
logical_path, CAST(version_id AS VARCHAR) AS version_id, chunk_index,
|
|
144
|
+
chunk_content, search_text
|
|
144
145
|
FROM current_chunks`,
|
|
145
146
|
);
|
|
146
147
|
await db.exec(
|
package/src/db/connection.ts
CHANGED
|
@@ -8,6 +8,7 @@ import {
|
|
|
8
8
|
|
|
9
9
|
import { EMBEDDING_DIMENSION } from "../constants.ts";
|
|
10
10
|
import { asHelpful } from "../errors.ts";
|
|
11
|
+
import { logger } from "../output/logger.ts";
|
|
11
12
|
import { applyMigrations } from "./migrations.ts";
|
|
12
13
|
|
|
13
14
|
/** Subset of @duckdb/node-api types we feed into / get out of queries. */
|
|
@@ -18,25 +19,51 @@ export interface RunResult {
|
|
|
18
19
|
changes: number;
|
|
19
20
|
}
|
|
20
21
|
|
|
22
|
+
/** Tunables for retrying a `DuckDBInstance.create()` call when another process holds the file lock. */
|
|
23
|
+
export interface LockRetryOptions {
|
|
24
|
+
maxAttempts: number;
|
|
25
|
+
baseDelayMs: number;
|
|
26
|
+
maxDelayMs: number;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export const DEFAULT_LOCK_RETRY: LockRetryOptions = {
|
|
30
|
+
maxAttempts: 30,
|
|
31
|
+
baseDelayMs: 100,
|
|
32
|
+
maxDelayMs: 2000,
|
|
33
|
+
};
|
|
34
|
+
|
|
21
35
|
/**
|
|
22
|
-
*
|
|
23
|
-
*
|
|
36
|
+
* Async wrapper around DuckDB with **lazy claim / release** semantics so
|
|
37
|
+
* concurrent membot processes don't deadlock on the file lock.
|
|
38
|
+
*
|
|
39
|
+
* Lifecycle:
|
|
40
|
+
* - construct with a path; nothing is opened yet
|
|
41
|
+
* - first query call (`exec`/`queryGet`/`queryAll`/`queryRun`) lazily opens
|
|
42
|
+
* DuckDB, retrying with backoff on lock conflicts, and runs migrations
|
|
43
|
+
* - `release()` closes the underlying DuckDB instance but leaves the
|
|
44
|
+
* wrapper reusable — the next query reopens transparently
|
|
45
|
+
* - `close()` is permanent: subsequent queries throw
|
|
46
|
+
*
|
|
47
|
+
* Long-running flows (MCP server, daemon, multi-file `add`) call `release()`
|
|
48
|
+
* between units of work so other consumers can grab the lock.
|
|
24
49
|
*/
|
|
25
50
|
export class DbConnection {
|
|
26
|
-
private readonly conn: DuckDBNativeConnection;
|
|
27
|
-
private readonly instance: DuckDBInstance | null;
|
|
28
51
|
readonly path: string;
|
|
52
|
+
private readonly retry: LockRetryOptions;
|
|
53
|
+
private conn: DuckDBNativeConnection | null = null;
|
|
54
|
+
private instance: DuckDBInstance | null = null;
|
|
29
55
|
private closed = false;
|
|
56
|
+
private opening: Promise<void> | null = null;
|
|
30
57
|
|
|
31
|
-
constructor(
|
|
32
|
-
this.conn = conn;
|
|
33
|
-
this.instance = instance;
|
|
58
|
+
constructor(path: string, retry: LockRetryOptions = DEFAULT_LOCK_RETRY) {
|
|
34
59
|
this.path = path;
|
|
60
|
+
this.retry = retry;
|
|
35
61
|
}
|
|
36
62
|
|
|
37
63
|
/** Run a parameter-less SQL statement (DDL, PRAGMA, batch SQL). */
|
|
38
64
|
async exec(sql: string): Promise<void> {
|
|
39
|
-
await this.
|
|
65
|
+
const conn = await this.ensureOpen();
|
|
66
|
+
await conn.run(sql);
|
|
40
67
|
}
|
|
41
68
|
|
|
42
69
|
/** Run a query and return the first row, or null. SQL uses `?N` placeholders. */
|
|
@@ -44,7 +71,8 @@ export class DbConnection {
|
|
|
44
71
|
sql: string,
|
|
45
72
|
...params: SqlParam[]
|
|
46
73
|
): Promise<T | null> {
|
|
47
|
-
const
|
|
74
|
+
const conn = await this.ensureOpen();
|
|
75
|
+
const result = await conn.runAndReadAll(translateParams(sql), flattenParams(params) as DuckDBValue[]);
|
|
48
76
|
const rows = (await result.getRowObjectsJS()) as Record<string, unknown>[];
|
|
49
77
|
if (!rows[0]) return null;
|
|
50
78
|
return convertRow(rows[0]) as T;
|
|
@@ -55,30 +83,169 @@ export class DbConnection {
|
|
|
55
83
|
sql: string,
|
|
56
84
|
...params: SqlParam[]
|
|
57
85
|
): Promise<T[]> {
|
|
58
|
-
const
|
|
86
|
+
const conn = await this.ensureOpen();
|
|
87
|
+
const result = await conn.runAndReadAll(translateParams(sql), flattenParams(params) as DuckDBValue[]);
|
|
59
88
|
const rows = (await result.getRowObjectsJS()) as Record<string, unknown>[];
|
|
60
89
|
return rows.map(convertRow) as T[];
|
|
61
90
|
}
|
|
62
91
|
|
|
63
92
|
/** Run a mutation (INSERT/UPDATE/DELETE) and report rows changed. SQL uses `?N` placeholders. */
|
|
64
93
|
async queryRun(sql: string, ...params: SqlParam[]): Promise<RunResult> {
|
|
65
|
-
const
|
|
94
|
+
const conn = await this.ensureOpen();
|
|
95
|
+
const result = await conn.run(translateParams(sql), flattenParams(params) as DuckDBValue[]);
|
|
66
96
|
return { changes: Number(result.rowsChanged) };
|
|
67
97
|
}
|
|
68
98
|
|
|
69
|
-
/**
|
|
99
|
+
/**
|
|
100
|
+
* Release the underlying DuckDB instance so other processes can claim
|
|
101
|
+
* the lock. The wrapper stays usable: the next query reopens. Idempotent
|
|
102
|
+
* — calling it on an already-released wrapper is a no-op.
|
|
103
|
+
*/
|
|
104
|
+
async release(): Promise<void> {
|
|
105
|
+
if (this.closed) return;
|
|
106
|
+
// If an open is in-flight, wait for it so we don't leave a stray instance behind.
|
|
107
|
+
if (this.opening) {
|
|
108
|
+
try {
|
|
109
|
+
await this.opening;
|
|
110
|
+
} catch {
|
|
111
|
+
// ensureOpen already cleared state on failure
|
|
112
|
+
return;
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
this.disposeHandles();
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
/** Permanently close. Subsequent queries throw. */
|
|
70
119
|
async close(): Promise<void> {
|
|
71
120
|
if (this.closed) return;
|
|
72
121
|
this.closed = true;
|
|
73
|
-
this.
|
|
122
|
+
if (this.opening) {
|
|
123
|
+
try {
|
|
124
|
+
await this.opening;
|
|
125
|
+
} catch {
|
|
126
|
+
return;
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
this.disposeHandles();
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
private disposeHandles(): void {
|
|
133
|
+
if (this.conn) {
|
|
134
|
+
try {
|
|
135
|
+
this.conn.disconnectSync();
|
|
136
|
+
} catch {
|
|
137
|
+
// best effort
|
|
138
|
+
}
|
|
139
|
+
this.conn = null;
|
|
140
|
+
}
|
|
74
141
|
if (this.instance) {
|
|
75
142
|
try {
|
|
76
143
|
this.instance.closeSync();
|
|
77
144
|
} catch {
|
|
78
145
|
// best effort
|
|
79
146
|
}
|
|
147
|
+
this.instance = null;
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
private async ensureOpen(): Promise<DuckDBNativeConnection> {
|
|
152
|
+
if (this.closed) {
|
|
153
|
+
throw new Error(`DbConnection at ${this.path} has been closed`);
|
|
154
|
+
}
|
|
155
|
+
if (this.conn) return this.conn;
|
|
156
|
+
if (!this.opening) {
|
|
157
|
+
this.opening = this.openOnce().finally(() => {
|
|
158
|
+
this.opening = null;
|
|
159
|
+
});
|
|
160
|
+
}
|
|
161
|
+
await this.opening;
|
|
162
|
+
if (!this.conn) {
|
|
163
|
+
throw new Error(`DbConnection at ${this.path} failed to open`);
|
|
164
|
+
}
|
|
165
|
+
return this.conn;
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
private async openOnce(): Promise<void> {
|
|
169
|
+
const instance = await createInstanceWithRetry(this.path, this.retry);
|
|
170
|
+
try {
|
|
171
|
+
const conn = await instance.connect();
|
|
172
|
+
this.instance = instance;
|
|
173
|
+
this.conn = conn;
|
|
174
|
+
await applyMigrations(this);
|
|
175
|
+
} catch (err) {
|
|
176
|
+
// On any failure after instance creation, release the lock immediately.
|
|
177
|
+
try {
|
|
178
|
+
instance.closeSync();
|
|
179
|
+
} catch {
|
|
180
|
+
// best effort
|
|
181
|
+
}
|
|
182
|
+
this.instance = null;
|
|
183
|
+
this.conn = null;
|
|
184
|
+
throw err;
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
/** True if the error message looks like DuckDB's lock-conflict shape. */
|
|
190
|
+
export function isLockConflictError(err: unknown): boolean {
|
|
191
|
+
const msg = err instanceof Error ? err.message : String(err ?? "");
|
|
192
|
+
return /could not set lock on file|conflicting lock|database is locked/i.test(msg);
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
/** Sleep helper. */
|
|
196
|
+
function delay(ms: number): Promise<void> {
|
|
197
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
/**
|
|
201
|
+
* Run an open-the-DB factory with exponential backoff + jitter when the file
|
|
202
|
+
* lock is held by another process. Non-lock errors are re-thrown immediately
|
|
203
|
+
* (wrapped as `HelpfulError`) — only lock conflicts are retried. After
|
|
204
|
+
* exhausting attempts we throw a `HelpfulError` whose hint names the
|
|
205
|
+
* concurrent-process problem. Exposed (rather than inlined) so tests can
|
|
206
|
+
* verify the retry behavior with a fake factory.
|
|
207
|
+
*/
|
|
208
|
+
export async function withLockRetry<T>(
|
|
209
|
+
factory: () => Promise<T>,
|
|
210
|
+
path: string,
|
|
211
|
+
retry: LockRetryOptions = DEFAULT_LOCK_RETRY,
|
|
212
|
+
): Promise<T> {
|
|
213
|
+
let lastErr: unknown;
|
|
214
|
+
for (let attempt = 1; attempt <= retry.maxAttempts; attempt++) {
|
|
215
|
+
try {
|
|
216
|
+
return await factory();
|
|
217
|
+
} catch (err) {
|
|
218
|
+
lastErr = err;
|
|
219
|
+
if (!isLockConflictError(err)) {
|
|
220
|
+
throw asHelpful(
|
|
221
|
+
err,
|
|
222
|
+
`while opening DuckDB at ${path}`,
|
|
223
|
+
`Check that ${path} is writable and not held open by another process. Delete the file to start fresh.`,
|
|
224
|
+
"internal_error",
|
|
225
|
+
);
|
|
226
|
+
}
|
|
227
|
+
if (attempt === retry.maxAttempts) break;
|
|
228
|
+
const backoff = Math.min(retry.maxDelayMs, retry.baseDelayMs * 2 ** (attempt - 1));
|
|
229
|
+
const jitter = Math.floor(Math.random() * Math.min(retry.baseDelayMs, backoff));
|
|
230
|
+
const wait = backoff + jitter;
|
|
231
|
+
logger.debug(`db: lock held on ${path}, retrying in ${wait}ms (attempt ${attempt}/${retry.maxAttempts})`);
|
|
232
|
+
await delay(wait);
|
|
80
233
|
}
|
|
81
234
|
}
|
|
235
|
+
throw asHelpful(
|
|
236
|
+
lastErr,
|
|
237
|
+
`while opening DuckDB at ${path} after ${retry.maxAttempts} attempts`,
|
|
238
|
+
`Another process is holding the database lock. Stop the conflicting process (check for a running 'membot serve' or open DuckDB CLI session) or delete ${path} to start fresh.`,
|
|
239
|
+
"internal_error",
|
|
240
|
+
);
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
/** Open a `DuckDBInstance` for `path`, retrying with backoff on lock conflicts. */
|
|
244
|
+
export function createInstanceWithRetry(
|
|
245
|
+
path: string,
|
|
246
|
+
retry: LockRetryOptions = DEFAULT_LOCK_RETRY,
|
|
247
|
+
): Promise<DuckDBInstance> {
|
|
248
|
+
return withLockRetry(() => DuckDBInstance.create(path), path, retry);
|
|
82
249
|
}
|
|
83
250
|
|
|
84
251
|
/** Type guard for the JS values DuckDB returns directly without further coercion. */
|
|
@@ -149,25 +316,17 @@ function flattenParams(params: SqlParam[]): unknown[] {
|
|
|
149
316
|
}
|
|
150
317
|
|
|
151
318
|
/**
|
|
152
|
-
*
|
|
153
|
-
*
|
|
319
|
+
* Construct a lazy DuckDB-backed connection for the given file path. The
|
|
320
|
+
* underlying DuckDB instance isn't opened until the first query call (which
|
|
321
|
+
* also runs migrations). To surface lock conflicts at the call site, callers
|
|
322
|
+
* may probe with `await db.exec("SELECT 1")` immediately after construction.
|
|
154
323
|
*/
|
|
155
|
-
export async function openDb(path: string): Promise<DbConnection> {
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
err,
|
|
162
|
-
`while opening DuckDB at ${path}`,
|
|
163
|
-
`Check that ${path} is writable and not held open by another process. Delete the file to start fresh.`,
|
|
164
|
-
"internal_error",
|
|
165
|
-
);
|
|
166
|
-
}
|
|
167
|
-
const conn = await instance.connect();
|
|
168
|
-
const wrapper = new DbConnection(conn, instance, path);
|
|
169
|
-
await applyMigrations(wrapper);
|
|
170
|
-
return wrapper;
|
|
324
|
+
export async function openDb(path: string, retry: LockRetryOptions = DEFAULT_LOCK_RETRY): Promise<DbConnection> {
|
|
325
|
+
const db = new DbConnection(path, retry);
|
|
326
|
+
// Eager probe so initial open errors (lock conflict, bad path, migration
|
|
327
|
+
// failure) surface here rather than at the first query in user code.
|
|
328
|
+
await db.exec("SELECT 1");
|
|
329
|
+
return db;
|
|
171
330
|
}
|
|
172
331
|
|
|
173
332
|
export { EMBEDDING_DIMENSION };
|
package/src/db/migrations.ts
CHANGED
|
@@ -16,12 +16,31 @@ export interface Migration {
|
|
|
16
16
|
|
|
17
17
|
const MIGRATIONS: Migration[] = [MIGRATION_001, MIGRATION_002];
|
|
18
18
|
|
|
19
|
+
/**
|
|
20
|
+
* Process-level cache of paths whose migrations have been applied (or
|
|
21
|
+
* confirmed already-current) in this process. With lazy-claim DB connections,
|
|
22
|
+
* `applyMigrations` runs on every reopen — caching here keeps the DDL/SELECT
|
|
23
|
+
* traffic and "migration: applied" log lines off the hot reopen path.
|
|
24
|
+
* Cleared by `forgetMigrations` so tests can simulate a fresh process.
|
|
25
|
+
*/
|
|
26
|
+
const checkedPaths = new Set<string>();
|
|
27
|
+
|
|
28
|
+
/** Reset the per-process migration cache. Test-only — production code never calls this. */
|
|
29
|
+
export function forgetMigrations(path?: string): void {
|
|
30
|
+
if (path === undefined) checkedPaths.clear();
|
|
31
|
+
else checkedPaths.delete(path);
|
|
32
|
+
}
|
|
33
|
+
|
|
19
34
|
/**
|
|
20
35
|
* Apply every unapplied migration in id order. Tracks applied ids in
|
|
21
36
|
* `_migrations`. Each successful run is logged via the shared logger so a
|
|
22
|
-
* user upgrading membot can see exactly what changed in their store.
|
|
37
|
+
* user upgrading membot can see exactly what changed in their store. The
|
|
38
|
+
* first call for a given DB path checks the table; subsequent calls in the
|
|
39
|
+
* same process short-circuit via `checkedPaths`.
|
|
23
40
|
*/
|
|
24
41
|
export async function applyMigrations(db: DbConnection): Promise<void> {
|
|
42
|
+
if (checkedPaths.has(db.path)) return;
|
|
43
|
+
|
|
25
44
|
await db.exec(`CREATE TABLE IF NOT EXISTS _migrations (
|
|
26
45
|
id INTEGER PRIMARY KEY,
|
|
27
46
|
name TEXT NOT NULL,
|
|
@@ -42,4 +61,6 @@ export async function applyMigrations(db: DbConnection): Promise<void> {
|
|
|
42
61
|
await db.queryRun(`INSERT INTO _migrations(id, name) VALUES (?1, ?2)`, migration.id, migration.name);
|
|
43
62
|
logger.info(`migration: applied ${String(migration.id).padStart(3, "0")}-${migration.name}`);
|
|
44
63
|
}
|
|
64
|
+
|
|
65
|
+
checkedPaths.add(db.path);
|
|
45
66
|
}
|
package/src/ingest/ingest.ts
CHANGED
|
@@ -44,12 +44,36 @@ export interface IngestResult {
|
|
|
44
44
|
failed: number;
|
|
45
45
|
}
|
|
46
46
|
|
|
47
|
+
/**
|
|
48
|
+
* Per-entry hooks invoked while a resolved source is being ingested. Used by
|
|
49
|
+
* `add` to drive a single shared progress reporter across many sources
|
|
50
|
+
* without re-resolving anything. `onEntryStart` fires before the pipeline
|
|
51
|
+
* touches an entry; `onEntryComplete` fires after the result (ok / unchanged
|
|
52
|
+
* / failed) is known. Both are optional.
|
|
53
|
+
*/
|
|
54
|
+
export interface IngestCallbacks {
|
|
55
|
+
onEntryStart?: (label: string) => void;
|
|
56
|
+
onEntryComplete?: (entry: IngestEntryResult) => void;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Count how many per-entry results a `ResolvedSource` will produce. Used by
|
|
61
|
+
* `add` to size a shared progress bar before ingestion starts.
|
|
62
|
+
*/
|
|
63
|
+
export function countResolvedEntries(resolved: ResolvedSource): number {
|
|
64
|
+
if (resolved.kind === "local-files") return resolved.entries.length;
|
|
65
|
+
return 1;
|
|
66
|
+
}
|
|
67
|
+
|
|
47
68
|
/**
|
|
48
69
|
* Top-level ingest orchestrator. Resolves the source arg, dispatches to the
|
|
49
70
|
* right reader (local / remote / inline), runs the pipeline (convert →
|
|
50
71
|
* describe → chunk → embed → write), and returns one entry per matched
|
|
51
72
|
* file. Partial failures are reported per-entry; the entire call doesn't
|
|
52
|
-
* abort because one URL or PDF is bad.
|
|
73
|
+
* abort because one URL or PDF is bad. Drives `ctx.progress` itself, so
|
|
74
|
+
* single-source SDK callers get a usable indicator out of the box. When
|
|
75
|
+
* orchestrating many sources at once (e.g. `add`), call `resolveSource` +
|
|
76
|
+
* `ingestResolved` directly so one shared progress spans every entry.
|
|
53
77
|
*/
|
|
54
78
|
export async function ingest(input: IngestInput, ctx: AppContext): Promise<IngestResult> {
|
|
55
79
|
const resolved = await resolveSource(input.source, {
|
|
@@ -57,17 +81,40 @@ export async function ingest(input: IngestInput, ctx: AppContext): Promise<Inges
|
|
|
57
81
|
exclude: input.exclude,
|
|
58
82
|
followSymlinks: input.follow_symlinks ?? true,
|
|
59
83
|
});
|
|
84
|
+
const total = countResolvedEntries(resolved);
|
|
85
|
+
ctx.progress.start(total, "ingest");
|
|
86
|
+
const callbacks: IngestCallbacks = {
|
|
87
|
+
onEntryStart: (label) => ctx.progress.tick(label),
|
|
88
|
+
};
|
|
89
|
+
const result = await ingestResolved(resolved, input, ctx, callbacks);
|
|
90
|
+
const okCount = result.ok;
|
|
91
|
+
const unchangedSuffix = result.unchanged > 0 ? ` (${result.unchanged} unchanged)` : "";
|
|
92
|
+
ctx.progress.done(`ingested ${okCount}/${result.total}${unchangedSuffix}`);
|
|
93
|
+
return result;
|
|
94
|
+
}
|
|
60
95
|
|
|
96
|
+
/**
|
|
97
|
+
* Run the ingest pipeline against a pre-resolved source. Same as `ingest`
|
|
98
|
+
* but skips the resolve step and delegates progress reporting to the caller
|
|
99
|
+
* via `callbacks`. This is the entry point used by multi-source orchestrators
|
|
100
|
+
* (`add`) so a single progress bar can span every entry across every source.
|
|
101
|
+
*/
|
|
102
|
+
export async function ingestResolved(
|
|
103
|
+
resolved: ResolvedSource,
|
|
104
|
+
input: IngestInput,
|
|
105
|
+
ctx: AppContext,
|
|
106
|
+
callbacks?: IngestCallbacks,
|
|
107
|
+
): Promise<IngestResult> {
|
|
61
108
|
const refreshSec = parseDuration(input.refresh_frequency);
|
|
62
109
|
const force = input.force === true;
|
|
63
110
|
|
|
64
111
|
if (resolved.kind === "inline") {
|
|
65
|
-
return ingestInline(resolved.text, input, ctx, refreshSec);
|
|
112
|
+
return ingestInline(resolved.text, input, ctx, refreshSec, callbacks);
|
|
66
113
|
}
|
|
67
114
|
if (resolved.kind === "url") {
|
|
68
|
-
return ingestUrl(resolved.url, input, ctx, refreshSec, force);
|
|
115
|
+
return ingestUrl(resolved.url, input, ctx, refreshSec, force, callbacks);
|
|
69
116
|
}
|
|
70
|
-
return ingestLocalFiles(resolved, input, ctx, refreshSec, force);
|
|
117
|
+
return ingestLocalFiles(resolved, input, ctx, refreshSec, force, callbacks);
|
|
71
118
|
}
|
|
72
119
|
|
|
73
120
|
/** Ingest a single inline blob (source_type='inline'). */
|
|
@@ -76,8 +123,10 @@ async function ingestInline(
|
|
|
76
123
|
input: IngestInput,
|
|
77
124
|
ctx: AppContext,
|
|
78
125
|
refreshSec: number | null,
|
|
126
|
+
callbacks?: IngestCallbacks,
|
|
79
127
|
): Promise<IngestResult> {
|
|
80
128
|
const logicalPath = input.logical_path ?? defaultInlinePath();
|
|
129
|
+
callbacks?.onEntryStart?.(logicalPath);
|
|
81
130
|
const bytes = new TextEncoder().encode(text);
|
|
82
131
|
const sha = sha256Hex(bytes);
|
|
83
132
|
const result: IngestEntryResult = {
|
|
@@ -113,6 +162,7 @@ async function ingestInline(
|
|
|
113
162
|
result.status = "failed";
|
|
114
163
|
result.error = errorMessage(err);
|
|
115
164
|
}
|
|
165
|
+
callbacks?.onEntryComplete?.(result);
|
|
116
166
|
return summarize([result]);
|
|
117
167
|
}
|
|
118
168
|
|
|
@@ -123,6 +173,7 @@ async function ingestUrl(
|
|
|
123
173
|
ctx: AppContext,
|
|
124
174
|
refreshSec: number | null,
|
|
125
175
|
force: boolean,
|
|
176
|
+
callbacks?: IngestCallbacks,
|
|
126
177
|
): Promise<IngestResult> {
|
|
127
178
|
const mcpxAdapter = ctx.mcpx
|
|
128
179
|
? {
|
|
@@ -137,6 +188,7 @@ async function ingestUrl(
|
|
|
137
188
|
: null;
|
|
138
189
|
|
|
139
190
|
const logicalPath = input.logical_path ?? defaultLogicalForUrl(url);
|
|
191
|
+
callbacks?.onEntryStart?.(url);
|
|
140
192
|
const result: IngestEntryResult = {
|
|
141
193
|
source_path: url,
|
|
142
194
|
logical_path: logicalPath,
|
|
@@ -160,6 +212,7 @@ async function ingestUrl(
|
|
|
160
212
|
if (cur && cur.source_sha256 === fetched.sha256) {
|
|
161
213
|
result.status = "unchanged";
|
|
162
214
|
result.version_id = cur.version_id;
|
|
215
|
+
callbacks?.onEntryComplete?.(result);
|
|
163
216
|
return summarize([result]);
|
|
164
217
|
}
|
|
165
218
|
}
|
|
@@ -185,6 +238,7 @@ async function ingestUrl(
|
|
|
185
238
|
result.status = "failed";
|
|
186
239
|
result.error = errorMessage(err);
|
|
187
240
|
}
|
|
241
|
+
callbacks?.onEntryComplete?.(result);
|
|
188
242
|
return summarize([result]);
|
|
189
243
|
}
|
|
190
244
|
|
|
@@ -195,8 +249,16 @@ async function ingestLocalFiles(
|
|
|
195
249
|
ctx: AppContext,
|
|
196
250
|
refreshSec: number | null,
|
|
197
251
|
force: boolean,
|
|
252
|
+
callbacks?: IngestCallbacks,
|
|
198
253
|
): Promise<IngestResult> {
|
|
199
254
|
if (resolved.entries.length === 0) {
|
|
255
|
+
// `filtered: true` means the source resolved successfully but every
|
|
256
|
+
// entry was dropped by --exclude / --include / DEFAULT_EXCLUDES.
|
|
257
|
+
// Treat that as a silent no-op: shell-expanded globs commonly hand
|
|
258
|
+
// us individual files we should skip without aborting the batch.
|
|
259
|
+
if (resolved.filtered) {
|
|
260
|
+
return { ingested: [], total: 0, ok: 0, unchanged: 0, failed: 0 };
|
|
261
|
+
}
|
|
200
262
|
throw new HelpfulError({
|
|
201
263
|
kind: "input_error",
|
|
202
264
|
message: `Glob/path matched 0 files`,
|
|
@@ -205,11 +267,10 @@ async function ingestLocalFiles(
|
|
|
205
267
|
}
|
|
206
268
|
|
|
207
269
|
const results: IngestEntryResult[] = [];
|
|
208
|
-
ctx.progress.start(resolved.entries.length, "ingest");
|
|
209
270
|
const isMulti = resolved.entries.length > 1;
|
|
210
271
|
|
|
211
272
|
for (const entry of resolved.entries) {
|
|
212
|
-
|
|
273
|
+
callbacks?.onEntryStart?.(entry.relPathFromBase);
|
|
213
274
|
const logicalPath = pickLogicalPath(input.logical_path, entry, isMulti);
|
|
214
275
|
const result: IngestEntryResult = {
|
|
215
276
|
source_path: entry.absPath,
|
|
@@ -233,6 +294,7 @@ async function ingestLocalFiles(
|
|
|
233
294
|
result.status = "unchanged";
|
|
234
295
|
result.version_id = cur.version_id;
|
|
235
296
|
results.push(result);
|
|
297
|
+
callbacks?.onEntryComplete?.(result);
|
|
236
298
|
continue;
|
|
237
299
|
}
|
|
238
300
|
}
|
|
@@ -257,13 +319,15 @@ async function ingestLocalFiles(
|
|
|
257
319
|
} catch (err) {
|
|
258
320
|
result.status = "failed";
|
|
259
321
|
result.error = errorMessage(err);
|
|
322
|
+
} finally {
|
|
323
|
+
// Release the DB lock between files in a directory/glob walk so
|
|
324
|
+
// concurrent processes can wedge in mid-batch. The next entry's
|
|
325
|
+
// first DB call reopens (cheap — same-process reopen).
|
|
326
|
+
await ctx.db.release();
|
|
260
327
|
}
|
|
261
328
|
results.push(result);
|
|
329
|
+
callbacks?.onEntryComplete?.(result);
|
|
262
330
|
}
|
|
263
|
-
const okCount = results.filter((r) => r.status === "ok").length;
|
|
264
|
-
const unchangedCount = results.filter((r) => r.status === "unchanged").length;
|
|
265
|
-
const suffix = unchangedCount > 0 ? ` (${unchangedCount} unchanged)` : "";
|
|
266
|
-
ctx.progress.done(`ingested ${okCount}/${results.length}${suffix}`);
|
|
267
331
|
|
|
268
332
|
return summarize(results);
|
|
269
333
|
}
|
|
@@ -1,12 +1,26 @@
|
|
|
1
1
|
import { realpath, stat } from "node:fs/promises";
|
|
2
|
+
import { homedir } from "node:os";
|
|
2
3
|
import { isAbsolute, join, relative, resolve, sep } from "node:path";
|
|
3
4
|
import picomatch from "picomatch";
|
|
4
5
|
import { asHelpful, HelpfulError } from "../errors.ts";
|
|
5
6
|
|
|
7
|
+
/**
|
|
8
|
+
* Expand a leading `~` or `~/` to the user's home directory. The shell does
|
|
9
|
+
* this for us when the arg is unquoted, but `bun dev add "~/foo/*.md"` passes
|
|
10
|
+
* the literal `~` through, and `path.resolve("~/foo")` treats `~` as a
|
|
11
|
+
* regular directory name. We patch it up so quoted args work like users
|
|
12
|
+
* expect. Inline literals and URLs are caught earlier and never reach here.
|
|
13
|
+
*/
|
|
14
|
+
function expandHome(p: string): string {
|
|
15
|
+
if (p === "~") return homedir();
|
|
16
|
+
if (p.startsWith("~/") || p.startsWith(`~${sep}`)) return join(homedir(), p.slice(2));
|
|
17
|
+
return p;
|
|
18
|
+
}
|
|
19
|
+
|
|
6
20
|
export type ResolvedSource =
|
|
7
21
|
| { kind: "inline"; text: string; logicalHint: string | null }
|
|
8
22
|
| { kind: "url"; url: string; logicalHint: string | null }
|
|
9
|
-
| { kind: "local-files"; entries: ResolvedLocalEntry[]; basePath: string };
|
|
23
|
+
| { kind: "local-files"; entries: ResolvedLocalEntry[]; basePath: string; filtered?: boolean };
|
|
10
24
|
|
|
11
25
|
export interface ResolvedLocalEntry {
|
|
12
26
|
/** Absolute filesystem path (post-realpath). */
|
|
@@ -28,6 +42,45 @@ export interface ResolveOptions {
|
|
|
28
42
|
|
|
29
43
|
const DEFAULT_EXCLUDES = ["**/node_modules/**", "**/.git/**", "**/.DS_Store", "**/dist/**", "**/.cache/**"];
|
|
30
44
|
|
|
45
|
+
/**
|
|
46
|
+
* Expand a user-supplied include/exclude pattern into a small set of
|
|
47
|
+
* gitignore-ish equivalents so common spellings all do the intuitive thing.
|
|
48
|
+
* Examples (all exclude the whole subtree): a bare name like `node_modules`,
|
|
49
|
+
* a trailing-slash form like `node_modules/`, the shell-style `node_modules`
|
|
50
|
+
* followed by single-star, the canonical doublestar forms — every spelling
|
|
51
|
+
* a user would reasonably reach for ends up matching nested files.
|
|
52
|
+
* Patterns starting with `**`-slash, `/`, or `./` are considered anchored
|
|
53
|
+
* and are not given an any-depth variant. `DEFAULT_EXCLUDES` are already
|
|
54
|
+
* canonical and bypass this helper.
|
|
55
|
+
*/
|
|
56
|
+
export function expandUserPattern(p: string): string[] {
|
|
57
|
+
const out = new Set<string>([p]);
|
|
58
|
+
const anchored = p.startsWith("**/") || p.startsWith("/") || p.startsWith("./");
|
|
59
|
+
const hasSlash = p.includes("/");
|
|
60
|
+
const hasGlob = /[*?[\]{}!]/.test(p);
|
|
61
|
+
// Path-like patterns ("foo/bar", "node_modules/*") imply the user is
|
|
62
|
+
// thinking about a directory tree — match at any depth. Bare globs like
|
|
63
|
+
// "*.md" are left alone so they keep their anchored top-level meaning.
|
|
64
|
+
if (hasSlash && !anchored) out.add(`**/${p}`);
|
|
65
|
+
if (p.endsWith("/*") && !p.endsWith("/**/*")) {
|
|
66
|
+
const base = p.slice(0, -2);
|
|
67
|
+
out.add(`${base}/**`);
|
|
68
|
+
if (!anchored) out.add(`**/${base}/**`);
|
|
69
|
+
}
|
|
70
|
+
if (p.endsWith("/")) {
|
|
71
|
+
const base = p.slice(0, -1);
|
|
72
|
+
out.add(`${base}/**`);
|
|
73
|
+
if (!anchored) out.add(`**/${base}/**`);
|
|
74
|
+
}
|
|
75
|
+
// Bare name with no slashes and no glob chars (e.g. "node_modules",
|
|
76
|
+
// "dist") → treat as a directory match anywhere in the tree.
|
|
77
|
+
if (!hasSlash && !hasGlob) {
|
|
78
|
+
out.add(`**/${p}`);
|
|
79
|
+
out.add(`**/${p}/**`);
|
|
80
|
+
}
|
|
81
|
+
return [...out];
|
|
82
|
+
}
|
|
83
|
+
|
|
31
84
|
/**
|
|
32
85
|
* Polymorphic source-arg expander. Accepts:
|
|
33
86
|
* - "inline:<text>" → inline literal
|
|
@@ -48,20 +101,28 @@ export async function resolveSource(source: string, options: ResolveOptions = {}
|
|
|
48
101
|
return { kind: "url", url: source, logicalHint: null };
|
|
49
102
|
}
|
|
50
103
|
|
|
104
|
+
source = expandHome(source);
|
|
105
|
+
|
|
51
106
|
const followSymlinks = options.followSymlinks !== false;
|
|
52
|
-
const
|
|
107
|
+
const userIncludesRaw = options.include
|
|
53
108
|
? options.include
|
|
54
109
|
.split(",")
|
|
55
110
|
.map((g) => g.trim())
|
|
56
111
|
.filter(Boolean)
|
|
57
112
|
: [];
|
|
58
|
-
const
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
];
|
|
113
|
+
const userExcludesRaw = (options.exclude ?? "")
|
|
114
|
+
.split(",")
|
|
115
|
+
.map((g) => g.trim())
|
|
116
|
+
.filter(Boolean);
|
|
117
|
+
const userIncludesExpanded = userIncludesRaw.flatMap(expandUserPattern);
|
|
118
|
+
const userExcludesExpanded = userExcludesRaw.flatMap(expandUserPattern);
|
|
119
|
+
const excludeMatchers = [...DEFAULT_EXCLUDES, ...userExcludesExpanded];
|
|
120
|
+
// Single-file matchers run against the absolute path so shell-expanded
|
|
121
|
+
// globs (where each file lands here individually) still honor excludes.
|
|
122
|
+
const isExcludeAbs = picomatch(excludeMatchers, { dot: false });
|
|
123
|
+
const isIncludeAbs = userIncludesExpanded.length
|
|
124
|
+
? picomatch(userIncludesExpanded, { dot: false, nocase: false })
|
|
125
|
+
: null;
|
|
65
126
|
|
|
66
127
|
if (isGlob(source)) {
|
|
67
128
|
const base = globBase(source);
|
|
@@ -71,7 +132,7 @@ export async function resolveSource(source: string, options: ResolveOptions = {}
|
|
|
71
132
|
// Source glob acts as a hard filter; user includes (if any) further
|
|
72
133
|
// narrow the result via AND. Pass them as a separate matcher so the
|
|
73
134
|
// two sets aren't picomatch-OR'd together.
|
|
74
|
-
const extraIncludes =
|
|
135
|
+
const extraIncludes = userIncludesExpanded.length > 0 ? [userIncludesExpanded] : [];
|
|
75
136
|
return walk(realBase, [remainder], excludeMatchers, followSymlinks, extraIncludes);
|
|
76
137
|
} catch (err) {
|
|
77
138
|
throw asHelpful(
|
|
@@ -98,6 +159,16 @@ export async function resolveSource(source: string, options: ResolveOptions = {}
|
|
|
98
159
|
|
|
99
160
|
if (st.isFile()) {
|
|
100
161
|
const real = await realpath(abs);
|
|
162
|
+
// Shell-expanded globs (e.g. zsh expanding `~/foo/**/*.md`) deliver
|
|
163
|
+
// each match here individually, so this branch must enforce both
|
|
164
|
+
// DEFAULT_EXCLUDES and the user's own --include/--exclude. Otherwise
|
|
165
|
+
// `node_modules` paths slip through whenever the shell expanded for us.
|
|
166
|
+
if (isExcludeAbs(real)) {
|
|
167
|
+
return { kind: "local-files", basePath: real, entries: [], filtered: true };
|
|
168
|
+
}
|
|
169
|
+
if (isIncludeAbs && !isIncludeAbs(real)) {
|
|
170
|
+
return { kind: "local-files", basePath: real, entries: [], filtered: true };
|
|
171
|
+
}
|
|
101
172
|
return {
|
|
102
173
|
kind: "local-files",
|
|
103
174
|
basePath: real,
|
|
@@ -107,7 +178,7 @@ export async function resolveSource(source: string, options: ResolveOptions = {}
|
|
|
107
178
|
|
|
108
179
|
if (st.isDirectory()) {
|
|
109
180
|
const realBase = await realpath(abs);
|
|
110
|
-
const dirIncludes =
|
|
181
|
+
const dirIncludes = userIncludesExpanded.length > 0 ? userIncludesExpanded : ["**/*"];
|
|
111
182
|
return walk(realBase, dirIncludes, excludeMatchers, followSymlinks);
|
|
112
183
|
}
|
|
113
184
|
|
|
@@ -170,6 +241,14 @@ async function walk(
|
|
|
170
241
|
const isInclude = picomatch(includes, { dot: false, nocase: false });
|
|
171
242
|
const extraMatchers = extraIncludeSets.map((set) => picomatch(set, { dot: false, nocase: false }));
|
|
172
243
|
const isExclude = excludes.length ? picomatch(excludes, { dot: false }) : null;
|
|
244
|
+
// Directory-prune patterns: derived from excludes by stripping a trailing
|
|
245
|
+
// `/**` or `/*`. Without this we descend into massive subtrees (e.g.
|
|
246
|
+
// every `node_modules/` under a workspace) before discarding files one
|
|
247
|
+
// by one — which on real machines presents as a hang.
|
|
248
|
+
const dirPrunePatterns = excludes
|
|
249
|
+
.map((p) => (p.endsWith("/**") ? p.slice(0, -3) : p.endsWith("/*") ? p.slice(0, -2) : p))
|
|
250
|
+
.filter((p) => p.length > 0);
|
|
251
|
+
const isExcludeDir = dirPrunePatterns.length ? picomatch(dirPrunePatterns, { dot: false }) : null;
|
|
173
252
|
|
|
174
253
|
const queue: string[] = [base];
|
|
175
254
|
while (queue.length > 0) {
|
|
@@ -191,6 +270,8 @@ async function walk(
|
|
|
191
270
|
}
|
|
192
271
|
if (st.isSymbolicLink() && !followSymlinks) continue;
|
|
193
272
|
if (st.isDirectory()) {
|
|
273
|
+
const rel = relative(base, real);
|
|
274
|
+
if (rel.length > 0 && isExcludeDir?.(rel)) continue;
|
|
194
275
|
let names: string[];
|
|
195
276
|
try {
|
|
196
277
|
names = await readdir(real);
|
package/src/mount/mcp.ts
CHANGED
|
@@ -48,6 +48,14 @@ export function mountAsMcpTool<I extends z.ZodObject, O extends z.ZodTypeAny>(
|
|
|
48
48
|
};
|
|
49
49
|
} catch (err) {
|
|
50
50
|
return renderMcpError(err);
|
|
51
|
+
} finally {
|
|
52
|
+
// Drop the DuckDB lock between MCP tool calls so concurrent CLI
|
|
53
|
+
// or daemon callers can claim it. The next tool call reopens.
|
|
54
|
+
try {
|
|
55
|
+
await ctx.db.release();
|
|
56
|
+
} catch {
|
|
57
|
+
// best effort — never let release failures mask a tool result
|
|
58
|
+
}
|
|
51
59
|
}
|
|
52
60
|
},
|
|
53
61
|
);
|
package/src/operations/add.ts
CHANGED
|
@@ -1,5 +1,12 @@
|
|
|
1
1
|
import { z } from "zod";
|
|
2
|
-
import {
|
|
2
|
+
import {
|
|
3
|
+
countResolvedEntries,
|
|
4
|
+
type IngestCallbacks,
|
|
5
|
+
type IngestEntryResult,
|
|
6
|
+
type IngestResult,
|
|
7
|
+
ingestResolved,
|
|
8
|
+
} from "../ingest/ingest.ts";
|
|
9
|
+
import { type ResolvedSource, resolveSource } from "../ingest/source-resolver.ts";
|
|
3
10
|
import { colors } from "../output/formatter.ts";
|
|
4
11
|
import { defineOperation } from "./types.ts";
|
|
5
12
|
|
|
@@ -97,21 +104,122 @@ Pass \`logical_path\` to override. For a multi-source / directory / glob walk it
|
|
|
97
104
|
},
|
|
98
105
|
handler: async (input, ctx) => {
|
|
99
106
|
const { sources, ...rest } = input;
|
|
100
|
-
const
|
|
101
|
-
|
|
107
|
+
const followSymlinks = rest.follow_symlinks ?? true;
|
|
108
|
+
|
|
109
|
+
// Phase 1: resolve every source upfront so the shared progress bar
|
|
110
|
+
// knows its total. A resolve failure (bad path, glob with no base) is
|
|
111
|
+
// captured per-source so one bad arg doesn't abort the whole batch.
|
|
112
|
+
type ResolveOutcome = { source: string; resolved: ResolvedSource } | { source: string; error: Error };
|
|
113
|
+
const outcomes: ResolveOutcome[] = [];
|
|
114
|
+
for (const source of sources) {
|
|
115
|
+
try {
|
|
116
|
+
const resolved = await resolveSource(source, {
|
|
117
|
+
include: rest.include,
|
|
118
|
+
exclude: rest.exclude,
|
|
119
|
+
followSymlinks,
|
|
120
|
+
});
|
|
121
|
+
outcomes.push({ source, resolved });
|
|
122
|
+
} catch (err) {
|
|
123
|
+
outcomes.push({ source, error: err instanceof Error ? err : new Error(String(err)) });
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
const total = outcomes.reduce((n, o) => ("error" in o ? n + 1 : n + countResolvedEntries(o.resolved)), 0);
|
|
128
|
+
|
|
129
|
+
const aggregated: IngestResult = {
|
|
130
|
+
ingested: [],
|
|
102
131
|
total: 0,
|
|
103
132
|
ok: 0,
|
|
104
133
|
unchanged: 0,
|
|
105
134
|
failed: 0,
|
|
106
135
|
};
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
136
|
+
|
|
137
|
+
ctx.progress.start(total, "ingest");
|
|
138
|
+
const callbacks: IngestCallbacks = {
|
|
139
|
+
onEntryStart: (label) => ctx.progress.tick(label),
|
|
140
|
+
onEntryComplete: (entry) => ctx.progress.entry(formatEntryLine(entry)),
|
|
141
|
+
};
|
|
142
|
+
|
|
143
|
+
for (const outcome of outcomes) {
|
|
144
|
+
if ("error" in outcome) {
|
|
145
|
+
const failed: IngestEntryResult = {
|
|
146
|
+
source_path: outcome.source,
|
|
147
|
+
logical_path: outcome.source,
|
|
148
|
+
version_id: null,
|
|
149
|
+
status: "failed",
|
|
150
|
+
error: outcome.error.message,
|
|
151
|
+
mime_type: null,
|
|
152
|
+
size_bytes: 0,
|
|
153
|
+
fetcher: "local",
|
|
154
|
+
source_sha256: "",
|
|
155
|
+
};
|
|
156
|
+
callbacks.onEntryStart?.(outcome.source);
|
|
157
|
+
callbacks.onEntryComplete?.(failed);
|
|
158
|
+
aggregated.ingested.push(failed);
|
|
159
|
+
aggregated.total += 1;
|
|
160
|
+
aggregated.failed += 1;
|
|
161
|
+
continue;
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
try {
|
|
165
|
+
const r = await ingestResolved(outcome.resolved, { ...rest, source: outcome.source }, ctx, callbacks);
|
|
166
|
+
aggregated.ingested.push(...r.ingested);
|
|
167
|
+
aggregated.total += r.total;
|
|
168
|
+
aggregated.ok += r.ok;
|
|
169
|
+
aggregated.unchanged += r.unchanged;
|
|
170
|
+
aggregated.failed += r.failed;
|
|
171
|
+
} catch (err) {
|
|
172
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
173
|
+
const failed: IngestEntryResult = {
|
|
174
|
+
source_path: outcome.source,
|
|
175
|
+
logical_path: outcome.source,
|
|
176
|
+
version_id: null,
|
|
177
|
+
status: "failed",
|
|
178
|
+
error: message,
|
|
179
|
+
mime_type: null,
|
|
180
|
+
size_bytes: 0,
|
|
181
|
+
fetcher: "local",
|
|
182
|
+
source_sha256: "",
|
|
183
|
+
};
|
|
184
|
+
callbacks.onEntryStart?.(outcome.source);
|
|
185
|
+
callbacks.onEntryComplete?.(failed);
|
|
186
|
+
aggregated.ingested.push(failed);
|
|
187
|
+
aggregated.total += 1;
|
|
188
|
+
aggregated.failed += 1;
|
|
189
|
+
} finally {
|
|
190
|
+
// Release the DB lock between sources so other consumers (a
|
|
191
|
+
// concurrent CLI call, the daemon, or a separate MCP server)
|
|
192
|
+
// can wedge in. The next source's first DB call reopens.
|
|
193
|
+
await ctx.db.release();
|
|
194
|
+
}
|
|
114
195
|
}
|
|
196
|
+
|
|
197
|
+
const summary = formatSummary(aggregated);
|
|
198
|
+
ctx.progress.done(summary);
|
|
115
199
|
return aggregated;
|
|
116
200
|
},
|
|
117
201
|
});
|
|
202
|
+
|
|
203
|
+
/**
|
|
204
|
+
* Render the persistent stderr line shown for one completed entry. Mirrors
|
|
205
|
+
* the glyphs used by the final `console_formatter` so users see the same
|
|
206
|
+
* status indicators twice (once during ingest on stderr, once in the final
|
|
207
|
+
* stdout summary).
|
|
208
|
+
*/
|
|
209
|
+
function formatEntryLine(entry: IngestEntryResult): string {
|
|
210
|
+
if (entry.status === "ok") {
|
|
211
|
+
return `${colors.green("✓")} ${colors.cyan(entry.logical_path)} ${colors.dim(`(${entry.fetcher}, ${entry.size_bytes}B)`)}`;
|
|
212
|
+
}
|
|
213
|
+
if (entry.status === "unchanged") {
|
|
214
|
+
return `${colors.dim("≡")} ${colors.cyan(entry.logical_path)} ${colors.dim("(unchanged)")}`;
|
|
215
|
+
}
|
|
216
|
+
return `${colors.red("✗")} ${entry.source_path} ${colors.dim(entry.error ?? "")}`;
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
/** Compose the final spinner-success line summarising the whole batch. */
|
|
220
|
+
function formatSummary(r: IngestResult): string {
|
|
221
|
+
const parts: string[] = [`added ${r.ok}/${r.total}`];
|
|
222
|
+
if (r.unchanged > 0) parts.push(`${r.unchanged} unchanged`);
|
|
223
|
+
if (r.failed > 0) parts.push(`${r.failed} failed`);
|
|
224
|
+
return parts.join(", ");
|
|
225
|
+
}
|
package/src/operations/search.ts
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { z } from "zod";
|
|
2
|
+
import { HelpfulError } from "../errors.ts";
|
|
2
3
|
import { embedSingle } from "../ingest/embedder.ts";
|
|
3
4
|
import { colors } from "../output/formatter.ts";
|
|
4
5
|
import { fuseRRF } from "../search/hybrid.ts";
|
|
@@ -26,9 +27,19 @@ export const searchOperation = defineOperation({
|
|
|
26
27
|
version_id: z.string(),
|
|
27
28
|
chunk_index: z.number(),
|
|
28
29
|
snippet: z.string(),
|
|
29
|
-
score: z
|
|
30
|
-
|
|
31
|
-
|
|
30
|
+
score: z
|
|
31
|
+
.number()
|
|
32
|
+
.describe(
|
|
33
|
+
"Normalized fusion score in [0,1]; 1.0 = chunk was top-1 on both semantic and keyword lists, ~0.5 = top-1 on one",
|
|
34
|
+
),
|
|
35
|
+
semantic_score: z
|
|
36
|
+
.number()
|
|
37
|
+
.nullable()
|
|
38
|
+
.describe("Cosine similarity from the semantic side (0-1), or null if not matched"),
|
|
39
|
+
keyword_score: z
|
|
40
|
+
.number()
|
|
41
|
+
.nullable()
|
|
42
|
+
.describe("Raw BM25 score from the keyword side (unbounded), or null if not matched"),
|
|
32
43
|
}),
|
|
33
44
|
),
|
|
34
45
|
mode: z.string(),
|
|
@@ -39,7 +50,10 @@ export const searchOperation = defineOperation({
|
|
|
39
50
|
return colors.dim(`(no hits in ${result.mode} mode)`);
|
|
40
51
|
}
|
|
41
52
|
const blocks = result.hits.map((h) => {
|
|
42
|
-
const
|
|
53
|
+
const parts = [`score=${h.score.toFixed(3)}`];
|
|
54
|
+
if (h.semantic_score !== null) parts.push(`sem=${h.semantic_score.toFixed(3)}`);
|
|
55
|
+
if (h.keyword_score !== null) parts.push(`bm25=${h.keyword_score.toFixed(2)}`);
|
|
56
|
+
const head = `${colors.cyan(h.logical_path)} ${colors.dim(`v=${h.version_id}`)} ${colors.green(parts.join(" "))}`;
|
|
43
57
|
const snippet = h.snippet
|
|
44
58
|
.split("\n")
|
|
45
59
|
.map((l) => ` ${l}`)
|
|
@@ -52,6 +66,14 @@ export const searchOperation = defineOperation({
|
|
|
52
66
|
const query = input.query ?? input.pattern ?? "";
|
|
53
67
|
const pattern = input.pattern ?? input.query ?? "";
|
|
54
68
|
|
|
69
|
+
if (!query.trim() && !pattern.trim()) {
|
|
70
|
+
throw new HelpfulError({
|
|
71
|
+
kind: "input_error",
|
|
72
|
+
message: "search requires a query or pattern",
|
|
73
|
+
hint: 'Pass a natural-language query (e.g. `membot search "oauth flow"`) or a keyword pattern (e.g. `membot search --pattern OAuth`).',
|
|
74
|
+
});
|
|
75
|
+
}
|
|
76
|
+
|
|
55
77
|
const semanticHits =
|
|
56
78
|
input.mode === "keyword" || !query.trim()
|
|
57
79
|
? []
|
package/src/output/progress.ts
CHANGED
|
@@ -2,21 +2,48 @@ import { logger } from "./logger.ts";
|
|
|
2
2
|
import { isSilent, useSpinner } from "./tty.ts";
|
|
3
3
|
|
|
4
4
|
/**
|
|
5
|
-
*
|
|
6
|
-
*
|
|
7
|
-
* entry
|
|
5
|
+
* Progress reporter for multi-entry operations (directory/glob ingest, batch
|
|
6
|
+
* refresh, multi-source `add`). Operations call `start(total)`, then for each
|
|
7
|
+
* entry call `tick(label)` (advances the in-flight indicator) and optionally
|
|
8
|
+
* `entry(line)` (writes a persistent stderr line that survives the spinner),
|
|
9
|
+
* then `done(summary)`.
|
|
8
10
|
*
|
|
9
|
-
* Interactive: replaces a single spinner line as work happens
|
|
10
|
-
*
|
|
11
|
+
* Interactive: replaces a single spinner line as work happens, with an ASCII
|
|
12
|
+
* bar like `[████░░░░░░] 4/15 (26%) — relative/path.md`.
|
|
13
|
+
* Non-interactive: emits `info` lines per `tick` and per `entry`.
|
|
11
14
|
*/
|
|
12
15
|
export interface Progress {
|
|
13
16
|
start(total: number, label?: string): void;
|
|
14
17
|
tick(label: string): void;
|
|
18
|
+
entry(line: string): void;
|
|
15
19
|
done(summary?: string): void;
|
|
16
20
|
fail(summary?: string): void;
|
|
17
21
|
info(msg: string): void;
|
|
18
22
|
}
|
|
19
23
|
|
|
24
|
+
const BAR_WIDTH = 20;
|
|
25
|
+
const LABEL_MAX = 60;
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* Render a fixed-width ASCII progress bar. Uses block-drawing characters in
|
|
29
|
+
* interactive mode so the bar reads naturally next to other unicode glyphs.
|
|
30
|
+
*/
|
|
31
|
+
export function renderBar(count: number, total: number, width = BAR_WIDTH): string {
|
|
32
|
+
if (total <= 0) return `[${"░".repeat(width)}]`;
|
|
33
|
+
const ratio = Math.min(1, Math.max(0, count / total));
|
|
34
|
+
const filled = Math.round(ratio * width);
|
|
35
|
+
return `[${"█".repeat(filled)}${"░".repeat(width - filled)}]`;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Truncate a label from the left so the most-specific tail of a long path
|
|
40
|
+
* stays visible. Keeps the spinner line on a single terminal row.
|
|
41
|
+
*/
|
|
42
|
+
function truncateLabel(label: string, max = LABEL_MAX): string {
|
|
43
|
+
if (label.length <= max) return label;
|
|
44
|
+
return `…${label.slice(label.length - max + 1)}`;
|
|
45
|
+
}
|
|
46
|
+
|
|
20
47
|
/**
|
|
21
48
|
* Build a `Progress` reporter whose mode is decided once, at call time, from
|
|
22
49
|
* the current TTY state. Use one per multi-entry operation.
|
|
@@ -29,26 +56,38 @@ export function createProgress(): Progress {
|
|
|
29
56
|
const interactive = useSpinner();
|
|
30
57
|
const silent = isSilent();
|
|
31
58
|
|
|
59
|
+
const renderSpinnerText = (label: string): string => {
|
|
60
|
+
const bar = renderBar(count, total);
|
|
61
|
+
const pct = total > 0 ? Math.floor((count / total) * 100) : 0;
|
|
62
|
+
const tail = label ? ` — ${truncateLabel(label)}` : "";
|
|
63
|
+
return `${bar} ${count}/${total} (${pct}%)${tail}`;
|
|
64
|
+
};
|
|
65
|
+
|
|
32
66
|
return {
|
|
33
67
|
start(t: number, label?: string) {
|
|
34
68
|
total = t;
|
|
35
69
|
count = 0;
|
|
36
70
|
if (silent) return;
|
|
37
71
|
if (interactive) {
|
|
38
|
-
|
|
72
|
+
const initial = renderSpinnerText(label ?? "");
|
|
73
|
+
spinner = logger.startSpinner(initial);
|
|
39
74
|
} else if (label) {
|
|
40
|
-
logger.info(label);
|
|
75
|
+
logger.info(`${label}: 0/${total}`);
|
|
41
76
|
}
|
|
42
77
|
},
|
|
43
78
|
tick(label: string) {
|
|
44
79
|
count += 1;
|
|
45
80
|
if (silent) return;
|
|
46
81
|
if (interactive && spinner) {
|
|
47
|
-
spinner.update(
|
|
82
|
+
spinner.update(renderSpinnerText(label));
|
|
48
83
|
} else {
|
|
49
84
|
logger.info(`[${count}/${total}] ${label}`);
|
|
50
85
|
}
|
|
51
86
|
},
|
|
87
|
+
entry(line: string) {
|
|
88
|
+
if (silent) return;
|
|
89
|
+
logger.info(line);
|
|
90
|
+
},
|
|
52
91
|
done(summary?: string) {
|
|
53
92
|
if (silent) return;
|
|
54
93
|
if (interactive && spinner) {
|
package/src/refresh/scheduler.ts
CHANGED
|
@@ -40,6 +40,14 @@ export function startDaemon(ctx: AppContext, tickSec: number): () => void {
|
|
|
40
40
|
await runDueRefreshes(ctx);
|
|
41
41
|
} catch (err) {
|
|
42
42
|
logger.warn(`daemon: tick failed (${err instanceof Error ? err.message : String(err)})`);
|
|
43
|
+
} finally {
|
|
44
|
+
// Drop the DuckDB lock between ticks so the CLI / MCP server can
|
|
45
|
+
// run while the daemon is idle. Next tick reopens transparently.
|
|
46
|
+
try {
|
|
47
|
+
await ctx.db.release();
|
|
48
|
+
} catch {
|
|
49
|
+
// best effort
|
|
50
|
+
}
|
|
43
51
|
}
|
|
44
52
|
if (!stopped) setTimeout(loop, intervalMs);
|
|
45
53
|
};
|
package/src/sdk.ts
CHANGED
|
@@ -14,8 +14,10 @@ export { chunkDeterministic } from "./ingest/chunker.ts";
|
|
|
14
14
|
export { embed, embedSingle } from "./ingest/embedder.ts";
|
|
15
15
|
export type { FetchedRemote, FetchOptions } from "./ingest/fetcher.ts";
|
|
16
16
|
export { fetchRemote } from "./ingest/fetcher.ts";
|
|
17
|
-
export type { IngestEntryResult, IngestInput, IngestResult } from "./ingest/ingest.ts";
|
|
18
|
-
export { ingest } from "./ingest/ingest.ts";
|
|
17
|
+
export type { IngestCallbacks, IngestEntryResult, IngestInput, IngestResult } from "./ingest/ingest.ts";
|
|
18
|
+
export { countResolvedEntries, ingest, ingestResolved } from "./ingest/ingest.ts";
|
|
19
|
+
export type { ResolvedLocalEntry, ResolvedSource } from "./ingest/source-resolver.ts";
|
|
20
|
+
export { resolveSource } from "./ingest/source-resolver.ts";
|
|
19
21
|
export { buildMcpServer, startHttpServer, startStdioServer } from "./mcp/server.ts";
|
|
20
22
|
export { OPERATIONS } from "./operations/index.ts";
|
|
21
23
|
export type { CliMetadata, Operation } from "./operations/types.ts";
|
package/src/search/hybrid.ts
CHANGED
|
@@ -17,6 +17,11 @@ const SNIPPET_MAX = 300;
|
|
|
17
17
|
* Reciprocal-rank fusion of semantic and keyword hit lists. Each result is
|
|
18
18
|
* keyed by `(logical_path, version_id, chunk_index)` so the same chunk
|
|
19
19
|
* appearing in both lists gets one fused score = sum of its RRF scores.
|
|
20
|
+
*
|
|
21
|
+
* The returned `score` is normalized to [0,1] by dividing by the theoretical
|
|
22
|
+
* max RRF (`2/(k+1)`, achieved when a chunk is rank-0 on both lists). This
|
|
23
|
+
* preserves ordering — division is monotonic — but makes the displayed value
|
|
24
|
+
* interpretable: 1.0 = top-1 on both signals, ~0.5 = top-1 on one.
|
|
20
25
|
*/
|
|
21
26
|
export function fuseRRF(
|
|
22
27
|
semantic: SemanticHit[],
|
|
@@ -24,6 +29,7 @@ export function fuseRRF(
|
|
|
24
29
|
options: { k?: number; limit: number },
|
|
25
30
|
): FusedHit[] {
|
|
26
31
|
const k = options.k ?? 60;
|
|
32
|
+
const maxRrf = 2 / (k + 1);
|
|
27
33
|
const merged = new Map<
|
|
28
34
|
string,
|
|
29
35
|
{
|
|
@@ -89,7 +95,7 @@ export function fuseRRF(
|
|
|
89
95
|
version_id: h.version_id,
|
|
90
96
|
chunk_index: h.chunk_index,
|
|
91
97
|
snippet: h.snippet,
|
|
92
|
-
score: round(h.rrf),
|
|
98
|
+
score: round(h.rrf / maxRrf),
|
|
93
99
|
semantic_score: h.semantic_score,
|
|
94
100
|
keyword_score: h.keyword_score,
|
|
95
101
|
}));
|
package/src/search/keyword.ts
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { rebuildFts } from "../db/chunks.ts";
|
|
2
2
|
import type { DbConnection } from "../db/connection.ts";
|
|
3
|
+
import { asHelpful } from "../errors.ts";
|
|
3
4
|
|
|
4
5
|
export interface KeywordHit {
|
|
5
6
|
logical_path: string;
|
|
@@ -23,9 +24,10 @@ interface RawKeywordRow {
|
|
|
23
24
|
|
|
24
25
|
/**
|
|
25
26
|
* BM25 keyword search over `chunks.search_text` via the FTS extension.
|
|
26
|
-
* Returns an empty list when FTS isn't available on this platform
|
|
27
|
-
* hybrid layer treats missing keyword hits as "no
|
|
28
|
-
* to semantic-only.
|
|
27
|
+
* Returns an empty list when FTS isn't available on this platform or the
|
|
28
|
+
* index is empty — the hybrid layer treats missing keyword hits as "no
|
|
29
|
+
* signal" and degrades to semantic-only. Genuine SQL/runtime errors are
|
|
30
|
+
* surfaced as HelpfulError so they don't get silently buried.
|
|
29
31
|
*/
|
|
30
32
|
export async function searchKeyword(
|
|
31
33
|
db: DbConnection,
|
|
@@ -36,15 +38,15 @@ export async function searchKeyword(
|
|
|
36
38
|
if (result.kind !== "rebuilt") return [];
|
|
37
39
|
|
|
38
40
|
const limit = options.limit ?? 50;
|
|
41
|
+
const sql = `SELECT row_key, logical_path, version_id, chunk_index,
|
|
42
|
+
chunk_content, search_text,
|
|
43
|
+
fts_main__current_chunks_fts.match_bm25(row_key, ?1) AS bm25_score
|
|
44
|
+
FROM _current_chunks_fts
|
|
45
|
+
WHERE fts_main__current_chunks_fts.match_bm25(row_key, ?1) IS NOT NULL
|
|
46
|
+
${options.pathPrefix ? "AND logical_path LIKE ?2" : ""}
|
|
47
|
+
ORDER BY bm25_score DESC
|
|
48
|
+
LIMIT ${Number(limit)}`;
|
|
39
49
|
try {
|
|
40
|
-
const sql = `SELECT row_key, logical_path, version_id, chunk_index,
|
|
41
|
-
chunk_content, search_text,
|
|
42
|
-
fts_main__current_chunks_fts.match_bm25(row_key, ?1) AS bm25_score
|
|
43
|
-
FROM _current_chunks_fts
|
|
44
|
-
WHERE fts_main__current_chunks_fts.match_bm25(row_key, ?1) IS NOT NULL
|
|
45
|
-
${options.pathPrefix ? "AND logical_path LIKE ?2" : ""}
|
|
46
|
-
ORDER BY bm25_score DESC
|
|
47
|
-
LIMIT ${Number(limit)}`;
|
|
48
50
|
const rows: RawKeywordRow[] = options.pathPrefix
|
|
49
51
|
? await db.queryAll<RawKeywordRow>(sql, query, `${options.pathPrefix}%`)
|
|
50
52
|
: await db.queryAll<RawKeywordRow>(sql, query);
|
|
@@ -56,7 +58,12 @@ export async function searchKeyword(
|
|
|
56
58
|
search_text: r.search_text,
|
|
57
59
|
score: Number(r.bm25_score),
|
|
58
60
|
}));
|
|
59
|
-
} catch {
|
|
60
|
-
|
|
61
|
+
} catch (e) {
|
|
62
|
+
throw asHelpful(
|
|
63
|
+
e,
|
|
64
|
+
"while running BM25 keyword search",
|
|
65
|
+
"Run `membot reindex` to rebuild the FTS index, then retry the search.",
|
|
66
|
+
"internal_error",
|
|
67
|
+
);
|
|
61
68
|
}
|
|
62
69
|
}
|