membot 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -36,6 +36,15 @@ membot add ./docs --refresh-frequency 24h # auto-refresh every day
36
36
 
37
37
  Each entry becomes a new version under its own `logical_path`. PDFs/DOCX/HTML are converted to markdown; images get vision captions; original bytes are kept and reachable via `membot read --bytes`.
38
38
 
39
+ The default `logical_path` mirrors the source path so files with the same basename in different projects don't collide:
40
+
41
+ - Local file → absolute path with leading `/` stripped (e.g. `/Users/me/projA/README.md` → `Users/me/projA/README.md`).
42
+ - Local directory or glob → each entry's absolute path under the same shape.
43
+ - URL → `remotes/{host}/{path}` with `/`'s preserved (e.g. `https://github.com/userA/projA/blob/main/README.md` → `remotes/github.com/userA/projA/blob/main/README.md`). Query strings and fragments are dropped from the logical_path (the full URL is still stored for refresh).
44
+ - `inline:<text>` → `inline/{timestamp}.md`.
45
+
46
+ Pass `-p <path>` (or `--logical-path`) to override. On a directory walk it's treated as a *prefix* — entries land at `{prefix}/{path-relative-to-walk-base}`. Re-running `membot add` on the same source reuses the same logical_path and creates a new version (correct refresh behavior).
47
+
39
48
  ## 3. Read
40
49
 
41
50
  ```bash
@@ -75,7 +84,7 @@ Tombstones hide a path from `ls` / `tree` / `search` but `versions` and `read --
75
84
 
76
85
  - Defaults always operate on the current, non-tombstoned version.
77
86
  - Pass an explicit `--version <timestamp>` (from `membot versions`) to read or diff history.
78
- - `membot_add`, refresh-with-changes, `write`, and `mv` each create a new version. The previous version is preserved.
87
+ - `membot_add` (when source bytes have changed), refresh-with-changes, `write`, and `mv` each create a new version. The previous version is preserved. Re-running `membot_add` against an unchanged source is a no-op (status `unchanged`, same `version_id`); pass `force=true` to force a new version.
79
88
  - Mutating an existing version is not possible — corrections are new versions.
80
89
 
81
90
  ## When to use this skill
@@ -99,7 +108,7 @@ Tombstones hide a path from `ls` / `tree` / `search` but `versions` and `read --
99
108
 
100
109
  | Command | Purpose |
101
110
  | ------------------------------------- | ------------------------------------------------------------------------------ |
102
- | `membot add <source>` | Ingest file, directory, glob, URL, or `inline:<text>` (one new version each) |
111
+ | `membot add <source>` | Ingest file, directory, glob, URL, or `inline:<text>`. Skips unchanged sources; pass `--force` to re-ingest |
103
112
  | `membot ls [prefix]` | List current files (size, mime, refresh status) |
104
113
  | `membot tree [prefix]` | Render the synthesised logical-path tree |
105
114
  | `membot read <path>` | Read current markdown surrogate (or `--bytes` for original) |
@@ -36,6 +36,15 @@ membot add ./docs --refresh-frequency 24h # auto-refresh every day
36
36
 
37
37
  Each entry becomes a new version under its own `logical_path`. PDFs/DOCX/HTML are converted to markdown; images get vision captions; original bytes are kept and reachable via `membot read --bytes`.
38
38
 
39
+ The default `logical_path` mirrors the source path so files with the same basename in different projects don't collide:
40
+
41
+ - Local file → absolute path with leading `/` stripped (e.g. `/Users/me/projA/README.md` → `Users/me/projA/README.md`).
42
+ - Local directory or glob → each entry's absolute path under the same shape.
43
+ - URL → `remotes/{host}/{path}` with `/`'s preserved (e.g. `https://github.com/userA/projA/blob/main/README.md` → `remotes/github.com/userA/projA/blob/main/README.md`). Query strings and fragments are dropped from the logical_path (the full URL is still stored for refresh).
44
+ - `inline:<text>` → `inline/{timestamp}.md`.
45
+
46
+ Pass `-p <path>` (or `--logical-path`) to override. On a directory walk it's treated as a *prefix* — entries land at `{prefix}/{path-relative-to-walk-base}`. Re-running `membot add` on the same source reuses the same logical_path and creates a new version (correct refresh behavior).
47
+
39
48
  ## 3. Read
40
49
 
41
50
  ```bash
@@ -75,7 +84,7 @@ Tombstones hide a path from `ls` / `tree` / `search` but `versions` and `read --
75
84
 
76
85
  - Defaults always operate on the current, non-tombstoned version.
77
86
  - Pass an explicit `--version <timestamp>` (from `membot versions`) to read or diff history.
78
- - `membot_add`, refresh-with-changes, `write`, and `mv` each create a new version. The previous version is preserved.
87
+ - `membot_add` (when source bytes have changed), refresh-with-changes, `write`, and `mv` each create a new version. The previous version is preserved. Re-running `membot_add` against an unchanged source is a no-op (status `unchanged`, same `version_id`); pass `force=true` to force a new version.
79
88
  - Mutating an existing version is not possible — corrections are new versions.
80
89
 
81
90
  ## When to use this rule
@@ -99,7 +108,7 @@ Tombstones hide a path from `ls` / `tree` / `search` but `versions` and `read --
99
108
 
100
109
  | Command | Purpose |
101
110
  | ------------------------------------- | ------------------------------------------------------------------------------ |
102
- | `membot add <source>` | Ingest file, directory, glob, URL, or `inline:<text>` (one new version each) |
111
+ | `membot add <source>` | Ingest file, directory, glob, URL, or `inline:<text>`. Skips unchanged sources; pass `--force` to re-ingest |
103
112
  | `membot ls [prefix]` | List current files (size, mime, refresh status) |
104
113
  | `membot tree [prefix]` | Render the synthesised logical-path tree |
105
114
  | `membot read <path>` | Read current markdown surrogate (or `--bytes` for original) |
package/README.md CHANGED
@@ -15,18 +15,13 @@
15
15
  ## Install
16
16
 
17
17
  ```bash
18
- # macOS / Linux — pre-built binary
19
- curl -fsSL https://raw.githubusercontent.com/evantahler/membot/main/install.sh | bash
20
-
21
- # Windows — PowerShell
22
- iwr -useb https://raw.githubusercontent.com/evantahler/membot/main/install.ps1 | iex
23
-
24
- # From npm (requires Bun or Node)
25
- bun add -g membot
18
+ bun install -g membot
26
19
  # or
27
20
  npm install -g membot
28
21
  ```
29
22
 
23
+ This pulls in DuckDB's per-platform native bindings alongside membot. The build externalizes `@duckdb/*` (those `.node` bindings can't be embedded by `bun build --compile`), so a global npm/bun install is the supported path.
24
+
30
25
  ## Quick start
31
26
 
32
27
  ```bash
@@ -55,7 +50,7 @@ The skill files describe the discover → ingest → search → read → write w
55
50
 
56
51
  | Command | Description |
57
52
  | ------------------------------- | --------------------------------------------------------------------------------- |
58
- | `membot add <source>` | Ingest a file, directory, glob, URL, or `inline:<text>`. Each match new version |
53
+ | `membot add <source>` | Ingest a file, directory, glob, URL, or `inline:<text>`. Default `logical_path` mirrors the source (absolute path for local files, `remotes/{host}/{path}` for URLs) so files with the same basename in different projects don't collide. Pass `-p <path>` to override or, on a directory walk, to set a prefix. Skips on unchanged source bytes; pass `--force` to re-ingest. |
59
54
  | `membot ls [prefix]` | List current files (size, mime, refresh status) |
60
55
  | `membot tree [prefix]` | Render the synthesised logical-path tree |
61
56
  | `membot read <path>` | Read the markdown surrogate (or `--bytes` for original bytes, base64) |
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "membot",
3
- "version": "0.1.1",
3
+ "version": "0.2.0",
4
4
  "description": "Versioned context store with hybrid search for AI agents. Stdio + HTTP MCP server and CLI.",
5
5
  "type": "module",
6
6
  "exports": {
@@ -26,7 +26,7 @@
26
26
  "test": "bun test",
27
27
  "lint": "biome ci . && tsc --noEmit",
28
28
  "format": "biome check --write .",
29
- "prebuild": "bash scripts/apply-transformers-patch.sh",
29
+ "prebuild": "bash scripts/apply-patches.sh",
30
30
  "build": "bun build --compile --minify --sourcemap --external '@duckdb/*' ./src/cli.ts --outfile dist/membot"
31
31
  },
32
32
  "keywords": [
@@ -0,0 +1,44 @@
1
+ diff --git a/src/search/onnx-wasm-paths.ts b/src/search/onnx-wasm-paths.ts
2
+ --- a/src/search/onnx-wasm-paths.ts
3
+ +++ b/src/search/onnx-wasm-paths.ts
4
+ @@ -1,31 +1,9 @@
5
+ -// Embed the onnxruntime-web WASM runtime files into the compiled binary
6
+ -// (`bun build --compile`) so they survive in a single-binary distribution
7
+ -// where the user has no node_modules.
8
+ -//
9
+ -// This file is loaded **dynamically** by semantic.ts. The relative paths
10
+ -// only resolve in the local repo / compiled binary; for npm/bun-installed
11
+ -// mcpx the parent directory layout is different (deps are hoisted), the
12
+ -// dynamic import throws, and we fall back to letting transformers.js
13
+ -// load WASM via its default mechanism — which works fine because in
14
+ -// that environment node_modules exists and onnxruntime-web is reachable
15
+ -// through normal module resolution.
16
+ -
17
+ -// The relative `../../node_modules/...` paths only resolve from the local repo
18
+ -// layout (and inside `bun build --compile`). When this file is shipped via npm,
19
+ -// deps are hoisted, so consumer `tsc` runs hit TS2307. The `ts-ignore` directive
20
+ -// below silences that for consumers; we avoid the stricter `expect-error` form
21
+ -// because in the local repo the path resolves fine and there would be no error
22
+ -// to expect. At runtime the dynamic import in semantic.ts is wrapped in
23
+ -// try/catch and falls back to transformers.js's default WASM loader (issue #85).
24
+ -// biome-ignore lint/suspicious/noTsIgnore: must stay as ts-ignore per comment above
25
+ -// @ts-ignore - dynamic-only import
26
+ -import wasmMjsPath from "../../node_modules/onnxruntime-web/dist/ort-wasm-simd-threaded.asyncify.mjs" with {
27
+ - type: "file",
28
+ -};
29
+ -// biome-ignore lint/suspicious/noTsIgnore: must stay as ts-ignore per comment above
30
+ -// @ts-ignore - dynamic-only import
31
+ -import wasmBinPath from "../../node_modules/onnxruntime-web/dist/ort-wasm-simd-threaded.asyncify.wasm" with {
32
+ - type: "file",
33
+ -};
34
+ -
35
+ -export { wasmBinPath, wasmMjsPath };
36
+ +// PATCHED (membot): upstream mcpx ships static `with { type: "file" }` imports
37
+ +// of onnxruntime-web WASM assets via `../../node_modules/...`, which only
38
+ +// resolves when mcpx is built standalone. When consumed as an npm dep those
39
+ +// paths are unreachable and `bun build --compile` fails at build time. membot
40
+ +// never invokes mcpx's semantic search (only `mcpx.exec()` for URL fetching),
41
+ +// so we stub the exports — semantic.ts wraps the dynamic import in try/catch
42
+ +// and falls back to transformers.js's default WASM loader.
43
+ +export const wasmMjsPath = "";
44
+ +export const wasmBinPath = "";
@@ -0,0 +1,49 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ # Apply node_modules patches imperatively. We don't use package.json's
5
+ # `patchedDependencies` field because that field, when present in a published
6
+ # package, breaks `bun install` from a tarball.
7
+ #
8
+ # Each patch is gated by a marker file inside its target so reruns are no-ops.
9
+
10
+ apply_patch() {
11
+ local patch="$1" target="$2" marker_name="$3"
12
+ local marker="$target/$marker_name"
13
+
14
+ if [ ! -d "$target" ]; then
15
+ echo "error: $target not found — run \`bun install\` first" >&2
16
+ exit 1
17
+ fi
18
+ if [ ! -f "$patch" ]; then
19
+ echo "error: $patch not found" >&2
20
+ exit 1
21
+ fi
22
+ if [ -f "$marker" ]; then
23
+ echo "patch $patch already applied — skipping"
24
+ return 0
25
+ fi
26
+
27
+ echo "Applying $patch to $target..."
28
+ git apply --directory="$target" "$patch"
29
+ touch "$marker"
30
+ }
31
+
32
+ # @huggingface/transformers — replace static `import 'onnxruntime-node'` with a
33
+ # stub so `bun build --compile` produces a binary using the WASM backend
34
+ # (onnxruntime-web) instead of onnxruntime-node, whose native bindings can't be
35
+ # bundled into a single-binary distribution.
36
+ apply_patch \
37
+ "patches/@huggingface%2Ftransformers@4.2.0.patch" \
38
+ "node_modules/@huggingface/transformers" \
39
+ ".membot-transformers-patch-applied"
40
+
41
+ # @evantahler/mcpx — stub `src/search/onnx-wasm-paths.ts` whose static
42
+ # `with { type: "file" }` imports use a relative path that only resolves in
43
+ # mcpx's own repo layout. When mcpx is consumed as an npm dep those paths are
44
+ # unreachable and `bun build --compile` fails at build time. membot never
45
+ # invokes mcpx's semantic search, so the stubbed exports are safe.
46
+ apply_patch \
47
+ "patches/@evantahler%2Fmcpx@0.21.4.patch" \
48
+ "node_modules/@evantahler/mcpx" \
49
+ ".membot-mcpx-patch-applied"
@@ -21,13 +21,14 @@ export interface IngestInput {
21
21
  refresh_frequency?: string;
22
22
  fetcher_hint?: string;
23
23
  change_note?: string;
24
+ force?: boolean;
24
25
  }
25
26
 
26
27
  export interface IngestEntryResult {
27
28
  source_path: string;
28
29
  logical_path: string;
29
30
  version_id: string | null;
30
- status: "ok" | "failed";
31
+ status: "ok" | "unchanged" | "failed";
31
32
  error?: string;
32
33
  mime_type: string | null;
33
34
  size_bytes: number;
@@ -39,6 +40,7 @@ export interface IngestResult {
39
40
  ingested: IngestEntryResult[];
40
41
  total: number;
41
42
  ok: number;
43
+ unchanged: number;
42
44
  failed: number;
43
45
  }
44
46
 
@@ -57,14 +59,15 @@ export async function ingest(input: IngestInput, ctx: AppContext): Promise<Inges
57
59
  });
58
60
 
59
61
  const refreshSec = parseDuration(input.refresh_frequency);
62
+ const force = input.force === true;
60
63
 
61
64
  if (resolved.kind === "inline") {
62
65
  return ingestInline(resolved.text, input, ctx, refreshSec);
63
66
  }
64
67
  if (resolved.kind === "url") {
65
- return ingestUrl(resolved.url, input, ctx, refreshSec);
68
+ return ingestUrl(resolved.url, input, ctx, refreshSec, force);
66
69
  }
67
- return ingestLocalFiles(resolved, input, ctx, refreshSec);
70
+ return ingestLocalFiles(resolved, input, ctx, refreshSec, force);
68
71
  }
69
72
 
70
73
  /** Ingest a single inline blob (source_type='inline'). */
@@ -119,6 +122,7 @@ async function ingestUrl(
119
122
  input: IngestInput,
120
123
  ctx: AppContext,
121
124
  refreshSec: number | null,
125
+ force: boolean,
122
126
  ): Promise<IngestResult> {
123
127
  const mcpxAdapter = ctx.mcpx
124
128
  ? {
@@ -151,6 +155,15 @@ async function ingestUrl(
151
155
  result.fetcher = fetched.fetcher;
152
156
  result.source_sha256 = fetched.sha256;
153
157
 
158
+ if (!force) {
159
+ const cur = await getCurrent(ctx.db, logicalPath);
160
+ if (cur && cur.source_sha256 === fetched.sha256) {
161
+ result.status = "unchanged";
162
+ result.version_id = cur.version_id;
163
+ return summarize([result]);
164
+ }
165
+ }
166
+
154
167
  const versionId = await pipelineForBytes(ctx, {
155
168
  logicalPath,
156
169
  bytes: fetched.bytes,
@@ -181,6 +194,7 @@ async function ingestLocalFiles(
181
194
  input: IngestInput,
182
195
  ctx: AppContext,
183
196
  refreshSec: number | null,
197
+ force: boolean,
184
198
  ): Promise<IngestResult> {
185
199
  if (resolved.entries.length === 0) {
186
200
  throw new HelpfulError({
@@ -195,7 +209,7 @@ async function ingestLocalFiles(
195
209
  const isMulti = resolved.entries.length > 1;
196
210
 
197
211
  for (const entry of resolved.entries) {
198
- ctx.progress.tick(entry.relPath);
212
+ ctx.progress.tick(entry.relPathFromBase);
199
213
  const logicalPath = pickLogicalPath(input.logical_path, entry, isMulti);
200
214
  const result: IngestEntryResult = {
201
215
  source_path: entry.absPath,
@@ -213,6 +227,16 @@ async function ingestLocalFiles(
213
227
  result.size_bytes = local.sizeBytes;
214
228
  result.source_sha256 = local.sha256;
215
229
 
230
+ if (!force) {
231
+ const cur = await getCurrent(ctx.db, logicalPath);
232
+ if (cur && cur.source_sha256 === local.sha256) {
233
+ result.status = "unchanged";
234
+ result.version_id = cur.version_id;
235
+ results.push(result);
236
+ continue;
237
+ }
238
+ }
239
+
216
240
  const versionId = await pipelineForBytes(ctx, {
217
241
  logicalPath,
218
242
  bytes: local.bytes,
@@ -236,7 +260,10 @@ async function ingestLocalFiles(
236
260
  }
237
261
  results.push(result);
238
262
  }
239
- ctx.progress.done(`ingested ${results.filter((r) => r.status === "ok").length}/${results.length}`);
263
+ const okCount = results.filter((r) => r.status === "ok").length;
264
+ const unchangedCount = results.filter((r) => r.status === "unchanged").length;
265
+ const suffix = unchangedCount > 0 ? ` (${unchangedCount} unchanged)` : "";
266
+ ctx.progress.done(`ingested ${okCount}/${results.length}${suffix}`);
240
267
 
241
268
  return summarize(results);
242
269
  }
@@ -377,26 +404,47 @@ async function persistVersion(ctx: AppContext, p: PersistParams): Promise<string
377
404
  }
378
405
 
379
406
  /**
380
- * Pick the logical path for a single matched entry. For a single-file
381
- * ingest with explicit `logical_path`, use it as-is. For multi-entry
382
- * ingests with `logical_path` set, treat it as a *prefix* under which
383
- * each entry's relative path is placed.
407
+ * Pick the logical path for a single matched entry.
408
+ *
409
+ * - Default (no explicit logical_path): use the entry's absolute filesystem
410
+ * path with `\` normalized to `/` and the leading `/` stripped. This
411
+ * keeps `~/projA/README.md` and `~/projB/README.md` from colliding under
412
+ * a shared `README.md`. Two adds of the same absolute path produce the
413
+ * same logical_path, so the second add correctly creates a new version.
414
+ * - Single-source with explicit logical_path: use it verbatim.
415
+ * - Multi-entry (directory/glob) with explicit logical_path: treat as a
416
+ * prefix and append each entry's path relative to the walk base.
384
417
  */
385
- function pickLogicalPath(explicit: string | undefined, entry: ResolvedLocalEntry, isMulti: boolean): string {
386
- if (!explicit) return entry.relPath.replaceAll("\\", "/");
418
+ export function pickLogicalPath(explicit: string | undefined, entry: ResolvedLocalEntry, isMulti: boolean): string {
419
+ if (!explicit) return normalizeAbs(entry.absPath);
387
420
  if (!isMulti) return explicit;
388
421
  const prefix = explicit.endsWith("/") ? explicit.slice(0, -1) : explicit;
389
- return `${prefix}/${entry.relPath.replaceAll("\\", "/")}`;
422
+ return `${prefix}/${entry.relPathFromBase.replaceAll("\\", "/")}`;
390
423
  }
391
424
 
392
- /** Default logical path for an ingested URL — host + path, sanitized. */
393
- function defaultLogicalForUrl(url: string): string {
425
+ /**
426
+ * Normalize an absolute filesystem path into a logical_path:
427
+ * `\` → `/`, leading `/` stripped. Drive letters (Windows `C:`) are kept
428
+ * as the first path segment.
429
+ */
430
+ export function normalizeAbs(absPath: string): string {
431
+ return absPath.replaceAll("\\", "/").replace(/^\/+/, "");
432
+ }
433
+
434
+ /**
435
+ * Default logical path for an ingested URL: `remotes/{host}/{pathname}`
436
+ * with slashes preserved so two projects on the same host (e.g.,
437
+ * github.com) don't collide. Query string and fragment are dropped from
438
+ * the logical_path for stable identity — the full URL is still preserved
439
+ * on the row in `source_path` and used for refresh.
440
+ */
441
+ export function defaultLogicalForUrl(url: string): string {
394
442
  try {
395
443
  const u = new URL(url);
396
- const tail = u.pathname.replace(/^\/+/, "").replaceAll("/", "_") || "root";
397
- return `urls/${u.hostname}/${tail || "root"}`;
444
+ const tail = u.pathname.replace(/^\/+/, "").replace(/\/+$/, "") || "index";
445
+ return `remotes/${u.hostname}/${tail}`;
398
446
  } catch {
399
- return `urls/${url.replace(/[^a-z0-9.-]/gi, "_")}`;
447
+ return `remotes/${url.replace(/[^a-z0-9.-]/gi, "_")}`;
400
448
  }
401
449
  }
402
450
 
@@ -428,12 +476,14 @@ export function parseDuration(input: string | null | undefined): number | null {
428
476
  /** Roll a list of per-entry results into the top-level summary shape. */
429
477
  function summarize(entries: IngestEntryResult[]): IngestResult {
430
478
  let ok = 0;
479
+ let unchanged = 0;
431
480
  let failed = 0;
432
481
  for (const e of entries) {
433
482
  if (e.status === "ok") ok += 1;
483
+ else if (e.status === "unchanged") unchanged += 1;
434
484
  else failed += 1;
435
485
  }
436
- return { ingested: entries, total: entries.length, ok, failed };
486
+ return { ingested: entries, total: entries.length, ok, unchanged, failed };
437
487
  }
438
488
 
439
489
  function errorMessage(err: unknown): string {
@@ -9,9 +9,15 @@ export type ResolvedSource =
9
9
  | { kind: "local-files"; entries: ResolvedLocalEntry[]; basePath: string };
10
10
 
11
11
  export interface ResolvedLocalEntry {
12
+ /** Absolute filesystem path (post-realpath). */
12
13
  absPath: string;
13
- /** Path relative to the base; used to derive a default logical_path. */
14
- relPath: string;
14
+ /**
15
+ * Path relative to the walk base. Used when the caller passes an
16
+ * explicit `logical_path` *prefix* (directory/glob mode) — entries land
17
+ * at `{prefix}/{relPathFromBase}`. For default logical_paths we use
18
+ * `absPath` directly so paths from different filesystems don't collide.
19
+ */
20
+ relPathFromBase: string;
15
21
  }
16
22
 
17
23
  export interface ResolveOptions {
@@ -43,10 +49,12 @@ export async function resolveSource(source: string, options: ResolveOptions = {}
43
49
  }
44
50
 
45
51
  const followSymlinks = options.followSymlinks !== false;
46
- const includeMatchers = (options.include ?? "**/*")
47
- .split(",")
48
- .map((g) => g.trim())
49
- .filter(Boolean);
52
+ const userIncludes = options.include
53
+ ? options.include
54
+ .split(",")
55
+ .map((g) => g.trim())
56
+ .filter(Boolean)
57
+ : [];
50
58
  const excludeMatchers = [
51
59
  ...DEFAULT_EXCLUDES,
52
60
  ...(options.exclude ?? "")
@@ -57,9 +65,14 @@ export async function resolveSource(source: string, options: ResolveOptions = {}
57
65
 
58
66
  if (isGlob(source)) {
59
67
  const base = globBase(source);
68
+ const remainder = globRemainder(source);
60
69
  try {
61
70
  const realBase = await realpath(base);
62
- return walk(realBase, [source, ...includeMatchers], excludeMatchers, followSymlinks);
71
+ // Source glob acts as a hard filter; user includes (if any) further
72
+ // narrow the result via AND. Pass them as a separate matcher so the
73
+ // two sets aren't picomatch-OR'd together.
74
+ const extraIncludes = userIncludes.length > 0 ? [userIncludes] : [];
75
+ return walk(realBase, [remainder], excludeMatchers, followSymlinks, extraIncludes);
63
76
  } catch (err) {
64
77
  throw asHelpful(
65
78
  err,
@@ -84,16 +97,18 @@ export async function resolveSource(source: string, options: ResolveOptions = {}
84
97
  }
85
98
 
86
99
  if (st.isFile()) {
100
+ const real = await realpath(abs);
87
101
  return {
88
102
  kind: "local-files",
89
- basePath: abs,
90
- entries: [{ absPath: abs, relPath: source.split(sep).pop() ?? source }],
103
+ basePath: real,
104
+ entries: [{ absPath: real, relPathFromBase: real.split(sep).pop() ?? real }],
91
105
  };
92
106
  }
93
107
 
94
108
  if (st.isDirectory()) {
95
109
  const realBase = await realpath(abs);
96
- return walk(realBase, includeMatchers, excludeMatchers, followSymlinks);
110
+ const dirIncludes = userIncludes.length > 0 ? userIncludes : ["**/*"];
111
+ return walk(realBase, dirIncludes, excludeMatchers, followSymlinks);
97
112
  }
98
113
 
99
114
  throw new HelpfulError({
@@ -120,22 +135,40 @@ export function globBase(glob: string): string {
120
135
  return base.length === 0 || !isAbsolute(base) ? resolve(base || ".") : base;
121
136
  }
122
137
 
138
+ /**
139
+ * Take the wildcard portion of a glob — everything from the first segment
140
+ * containing a wildcard onward. We strip the static prefix so the matcher
141
+ * runs against entry paths relative to `globBase`. Without this, a glob like
142
+ * `docs/star-star/star.md` never matches anything under base=`docs/`, since
143
+ * walk() exposes `sub/file.md` to picomatch, not `docs/sub/file.md`.
144
+ */
145
+ export function globRemainder(glob: string): string {
146
+ const parts = glob.split(sep);
147
+ const wildcardIdx = parts.findIndex((p) => /[*?[\]{}!]/.test(p));
148
+ if (wildcardIdx === -1) return glob;
149
+ return parts.slice(wildcardIdx).join(sep);
150
+ }
151
+
123
152
  /**
124
153
  * Recursively walk `base`, returning files matched by `includes` and not
125
154
  * matched by `excludes`. Both globsets match against the entry's path
126
155
  * relative to `base`. Symlinks are followed when `followSymlinks` is true,
127
- * with cycles detected via a realpath cache.
156
+ * with cycles detected via a realpath cache. `extraIncludeSets` is a list
157
+ * of additional include groups, each ANDed onto the primary `includes` —
158
+ * use it when two filters must both match (e.g. source glob + --include).
128
159
  */
129
160
  async function walk(
130
161
  base: string,
131
162
  includes: string[],
132
163
  excludes: string[],
133
164
  followSymlinks: boolean,
165
+ extraIncludeSets: string[][] = [],
134
166
  ): Promise<ResolvedSource> {
135
167
  const seen = new Set<string>();
136
168
  const entries: ResolvedLocalEntry[] = [];
137
169
 
138
170
  const isInclude = picomatch(includes, { dot: false, nocase: false });
171
+ const extraMatchers = extraIncludeSets.map((set) => picomatch(set, { dot: false, nocase: false }));
139
172
  const isExclude = excludes.length ? picomatch(excludes, { dot: false }) : null;
140
173
 
141
174
  const queue: string[] = [base];
@@ -174,7 +207,8 @@ async function walk(
174
207
  const relForMatch = rel.length === 0 ? (cur.split(sep).pop() ?? cur) : rel;
175
208
  if (isExclude?.(relForMatch)) continue;
176
209
  if (!isInclude(relForMatch)) continue;
177
- entries.push({ absPath: real, relPath: relForMatch });
210
+ if (extraMatchers.some((m) => !m(relForMatch))) continue;
211
+ entries.push({ absPath: real, relPathFromBase: relForMatch });
178
212
  }
179
213
 
180
214
  return { kind: "local-files", basePath: base, entries };
@@ -14,11 +14,23 @@ export const addOperation = defineOperation({
14
14
  - a glob pattern (e.g. "docs/**/*.md")
15
15
  - a URL (fetched via mcpx if configured, otherwise plain HTTP)
16
16
  - "inline:<text>" literal
17
- PDF, DOCX, HTML, images, and other binaries are converted to markdown — native libraries first, vision/OCR for images, LLM fallback for messy or scanned input. Original bytes are kept in the blobs table; \`membot_read bytes=true\` returns them. Setting \`refresh_frequency\` enables automatic refresh from the daemon. Each ingested file becomes a NEW version under its own logical_path; existing versions stay queryable via membot_versions. Directory/glob ingests stream one file at a time — partial failures do not abort the rest; the response lists per-entry status.`,
17
+ PDF, DOCX, HTML, images, and other binaries are converted to markdown — native libraries first, vision/OCR for images, LLM fallback for messy or scanned input. Original bytes are kept in the blobs table; \`membot_read bytes=true\` returns them. Setting \`refresh_frequency\` enables automatic refresh from the daemon. By default, re-ingesting an unchanged source (same source_sha256 as the current version) is a no-op and reports \`status: "unchanged"\`; pass \`force=true\` to always create a new version. Each newly-ingested file becomes a new version under its own logical_path; existing versions stay queryable via membot_versions. Directory/glob ingests stream one file at a time — partial failures do not abort the rest; the response lists per-entry status.
18
+
19
+ When \`logical_path\` is omitted, it is derived from the source so files with the same basename in different projects do not collide:
20
+ - Local sources use the entry's absolute filesystem path with the leading "/" stripped (e.g. "/Users/me/projA/README.md" → "Users/me/projA/README.md").
21
+ - URLs use "remotes/{host}/{path}" with slashes preserved (e.g. "https://github.com/u/p/blob/main/README.md" → "remotes/github.com/u/p/blob/main/README.md"). Query strings and fragments are dropped from the logical_path; the full URL is still stored on the row for refresh.
22
+ - "inline:<text>" defaults to "inline/{timestamp}.md".
23
+
24
+ Pass \`logical_path\` to override. For a directory or glob walk it is treated as a PREFIX — each entry is placed at "{prefix}/{path-relative-to-walk-base}". Re-running \`membot_add\` on the same source resolves to the same logical_path; if bytes are unchanged the call is a no-op (status \`unchanged\`), otherwise a new version is created.`,
18
25
  inputSchema: z.object({
19
26
  source: z.string().describe("Local path, directory, glob, URL, or `inline:<text>` literal"),
20
27
  logical_path: z.string().optional().describe("Destination logical_path (single source) or prefix (directory/glob)"),
21
- include: z.string().optional().describe("Glob include filter (comma-separated for multiple); default `**/*`"),
28
+ include: z
29
+ .string()
30
+ .optional()
31
+ .describe(
32
+ "Glob include filter (comma-separated for multiple). Defaults to `**/*` for directory sources, or the source pattern itself when source is a glob.",
33
+ ),
22
34
  exclude: z.string().optional().describe("Glob exclude filter (comma-separated for multiple)"),
23
35
  follow_symlinks: z
24
36
  .boolean()
@@ -30,6 +42,10 @@ PDF, DOCX, HTML, images, and other binaries are converted to markdown — native
30
42
  .optional()
31
43
  .describe("Free-form hint passed to mcpx tool search (e.g. 'firecrawl', 'github', 'google docs', 'http')"),
32
44
  change_note: z.string().optional().describe("Free-text note attached to the new version"),
45
+ force: z
46
+ .boolean()
47
+ .optional()
48
+ .describe("Re-ingest even when source bytes are unchanged. Default skips and reports `unchanged`."),
33
49
  }),
34
50
  outputSchema: z.object({
35
51
  ingested: z.array(
@@ -37,7 +53,7 @@ PDF, DOCX, HTML, images, and other binaries are converted to markdown — native
37
53
  source_path: z.string(),
38
54
  logical_path: z.string(),
39
55
  version_id: z.string().nullable(),
40
- status: z.enum(["ok", "failed"]),
56
+ status: z.enum(["ok", "unchanged", "failed"]),
41
57
  error: z.string().optional(),
42
58
  mime_type: z.string().nullable(),
43
59
  size_bytes: z.number(),
@@ -47,23 +63,27 @@ PDF, DOCX, HTML, images, and other binaries are converted to markdown — native
47
63
  ),
48
64
  total: z.number(),
49
65
  ok: z.number(),
66
+ unchanged: z.number(),
50
67
  failed: z.number(),
51
68
  }),
52
69
  cli: {
53
70
  positional: ["source"],
54
- aliases: { logical_path: "-p", refresh_frequency: "-r", change_note: "-m" },
71
+ aliases: { logical_path: "-p", refresh_frequency: "-r", change_note: "-m", force: "-f" },
55
72
  },
56
73
  console_formatter: (result) => {
57
74
  const lines = result.ingested.map((e) => {
58
75
  if (e.status === "ok") {
59
76
  return `${colors.green("✓")} ${colors.cyan(e.logical_path)} ${colors.dim(`(${e.fetcher}, ${e.size_bytes}B)`)}`;
60
77
  }
78
+ if (e.status === "unchanged") {
79
+ return `${colors.dim("≡")} ${colors.cyan(e.logical_path)} ${colors.dim("(unchanged)")}`;
80
+ }
61
81
  return `${colors.red("✗")} ${e.source_path} ${colors.dim(e.error ?? "")}`;
62
82
  });
63
- const summary = result.failed
64
- ? `${colors.green(`added ${result.ok}`)}, ${colors.red(`failed ${result.failed}`)}`
65
- : colors.green(`added ${result.ok}`);
66
- return `${lines.join("\n")}\n${summary}`;
83
+ const parts: string[] = [colors.green(`added ${result.ok}`)];
84
+ if (result.unchanged > 0) parts.push(colors.dim(`unchanged ${result.unchanged}`));
85
+ if (result.failed > 0) parts.push(colors.red(`failed ${result.failed}`));
86
+ return `${lines.join("\n")}\n${parts.join(", ")}`;
67
87
  },
68
88
  handler: async (input, ctx) => ingest(input, ctx),
69
89
  });
@@ -47,42 +47,47 @@ export const treeOperation = defineOperation({
47
47
 
48
48
  /**
49
49
  * Build a tree of TreeNode objects from a flat list of `/`-delimited paths.
50
- * Splits each path into segments and groups by common prefix; nodes deeper
51
- * than `maxDepth` are folded into their parent's `children` summary count.
50
+ * Splits each path into segments and groups by common prefix. Segments
51
+ * deeper than `maxDepth` are folded into the deepest visible ancestor
52
+ * that ancestor is marked `is_file=true` so the renderer surfaces it as a
53
+ * leaf even though longer paths exist underneath.
52
54
  */
53
55
  function buildTree(paths: string[], maxDepth: number): TreeNode[] {
54
- const root: Map<string, TreeNode> = new Map();
56
+ interface MutableNode {
57
+ name: string;
58
+ full_path: string;
59
+ is_file: boolean;
60
+ children: Map<string, MutableNode>;
61
+ }
62
+ const root = new Map<string, MutableNode>();
55
63
  for (const path of paths) {
56
64
  const segs = path.split("/").filter(Boolean);
65
+ if (segs.length === 0) continue;
57
66
  let level = root;
58
67
  const trail: string[] = [];
59
- for (let i = 0; i < segs.length && i < maxDepth; i++) {
68
+ const stop = Math.min(segs.length, maxDepth);
69
+ for (let i = 0; i < stop; i++) {
60
70
  const seg = segs[i]!;
61
71
  trail.push(seg);
62
- const fullPath = trail.join("/");
63
72
  let node = level.get(seg);
64
73
  if (!node) {
65
- node = { name: seg, full_path: fullPath, is_file: i === segs.length - 1 };
74
+ node = { name: seg, full_path: trail.join("/"), is_file: false, children: new Map() };
66
75
  level.set(seg, node);
67
- } else if (i === segs.length - 1) {
68
- node.is_file = true;
69
- }
70
- if (i < segs.length - 1) {
71
- if (!node.children) node.children = [];
72
- const childMap = new Map(node.children.map((c) => [c.name, c] as const));
73
- node.children = [...childMap.values()];
74
- level = childMap;
75
- if (childMap.size === 0) {
76
- level = new Map();
77
- node.children = [];
78
- } else {
79
- // rebuild level pointer
80
- level = new Map(node.children.map((c) => [c.name, c] as const));
81
- }
82
76
  }
77
+ const isTerminal = i === segs.length - 1 || i === maxDepth - 1;
78
+ if (isTerminal) node.is_file = true;
79
+ level = node.children;
83
80
  }
84
81
  }
85
- return [...root.values()].sort((a, b) => a.name.localeCompare(b.name));
82
+ const finalize = (m: Map<string, MutableNode>): TreeNode[] => {
83
+ const arr = [...m.values()].sort((a, b) => a.name.localeCompare(b.name));
84
+ return arr.map((n) => {
85
+ const out: TreeNode = { name: n.name, full_path: n.full_path, is_file: n.is_file };
86
+ if (n.children.size > 0) out.children = finalize(n.children);
87
+ return out;
88
+ });
89
+ };
90
+ return finalize(root);
86
91
  }
87
92
 
88
93
  /**
@@ -1,35 +0,0 @@
1
- #!/usr/bin/env bash
2
- set -euo pipefail
3
-
4
- # Apply the @huggingface/transformers patch to node_modules so that
5
- # `bun build --compile` produces a binary using the WASM backend
6
- # (onnxruntime-web) instead of onnxruntime-node, whose native bindings
7
- # can't be bundled into a single-binary distribution.
8
- #
9
- # We apply the patch imperatively (rather than via package.json
10
- # `patchedDependencies`) because that field, when present in a
11
- # published package, breaks `bun install` from a tarball.
12
-
13
- PATCH="patches/@huggingface%2Ftransformers@4.2.0.patch"
14
- TARGET="node_modules/@huggingface/transformers"
15
- MARKER="$TARGET/.membot-transformers-patch-applied"
16
-
17
- if [ ! -d "$TARGET" ]; then
18
- echo "error: $TARGET not found — run \`bun install\` first" >&2
19
- exit 1
20
- fi
21
-
22
- if [ ! -f "$PATCH" ]; then
23
- echo "error: $PATCH not found" >&2
24
- exit 1
25
- fi
26
-
27
- if [ -f "$MARKER" ]; then
28
- echo "transformers patch already applied — skipping"
29
- exit 0
30
- fi
31
-
32
- echo "Applying transformers patch ($PATCH) to $TARGET..."
33
- git apply --directory="$TARGET" "$PATCH"
34
- touch "$MARKER"
35
- echo "Patch applied."