npm - membot - Versions diffs - 0.1.1 → 0.2.0 - Mend

membot 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/.claude/skills/membot.md +11 -2
package/.cursor/rules/membot.mdc +11 -2
package/README.md +4 -9
package/package.json +2 -2
package/patches/@evantahler%2Fmcpx@0.21.4.patch +44 -0
package/scripts/apply-patches.sh +49 -0
package/src/ingest/ingest.ts +68 -18
package/src/ingest/source-resolver.ts +46 -12
package/src/operations/add.ts +28 -8
package/src/operations/tree.ts +27 -22
package/scripts/apply-transformers-patch.sh +0 -35

package/.claude/skills/membot.md CHANGED Viewed

@@ -36,6 +36,15 @@ membot add ./docs --refresh-frequency 24h         # auto-refresh every day
 Each entry becomes a new version under its own `logical_path`. PDFs/DOCX/HTML are converted to markdown; images get vision captions; original bytes are kept and reachable via `membot read --bytes`.
+The default `logical_path` mirrors the source path so files with the same basename in different projects don't collide:
+- Local file → absolute path with leading `/` stripped (e.g. `/Users/me/projA/README.md` → `Users/me/projA/README.md`).
+- Local directory or glob → each entry's absolute path under the same shape.
+- URL → `remotes/{host}/{path}` with `/`'s preserved (e.g. `https://github.com/userA/projA/blob/main/README.md` → `remotes/github.com/userA/projA/blob/main/README.md`). Query strings and fragments are dropped from the logical_path (the full URL is still stored for refresh).
+- `inline:<text>` → `inline/{timestamp}.md`.
+Pass `-p <path>` (or `--logical-path`) to override. On a directory walk it's treated as a *prefix* — entries land at `{prefix}/{path-relative-to-walk-base}`. Re-running `membot add` on the same source reuses the same logical_path and creates a new version (correct refresh behavior).
 ## 3. Read
 ```bash
@@ -75,7 +84,7 @@ Tombstones hide a path from `ls` / `tree` / `search` but `versions` and `read --
 - Defaults always operate on the current, non-tombstoned version.
 - Pass an explicit `--version <timestamp>` (from `membot versions`) to read or diff history.
-- `membot_add`, refresh-with-changes, `write`, and `mv` each create a new version. The previous version is preserved.
+- `membot_add` (when source bytes have changed), refresh-with-changes, `write`, and `mv` each create a new version. The previous version is preserved. Re-running `membot_add` against an unchanged source is a no-op (status `unchanged`, same `version_id`); pass `force=true` to force a new version.
 - Mutating an existing version is not possible — corrections are new versions.
 ## When to use this skill
@@ -99,7 +108,7 @@ Tombstones hide a path from `ls` / `tree` / `search` but `versions` and `read --
 | Command                               | Purpose                                                                        |
 | ------------------------------------- | ------------------------------------------------------------------------------ |
-| `membot add <source>`                 | Ingest file, directory, glob, URL, or `inline:<text>` (one new version each)   |
+| `membot add <source>`                 | Ingest file, directory, glob, URL, or `inline:<text>`. Skips unchanged sources; pass `--force` to re-ingest |
 | `membot ls [prefix]`                  | List current files (size, mime, refresh status)                                |
 | `membot tree [prefix]`                | Render the synthesised logical-path tree                                       |
 | `membot read <path>`                  | Read current markdown surrogate (or `--bytes` for original)                    |

package/.cursor/rules/membot.mdc CHANGED Viewed

@@ -36,6 +36,15 @@ membot add ./docs --refresh-frequency 24h         # auto-refresh every day
 Each entry becomes a new version under its own `logical_path`. PDFs/DOCX/HTML are converted to markdown; images get vision captions; original bytes are kept and reachable via `membot read --bytes`.
+The default `logical_path` mirrors the source path so files with the same basename in different projects don't collide:
+- Local file → absolute path with leading `/` stripped (e.g. `/Users/me/projA/README.md` → `Users/me/projA/README.md`).
+- Local directory or glob → each entry's absolute path under the same shape.
+- URL → `remotes/{host}/{path}` with `/`'s preserved (e.g. `https://github.com/userA/projA/blob/main/README.md` → `remotes/github.com/userA/projA/blob/main/README.md`). Query strings and fragments are dropped from the logical_path (the full URL is still stored for refresh).
+- `inline:<text>` → `inline/{timestamp}.md`.
+Pass `-p <path>` (or `--logical-path`) to override. On a directory walk it's treated as a *prefix* — entries land at `{prefix}/{path-relative-to-walk-base}`. Re-running `membot add` on the same source reuses the same logical_path and creates a new version (correct refresh behavior).
 ## 3. Read
 ```bash
@@ -75,7 +84,7 @@ Tombstones hide a path from `ls` / `tree` / `search` but `versions` and `read --
 - Defaults always operate on the current, non-tombstoned version.
 - Pass an explicit `--version <timestamp>` (from `membot versions`) to read or diff history.
-- `membot_add`, refresh-with-changes, `write`, and `mv` each create a new version. The previous version is preserved.
+- `membot_add` (when source bytes have changed), refresh-with-changes, `write`, and `mv` each create a new version. The previous version is preserved. Re-running `membot_add` against an unchanged source is a no-op (status `unchanged`, same `version_id`); pass `force=true` to force a new version.
 - Mutating an existing version is not possible — corrections are new versions.
 ## When to use this rule
@@ -99,7 +108,7 @@ Tombstones hide a path from `ls` / `tree` / `search` but `versions` and `read --
 | Command                               | Purpose                                                                        |
 | ------------------------------------- | ------------------------------------------------------------------------------ |
-| `membot add <source>`                 | Ingest file, directory, glob, URL, or `inline:<text>` (one new version each)   |
+| `membot add <source>`                 | Ingest file, directory, glob, URL, or `inline:<text>`. Skips unchanged sources; pass `--force` to re-ingest |
 | `membot ls [prefix]`                  | List current files (size, mime, refresh status)                                |
 | `membot tree [prefix]`                | Render the synthesised logical-path tree                                       |
 | `membot read <path>`                  | Read current markdown surrogate (or `--bytes` for original)                    |

package/README.md CHANGED Viewed

@@ -15,18 +15,13 @@
 ## Install
 ```bash
-# macOS / Linux — pre-built binary
-curl -fsSL https://raw.githubusercontent.com/evantahler/membot/main/install.sh | bash
-# Windows — PowerShell
-iwr -useb https://raw.githubusercontent.com/evantahler/membot/main/install.ps1 | iex
-# From npm (requires Bun or Node)
-bun add -g membot
+bun install -g membot
 # or
 npm install -g membot
 ```
+This pulls in DuckDB's per-platform native bindings alongside membot. The build externalizes `@duckdb/*` (those `.node` bindings can't be embedded by `bun build --compile`), so a global npm/bun install is the supported path.
 ## Quick start
 ```bash
@@ -55,7 +50,7 @@ The skill files describe the discover → ingest → search → read → write w
 | Command                         | Description                                                                       |
 | ------------------------------- | --------------------------------------------------------------------------------- |
-| `membot add <source>`           | Ingest a file, directory, glob, URL, or `inline:<text>`. Each match → new version |
+| `membot add <source>`           | Ingest a file, directory, glob, URL, or `inline:<text>`. Default `logical_path` mirrors the source (absolute path for local files, `remotes/{host}/{path}` for URLs) so files with the same basename in different projects don't collide. Pass `-p <path>` to override or, on a directory walk, to set a prefix. Skips on unchanged source bytes; pass `--force` to re-ingest. |
 | `membot ls [prefix]`            | List current files (size, mime, refresh status)                                   |
 | `membot tree [prefix]`          | Render the synthesised logical-path tree                                          |
 | `membot read <path>`            | Read the markdown surrogate (or `--bytes` for original bytes, base64)             |

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
 	"name": "membot",
-	"version": "0.1.1",
+	"version": "0.2.0",
 	"description": "Versioned context store with hybrid search for AI agents. Stdio + HTTP MCP server and CLI.",
 	"type": "module",
 	"exports": {
@@ -26,7 +26,7 @@
 		"test": "bun test",
 		"lint": "biome ci . && tsc --noEmit",
 		"format": "biome check --write .",
-		"prebuild": "bash scripts/apply-transformers-patch.sh",
+		"prebuild": "bash scripts/apply-patches.sh",
 		"build": "bun build --compile --minify --sourcemap --external '@duckdb/*' ./src/cli.ts --outfile dist/membot"
 	},
 	"keywords": [

package/patches/@evantahler%2Fmcpx@0.21.4.patch ADDED Viewed

@@ -0,0 +1,44 @@
+diff --git a/src/search/onnx-wasm-paths.ts b/src/search/onnx-wasm-paths.ts
+--- a/src/search/onnx-wasm-paths.ts
++++ b/src/search/onnx-wasm-paths.ts
+@@ -1,31 +1,9 @@
+-// Embed the onnxruntime-web WASM runtime files into the compiled binary
+-// (`bun build --compile`) so they survive in a single-binary distribution
+-// where the user has no node_modules.
+-//
+-// This file is loaded **dynamically** by semantic.ts. The relative paths
+-// only resolve in the local repo / compiled binary; for npm/bun-installed
+-// mcpx the parent directory layout is different (deps are hoisted), the
+-// dynamic import throws, and we fall back to letting transformers.js
+-// load WASM via its default mechanism — which works fine because in
+-// that environment node_modules exists and onnxruntime-web is reachable
+-// through normal module resolution.
+-
+-// The relative `../../node_modules/...` paths only resolve from the local repo
+-// layout (and inside `bun build --compile`). When this file is shipped via npm,
+-// deps are hoisted, so consumer `tsc` runs hit TS2307. The `ts-ignore` directive
+-// below silences that for consumers; we avoid the stricter `expect-error` form
+-// because in the local repo the path resolves fine and there would be no error
+-// to expect. At runtime the dynamic import in semantic.ts is wrapped in
+-// try/catch and falls back to transformers.js's default WASM loader (issue #85).
+-// biome-ignore lint/suspicious/noTsIgnore: must stay as ts-ignore per comment above
+-// @ts-ignore - dynamic-only import
+-import wasmMjsPath from "../../node_modules/onnxruntime-web/dist/ort-wasm-simd-threaded.asyncify.mjs" with {
+-	type: "file",
+-};
+-// biome-ignore lint/suspicious/noTsIgnore: must stay as ts-ignore per comment above
+-// @ts-ignore - dynamic-only import
+-import wasmBinPath from "../../node_modules/onnxruntime-web/dist/ort-wasm-simd-threaded.asyncify.wasm" with {
+-	type: "file",
+-};
+-
+-export { wasmBinPath, wasmMjsPath };
++// PATCHED (membot): upstream mcpx ships static `with { type: "file" }` imports
++// of onnxruntime-web WASM assets via `../../node_modules/...`, which only
++// resolves when mcpx is built standalone. When consumed as an npm dep those
++// paths are unreachable and `bun build --compile` fails at build time. membot
++// never invokes mcpx's semantic search (only `mcpx.exec()` for URL fetching),
++// so we stub the exports — semantic.ts wraps the dynamic import in try/catch
++// and falls back to transformers.js's default WASM loader.
++export const wasmMjsPath = "";
++export const wasmBinPath = "";

package/scripts/apply-patches.sh ADDED Viewed

@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+set -euo pipefail
+# Apply node_modules patches imperatively. We don't use package.json's
+# `patchedDependencies` field because that field, when present in a published
+# package, breaks `bun install` from a tarball.
+#
+# Each patch is gated by a marker file inside its target so reruns are no-ops.
+apply_patch() {
+	local patch="$1" target="$2" marker_name="$3"
+	local marker="$target/$marker_name"
+	if [ ! -d "$target" ]; then
+		echo "error: $target not found — run \`bun install\` first" >&2
+		exit 1
+	fi
+	if [ ! -f "$patch" ]; then
+		echo "error: $patch not found" >&2
+		exit 1
+	fi
+	if [ -f "$marker" ]; then
+		echo "patch $patch already applied — skipping"
+		return 0
+	fi
+	echo "Applying $patch to $target..."
+	git apply --directory="$target" "$patch"
+	touch "$marker"
+}
+# @huggingface/transformers — replace static `import 'onnxruntime-node'` with a
+# stub so `bun build --compile` produces a binary using the WASM backend
+# (onnxruntime-web) instead of onnxruntime-node, whose native bindings can't be
+# bundled into a single-binary distribution.
+apply_patch \
+	"patches/@huggingface%2Ftransformers@4.2.0.patch" \
+	"node_modules/@huggingface/transformers" \
+	".membot-transformers-patch-applied"
+# @evantahler/mcpx — stub `src/search/onnx-wasm-paths.ts` whose static
+# `with { type: "file" }` imports use a relative path that only resolves in
+# mcpx's own repo layout. When mcpx is consumed as an npm dep those paths are
+# unreachable and `bun build --compile` fails at build time. membot never
+# invokes mcpx's semantic search, so the stubbed exports are safe.
+apply_patch \
+	"patches/@evantahler%2Fmcpx@0.21.4.patch" \
+	"node_modules/@evantahler/mcpx" \
+	".membot-mcpx-patch-applied"

package/src/ingest/ingest.ts CHANGED Viewed

@@ -21,13 +21,14 @@ export interface IngestInput {
 	refresh_frequency?: string;
 	fetcher_hint?: string;
 	change_note?: string;
+	force?: boolean;
 }
 export interface IngestEntryResult {
 	source_path: string;
 	logical_path: string;
 	version_id: string | null;
-	status: "ok" | "failed";
+	status: "ok" | "unchanged" | "failed";
 	error?: string;
 	mime_type: string | null;
 	size_bytes: number;
@@ -39,6 +40,7 @@ export interface IngestResult {
 	ingested: IngestEntryResult[];
 	total: number;
 	ok: number;
+	unchanged: number;
 	failed: number;
 }
@@ -57,14 +59,15 @@ export async function ingest(input: IngestInput, ctx: AppContext): Promise<Inges
 	});
 	const refreshSec = parseDuration(input.refresh_frequency);
+	const force = input.force === true;
 	if (resolved.kind === "inline") {
 		return ingestInline(resolved.text, input, ctx, refreshSec);
 	}
 	if (resolved.kind === "url") {
-		return ingestUrl(resolved.url, input, ctx, refreshSec);
+		return ingestUrl(resolved.url, input, ctx, refreshSec, force);
 	}
-	return ingestLocalFiles(resolved, input, ctx, refreshSec);
+	return ingestLocalFiles(resolved, input, ctx, refreshSec, force);
 }
 /** Ingest a single inline blob (source_type='inline'). */
@@ -119,6 +122,7 @@ async function ingestUrl(
 	input: IngestInput,
 	ctx: AppContext,
 	refreshSec: number | null,
+	force: boolean,
 ): Promise<IngestResult> {
 	const mcpxAdapter = ctx.mcpx
 		? {
@@ -151,6 +155,15 @@ async function ingestUrl(
 		result.fetcher = fetched.fetcher;
 		result.source_sha256 = fetched.sha256;
+		if (!force) {
+			const cur = await getCurrent(ctx.db, logicalPath);
+			if (cur && cur.source_sha256 === fetched.sha256) {
+				result.status = "unchanged";
+				result.version_id = cur.version_id;
+				return summarize([result]);
+			}
+		}
 		const versionId = await pipelineForBytes(ctx, {
 			logicalPath,
 			bytes: fetched.bytes,
@@ -181,6 +194,7 @@ async function ingestLocalFiles(
 	input: IngestInput,
 	ctx: AppContext,
 	refreshSec: number | null,
+	force: boolean,
 ): Promise<IngestResult> {
 	if (resolved.entries.length === 0) {
 		throw new HelpfulError({
@@ -195,7 +209,7 @@ async function ingestLocalFiles(
 	const isMulti = resolved.entries.length > 1;
 	for (const entry of resolved.entries) {
-		ctx.progress.tick(entry.relPath);
+		ctx.progress.tick(entry.relPathFromBase);
 		const logicalPath = pickLogicalPath(input.logical_path, entry, isMulti);
 		const result: IngestEntryResult = {
 			source_path: entry.absPath,
@@ -213,6 +227,16 @@ async function ingestLocalFiles(
 			result.size_bytes = local.sizeBytes;
 			result.source_sha256 = local.sha256;
+			if (!force) {
+				const cur = await getCurrent(ctx.db, logicalPath);
+				if (cur && cur.source_sha256 === local.sha256) {
+					result.status = "unchanged";
+					result.version_id = cur.version_id;
+					results.push(result);
+					continue;
+				}
+			}
 			const versionId = await pipelineForBytes(ctx, {
 				logicalPath,
 				bytes: local.bytes,
@@ -236,7 +260,10 @@ async function ingestLocalFiles(
 		}
 		results.push(result);
 	}
-	ctx.progress.done(`ingested ${results.filter((r) => r.status === "ok").length}/${results.length}`);
+	const okCount = results.filter((r) => r.status === "ok").length;
+	const unchangedCount = results.filter((r) => r.status === "unchanged").length;
+	const suffix = unchangedCount > 0 ? ` (${unchangedCount} unchanged)` : "";
+	ctx.progress.done(`ingested ${okCount}/${results.length}${suffix}`);
 	return summarize(results);
 }
@@ -377,26 +404,47 @@ async function persistVersion(ctx: AppContext, p: PersistParams): Promise<string
 }
 /**
- * Pick the logical path for a single matched entry. For a single-file
- * ingest with explicit `logical_path`, use it as-is. For multi-entry
- * ingests with `logical_path` set, treat it as a *prefix* under which
- * each entry's relative path is placed.
+ * Pick the logical path for a single matched entry.
+ *
+ * - Default (no explicit logical_path): use the entry's absolute filesystem
+ *   path with `\` normalized to `/` and the leading `/` stripped. This
+ *   keeps `~/projA/README.md` and `~/projB/README.md` from colliding under
+ *   a shared `README.md`. Two adds of the same absolute path produce the
+ *   same logical_path, so the second add correctly creates a new version.
+ * - Single-source with explicit logical_path: use it verbatim.
+ * - Multi-entry (directory/glob) with explicit logical_path: treat as a
+ *   prefix and append each entry's path relative to the walk base.
  */
-function pickLogicalPath(explicit: string | undefined, entry: ResolvedLocalEntry, isMulti: boolean): string {
-	if (!explicit) return entry.relPath.replaceAll("\\", "/");
+export function pickLogicalPath(explicit: string | undefined, entry: ResolvedLocalEntry, isMulti: boolean): string {
+	if (!explicit) return normalizeAbs(entry.absPath);
 	if (!isMulti) return explicit;
 	const prefix = explicit.endsWith("/") ? explicit.slice(0, -1) : explicit;
-	return `${prefix}/${entry.relPath.replaceAll("\\", "/")}`;
+	return `${prefix}/${entry.relPathFromBase.replaceAll("\\", "/")}`;
 }
-/** Default logical path for an ingested URL — host + path, sanitized. */
-function defaultLogicalForUrl(url: string): string {
+/**
+ * Normalize an absolute filesystem path into a logical_path:
+ * `\` → `/`, leading `/` stripped. Drive letters (Windows `C:`) are kept
+ * as the first path segment.
+ */
+export function normalizeAbs(absPath: string): string {
+	return absPath.replaceAll("\\", "/").replace(/^\/+/, "");
+}
+/**
+ * Default logical path for an ingested URL: `remotes/{host}/{pathname}`
+ * with slashes preserved so two projects on the same host (e.g.,
+ * github.com) don't collide. Query string and fragment are dropped from
+ * the logical_path for stable identity — the full URL is still preserved
+ * on the row in `source_path` and used for refresh.
+ */
+export function defaultLogicalForUrl(url: string): string {
 	try {
 		const u = new URL(url);
-		const tail = u.pathname.replace(/^\/+/, "").replaceAll("/", "_") || "root";
-		return `urls/${u.hostname}/${tail || "root"}`;
+		const tail = u.pathname.replace(/^\/+/, "").replace(/\/+$/, "") || "index";
+		return `remotes/${u.hostname}/${tail}`;
 	} catch {
-		return `urls/${url.replace(/[^a-z0-9.-]/gi, "_")}`;
+		return `remotes/${url.replace(/[^a-z0-9.-]/gi, "_")}`;
 	}
 }
@@ -428,12 +476,14 @@ export function parseDuration(input: string | null | undefined): number | null {
 /** Roll a list of per-entry results into the top-level summary shape. */
 function summarize(entries: IngestEntryResult[]): IngestResult {
 	let ok = 0;
+	let unchanged = 0;
 	let failed = 0;
 	for (const e of entries) {
 		if (e.status === "ok") ok += 1;
+		else if (e.status === "unchanged") unchanged += 1;
 		else failed += 1;
 	}
-	return { ingested: entries, total: entries.length, ok, failed };
+	return { ingested: entries, total: entries.length, ok, unchanged, failed };
 }
 function errorMessage(err: unknown): string {

package/src/ingest/source-resolver.ts CHANGED Viewed

@@ -9,9 +9,15 @@ export type ResolvedSource =
 	| { kind: "local-files"; entries: ResolvedLocalEntry[]; basePath: string };
 export interface ResolvedLocalEntry {
+	/** Absolute filesystem path (post-realpath). */
 	absPath: string;
-	/** Path relative to the base; used to derive a default logical_path. */
-	relPath: string;
+	/**
+	 * Path relative to the walk base. Used when the caller passes an
+	 * explicit `logical_path` *prefix* (directory/glob mode) — entries land
+	 * at `{prefix}/{relPathFromBase}`. For default logical_paths we use
+	 * `absPath` directly so paths from different filesystems don't collide.
+	 */
+	relPathFromBase: string;
 }
 export interface ResolveOptions {
@@ -43,10 +49,12 @@ export async function resolveSource(source: string, options: ResolveOptions = {}
 	}
 	const followSymlinks = options.followSymlinks !== false;
-	const includeMatchers = (options.include ?? "**/*")
-		.split(",")
-		.map((g) => g.trim())
-		.filter(Boolean);
+	const userIncludes = options.include
+		? options.include
+				.split(",")
+				.map((g) => g.trim())
+				.filter(Boolean)
+		: [];
 	const excludeMatchers = [
 		...DEFAULT_EXCLUDES,
 		...(options.exclude ?? "")
@@ -57,9 +65,14 @@ export async function resolveSource(source: string, options: ResolveOptions = {}
 	if (isGlob(source)) {
 		const base = globBase(source);
+		const remainder = globRemainder(source);
 		try {
 			const realBase = await realpath(base);
-			return walk(realBase, [source, ...includeMatchers], excludeMatchers, followSymlinks);
+			// Source glob acts as a hard filter; user includes (if any) further
+			// narrow the result via AND. Pass them as a separate matcher so the
+			// two sets aren't picomatch-OR'd together.
+			const extraIncludes = userIncludes.length > 0 ? [userIncludes] : [];
+			return walk(realBase, [remainder], excludeMatchers, followSymlinks, extraIncludes);
 		} catch (err) {
 			throw asHelpful(
 				err,
@@ -84,16 +97,18 @@ export async function resolveSource(source: string, options: ResolveOptions = {}
 	}
 	if (st.isFile()) {
+		const real = await realpath(abs);
 		return {
 			kind: "local-files",
-			basePath: abs,
-			entries: [{ absPath: abs, relPath: source.split(sep).pop() ?? source }],
+			basePath: real,
+			entries: [{ absPath: real, relPathFromBase: real.split(sep).pop() ?? real }],
 		};
 	}
 	if (st.isDirectory()) {
 		const realBase = await realpath(abs);
-		return walk(realBase, includeMatchers, excludeMatchers, followSymlinks);
+		const dirIncludes = userIncludes.length > 0 ? userIncludes : ["**/*"];
+		return walk(realBase, dirIncludes, excludeMatchers, followSymlinks);
 	}
 	throw new HelpfulError({
@@ -120,22 +135,40 @@ export function globBase(glob: string): string {
 	return base.length === 0 || !isAbsolute(base) ? resolve(base || ".") : base;
 }
+/**
+ * Take the wildcard portion of a glob — everything from the first segment
+ * containing a wildcard onward. We strip the static prefix so the matcher
+ * runs against entry paths relative to `globBase`. Without this, a glob like
+ * `docs/star-star/star.md` never matches anything under base=`docs/`, since
+ * walk() exposes `sub/file.md` to picomatch, not `docs/sub/file.md`.
+ */
+export function globRemainder(glob: string): string {
+	const parts = glob.split(sep);
+	const wildcardIdx = parts.findIndex((p) => /[*?[\]{}!]/.test(p));
+	if (wildcardIdx === -1) return glob;
+	return parts.slice(wildcardIdx).join(sep);
+}
 /**
  * Recursively walk `base`, returning files matched by `includes` and not
  * matched by `excludes`. Both globsets match against the entry's path
  * relative to `base`. Symlinks are followed when `followSymlinks` is true,
- * with cycles detected via a realpath cache.
+ * with cycles detected via a realpath cache. `extraIncludeSets` is a list
+ * of additional include groups, each ANDed onto the primary `includes` —
+ * use it when two filters must both match (e.g. source glob + --include).
  */
 async function walk(
 	base: string,
 	includes: string[],
 	excludes: string[],
 	followSymlinks: boolean,
+	extraIncludeSets: string[][] = [],
 ): Promise<ResolvedSource> {
 	const seen = new Set<string>();
 	const entries: ResolvedLocalEntry[] = [];
 	const isInclude = picomatch(includes, { dot: false, nocase: false });
+	const extraMatchers = extraIncludeSets.map((set) => picomatch(set, { dot: false, nocase: false }));
 	const isExclude = excludes.length ? picomatch(excludes, { dot: false }) : null;
 	const queue: string[] = [base];
@@ -174,7 +207,8 @@ async function walk(
 		const relForMatch = rel.length === 0 ? (cur.split(sep).pop() ?? cur) : rel;
 		if (isExclude?.(relForMatch)) continue;
 		if (!isInclude(relForMatch)) continue;
-		entries.push({ absPath: real, relPath: relForMatch });
+		if (extraMatchers.some((m) => !m(relForMatch))) continue;
+		entries.push({ absPath: real, relPathFromBase: relForMatch });
 	}
 	return { kind: "local-files", basePath: base, entries };

package/src/operations/add.ts CHANGED Viewed

@@ -14,11 +14,23 @@ export const addOperation = defineOperation({
   - a glob pattern (e.g. "docs/**/*.md")
   - a URL (fetched via mcpx if configured, otherwise plain HTTP)
   - "inline:<text>" literal
-PDF, DOCX, HTML, images, and other binaries are converted to markdown — native libraries first, vision/OCR for images, LLM fallback for messy or scanned input. Original bytes are kept in the blobs table; \`membot_read bytes=true\` returns them. Setting \`refresh_frequency\` enables automatic refresh from the daemon. Each ingested file becomes a NEW version under its own logical_path; existing versions stay queryable via membot_versions. Directory/glob ingests stream one file at a time — partial failures do not abort the rest; the response lists per-entry status.`,
+PDF, DOCX, HTML, images, and other binaries are converted to markdown — native libraries first, vision/OCR for images, LLM fallback for messy or scanned input. Original bytes are kept in the blobs table; \`membot_read bytes=true\` returns them. Setting \`refresh_frequency\` enables automatic refresh from the daemon. By default, re-ingesting an unchanged source (same source_sha256 as the current version) is a no-op and reports \`status: "unchanged"\`; pass \`force=true\` to always create a new version. Each newly-ingested file becomes a new version under its own logical_path; existing versions stay queryable via membot_versions. Directory/glob ingests stream one file at a time — partial failures do not abort the rest; the response lists per-entry status.
+When \`logical_path\` is omitted, it is derived from the source so files with the same basename in different projects do not collide:
+  - Local sources use the entry's absolute filesystem path with the leading "/" stripped (e.g. "/Users/me/projA/README.md" → "Users/me/projA/README.md").
+  - URLs use "remotes/{host}/{path}" with slashes preserved (e.g. "https://github.com/u/p/blob/main/README.md" → "remotes/github.com/u/p/blob/main/README.md"). Query strings and fragments are dropped from the logical_path; the full URL is still stored on the row for refresh.
+  - "inline:<text>" defaults to "inline/{timestamp}.md".
+Pass \`logical_path\` to override. For a directory or glob walk it is treated as a PREFIX — each entry is placed at "{prefix}/{path-relative-to-walk-base}". Re-running \`membot_add\` on the same source resolves to the same logical_path; if bytes are unchanged the call is a no-op (status \`unchanged\`), otherwise a new version is created.`,
 	inputSchema: z.object({
 		source: z.string().describe("Local path, directory, glob, URL, or `inline:<text>` literal"),
 		logical_path: z.string().optional().describe("Destination logical_path (single source) or prefix (directory/glob)"),
-		include: z.string().optional().describe("Glob include filter (comma-separated for multiple); default `**/*`"),
+		include: z
+			.string()
+			.optional()
+			.describe(
+				"Glob include filter (comma-separated for multiple). Defaults to `**/*` for directory sources, or the source pattern itself when source is a glob.",
+			),
 		exclude: z.string().optional().describe("Glob exclude filter (comma-separated for multiple)"),
 		follow_symlinks: z
 			.boolean()
@@ -30,6 +42,10 @@ PDF, DOCX, HTML, images, and other binaries are converted to markdown — native
 			.optional()
 			.describe("Free-form hint passed to mcpx tool search (e.g. 'firecrawl', 'github', 'google docs', 'http')"),
 		change_note: z.string().optional().describe("Free-text note attached to the new version"),
+		force: z
+			.boolean()
+			.optional()
+			.describe("Re-ingest even when source bytes are unchanged. Default skips and reports `unchanged`."),
 	}),
 	outputSchema: z.object({
 		ingested: z.array(
@@ -37,7 +53,7 @@ PDF, DOCX, HTML, images, and other binaries are converted to markdown — native
 				source_path: z.string(),
 				logical_path: z.string(),
 				version_id: z.string().nullable(),
-				status: z.enum(["ok", "failed"]),
+				status: z.enum(["ok", "unchanged", "failed"]),
 				error: z.string().optional(),
 				mime_type: z.string().nullable(),
 				size_bytes: z.number(),
@@ -47,23 +63,27 @@ PDF, DOCX, HTML, images, and other binaries are converted to markdown — native
 		),
 		total: z.number(),
 		ok: z.number(),
+		unchanged: z.number(),
 		failed: z.number(),
 	}),
 	cli: {
 		positional: ["source"],
-		aliases: { logical_path: "-p", refresh_frequency: "-r", change_note: "-m" },
+		aliases: { logical_path: "-p", refresh_frequency: "-r", change_note: "-m", force: "-f" },
 	},
 	console_formatter: (result) => {
 		const lines = result.ingested.map((e) => {
 			if (e.status === "ok") {
 				return `${colors.green("✓")} ${colors.cyan(e.logical_path)} ${colors.dim(`(${e.fetcher}, ${e.size_bytes}B)`)}`;
 			}
+			if (e.status === "unchanged") {
+				return `${colors.dim("≡")} ${colors.cyan(e.logical_path)} ${colors.dim("(unchanged)")}`;
+			}
 			return `${colors.red("✗")} ${e.source_path} ${colors.dim(e.error ?? "")}`;
 		});
-		const summary = result.failed
-			? `${colors.green(`added ${result.ok}`)}, ${colors.red(`failed ${result.failed}`)}`
-			: colors.green(`added ${result.ok}`);
-		return `${lines.join("\n")}\n${summary}`;
+		const parts: string[] = [colors.green(`added ${result.ok}`)];
+		if (result.unchanged > 0) parts.push(colors.dim(`unchanged ${result.unchanged}`));
+		if (result.failed > 0) parts.push(colors.red(`failed ${result.failed}`));
+		return `${lines.join("\n")}\n${parts.join(", ")}`;
 	},
 	handler: async (input, ctx) => ingest(input, ctx),
 });

package/src/operations/tree.ts CHANGED Viewed

@@ -47,42 +47,47 @@ export const treeOperation = defineOperation({
 /**
  * Build a tree of TreeNode objects from a flat list of `/`-delimited paths.
- * Splits each path into segments and groups by common prefix; nodes deeper
- * than `maxDepth` are folded into their parent's `children` summary count.
+ * Splits each path into segments and groups by common prefix. Segments
+ * deeper than `maxDepth` are folded into the deepest visible ancestor —
+ * that ancestor is marked `is_file=true` so the renderer surfaces it as a
+ * leaf even though longer paths exist underneath.
  */
 function buildTree(paths: string[], maxDepth: number): TreeNode[] {
-	const root: Map<string, TreeNode> = new Map();
+	interface MutableNode {
+		name: string;
+		full_path: string;
+		is_file: boolean;
+		children: Map<string, MutableNode>;
+	}
+	const root = new Map<string, MutableNode>();
 	for (const path of paths) {
 		const segs = path.split("/").filter(Boolean);
+		if (segs.length === 0) continue;
 		let level = root;
 		const trail: string[] = [];
-		for (let i = 0; i < segs.length && i < maxDepth; i++) {
+		const stop = Math.min(segs.length, maxDepth);
+		for (let i = 0; i < stop; i++) {
 			const seg = segs[i]!;
 			trail.push(seg);
-			const fullPath = trail.join("/");
 			let node = level.get(seg);
 			if (!node) {
-				node = { name: seg, full_path: fullPath, is_file: i === segs.length - 1 };
+				node = { name: seg, full_path: trail.join("/"), is_file: false, children: new Map() };
 				level.set(seg, node);
-			} else if (i === segs.length - 1) {
-				node.is_file = true;
-			}
-			if (i < segs.length - 1) {
-				if (!node.children) node.children = [];
-				const childMap = new Map(node.children.map((c) => [c.name, c] as const));
-				node.children = [...childMap.values()];
-				level = childMap;
-				if (childMap.size === 0) {
-					level = new Map();
-					node.children = [];
-				} else {
-					// rebuild level pointer
-					level = new Map(node.children.map((c) => [c.name, c] as const));
-				}
 			}
+			const isTerminal = i === segs.length - 1 || i === maxDepth - 1;
+			if (isTerminal) node.is_file = true;
+			level = node.children;
 		}
 	}
-	return [...root.values()].sort((a, b) => a.name.localeCompare(b.name));
+	const finalize = (m: Map<string, MutableNode>): TreeNode[] => {
+		const arr = [...m.values()].sort((a, b) => a.name.localeCompare(b.name));
+		return arr.map((n) => {
+			const out: TreeNode = { name: n.name, full_path: n.full_path, is_file: n.is_file };
+			if (n.children.size > 0) out.children = finalize(n.children);
+			return out;
+		});
+	};
+	return finalize(root);
 }
 /**

package/scripts/apply-transformers-patch.sh DELETED Viewed

@@ -1,35 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-# Apply the @huggingface/transformers patch to node_modules so that
-# `bun build --compile` produces a binary using the WASM backend
-# (onnxruntime-web) instead of onnxruntime-node, whose native bindings
-# can't be bundled into a single-binary distribution.
-#
-# We apply the patch imperatively (rather than via package.json
-# `patchedDependencies`) because that field, when present in a
-# published package, breaks `bun install` from a tarball.
-PATCH="patches/@huggingface%2Ftransformers@4.2.0.patch"
-TARGET="node_modules/@huggingface/transformers"
-MARKER="$TARGET/.membot-transformers-patch-applied"
-if [ ! -d "$TARGET" ]; then
-	echo "error: $TARGET not found — run \`bun install\` first" >&2
-	exit 1
-fi
-if [ ! -f "$PATCH" ]; then
-	echo "error: $PATCH not found" >&2
-	exit 1
-fi
-if [ -f "$MARKER" ]; then
-	echo "transformers patch already applied — skipping"
-	exit 0
-fi
-echo "Applying transformers patch ($PATCH) to $TARGET..."
-git apply --directory="$TARGET" "$PATCH"
-touch "$MARKER"
-echo "Patch applied."