membot 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/skills/membot.md +11 -2
- package/.cursor/rules/membot.mdc +11 -2
- package/README.md +4 -9
- package/package.json +2 -2
- package/patches/@evantahler%2Fmcpx@0.21.4.patch +44 -0
- package/scripts/apply-patches.sh +49 -0
- package/src/ingest/ingest.ts +68 -18
- package/src/ingest/source-resolver.ts +46 -12
- package/src/operations/add.ts +28 -8
- package/src/operations/tree.ts +27 -22
- package/scripts/apply-transformers-patch.sh +0 -35
package/.claude/skills/membot.md
CHANGED
|
@@ -36,6 +36,15 @@ membot add ./docs --refresh-frequency 24h # auto-refresh every day
|
|
|
36
36
|
|
|
37
37
|
Each entry becomes a new version under its own `logical_path`. PDFs/DOCX/HTML are converted to markdown; images get vision captions; original bytes are kept and reachable via `membot read --bytes`.
|
|
38
38
|
|
|
39
|
+
The default `logical_path` mirrors the source path so files with the same basename in different projects don't collide:
|
|
40
|
+
|
|
41
|
+
- Local file → absolute path with leading `/` stripped (e.g. `/Users/me/projA/README.md` → `Users/me/projA/README.md`).
|
|
42
|
+
- Local directory or glob → each entry's absolute path under the same shape.
|
|
43
|
+
- URL → `remotes/{host}/{path}` with `/`'s preserved (e.g. `https://github.com/userA/projA/blob/main/README.md` → `remotes/github.com/userA/projA/blob/main/README.md`). Query strings and fragments are dropped from the logical_path (the full URL is still stored for refresh).
|
|
44
|
+
- `inline:<text>` → `inline/{timestamp}.md`.
|
|
45
|
+
|
|
46
|
+
Pass `-p <path>` (or `--logical-path`) to override. On a directory walk it's treated as a *prefix* — entries land at `{prefix}/{path-relative-to-walk-base}`. Re-running `membot add` on the same source reuses the same logical_path and creates a new version (correct refresh behavior).
|
|
47
|
+
|
|
39
48
|
## 3. Read
|
|
40
49
|
|
|
41
50
|
```bash
|
|
@@ -75,7 +84,7 @@ Tombstones hide a path from `ls` / `tree` / `search` but `versions` and `read --
|
|
|
75
84
|
|
|
76
85
|
- Defaults always operate on the current, non-tombstoned version.
|
|
77
86
|
- Pass an explicit `--version <timestamp>` (from `membot versions`) to read or diff history.
|
|
78
|
-
- `membot_add
|
|
87
|
+
- `membot_add` (when source bytes have changed), refresh-with-changes, `write`, and `mv` each create a new version. The previous version is preserved. Re-running `membot_add` against an unchanged source is a no-op (status `unchanged`, same `version_id`); pass `force=true` to force a new version.
|
|
79
88
|
- Mutating an existing version is not possible — corrections are new versions.
|
|
80
89
|
|
|
81
90
|
## When to use this skill
|
|
@@ -99,7 +108,7 @@ Tombstones hide a path from `ls` / `tree` / `search` but `versions` and `read --
|
|
|
99
108
|
|
|
100
109
|
| Command | Purpose |
|
|
101
110
|
| ------------------------------------- | ------------------------------------------------------------------------------ |
|
|
102
|
-
| `membot add <source>` | Ingest file, directory, glob, URL, or `inline:<text
|
|
111
|
+
| `membot add <source>` | Ingest file, directory, glob, URL, or `inline:<text>`. Skips unchanged sources; pass `--force` to re-ingest |
|
|
103
112
|
| `membot ls [prefix]` | List current files (size, mime, refresh status) |
|
|
104
113
|
| `membot tree [prefix]` | Render the synthesised logical-path tree |
|
|
105
114
|
| `membot read <path>` | Read current markdown surrogate (or `--bytes` for original) |
|
package/.cursor/rules/membot.mdc
CHANGED
|
@@ -36,6 +36,15 @@ membot add ./docs --refresh-frequency 24h # auto-refresh every day
|
|
|
36
36
|
|
|
37
37
|
Each entry becomes a new version under its own `logical_path`. PDFs/DOCX/HTML are converted to markdown; images get vision captions; original bytes are kept and reachable via `membot read --bytes`.
|
|
38
38
|
|
|
39
|
+
The default `logical_path` mirrors the source path so files with the same basename in different projects don't collide:
|
|
40
|
+
|
|
41
|
+
- Local file → absolute path with leading `/` stripped (e.g. `/Users/me/projA/README.md` → `Users/me/projA/README.md`).
|
|
42
|
+
- Local directory or glob → each entry's absolute path under the same shape.
|
|
43
|
+
- URL → `remotes/{host}/{path}` with `/`'s preserved (e.g. `https://github.com/userA/projA/blob/main/README.md` → `remotes/github.com/userA/projA/blob/main/README.md`). Query strings and fragments are dropped from the logical_path (the full URL is still stored for refresh).
|
|
44
|
+
- `inline:<text>` → `inline/{timestamp}.md`.
|
|
45
|
+
|
|
46
|
+
Pass `-p <path>` (or `--logical-path`) to override. On a directory walk it's treated as a *prefix* — entries land at `{prefix}/{path-relative-to-walk-base}`. Re-running `membot add` on the same source reuses the same logical_path and creates a new version (correct refresh behavior).
|
|
47
|
+
|
|
39
48
|
## 3. Read
|
|
40
49
|
|
|
41
50
|
```bash
|
|
@@ -75,7 +84,7 @@ Tombstones hide a path from `ls` / `tree` / `search` but `versions` and `read --
|
|
|
75
84
|
|
|
76
85
|
- Defaults always operate on the current, non-tombstoned version.
|
|
77
86
|
- Pass an explicit `--version <timestamp>` (from `membot versions`) to read or diff history.
|
|
78
|
-
- `membot_add
|
|
87
|
+
- `membot_add` (when source bytes have changed), refresh-with-changes, `write`, and `mv` each create a new version. The previous version is preserved. Re-running `membot_add` against an unchanged source is a no-op (status `unchanged`, same `version_id`); pass `force=true` to force a new version.
|
|
79
88
|
- Mutating an existing version is not possible — corrections are new versions.
|
|
80
89
|
|
|
81
90
|
## When to use this rule
|
|
@@ -99,7 +108,7 @@ Tombstones hide a path from `ls` / `tree` / `search` but `versions` and `read --
|
|
|
99
108
|
|
|
100
109
|
| Command | Purpose |
|
|
101
110
|
| ------------------------------------- | ------------------------------------------------------------------------------ |
|
|
102
|
-
| `membot add <source>` | Ingest file, directory, glob, URL, or `inline:<text
|
|
111
|
+
| `membot add <source>` | Ingest file, directory, glob, URL, or `inline:<text>`. Skips unchanged sources; pass `--force` to re-ingest |
|
|
103
112
|
| `membot ls [prefix]` | List current files (size, mime, refresh status) |
|
|
104
113
|
| `membot tree [prefix]` | Render the synthesised logical-path tree |
|
|
105
114
|
| `membot read <path>` | Read current markdown surrogate (or `--bytes` for original) |
|
package/README.md
CHANGED
|
@@ -15,18 +15,13 @@
|
|
|
15
15
|
## Install
|
|
16
16
|
|
|
17
17
|
```bash
|
|
18
|
-
|
|
19
|
-
curl -fsSL https://raw.githubusercontent.com/evantahler/membot/main/install.sh | bash
|
|
20
|
-
|
|
21
|
-
# Windows — PowerShell
|
|
22
|
-
iwr -useb https://raw.githubusercontent.com/evantahler/membot/main/install.ps1 | iex
|
|
23
|
-
|
|
24
|
-
# From npm (requires Bun or Node)
|
|
25
|
-
bun add -g membot
|
|
18
|
+
bun install -g membot
|
|
26
19
|
# or
|
|
27
20
|
npm install -g membot
|
|
28
21
|
```
|
|
29
22
|
|
|
23
|
+
This pulls in DuckDB's per-platform native bindings alongside membot. The build externalizes `@duckdb/*` (those `.node` bindings can't be embedded by `bun build --compile`), so a global npm/bun install is the supported path.
|
|
24
|
+
|
|
30
25
|
## Quick start
|
|
31
26
|
|
|
32
27
|
```bash
|
|
@@ -55,7 +50,7 @@ The skill files describe the discover → ingest → search → read → write w
|
|
|
55
50
|
|
|
56
51
|
| Command | Description |
|
|
57
52
|
| ------------------------------- | --------------------------------------------------------------------------------- |
|
|
58
|
-
| `membot add <source>` | Ingest a file, directory, glob, URL, or `inline:<text>`.
|
|
53
|
+
| `membot add <source>` | Ingest a file, directory, glob, URL, or `inline:<text>`. Default `logical_path` mirrors the source (absolute path for local files, `remotes/{host}/{path}` for URLs) so files with the same basename in different projects don't collide. Pass `-p <path>` to override or, on a directory walk, to set a prefix. Skips on unchanged source bytes; pass `--force` to re-ingest. |
|
|
59
54
|
| `membot ls [prefix]` | List current files (size, mime, refresh status) |
|
|
60
55
|
| `membot tree [prefix]` | Render the synthesised logical-path tree |
|
|
61
56
|
| `membot read <path>` | Read the markdown surrogate (or `--bytes` for original bytes, base64) |
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "membot",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.2.0",
|
|
4
4
|
"description": "Versioned context store with hybrid search for AI agents. Stdio + HTTP MCP server and CLI.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"exports": {
|
|
@@ -26,7 +26,7 @@
|
|
|
26
26
|
"test": "bun test",
|
|
27
27
|
"lint": "biome ci . && tsc --noEmit",
|
|
28
28
|
"format": "biome check --write .",
|
|
29
|
-
"prebuild": "bash scripts/apply-
|
|
29
|
+
"prebuild": "bash scripts/apply-patches.sh",
|
|
30
30
|
"build": "bun build --compile --minify --sourcemap --external '@duckdb/*' ./src/cli.ts --outfile dist/membot"
|
|
31
31
|
},
|
|
32
32
|
"keywords": [
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
diff --git a/src/search/onnx-wasm-paths.ts b/src/search/onnx-wasm-paths.ts
|
|
2
|
+
--- a/src/search/onnx-wasm-paths.ts
|
|
3
|
+
+++ b/src/search/onnx-wasm-paths.ts
|
|
4
|
+
@@ -1,31 +1,9 @@
|
|
5
|
+
-// Embed the onnxruntime-web WASM runtime files into the compiled binary
|
|
6
|
+
-// (`bun build --compile`) so they survive in a single-binary distribution
|
|
7
|
+
-// where the user has no node_modules.
|
|
8
|
+
-//
|
|
9
|
+
-// This file is loaded **dynamically** by semantic.ts. The relative paths
|
|
10
|
+
-// only resolve in the local repo / compiled binary; for npm/bun-installed
|
|
11
|
+
-// mcpx the parent directory layout is different (deps are hoisted), the
|
|
12
|
+
-// dynamic import throws, and we fall back to letting transformers.js
|
|
13
|
+
-// load WASM via its default mechanism — which works fine because in
|
|
14
|
+
-// that environment node_modules exists and onnxruntime-web is reachable
|
|
15
|
+
-// through normal module resolution.
|
|
16
|
+
-
|
|
17
|
+
-// The relative `../../node_modules/...` paths only resolve from the local repo
|
|
18
|
+
-// layout (and inside `bun build --compile`). When this file is shipped via npm,
|
|
19
|
+
-// deps are hoisted, so consumer `tsc` runs hit TS2307. The `ts-ignore` directive
|
|
20
|
+
-// below silences that for consumers; we avoid the stricter `expect-error` form
|
|
21
|
+
-// because in the local repo the path resolves fine and there would be no error
|
|
22
|
+
-// to expect. At runtime the dynamic import in semantic.ts is wrapped in
|
|
23
|
+
-// try/catch and falls back to transformers.js's default WASM loader (issue #85).
|
|
24
|
+
-// biome-ignore lint/suspicious/noTsIgnore: must stay as ts-ignore per comment above
|
|
25
|
+
-// @ts-ignore - dynamic-only import
|
|
26
|
+
-import wasmMjsPath from "../../node_modules/onnxruntime-web/dist/ort-wasm-simd-threaded.asyncify.mjs" with {
|
|
27
|
+
- type: "file",
|
|
28
|
+
-};
|
|
29
|
+
-// biome-ignore lint/suspicious/noTsIgnore: must stay as ts-ignore per comment above
|
|
30
|
+
-// @ts-ignore - dynamic-only import
|
|
31
|
+
-import wasmBinPath from "../../node_modules/onnxruntime-web/dist/ort-wasm-simd-threaded.asyncify.wasm" with {
|
|
32
|
+
- type: "file",
|
|
33
|
+
-};
|
|
34
|
+
-
|
|
35
|
+
-export { wasmBinPath, wasmMjsPath };
|
|
36
|
+
+// PATCHED (membot): upstream mcpx ships static `with { type: "file" }` imports
|
|
37
|
+
+// of onnxruntime-web WASM assets via `../../node_modules/...`, which only
|
|
38
|
+
+// resolves when mcpx is built standalone. When consumed as an npm dep those
|
|
39
|
+
+// paths are unreachable and `bun build --compile` fails at build time. membot
|
|
40
|
+
+// never invokes mcpx's semantic search (only `mcpx.exec()` for URL fetching),
|
|
41
|
+
+// so we stub the exports — semantic.ts wraps the dynamic import in try/catch
|
|
42
|
+
+// and falls back to transformers.js's default WASM loader.
|
|
43
|
+
+export const wasmMjsPath = "";
|
|
44
|
+
+export const wasmBinPath = "";
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
set -euo pipefail
|
|
3
|
+
|
|
4
|
+
# Apply node_modules patches imperatively. We don't use package.json's
|
|
5
|
+
# `patchedDependencies` field because that field, when present in a published
|
|
6
|
+
# package, breaks `bun install` from a tarball.
|
|
7
|
+
#
|
|
8
|
+
# Each patch is gated by a marker file inside its target so reruns are no-ops.
|
|
9
|
+
|
|
10
|
+
apply_patch() {
|
|
11
|
+
local patch="$1" target="$2" marker_name="$3"
|
|
12
|
+
local marker="$target/$marker_name"
|
|
13
|
+
|
|
14
|
+
if [ ! -d "$target" ]; then
|
|
15
|
+
echo "error: $target not found — run \`bun install\` first" >&2
|
|
16
|
+
exit 1
|
|
17
|
+
fi
|
|
18
|
+
if [ ! -f "$patch" ]; then
|
|
19
|
+
echo "error: $patch not found" >&2
|
|
20
|
+
exit 1
|
|
21
|
+
fi
|
|
22
|
+
if [ -f "$marker" ]; then
|
|
23
|
+
echo "patch $patch already applied — skipping"
|
|
24
|
+
return 0
|
|
25
|
+
fi
|
|
26
|
+
|
|
27
|
+
echo "Applying $patch to $target..."
|
|
28
|
+
git apply --directory="$target" "$patch"
|
|
29
|
+
touch "$marker"
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
# @huggingface/transformers — replace static `import 'onnxruntime-node'` with a
|
|
33
|
+
# stub so `bun build --compile` produces a binary using the WASM backend
|
|
34
|
+
# (onnxruntime-web) instead of onnxruntime-node, whose native bindings can't be
|
|
35
|
+
# bundled into a single-binary distribution.
|
|
36
|
+
apply_patch \
|
|
37
|
+
"patches/@huggingface%2Ftransformers@4.2.0.patch" \
|
|
38
|
+
"node_modules/@huggingface/transformers" \
|
|
39
|
+
".membot-transformers-patch-applied"
|
|
40
|
+
|
|
41
|
+
# @evantahler/mcpx — stub `src/search/onnx-wasm-paths.ts` whose static
|
|
42
|
+
# `with { type: "file" }` imports use a relative path that only resolves in
|
|
43
|
+
# mcpx's own repo layout. When mcpx is consumed as an npm dep those paths are
|
|
44
|
+
# unreachable and `bun build --compile` fails at build time. membot never
|
|
45
|
+
# invokes mcpx's semantic search, so the stubbed exports are safe.
|
|
46
|
+
apply_patch \
|
|
47
|
+
"patches/@evantahler%2Fmcpx@0.21.4.patch" \
|
|
48
|
+
"node_modules/@evantahler/mcpx" \
|
|
49
|
+
".membot-mcpx-patch-applied"
|
package/src/ingest/ingest.ts
CHANGED
|
@@ -21,13 +21,14 @@ export interface IngestInput {
|
|
|
21
21
|
refresh_frequency?: string;
|
|
22
22
|
fetcher_hint?: string;
|
|
23
23
|
change_note?: string;
|
|
24
|
+
force?: boolean;
|
|
24
25
|
}
|
|
25
26
|
|
|
26
27
|
export interface IngestEntryResult {
|
|
27
28
|
source_path: string;
|
|
28
29
|
logical_path: string;
|
|
29
30
|
version_id: string | null;
|
|
30
|
-
status: "ok" | "failed";
|
|
31
|
+
status: "ok" | "unchanged" | "failed";
|
|
31
32
|
error?: string;
|
|
32
33
|
mime_type: string | null;
|
|
33
34
|
size_bytes: number;
|
|
@@ -39,6 +40,7 @@ export interface IngestResult {
|
|
|
39
40
|
ingested: IngestEntryResult[];
|
|
40
41
|
total: number;
|
|
41
42
|
ok: number;
|
|
43
|
+
unchanged: number;
|
|
42
44
|
failed: number;
|
|
43
45
|
}
|
|
44
46
|
|
|
@@ -57,14 +59,15 @@ export async function ingest(input: IngestInput, ctx: AppContext): Promise<Inges
|
|
|
57
59
|
});
|
|
58
60
|
|
|
59
61
|
const refreshSec = parseDuration(input.refresh_frequency);
|
|
62
|
+
const force = input.force === true;
|
|
60
63
|
|
|
61
64
|
if (resolved.kind === "inline") {
|
|
62
65
|
return ingestInline(resolved.text, input, ctx, refreshSec);
|
|
63
66
|
}
|
|
64
67
|
if (resolved.kind === "url") {
|
|
65
|
-
return ingestUrl(resolved.url, input, ctx, refreshSec);
|
|
68
|
+
return ingestUrl(resolved.url, input, ctx, refreshSec, force);
|
|
66
69
|
}
|
|
67
|
-
return ingestLocalFiles(resolved, input, ctx, refreshSec);
|
|
70
|
+
return ingestLocalFiles(resolved, input, ctx, refreshSec, force);
|
|
68
71
|
}
|
|
69
72
|
|
|
70
73
|
/** Ingest a single inline blob (source_type='inline'). */
|
|
@@ -119,6 +122,7 @@ async function ingestUrl(
|
|
|
119
122
|
input: IngestInput,
|
|
120
123
|
ctx: AppContext,
|
|
121
124
|
refreshSec: number | null,
|
|
125
|
+
force: boolean,
|
|
122
126
|
): Promise<IngestResult> {
|
|
123
127
|
const mcpxAdapter = ctx.mcpx
|
|
124
128
|
? {
|
|
@@ -151,6 +155,15 @@ async function ingestUrl(
|
|
|
151
155
|
result.fetcher = fetched.fetcher;
|
|
152
156
|
result.source_sha256 = fetched.sha256;
|
|
153
157
|
|
|
158
|
+
if (!force) {
|
|
159
|
+
const cur = await getCurrent(ctx.db, logicalPath);
|
|
160
|
+
if (cur && cur.source_sha256 === fetched.sha256) {
|
|
161
|
+
result.status = "unchanged";
|
|
162
|
+
result.version_id = cur.version_id;
|
|
163
|
+
return summarize([result]);
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
|
|
154
167
|
const versionId = await pipelineForBytes(ctx, {
|
|
155
168
|
logicalPath,
|
|
156
169
|
bytes: fetched.bytes,
|
|
@@ -181,6 +194,7 @@ async function ingestLocalFiles(
|
|
|
181
194
|
input: IngestInput,
|
|
182
195
|
ctx: AppContext,
|
|
183
196
|
refreshSec: number | null,
|
|
197
|
+
force: boolean,
|
|
184
198
|
): Promise<IngestResult> {
|
|
185
199
|
if (resolved.entries.length === 0) {
|
|
186
200
|
throw new HelpfulError({
|
|
@@ -195,7 +209,7 @@ async function ingestLocalFiles(
|
|
|
195
209
|
const isMulti = resolved.entries.length > 1;
|
|
196
210
|
|
|
197
211
|
for (const entry of resolved.entries) {
|
|
198
|
-
ctx.progress.tick(entry.
|
|
212
|
+
ctx.progress.tick(entry.relPathFromBase);
|
|
199
213
|
const logicalPath = pickLogicalPath(input.logical_path, entry, isMulti);
|
|
200
214
|
const result: IngestEntryResult = {
|
|
201
215
|
source_path: entry.absPath,
|
|
@@ -213,6 +227,16 @@ async function ingestLocalFiles(
|
|
|
213
227
|
result.size_bytes = local.sizeBytes;
|
|
214
228
|
result.source_sha256 = local.sha256;
|
|
215
229
|
|
|
230
|
+
if (!force) {
|
|
231
|
+
const cur = await getCurrent(ctx.db, logicalPath);
|
|
232
|
+
if (cur && cur.source_sha256 === local.sha256) {
|
|
233
|
+
result.status = "unchanged";
|
|
234
|
+
result.version_id = cur.version_id;
|
|
235
|
+
results.push(result);
|
|
236
|
+
continue;
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
|
|
216
240
|
const versionId = await pipelineForBytes(ctx, {
|
|
217
241
|
logicalPath,
|
|
218
242
|
bytes: local.bytes,
|
|
@@ -236,7 +260,10 @@ async function ingestLocalFiles(
|
|
|
236
260
|
}
|
|
237
261
|
results.push(result);
|
|
238
262
|
}
|
|
239
|
-
|
|
263
|
+
const okCount = results.filter((r) => r.status === "ok").length;
|
|
264
|
+
const unchangedCount = results.filter((r) => r.status === "unchanged").length;
|
|
265
|
+
const suffix = unchangedCount > 0 ? ` (${unchangedCount} unchanged)` : "";
|
|
266
|
+
ctx.progress.done(`ingested ${okCount}/${results.length}${suffix}`);
|
|
240
267
|
|
|
241
268
|
return summarize(results);
|
|
242
269
|
}
|
|
@@ -377,26 +404,47 @@ async function persistVersion(ctx: AppContext, p: PersistParams): Promise<string
|
|
|
377
404
|
}
|
|
378
405
|
|
|
379
406
|
/**
|
|
380
|
-
* Pick the logical path for a single matched entry.
|
|
381
|
-
*
|
|
382
|
-
*
|
|
383
|
-
*
|
|
407
|
+
* Pick the logical path for a single matched entry.
|
|
408
|
+
*
|
|
409
|
+
* - Default (no explicit logical_path): use the entry's absolute filesystem
|
|
410
|
+
* path with `\` normalized to `/` and the leading `/` stripped. This
|
|
411
|
+
* keeps `~/projA/README.md` and `~/projB/README.md` from colliding under
|
|
412
|
+
* a shared `README.md`. Two adds of the same absolute path produce the
|
|
413
|
+
* same logical_path, so the second add correctly creates a new version.
|
|
414
|
+
* - Single-source with explicit logical_path: use it verbatim.
|
|
415
|
+
* - Multi-entry (directory/glob) with explicit logical_path: treat as a
|
|
416
|
+
* prefix and append each entry's path relative to the walk base.
|
|
384
417
|
*/
|
|
385
|
-
function pickLogicalPath(explicit: string | undefined, entry: ResolvedLocalEntry, isMulti: boolean): string {
|
|
386
|
-
if (!explicit) return entry.
|
|
418
|
+
export function pickLogicalPath(explicit: string | undefined, entry: ResolvedLocalEntry, isMulti: boolean): string {
|
|
419
|
+
if (!explicit) return normalizeAbs(entry.absPath);
|
|
387
420
|
if (!isMulti) return explicit;
|
|
388
421
|
const prefix = explicit.endsWith("/") ? explicit.slice(0, -1) : explicit;
|
|
389
|
-
return `${prefix}/${entry.
|
|
422
|
+
return `${prefix}/${entry.relPathFromBase.replaceAll("\\", "/")}`;
|
|
390
423
|
}
|
|
391
424
|
|
|
392
|
-
/**
|
|
393
|
-
|
|
425
|
+
/**
|
|
426
|
+
* Normalize an absolute filesystem path into a logical_path:
|
|
427
|
+
* `\` → `/`, leading `/` stripped. Drive letters (Windows `C:`) are kept
|
|
428
|
+
* as the first path segment.
|
|
429
|
+
*/
|
|
430
|
+
export function normalizeAbs(absPath: string): string {
|
|
431
|
+
return absPath.replaceAll("\\", "/").replace(/^\/+/, "");
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
/**
|
|
435
|
+
* Default logical path for an ingested URL: `remotes/{host}/{pathname}`
|
|
436
|
+
* with slashes preserved so two projects on the same host (e.g.,
|
|
437
|
+
* github.com) don't collide. Query string and fragment are dropped from
|
|
438
|
+
* the logical_path for stable identity — the full URL is still preserved
|
|
439
|
+
* on the row in `source_path` and used for refresh.
|
|
440
|
+
*/
|
|
441
|
+
export function defaultLogicalForUrl(url: string): string {
|
|
394
442
|
try {
|
|
395
443
|
const u = new URL(url);
|
|
396
|
-
const tail = u.pathname.replace(/^\/+/, "").
|
|
397
|
-
return `
|
|
444
|
+
const tail = u.pathname.replace(/^\/+/, "").replace(/\/+$/, "") || "index";
|
|
445
|
+
return `remotes/${u.hostname}/${tail}`;
|
|
398
446
|
} catch {
|
|
399
|
-
return `
|
|
447
|
+
return `remotes/${url.replace(/[^a-z0-9.-]/gi, "_")}`;
|
|
400
448
|
}
|
|
401
449
|
}
|
|
402
450
|
|
|
@@ -428,12 +476,14 @@ export function parseDuration(input: string | null | undefined): number | null {
|
|
|
428
476
|
/** Roll a list of per-entry results into the top-level summary shape. */
|
|
429
477
|
function summarize(entries: IngestEntryResult[]): IngestResult {
|
|
430
478
|
let ok = 0;
|
|
479
|
+
let unchanged = 0;
|
|
431
480
|
let failed = 0;
|
|
432
481
|
for (const e of entries) {
|
|
433
482
|
if (e.status === "ok") ok += 1;
|
|
483
|
+
else if (e.status === "unchanged") unchanged += 1;
|
|
434
484
|
else failed += 1;
|
|
435
485
|
}
|
|
436
|
-
return { ingested: entries, total: entries.length, ok, failed };
|
|
486
|
+
return { ingested: entries, total: entries.length, ok, unchanged, failed };
|
|
437
487
|
}
|
|
438
488
|
|
|
439
489
|
function errorMessage(err: unknown): string {
|
|
@@ -9,9 +9,15 @@ export type ResolvedSource =
|
|
|
9
9
|
| { kind: "local-files"; entries: ResolvedLocalEntry[]; basePath: string };
|
|
10
10
|
|
|
11
11
|
export interface ResolvedLocalEntry {
|
|
12
|
+
/** Absolute filesystem path (post-realpath). */
|
|
12
13
|
absPath: string;
|
|
13
|
-
/**
|
|
14
|
-
|
|
14
|
+
/**
|
|
15
|
+
* Path relative to the walk base. Used when the caller passes an
|
|
16
|
+
* explicit `logical_path` *prefix* (directory/glob mode) — entries land
|
|
17
|
+
* at `{prefix}/{relPathFromBase}`. For default logical_paths we use
|
|
18
|
+
* `absPath` directly so paths from different filesystems don't collide.
|
|
19
|
+
*/
|
|
20
|
+
relPathFromBase: string;
|
|
15
21
|
}
|
|
16
22
|
|
|
17
23
|
export interface ResolveOptions {
|
|
@@ -43,10 +49,12 @@ export async function resolveSource(source: string, options: ResolveOptions = {}
|
|
|
43
49
|
}
|
|
44
50
|
|
|
45
51
|
const followSymlinks = options.followSymlinks !== false;
|
|
46
|
-
const
|
|
47
|
-
.
|
|
48
|
-
|
|
49
|
-
|
|
52
|
+
const userIncludes = options.include
|
|
53
|
+
? options.include
|
|
54
|
+
.split(",")
|
|
55
|
+
.map((g) => g.trim())
|
|
56
|
+
.filter(Boolean)
|
|
57
|
+
: [];
|
|
50
58
|
const excludeMatchers = [
|
|
51
59
|
...DEFAULT_EXCLUDES,
|
|
52
60
|
...(options.exclude ?? "")
|
|
@@ -57,9 +65,14 @@ export async function resolveSource(source: string, options: ResolveOptions = {}
|
|
|
57
65
|
|
|
58
66
|
if (isGlob(source)) {
|
|
59
67
|
const base = globBase(source);
|
|
68
|
+
const remainder = globRemainder(source);
|
|
60
69
|
try {
|
|
61
70
|
const realBase = await realpath(base);
|
|
62
|
-
|
|
71
|
+
// Source glob acts as a hard filter; user includes (if any) further
|
|
72
|
+
// narrow the result via AND. Pass them as a separate matcher so the
|
|
73
|
+
// two sets aren't picomatch-OR'd together.
|
|
74
|
+
const extraIncludes = userIncludes.length > 0 ? [userIncludes] : [];
|
|
75
|
+
return walk(realBase, [remainder], excludeMatchers, followSymlinks, extraIncludes);
|
|
63
76
|
} catch (err) {
|
|
64
77
|
throw asHelpful(
|
|
65
78
|
err,
|
|
@@ -84,16 +97,18 @@ export async function resolveSource(source: string, options: ResolveOptions = {}
|
|
|
84
97
|
}
|
|
85
98
|
|
|
86
99
|
if (st.isFile()) {
|
|
100
|
+
const real = await realpath(abs);
|
|
87
101
|
return {
|
|
88
102
|
kind: "local-files",
|
|
89
|
-
basePath:
|
|
90
|
-
entries: [{ absPath:
|
|
103
|
+
basePath: real,
|
|
104
|
+
entries: [{ absPath: real, relPathFromBase: real.split(sep).pop() ?? real }],
|
|
91
105
|
};
|
|
92
106
|
}
|
|
93
107
|
|
|
94
108
|
if (st.isDirectory()) {
|
|
95
109
|
const realBase = await realpath(abs);
|
|
96
|
-
|
|
110
|
+
const dirIncludes = userIncludes.length > 0 ? userIncludes : ["**/*"];
|
|
111
|
+
return walk(realBase, dirIncludes, excludeMatchers, followSymlinks);
|
|
97
112
|
}
|
|
98
113
|
|
|
99
114
|
throw new HelpfulError({
|
|
@@ -120,22 +135,40 @@ export function globBase(glob: string): string {
|
|
|
120
135
|
return base.length === 0 || !isAbsolute(base) ? resolve(base || ".") : base;
|
|
121
136
|
}
|
|
122
137
|
|
|
138
|
+
/**
|
|
139
|
+
* Take the wildcard portion of a glob — everything from the first segment
|
|
140
|
+
* containing a wildcard onward. We strip the static prefix so the matcher
|
|
141
|
+
* runs against entry paths relative to `globBase`. Without this, a glob like
|
|
142
|
+
* `docs/star-star/star.md` never matches anything under base=`docs/`, since
|
|
143
|
+
* walk() exposes `sub/file.md` to picomatch, not `docs/sub/file.md`.
|
|
144
|
+
*/
|
|
145
|
+
export function globRemainder(glob: string): string {
|
|
146
|
+
const parts = glob.split(sep);
|
|
147
|
+
const wildcardIdx = parts.findIndex((p) => /[*?[\]{}!]/.test(p));
|
|
148
|
+
if (wildcardIdx === -1) return glob;
|
|
149
|
+
return parts.slice(wildcardIdx).join(sep);
|
|
150
|
+
}
|
|
151
|
+
|
|
123
152
|
/**
|
|
124
153
|
* Recursively walk `base`, returning files matched by `includes` and not
|
|
125
154
|
* matched by `excludes`. Both globsets match against the entry's path
|
|
126
155
|
* relative to `base`. Symlinks are followed when `followSymlinks` is true,
|
|
127
|
-
* with cycles detected via a realpath cache.
|
|
156
|
+
* with cycles detected via a realpath cache. `extraIncludeSets` is a list
|
|
157
|
+
* of additional include groups, each ANDed onto the primary `includes` —
|
|
158
|
+
* use it when two filters must both match (e.g. source glob + --include).
|
|
128
159
|
*/
|
|
129
160
|
async function walk(
|
|
130
161
|
base: string,
|
|
131
162
|
includes: string[],
|
|
132
163
|
excludes: string[],
|
|
133
164
|
followSymlinks: boolean,
|
|
165
|
+
extraIncludeSets: string[][] = [],
|
|
134
166
|
): Promise<ResolvedSource> {
|
|
135
167
|
const seen = new Set<string>();
|
|
136
168
|
const entries: ResolvedLocalEntry[] = [];
|
|
137
169
|
|
|
138
170
|
const isInclude = picomatch(includes, { dot: false, nocase: false });
|
|
171
|
+
const extraMatchers = extraIncludeSets.map((set) => picomatch(set, { dot: false, nocase: false }));
|
|
139
172
|
const isExclude = excludes.length ? picomatch(excludes, { dot: false }) : null;
|
|
140
173
|
|
|
141
174
|
const queue: string[] = [base];
|
|
@@ -174,7 +207,8 @@ async function walk(
|
|
|
174
207
|
const relForMatch = rel.length === 0 ? (cur.split(sep).pop() ?? cur) : rel;
|
|
175
208
|
if (isExclude?.(relForMatch)) continue;
|
|
176
209
|
if (!isInclude(relForMatch)) continue;
|
|
177
|
-
|
|
210
|
+
if (extraMatchers.some((m) => !m(relForMatch))) continue;
|
|
211
|
+
entries.push({ absPath: real, relPathFromBase: relForMatch });
|
|
178
212
|
}
|
|
179
213
|
|
|
180
214
|
return { kind: "local-files", basePath: base, entries };
|
package/src/operations/add.ts
CHANGED
|
@@ -14,11 +14,23 @@ export const addOperation = defineOperation({
|
|
|
14
14
|
- a glob pattern (e.g. "docs/**/*.md")
|
|
15
15
|
- a URL (fetched via mcpx if configured, otherwise plain HTTP)
|
|
16
16
|
- "inline:<text>" literal
|
|
17
|
-
PDF, DOCX, HTML, images, and other binaries are converted to markdown — native libraries first, vision/OCR for images, LLM fallback for messy or scanned input. Original bytes are kept in the blobs table; \`membot_read bytes=true\` returns them. Setting \`refresh_frequency\` enables automatic refresh from the daemon. Each ingested file becomes a
|
|
17
|
+
PDF, DOCX, HTML, images, and other binaries are converted to markdown — native libraries first, vision/OCR for images, LLM fallback for messy or scanned input. Original bytes are kept in the blobs table; \`membot_read bytes=true\` returns them. Setting \`refresh_frequency\` enables automatic refresh from the daemon. By default, re-ingesting an unchanged source (same source_sha256 as the current version) is a no-op and reports \`status: "unchanged"\`; pass \`force=true\` to always create a new version. Each newly-ingested file becomes a new version under its own logical_path; existing versions stay queryable via membot_versions. Directory/glob ingests stream one file at a time — partial failures do not abort the rest; the response lists per-entry status.
|
|
18
|
+
|
|
19
|
+
When \`logical_path\` is omitted, it is derived from the source so files with the same basename in different projects do not collide:
|
|
20
|
+
- Local sources use the entry's absolute filesystem path with the leading "/" stripped (e.g. "/Users/me/projA/README.md" → "Users/me/projA/README.md").
|
|
21
|
+
- URLs use "remotes/{host}/{path}" with slashes preserved (e.g. "https://github.com/u/p/blob/main/README.md" → "remotes/github.com/u/p/blob/main/README.md"). Query strings and fragments are dropped from the logical_path; the full URL is still stored on the row for refresh.
|
|
22
|
+
- "inline:<text>" defaults to "inline/{timestamp}.md".
|
|
23
|
+
|
|
24
|
+
Pass \`logical_path\` to override. For a directory or glob walk it is treated as a PREFIX — each entry is placed at "{prefix}/{path-relative-to-walk-base}". Re-running \`membot_add\` on the same source resolves to the same logical_path; if bytes are unchanged the call is a no-op (status \`unchanged\`), otherwise a new version is created.`,
|
|
18
25
|
inputSchema: z.object({
|
|
19
26
|
source: z.string().describe("Local path, directory, glob, URL, or `inline:<text>` literal"),
|
|
20
27
|
logical_path: z.string().optional().describe("Destination logical_path (single source) or prefix (directory/glob)"),
|
|
21
|
-
include: z
|
|
28
|
+
include: z
|
|
29
|
+
.string()
|
|
30
|
+
.optional()
|
|
31
|
+
.describe(
|
|
32
|
+
"Glob include filter (comma-separated for multiple). Defaults to `**/*` for directory sources, or the source pattern itself when source is a glob.",
|
|
33
|
+
),
|
|
22
34
|
exclude: z.string().optional().describe("Glob exclude filter (comma-separated for multiple)"),
|
|
23
35
|
follow_symlinks: z
|
|
24
36
|
.boolean()
|
|
@@ -30,6 +42,10 @@ PDF, DOCX, HTML, images, and other binaries are converted to markdown — native
|
|
|
30
42
|
.optional()
|
|
31
43
|
.describe("Free-form hint passed to mcpx tool search (e.g. 'firecrawl', 'github', 'google docs', 'http')"),
|
|
32
44
|
change_note: z.string().optional().describe("Free-text note attached to the new version"),
|
|
45
|
+
force: z
|
|
46
|
+
.boolean()
|
|
47
|
+
.optional()
|
|
48
|
+
.describe("Re-ingest even when source bytes are unchanged. Default skips and reports `unchanged`."),
|
|
33
49
|
}),
|
|
34
50
|
outputSchema: z.object({
|
|
35
51
|
ingested: z.array(
|
|
@@ -37,7 +53,7 @@ PDF, DOCX, HTML, images, and other binaries are converted to markdown — native
|
|
|
37
53
|
source_path: z.string(),
|
|
38
54
|
logical_path: z.string(),
|
|
39
55
|
version_id: z.string().nullable(),
|
|
40
|
-
status: z.enum(["ok", "failed"]),
|
|
56
|
+
status: z.enum(["ok", "unchanged", "failed"]),
|
|
41
57
|
error: z.string().optional(),
|
|
42
58
|
mime_type: z.string().nullable(),
|
|
43
59
|
size_bytes: z.number(),
|
|
@@ -47,23 +63,27 @@ PDF, DOCX, HTML, images, and other binaries are converted to markdown — native
|
|
|
47
63
|
),
|
|
48
64
|
total: z.number(),
|
|
49
65
|
ok: z.number(),
|
|
66
|
+
unchanged: z.number(),
|
|
50
67
|
failed: z.number(),
|
|
51
68
|
}),
|
|
52
69
|
cli: {
|
|
53
70
|
positional: ["source"],
|
|
54
|
-
aliases: { logical_path: "-p", refresh_frequency: "-r", change_note: "-m" },
|
|
71
|
+
aliases: { logical_path: "-p", refresh_frequency: "-r", change_note: "-m", force: "-f" },
|
|
55
72
|
},
|
|
56
73
|
console_formatter: (result) => {
|
|
57
74
|
const lines = result.ingested.map((e) => {
|
|
58
75
|
if (e.status === "ok") {
|
|
59
76
|
return `${colors.green("✓")} ${colors.cyan(e.logical_path)} ${colors.dim(`(${e.fetcher}, ${e.size_bytes}B)`)}`;
|
|
60
77
|
}
|
|
78
|
+
if (e.status === "unchanged") {
|
|
79
|
+
return `${colors.dim("≡")} ${colors.cyan(e.logical_path)} ${colors.dim("(unchanged)")}`;
|
|
80
|
+
}
|
|
61
81
|
return `${colors.red("✗")} ${e.source_path} ${colors.dim(e.error ?? "")}`;
|
|
62
82
|
});
|
|
63
|
-
const
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
return `${lines.join("\n")}\n${
|
|
83
|
+
const parts: string[] = [colors.green(`added ${result.ok}`)];
|
|
84
|
+
if (result.unchanged > 0) parts.push(colors.dim(`unchanged ${result.unchanged}`));
|
|
85
|
+
if (result.failed > 0) parts.push(colors.red(`failed ${result.failed}`));
|
|
86
|
+
return `${lines.join("\n")}\n${parts.join(", ")}`;
|
|
67
87
|
},
|
|
68
88
|
handler: async (input, ctx) => ingest(input, ctx),
|
|
69
89
|
});
|
package/src/operations/tree.ts
CHANGED
|
@@ -47,42 +47,47 @@ export const treeOperation = defineOperation({
|
|
|
47
47
|
|
|
48
48
|
/**
|
|
49
49
|
* Build a tree of TreeNode objects from a flat list of `/`-delimited paths.
|
|
50
|
-
* Splits each path into segments and groups by common prefix
|
|
51
|
-
* than `maxDepth` are folded into
|
|
50
|
+
* Splits each path into segments and groups by common prefix. Segments
|
|
51
|
+
* deeper than `maxDepth` are folded into the deepest visible ancestor —
|
|
52
|
+
* that ancestor is marked `is_file=true` so the renderer surfaces it as a
|
|
53
|
+
* leaf even though longer paths exist underneath.
|
|
52
54
|
*/
|
|
53
55
|
function buildTree(paths: string[], maxDepth: number): TreeNode[] {
|
|
54
|
-
|
|
56
|
+
interface MutableNode {
|
|
57
|
+
name: string;
|
|
58
|
+
full_path: string;
|
|
59
|
+
is_file: boolean;
|
|
60
|
+
children: Map<string, MutableNode>;
|
|
61
|
+
}
|
|
62
|
+
const root = new Map<string, MutableNode>();
|
|
55
63
|
for (const path of paths) {
|
|
56
64
|
const segs = path.split("/").filter(Boolean);
|
|
65
|
+
if (segs.length === 0) continue;
|
|
57
66
|
let level = root;
|
|
58
67
|
const trail: string[] = [];
|
|
59
|
-
|
|
68
|
+
const stop = Math.min(segs.length, maxDepth);
|
|
69
|
+
for (let i = 0; i < stop; i++) {
|
|
60
70
|
const seg = segs[i]!;
|
|
61
71
|
trail.push(seg);
|
|
62
|
-
const fullPath = trail.join("/");
|
|
63
72
|
let node = level.get(seg);
|
|
64
73
|
if (!node) {
|
|
65
|
-
node = { name: seg, full_path:
|
|
74
|
+
node = { name: seg, full_path: trail.join("/"), is_file: false, children: new Map() };
|
|
66
75
|
level.set(seg, node);
|
|
67
|
-
} else if (i === segs.length - 1) {
|
|
68
|
-
node.is_file = true;
|
|
69
|
-
}
|
|
70
|
-
if (i < segs.length - 1) {
|
|
71
|
-
if (!node.children) node.children = [];
|
|
72
|
-
const childMap = new Map(node.children.map((c) => [c.name, c] as const));
|
|
73
|
-
node.children = [...childMap.values()];
|
|
74
|
-
level = childMap;
|
|
75
|
-
if (childMap.size === 0) {
|
|
76
|
-
level = new Map();
|
|
77
|
-
node.children = [];
|
|
78
|
-
} else {
|
|
79
|
-
// rebuild level pointer
|
|
80
|
-
level = new Map(node.children.map((c) => [c.name, c] as const));
|
|
81
|
-
}
|
|
82
76
|
}
|
|
77
|
+
const isTerminal = i === segs.length - 1 || i === maxDepth - 1;
|
|
78
|
+
if (isTerminal) node.is_file = true;
|
|
79
|
+
level = node.children;
|
|
83
80
|
}
|
|
84
81
|
}
|
|
85
|
-
|
|
82
|
+
const finalize = (m: Map<string, MutableNode>): TreeNode[] => {
|
|
83
|
+
const arr = [...m.values()].sort((a, b) => a.name.localeCompare(b.name));
|
|
84
|
+
return arr.map((n) => {
|
|
85
|
+
const out: TreeNode = { name: n.name, full_path: n.full_path, is_file: n.is_file };
|
|
86
|
+
if (n.children.size > 0) out.children = finalize(n.children);
|
|
87
|
+
return out;
|
|
88
|
+
});
|
|
89
|
+
};
|
|
90
|
+
return finalize(root);
|
|
86
91
|
}
|
|
87
92
|
|
|
88
93
|
/**
|
|
@@ -1,35 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env bash
|
|
2
|
-
set -euo pipefail
|
|
3
|
-
|
|
4
|
-
# Apply the @huggingface/transformers patch to node_modules so that
|
|
5
|
-
# `bun build --compile` produces a binary using the WASM backend
|
|
6
|
-
# (onnxruntime-web) instead of onnxruntime-node, whose native bindings
|
|
7
|
-
# can't be bundled into a single-binary distribution.
|
|
8
|
-
#
|
|
9
|
-
# We apply the patch imperatively (rather than via package.json
|
|
10
|
-
# `patchedDependencies`) because that field, when present in a
|
|
11
|
-
# published package, breaks `bun install` from a tarball.
|
|
12
|
-
|
|
13
|
-
PATCH="patches/@huggingface%2Ftransformers@4.2.0.patch"
|
|
14
|
-
TARGET="node_modules/@huggingface/transformers"
|
|
15
|
-
MARKER="$TARGET/.membot-transformers-patch-applied"
|
|
16
|
-
|
|
17
|
-
if [ ! -d "$TARGET" ]; then
|
|
18
|
-
echo "error: $TARGET not found — run \`bun install\` first" >&2
|
|
19
|
-
exit 1
|
|
20
|
-
fi
|
|
21
|
-
|
|
22
|
-
if [ ! -f "$PATCH" ]; then
|
|
23
|
-
echo "error: $PATCH not found" >&2
|
|
24
|
-
exit 1
|
|
25
|
-
fi
|
|
26
|
-
|
|
27
|
-
if [ -f "$MARKER" ]; then
|
|
28
|
-
echo "transformers patch already applied — skipping"
|
|
29
|
-
exit 0
|
|
30
|
-
fi
|
|
31
|
-
|
|
32
|
-
echo "Applying transformers patch ($PATCH) to $TARGET..."
|
|
33
|
-
git apply --directory="$TARGET" "$PATCH"
|
|
34
|
-
touch "$MARKER"
|
|
35
|
-
echo "Patch applied."
|