membot 0.4.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/skills/membot.md +1 -0
- package/.cursor/rules/membot.mdc +1 -0
- package/LICENSE +21 -0
- package/README.md +15 -2
- package/package.json +1 -1
- package/src/cli.ts +2 -0
- package/src/commands/config.ts +494 -0
- package/src/config/loader.ts +12 -10
- package/src/config/schemas.ts +1 -1
- package/src/ingest/embedder.ts +23 -1
- package/src/ingest/ingest.ts +103 -71
- package/src/operations/add.ts +1 -0
- package/src/operations/refresh.ts +3 -1
- package/src/output/progress.ts +20 -4
- package/src/refresh/runner.ts +67 -38
package/.claude/skills/membot.md
CHANGED
|
@@ -125,6 +125,7 @@ Tombstones hide a path from `ls` / `tree` / `search` but `versions` and `read --
|
|
|
125
125
|
| `membot prune --before <ts>` | Permanently drop non-current versions older than cutoff (irreversible) |
|
|
126
126
|
| `membot serve` | Start MCP server (stdio default, `--http <port>` for HTTP) |
|
|
127
127
|
| `membot reindex` | Rebuild the FTS keyword index over current chunks |
|
|
128
|
+
| `membot config <subcommand>` | Host-side config management (`get` / `set` / `unset` / `list` / `path`). **Don't run** — this is for the human operator, not for agents |
|
|
128
129
|
|
|
129
130
|
## Output formats
|
|
130
131
|
|
package/.cursor/rules/membot.mdc
CHANGED
|
@@ -125,6 +125,7 @@ Tombstones hide a path from `ls` / `tree` / `search` but `versions` and `read --
|
|
|
125
125
|
| `membot prune --before <ts>` | Permanently drop non-current versions older than cutoff (irreversible) |
|
|
126
126
|
| `membot serve` | Start MCP server (stdio default, `--http <port>` for HTTP) |
|
|
127
127
|
| `membot reindex` | Rebuild the FTS keyword index over current chunks |
|
|
128
|
+
| `membot config <subcommand>` | Host-side config management (`get` / `set` / `unset` / `list` / `path`). **Don't run** — this is for the human operator, not for agents |
|
|
128
129
|
|
|
129
130
|
## Output formats
|
|
130
131
|
|
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Evan Tahler
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
> Versioned context store with hybrid search for AI agents. Stdio + HTTP MCP server and CLI.
|
|
4
4
|
|
|
5
|
+
[](https://www.npmjs.com/package/membot)
|
|
5
6
|
[](./LICENSE)
|
|
6
7
|
|
|
7
8
|
`membot` is a single-binary CLI and MCP server that gives AI agents a persistent, versioned, searchable context store. Files (markdown, PDFs, DOCX, HTML, URLs, agent-authored notes) are ingested, converted to markdown, chunked, embedded **locally** with `@huggingface/transformers` (WASM, no cloud calls), and indexed in DuckDB with hybrid search (semantic vector + BM25). Every change creates a new version — nothing is overwritten in place.
|
|
@@ -63,6 +64,7 @@ The skill files describe the discover → ingest → search → read → write w
|
|
|
63
64
|
| `membot prune --before <ts>` | Permanently drop non-current versions older than cutoff (irreversible) |
|
|
64
65
|
| `membot serve` | Run the MCP server (stdio default; `--http <port>` for HTTP) |
|
|
65
66
|
| `membot reindex` | Rebuild the FTS keyword index over current chunks |
|
|
67
|
+
| `membot config <subcommand>` | Get / set values in `~/.membot/config.json` (`get`, `set`, `unset`, `list`, `path`) |
|
|
66
68
|
| `membot mcpx <subcommand>` | Forward to the bundled `mcpx` CLI for managing remote MCP servers |
|
|
67
69
|
| `membot skill install` | Install the Claude Code / Cursor agent skill |
|
|
68
70
|
|
|
@@ -100,9 +102,20 @@ Add `--watch` (and optional `--tick <sec>`) to also run the refresh daemon, whic
|
|
|
100
102
|
- `~/.membot/index.duckdb` — all content, blobs, chunks, embeddings, and metadata.
|
|
101
103
|
- `~/.membot/models/` — cached embedding model weights (`Xenova/bge-small-en-v1.5`, 384-dim).
|
|
102
104
|
- `~/.membot/logs/` — daemon logs when running `serve --watch`.
|
|
103
|
-
- **Config file:** `~/.membot/config.json` (optional; defaults are sane).
|
|
105
|
+
- **Config file:** `~/.membot/config.json` (optional; defaults are sane). Edit it directly or via `membot config`:
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
membot config list # show every value (secrets masked)
|
|
109
|
+
membot config set llm.anthropic_api_key sk-ant-... # enable LLM-fallback paths
|
|
110
|
+
membot config set chunker.target_chars 800 # tweak any nested value
|
|
111
|
+
membot config get llm.anthropic_api_key --show-secrets # reveal the masked key
|
|
112
|
+
membot config unset chunker.target_chars # back to schema default
|
|
113
|
+
membot config path # print the absolute config path
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
Values are written with file mode `0600`. `ANTHROPIC_API_KEY` set in the environment still wins on read, so existing env-var setups keep working.
|
|
104
117
|
- **Environment variables:**
|
|
105
|
-
- `ANTHROPIC_API_KEY` — optional. Enables LLM fallback for messy / scanned input (vision captions for images, last-resort markdown conversion). Without it, the pipeline degrades to deterministic native conversion.
|
|
118
|
+
- `ANTHROPIC_API_KEY` — optional. Enables LLM fallback for messy / scanned input (vision captions for images, last-resort markdown conversion). Without it, the pipeline degrades to deterministic native conversion. Equivalent to `membot config set llm.anthropic_api_key ...`; the env var takes precedence on read.
|
|
106
119
|
- `MEMBOT_HOME` — override the data directory.
|
|
107
120
|
- `NO_COLOR`, `CI`, `FORCE_COLOR` — standard output controls.
|
|
108
121
|
|
package/package.json
CHANGED
package/src/cli.ts
CHANGED
|
@@ -4,6 +4,7 @@ import { bold, cyan, dim, green, yellow } from "ansis";
|
|
|
4
4
|
import { program } from "commander";
|
|
5
5
|
import pkg from "../package.json" with { type: "json" };
|
|
6
6
|
import { registerCheckUpdateCommand } from "./commands/check-update.ts";
|
|
7
|
+
import { registerConfigCommand } from "./commands/config.ts";
|
|
7
8
|
import { registerMcpxCommand } from "./commands/mcpx.ts";
|
|
8
9
|
import { registerReindexCommand } from "./commands/reindex.ts";
|
|
9
10
|
import { registerServeCommand } from "./commands/serve.ts";
|
|
@@ -57,6 +58,7 @@ for (const op of OPERATIONS) {
|
|
|
57
58
|
|
|
58
59
|
registerServeCommand(program);
|
|
59
60
|
registerReindexCommand(program);
|
|
61
|
+
registerConfigCommand(program);
|
|
60
62
|
registerMcpxCommand(program);
|
|
61
63
|
registerSkillCommand(program);
|
|
62
64
|
registerCheckUpdateCommand(program);
|
|
@@ -0,0 +1,494 @@
|
|
|
1
|
+
import type { Command } from "commander";
|
|
2
|
+
import { z } from "zod";
|
|
3
|
+
import { loadConfig, saveConfig } from "../config/loader.ts";
|
|
4
|
+
import { type MembotConfig, MembotConfigSchema } from "../config/schemas.ts";
|
|
5
|
+
import { ENV } from "../constants.ts";
|
|
6
|
+
import { HelpfulError, isHelpfulError, mapKindToExit } from "../errors.ts";
|
|
7
|
+
import { renderCliError } from "../mount/commander.ts";
|
|
8
|
+
import { colors, renderTable } from "../output/formatter.ts";
|
|
9
|
+
import { logger } from "../output/logger.ts";
|
|
10
|
+
import { detectMode, isJson, setMode } from "../output/tty.ts";
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* The set of value shapes any config leaf can take. Mirrors the zod leaf
|
|
14
|
+
* types used in `MembotConfigSchema` — extend this when the schema gains a
|
|
15
|
+
* new primitive (e.g. arrays, enums).
|
|
16
|
+
*/
|
|
17
|
+
export type ConfigFieldKind = "string" | "number" | "boolean" | "null" | "unknown";
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* Single source of truth for "what does this config key look like?":
|
|
21
|
+
* - `path` — dot-notation address (e.g. `llm.anthropic_api_key`)
|
|
22
|
+
* - `kind` — runtime value shape, derived from the zod schema
|
|
23
|
+
* - `nullable` — whether `null` is a legal value
|
|
24
|
+
* - `is_secret` — declared at the schema level via `.meta({ secret: true })`;
|
|
25
|
+
* drives masking on every read path
|
|
26
|
+
*/
|
|
27
|
+
export interface ConfigField {
|
|
28
|
+
path: string;
|
|
29
|
+
kind: ConfigFieldKind;
|
|
30
|
+
nullable: boolean;
|
|
31
|
+
is_secret: boolean;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
interface ConfigGetOptions {
|
|
35
|
+
showSecrets?: boolean;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Register the `membot config` parent command and its subcommands
|
|
40
|
+
* (`get`, `set`, `unset`, `list`, `path`). All subcommands read from and
|
|
41
|
+
* write to `~/.membot/config.json` via the existing `loadConfig` /
|
|
42
|
+
* `saveConfig` helpers, so dot-paths, defaults, and env-var precedence
|
|
43
|
+
* stay consistent with the rest of membot.
|
|
44
|
+
*/
|
|
45
|
+
export function registerConfigCommand(program: Command): void {
|
|
46
|
+
const config = program.command("config").description("Get and set membot config values in ~/.membot/config.json");
|
|
47
|
+
|
|
48
|
+
config
|
|
49
|
+
.command("get")
|
|
50
|
+
.argument("[key]", "dot-notation key (e.g. llm.anthropic_api_key); omit to print all values")
|
|
51
|
+
.option("--show-secrets", "print secret values (e.g. API keys) unmasked")
|
|
52
|
+
.description("Print a config value at the given dot-notation key, or all values if no key is given")
|
|
53
|
+
.action(async (key: string | undefined, opts: ConfigGetOptions) => {
|
|
54
|
+
await runSubcommand(program, async () => {
|
|
55
|
+
if (key === undefined) {
|
|
56
|
+
await runList(opts);
|
|
57
|
+
} else {
|
|
58
|
+
await runGet(key, opts);
|
|
59
|
+
}
|
|
60
|
+
});
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
config
|
|
64
|
+
.command("set")
|
|
65
|
+
.argument("<key>", "dot-notation key (e.g. llm.anthropic_api_key)")
|
|
66
|
+
.argument("<value>", 'JSON literal (42, true, null, "text") or raw string')
|
|
67
|
+
.description("Set a config value at the given dot-notation key. Persists to ~/.membot/config.json")
|
|
68
|
+
.action(async (key: string, value: string) => {
|
|
69
|
+
await runSubcommand(program, async () => {
|
|
70
|
+
await runSet(key, value);
|
|
71
|
+
});
|
|
72
|
+
});
|
|
73
|
+
|
|
74
|
+
config
|
|
75
|
+
.command("unset")
|
|
76
|
+
.argument("<key>", "dot-notation key (e.g. chunker.target_chars)")
|
|
77
|
+
.description("Reset a config value to its schema default")
|
|
78
|
+
.action(async (key: string) => {
|
|
79
|
+
await runSubcommand(program, async () => {
|
|
80
|
+
await runUnset(key);
|
|
81
|
+
});
|
|
82
|
+
});
|
|
83
|
+
|
|
84
|
+
config
|
|
85
|
+
.command("list")
|
|
86
|
+
.option("--show-secrets", "print secret values (e.g. API keys) unmasked")
|
|
87
|
+
.description("Print every config value (table on a TTY, JSON otherwise). Secrets masked by default")
|
|
88
|
+
.action(async (opts: ConfigGetOptions) => {
|
|
89
|
+
await runSubcommand(program, async () => {
|
|
90
|
+
await runList(opts);
|
|
91
|
+
});
|
|
92
|
+
});
|
|
93
|
+
|
|
94
|
+
config
|
|
95
|
+
.command("path")
|
|
96
|
+
.description("Print the absolute path to the config file")
|
|
97
|
+
.action(async () => {
|
|
98
|
+
await runSubcommand(program, async () => {
|
|
99
|
+
await runPath();
|
|
100
|
+
});
|
|
101
|
+
});
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* Apply global flags to the output mode (so `--json` / `--no-color` /
|
|
106
|
+
* `CI=true` are honored) and turn any thrown error into a uniform
|
|
107
|
+
* `renderCliError` + appropriate exit code.
|
|
108
|
+
*/
|
|
109
|
+
async function runSubcommand(program: Command, fn: () => Promise<void>): Promise<void> {
|
|
110
|
+
const globalOpts = program.optsWithGlobals<{
|
|
111
|
+
json?: boolean;
|
|
112
|
+
verbose?: boolean;
|
|
113
|
+
color?: boolean;
|
|
114
|
+
}>();
|
|
115
|
+
setMode(
|
|
116
|
+
detectMode({
|
|
117
|
+
json: globalOpts.json,
|
|
118
|
+
verbose: globalOpts.verbose,
|
|
119
|
+
noColor: globalOpts.color === false,
|
|
120
|
+
}),
|
|
121
|
+
);
|
|
122
|
+
try {
|
|
123
|
+
await fn();
|
|
124
|
+
} catch (err) {
|
|
125
|
+
renderCliError(err);
|
|
126
|
+
process.exit(isHelpfulError(err) ? mapKindToExit(err.kind) : 1);
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
/** Print a single config value at `key`, masked unless `--show-secrets`. */
|
|
131
|
+
export async function runGet(key: string, opts: ConfigGetOptions): Promise<void> {
|
|
132
|
+
resolveSchemaPath(MembotConfigSchema, key);
|
|
133
|
+
const { config } = await loadConfig();
|
|
134
|
+
const raw = getValueAt(config, key);
|
|
135
|
+
const value = opts.showSecrets ? raw : maskIfSecret(key, raw);
|
|
136
|
+
if (isJson()) {
|
|
137
|
+
process.stdout.write(`${JSON.stringify(value)}\n`);
|
|
138
|
+
return;
|
|
139
|
+
}
|
|
140
|
+
process.stdout.write(`${formatScalar(value)}\n`);
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
/**
|
|
144
|
+
* Coerce + validate + persist `value` at `key`. Coercion rule: try
|
|
145
|
+
* `JSON.parse(value)` first (so `42` / `true` / `null` work); fall back to
|
|
146
|
+
* the raw string. Validation runs the full `MembotConfigSchema` parse, so
|
|
147
|
+
* type errors surface a precise hint.
|
|
148
|
+
*/
|
|
149
|
+
export async function runSet(key: string, rawValue: string): Promise<void> {
|
|
150
|
+
resolveSchemaPath(MembotConfigSchema, key);
|
|
151
|
+
const coerced = coerceValue(rawValue);
|
|
152
|
+
|
|
153
|
+
const { config, configPath } = await loadConfig();
|
|
154
|
+
const draft = structuredClone(config);
|
|
155
|
+
setValueAt(draft, key, coerced);
|
|
156
|
+
|
|
157
|
+
const validated = validateOrThrow(draft, key);
|
|
158
|
+
await saveConfig(configPath, validated);
|
|
159
|
+
|
|
160
|
+
if (isJson()) {
|
|
161
|
+
process.stdout.write(
|
|
162
|
+
`${JSON.stringify({ ok: true, key, value: maskIfSecret(key, getValueAt(validated, key)) })}\n`,
|
|
163
|
+
);
|
|
164
|
+
} else {
|
|
165
|
+
const display = formatScalar(maskIfSecret(key, getValueAt(validated, key)));
|
|
166
|
+
logger.info(`set ${key} = ${display}`);
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
// If a user just persisted the API key while ANTHROPIC_API_KEY is also set
|
|
170
|
+
// in the environment, the env wins on read — surface that so they don't
|
|
171
|
+
// wonder why their new value isn't taking effect.
|
|
172
|
+
if (key === "llm.anthropic_api_key" && process.env[ENV.ANTHROPIC_API_KEY]?.trim()) {
|
|
173
|
+
logger.warn(
|
|
174
|
+
`note: ANTHROPIC_API_KEY is set in your environment and overrides the file at read time. Unset it (\`unset ANTHROPIC_API_KEY\`) to use the value you just saved.`,
|
|
175
|
+
);
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
/** Reset `key` to whatever `MembotConfigSchema` produces from `{}`. */
|
|
180
|
+
export async function runUnset(key: string): Promise<void> {
|
|
181
|
+
resolveSchemaPath(MembotConfigSchema, key);
|
|
182
|
+
const defaults = MembotConfigSchema.parse({});
|
|
183
|
+
const defaultValue = getValueAt(defaults, key);
|
|
184
|
+
|
|
185
|
+
const { config, configPath } = await loadConfig();
|
|
186
|
+
const draft = structuredClone(config);
|
|
187
|
+
setValueAt(draft, key, defaultValue);
|
|
188
|
+
|
|
189
|
+
const validated = validateOrThrow(draft, key);
|
|
190
|
+
await saveConfig(configPath, validated);
|
|
191
|
+
|
|
192
|
+
if (isJson()) {
|
|
193
|
+
process.stdout.write(`${JSON.stringify({ ok: true, key, value: maskIfSecret(key, defaultValue) })}\n`);
|
|
194
|
+
} else {
|
|
195
|
+
logger.info(`unset ${key} → ${formatScalar(maskIfSecret(key, defaultValue))}`);
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
/** Print every key/value pair. JSON mode → nested config object; TTY → table. */
|
|
200
|
+
async function runList(opts: ConfigGetOptions): Promise<void> {
|
|
201
|
+
const { config } = await loadConfig();
|
|
202
|
+
if (isJson()) {
|
|
203
|
+
const masked = opts.showSecrets ? config : maskAllSecrets(config);
|
|
204
|
+
process.stdout.write(`${JSON.stringify(masked, null, 2)}\n`);
|
|
205
|
+
return;
|
|
206
|
+
}
|
|
207
|
+
const paths = enumerateSchemaPaths(MembotConfigSchema);
|
|
208
|
+
const rows = paths.map((p) => {
|
|
209
|
+
const raw = getValueAt(config, p);
|
|
210
|
+
const value = opts.showSecrets ? raw : maskIfSecret(p, raw);
|
|
211
|
+
return [colors.cyan(p), formatScalar(value)];
|
|
212
|
+
});
|
|
213
|
+
process.stdout.write(`${renderTable(["key", "value"], rows)}\n`);
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
/** Print the absolute path to the config file. */
|
|
217
|
+
async function runPath(): Promise<void> {
|
|
218
|
+
const { configPath } = await loadConfig();
|
|
219
|
+
if (isJson()) {
|
|
220
|
+
process.stdout.write(`${JSON.stringify({ path: configPath })}\n`);
|
|
221
|
+
return;
|
|
222
|
+
}
|
|
223
|
+
process.stdout.write(`${configPath}\n`);
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
/**
|
|
227
|
+
* Walk a dotted path through `MembotConfigSchema` and return the leaf zod
|
|
228
|
+
* type. Descends into `ZodObject.shape` and transparently unwraps
|
|
229
|
+
* `ZodDefault` / `ZodOptional` / `ZodNullable`. Throws `HelpfulError` if any
|
|
230
|
+
* segment doesn't exist, with a "did you mean" suggestion derived from the
|
|
231
|
+
* full set of valid paths.
|
|
232
|
+
*/
|
|
233
|
+
export function resolveSchemaPath(schema: z.ZodTypeAny, dottedPath: string): z.ZodTypeAny {
|
|
234
|
+
const segments = dottedPath.split(".").filter((s) => s.length > 0);
|
|
235
|
+
if (segments.length === 0) {
|
|
236
|
+
throw new HelpfulError({
|
|
237
|
+
kind: "input_error",
|
|
238
|
+
message: "config key is required",
|
|
239
|
+
hint: "Pass a dot-notation key, e.g. `membot config get llm.anthropic_api_key`. Run `membot config list` for the full set.",
|
|
240
|
+
});
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
let current = unwrapSchema(schema);
|
|
244
|
+
const traversed: string[] = [];
|
|
245
|
+
for (const segment of segments) {
|
|
246
|
+
if (!(current instanceof z.ZodObject)) {
|
|
247
|
+
throw unknownKeyError(dottedPath, traversed.join("."));
|
|
248
|
+
}
|
|
249
|
+
const shape = current.shape as Record<string, z.ZodTypeAny>;
|
|
250
|
+
const next = shape[segment];
|
|
251
|
+
if (!next) {
|
|
252
|
+
throw unknownKeyError(dottedPath, [...traversed, segment].join("."));
|
|
253
|
+
}
|
|
254
|
+
traversed.push(segment);
|
|
255
|
+
current = unwrapSchema(next);
|
|
256
|
+
}
|
|
257
|
+
return current;
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
/**
|
|
261
|
+
* Build the `HelpfulError` for an unknown key. Includes a "did you mean"
|
|
262
|
+
* suggestion when there's an obvious near-match (Levenshtein distance ≤ 2).
|
|
263
|
+
*/
|
|
264
|
+
function unknownKeyError(badPath: string, _matchedPrefix: string): HelpfulError {
|
|
265
|
+
const valid = enumerateSchemaPaths(MembotConfigSchema);
|
|
266
|
+
const suggestion = nearestPath(badPath, valid);
|
|
267
|
+
const baseHint = "Run `membot config list` to see all valid keys.";
|
|
268
|
+
const hint = suggestion ? `Did you mean \`${suggestion}\`? ${baseHint}` : baseHint;
|
|
269
|
+
return new HelpfulError({
|
|
270
|
+
kind: "input_error",
|
|
271
|
+
message: `unknown config key: ${badPath}`,
|
|
272
|
+
hint,
|
|
273
|
+
});
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
/** Return the closest known path within Levenshtein distance 2, or null. */
|
|
277
|
+
function nearestPath(target: string, candidates: readonly string[]): string | null {
|
|
278
|
+
let best: { path: string; distance: number } | null = null;
|
|
279
|
+
for (const c of candidates) {
|
|
280
|
+
const d = levenshtein(target, c);
|
|
281
|
+
if (d <= 2 && (!best || d < best.distance)) best = { path: c, distance: d };
|
|
282
|
+
}
|
|
283
|
+
return best?.path ?? null;
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
function levenshtein(a: string, b: string): number {
|
|
287
|
+
if (a === b) return 0;
|
|
288
|
+
if (a.length === 0) return b.length;
|
|
289
|
+
if (b.length === 0) return a.length;
|
|
290
|
+
const prev = new Array<number>(b.length + 1);
|
|
291
|
+
const curr = new Array<number>(b.length + 1);
|
|
292
|
+
for (let j = 0; j <= b.length; j++) prev[j] = j;
|
|
293
|
+
for (let i = 1; i <= a.length; i++) {
|
|
294
|
+
curr[0] = i;
|
|
295
|
+
for (let j = 1; j <= b.length; j++) {
|
|
296
|
+
const cost = a[i - 1] === b[j - 1] ? 0 : 1;
|
|
297
|
+
curr[j] = Math.min((curr[j - 1] ?? 0) + 1, (prev[j] ?? 0) + 1, (prev[j - 1] ?? 0) + cost);
|
|
298
|
+
}
|
|
299
|
+
for (let j = 0; j <= b.length; j++) prev[j] = curr[j] ?? 0;
|
|
300
|
+
}
|
|
301
|
+
return prev[b.length] ?? 0;
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
/**
|
|
305
|
+
* Strip every layer of `ZodDefault` / `ZodOptional` / `ZodNullable`. Zod 4
|
|
306
|
+
* types `.unwrap()` as the lower-level `$ZodType` rather than `ZodType`, so
|
|
307
|
+
* we cast back through `unknown` — the runtime instance is a real `ZodType`.
|
|
308
|
+
*/
|
|
309
|
+
function unwrapSchema(t: z.ZodTypeAny): z.ZodTypeAny {
|
|
310
|
+
let cur = t;
|
|
311
|
+
while (cur instanceof z.ZodDefault || cur instanceof z.ZodOptional || cur instanceof z.ZodNullable) {
|
|
312
|
+
cur = cur.unwrap() as unknown as z.ZodTypeAny;
|
|
313
|
+
}
|
|
314
|
+
return cur;
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
/**
|
|
318
|
+
* Walk every wrapper layer of a zod leaf (default / optional / nullable)
|
|
319
|
+
* and return: the innermost type, whether `null` is legal, and the merged
|
|
320
|
+
* `.meta()` from every layer (outer layers win on conflict).
|
|
321
|
+
*
|
|
322
|
+
* Zod 4's `.meta()` is bound to the specific layer where it was declared —
|
|
323
|
+
* `.meta({secret:true}).default("")` and `.default("").meta({secret:true})`
|
|
324
|
+
* land it on different wrappers — so we have to scan all of them.
|
|
325
|
+
*/
|
|
326
|
+
function walkLeaf(t: z.ZodTypeAny): {
|
|
327
|
+
leaf: z.ZodTypeAny;
|
|
328
|
+
nullable: boolean;
|
|
329
|
+
meta: Record<string, unknown>;
|
|
330
|
+
} {
|
|
331
|
+
let cur = t;
|
|
332
|
+
let nullable = false;
|
|
333
|
+
const layers: z.ZodTypeAny[] = [cur];
|
|
334
|
+
while (cur instanceof z.ZodDefault || cur instanceof z.ZodOptional || cur instanceof z.ZodNullable) {
|
|
335
|
+
if (cur instanceof z.ZodNullable) nullable = true;
|
|
336
|
+
cur = cur.unwrap() as unknown as z.ZodTypeAny;
|
|
337
|
+
layers.push(cur);
|
|
338
|
+
}
|
|
339
|
+
let meta: Record<string, unknown> = {};
|
|
340
|
+
// inner-to-outer merge so outer layers (declared closer to the user) win
|
|
341
|
+
for (const layer of layers) {
|
|
342
|
+
const layerMeta = (layer as { meta?: () => Record<string, unknown> | undefined }).meta?.();
|
|
343
|
+
if (layerMeta) meta = { ...meta, ...layerMeta };
|
|
344
|
+
}
|
|
345
|
+
return { leaf: cur, nullable, meta };
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
/** Map a zod leaf type to its `ConfigFieldKind` discriminator. */
|
|
349
|
+
function inferKind(leaf: z.ZodTypeAny): ConfigFieldKind {
|
|
350
|
+
if (leaf instanceof z.ZodString) return "string";
|
|
351
|
+
if (leaf instanceof z.ZodNumber) return "number";
|
|
352
|
+
if (leaf instanceof z.ZodBoolean) return "boolean";
|
|
353
|
+
if (leaf instanceof z.ZodNull) return "null";
|
|
354
|
+
return "unknown";
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
/**
|
|
358
|
+
* Recursively enumerate every leaf in a zod schema as a `ConfigField`. This
|
|
359
|
+
* is the single source of truth for what's gettable / settable / maskable —
|
|
360
|
+
* adding a new field to `MembotConfigSchema` (and tagging it with
|
|
361
|
+
* `.meta({secret:true})` if appropriate) is enough to make every path here
|
|
362
|
+
* pick it up automatically.
|
|
363
|
+
*/
|
|
364
|
+
export function enumerateSchemaFields(schema: z.ZodTypeAny, prefix = ""): ConfigField[] {
|
|
365
|
+
const root = unwrapSchema(schema);
|
|
366
|
+
if (!(root instanceof z.ZodObject)) {
|
|
367
|
+
if (!prefix) return [];
|
|
368
|
+
const { leaf, nullable, meta } = walkLeaf(schema);
|
|
369
|
+
return [{ path: prefix, kind: inferKind(leaf), nullable, is_secret: meta.secret === true }];
|
|
370
|
+
}
|
|
371
|
+
const out: ConfigField[] = [];
|
|
372
|
+
const shape = root.shape as Record<string, z.ZodTypeAny>;
|
|
373
|
+
for (const key of Object.keys(shape)) {
|
|
374
|
+
const child = shape[key] as z.ZodTypeAny;
|
|
375
|
+
const childUnwrapped = unwrapSchema(child);
|
|
376
|
+
const path = prefix ? `${prefix}.${key}` : key;
|
|
377
|
+
if (childUnwrapped instanceof z.ZodObject) {
|
|
378
|
+
out.push(...enumerateSchemaFields(childUnwrapped, path));
|
|
379
|
+
} else {
|
|
380
|
+
const { leaf, nullable, meta } = walkLeaf(child);
|
|
381
|
+
out.push({ path, kind: inferKind(leaf), nullable, is_secret: meta.secret === true });
|
|
382
|
+
}
|
|
383
|
+
}
|
|
384
|
+
return out;
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
/** Backward-compatible wrapper: just the dotted paths, no metadata. */
|
|
388
|
+
export function enumerateSchemaPaths(schema: z.ZodTypeAny, prefix = ""): string[] {
|
|
389
|
+
return enumerateSchemaFields(schema, prefix).map((f) => f.path);
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
/**
|
|
393
|
+
* Field index built once from `MembotConfigSchema` at module load. Every
|
|
394
|
+
* read/write path consults this instead of duplicating schema introspection.
|
|
395
|
+
*/
|
|
396
|
+
const FIELD_INDEX: ReadonlyMap<string, ConfigField> = new Map(
|
|
397
|
+
enumerateSchemaFields(MembotConfigSchema).map((f) => [f.path, f]),
|
|
398
|
+
);
|
|
399
|
+
|
|
400
|
+
/** Look up the `ConfigField` for a known dotted path, or `undefined`. */
|
|
401
|
+
export function getField(path: string): ConfigField | undefined {
|
|
402
|
+
return FIELD_INDEX.get(path);
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
/** Read the value at a dotted path from a plain object. */
|
|
406
|
+
function getValueAt(obj: unknown, dottedPath: string): unknown {
|
|
407
|
+
let cur: unknown = obj;
|
|
408
|
+
for (const segment of dottedPath.split(".")) {
|
|
409
|
+
if (cur === null || typeof cur !== "object") return undefined;
|
|
410
|
+
cur = (cur as Record<string, unknown>)[segment];
|
|
411
|
+
}
|
|
412
|
+
return cur;
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
/**
|
|
416
|
+
* Set the value at a dotted path on a plain object, creating intermediate
|
|
417
|
+
* objects as needed. Mutates `obj` in place.
|
|
418
|
+
*/
|
|
419
|
+
function setValueAt(obj: Record<string, unknown>, dottedPath: string, value: unknown): void {
|
|
420
|
+
const segments = dottedPath.split(".");
|
|
421
|
+
let cur: Record<string, unknown> = obj;
|
|
422
|
+
for (let i = 0; i < segments.length - 1; i++) {
|
|
423
|
+
const seg = segments[i] as string;
|
|
424
|
+
const next = cur[seg];
|
|
425
|
+
if (next === null || typeof next !== "object") {
|
|
426
|
+
cur[seg] = {};
|
|
427
|
+
}
|
|
428
|
+
cur = cur[seg] as Record<string, unknown>;
|
|
429
|
+
}
|
|
430
|
+
cur[segments[segments.length - 1] as string] = value;
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
/**
|
|
434
|
+
* Try `JSON.parse` (so `42`, `true`, `null`, `"foo"` all coerce correctly);
|
|
435
|
+
* fall back to the raw string when the value isn't valid JSON.
|
|
436
|
+
*/
|
|
437
|
+
function coerceValue(raw: string): unknown {
|
|
438
|
+
try {
|
|
439
|
+
return JSON.parse(raw);
|
|
440
|
+
} catch {
|
|
441
|
+
return raw;
|
|
442
|
+
}
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
/**
|
|
446
|
+
* Reparse the entire draft against `MembotConfigSchema`. On failure, throw
|
|
447
|
+
* a `HelpfulError` whose hint names the offending dot-path and shows the
|
|
448
|
+
* zod error message — far more useful than zod's raw issue array.
|
|
449
|
+
*/
|
|
450
|
+
function validateOrThrow(draft: unknown, key: string): MembotConfig {
|
|
451
|
+
const result = MembotConfigSchema.safeParse(draft);
|
|
452
|
+
if (result.success) return result.data;
|
|
453
|
+
const issue = result.error.issues.find((i) => i.path.join(".") === key) ?? result.error.issues[0];
|
|
454
|
+
const issuePath = issue?.path.join(".") ?? key;
|
|
455
|
+
const issueMessage = issue?.message ?? result.error.message;
|
|
456
|
+
throw new HelpfulError({
|
|
457
|
+
kind: "input_error",
|
|
458
|
+
message: `invalid value for ${issuePath}: ${issueMessage}`,
|
|
459
|
+
hint: `Run \`membot config get ${issuePath}\` to see the current value, or \`membot config unset ${issuePath}\` to reset to default.`,
|
|
460
|
+
details: result.error.issues,
|
|
461
|
+
cause: result.error,
|
|
462
|
+
});
|
|
463
|
+
}
|
|
464
|
+
|
|
465
|
+
/**
|
|
466
|
+
* Mask a value for display when its `ConfigField.is_secret` is true.
|
|
467
|
+
* Non-secret paths and unknown paths pass through unchanged.
|
|
468
|
+
*/
|
|
469
|
+
export function maskIfSecret(path: string, value: unknown): unknown {
|
|
470
|
+
if (!getField(path)?.is_secret) return value;
|
|
471
|
+
if (typeof value !== "string" || value.length === 0) return value;
|
|
472
|
+
if (value.length <= 11) return "****";
|
|
473
|
+
return `${value.slice(0, 7)}...${value.slice(-4)}`;
|
|
474
|
+
}
|
|
475
|
+
|
|
476
|
+
/** Walk a config object and mask every secret field in place. */
|
|
477
|
+
function maskAllSecrets(config: MembotConfig): MembotConfig {
|
|
478
|
+
const clone = structuredClone(config) as Record<string, unknown>;
|
|
479
|
+
for (const field of FIELD_INDEX.values()) {
|
|
480
|
+
if (!field.is_secret) continue;
|
|
481
|
+
const current = getValueAt(clone, field.path);
|
|
482
|
+
setValueAt(clone, field.path, maskIfSecret(field.path, current));
|
|
483
|
+
}
|
|
484
|
+
return clone as MembotConfig;
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
/** Render a scalar (or null/undefined/object) for human-readable output. */
|
|
488
|
+
function formatScalar(value: unknown): string {
|
|
489
|
+
if (value === null) return colors.dim("null");
|
|
490
|
+
if (value === undefined) return colors.dim("(unset)");
|
|
491
|
+
if (typeof value === "string") return value;
|
|
492
|
+
if (typeof value === "number" || typeof value === "boolean") return String(value);
|
|
493
|
+
return JSON.stringify(value);
|
|
494
|
+
}
|
package/src/config/loader.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { mkdir } from "node:fs/promises";
|
|
1
|
+
import { chmod, mkdir } from "node:fs/promises";
|
|
2
2
|
import { resolve } from "node:path";
|
|
3
3
|
import { defaultMembotHome, ENV, FILES } from "../constants.ts";
|
|
4
4
|
import { asHelpful, HelpfulError } from "../errors.ts";
|
|
@@ -74,17 +74,19 @@ function resolveDataDir(flag?: string): string {
|
|
|
74
74
|
}
|
|
75
75
|
|
|
76
76
|
/**
|
|
77
|
-
* Persist config to disk
|
|
78
|
-
*
|
|
79
|
-
*
|
|
80
|
-
*
|
|
77
|
+
* Persist config to disk and chmod 0600 so the file is owner-read-only —
|
|
78
|
+
* `llm.anthropic_api_key` may be present, and we don't want it world-readable.
|
|
79
|
+
* `loadConfig` still lets `ANTHROPIC_API_KEY` (env) override the file at read
|
|
80
|
+
* time, so an env-var-only setup keeps working unchanged.
|
|
81
81
|
*/
|
|
82
82
|
export async function saveConfig(configPath: string, config: MembotConfig): Promise<void> {
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
}
|
|
87
|
-
|
|
83
|
+
await Bun.write(configPath, `${JSON.stringify(config, null, 2)}\n`);
|
|
84
|
+
try {
|
|
85
|
+
await chmod(configPath, 0o600);
|
|
86
|
+
} catch {
|
|
87
|
+
// chmod is best-effort: filesystems without unix permissions (e.g. some
|
|
88
|
+
// Windows scenarios) silently fail, and that's acceptable.
|
|
89
|
+
}
|
|
88
90
|
}
|
|
89
91
|
|
|
90
92
|
/**
|
package/src/config/schemas.ts
CHANGED
|
@@ -8,7 +8,7 @@ export const ChunkerConfigSchema = z.object({
|
|
|
8
8
|
});
|
|
9
9
|
|
|
10
10
|
export const LlmConfigSchema = z.object({
|
|
11
|
-
anthropic_api_key: z.string().default(""),
|
|
11
|
+
anthropic_api_key: z.string().meta({ secret: true }).default(""),
|
|
12
12
|
converter_model: z.string().default(DEFAULTS.CONVERTER_MODEL),
|
|
13
13
|
chunker_model: z.string().default(DEFAULTS.CHUNKER_MODEL),
|
|
14
14
|
describer_model: z.string().default(DEFAULTS.DESCRIBER_MODEL),
|
package/src/ingest/embedder.ts
CHANGED
|
@@ -63,6 +63,16 @@ async function getPipeline(model: string): Promise<FeatureExtractionPipeline> {
|
|
|
63
63
|
return p;
|
|
64
64
|
}
|
|
65
65
|
|
|
66
|
+
/**
|
|
67
|
+
* Options for `embed()`. `onProgress` fires once after each batch finishes
|
|
68
|
+
* with `(done, total)` chunk counts so callers can drive a spinner / progress
|
|
69
|
+
* bar — ONNX WASM holds the JS thread for hundreds of ms per batch and would
|
|
70
|
+
* otherwise leave nanospinner's setInterval starved between updates.
|
|
71
|
+
*/
|
|
72
|
+
export interface EmbedOptions {
|
|
73
|
+
onProgress?: (done: number, total: number) => void;
|
|
74
|
+
}
|
|
75
|
+
|
|
66
76
|
/**
|
|
67
77
|
* Embed an array of texts to L2-normalized vectors with the configured
|
|
68
78
|
* model. Throws a HelpfulError when the model's dimension doesn't match
|
|
@@ -71,8 +81,16 @@ async function getPipeline(model: string): Promise<FeatureExtractionPipeline> {
|
|
|
71
81
|
* Inputs are sliced into windows of EMBEDDING_BATCH_SIZE so a single
|
|
72
82
|
* forward pass never has to allocate activations for arbitrarily many
|
|
73
83
|
* chunks — large files (hundreds of chunks) otherwise OOM the WASM heap.
|
|
84
|
+
*
|
|
85
|
+
* Between batches we yield a macrotask (`setTimeout(0)`) so the event loop
|
|
86
|
+
* can flush nanospinner renders and stderr writes — without that, the spinner
|
|
87
|
+
* visibly freezes for the entire embed phase on large files.
|
|
74
88
|
*/
|
|
75
|
-
export async function embed(
|
|
89
|
+
export async function embed(
|
|
90
|
+
texts: string[],
|
|
91
|
+
model: string = EMBEDDING_MODEL,
|
|
92
|
+
opts: EmbedOptions = {},
|
|
93
|
+
): Promise<number[][]> {
|
|
76
94
|
if (texts.length === 0) return [];
|
|
77
95
|
const extractor = await getPipeline(model);
|
|
78
96
|
const out: number[][] = [];
|
|
@@ -88,6 +106,10 @@ export async function embed(texts: string[], model: string = EMBEDDING_MODEL): P
|
|
|
88
106
|
});
|
|
89
107
|
}
|
|
90
108
|
for (const vec of data) out.push(vec);
|
|
109
|
+
opts.onProgress?.(out.length, texts.length);
|
|
110
|
+
// Yield a macrotask so nanospinner's setInterval and any queued
|
|
111
|
+
// stderr writes get a chance to run between batches.
|
|
112
|
+
await new Promise<void>((resolve) => setTimeout(resolve, 0));
|
|
91
113
|
}
|
|
92
114
|
return out;
|
|
93
115
|
}
|
package/src/ingest/ingest.ts
CHANGED
|
@@ -54,6 +54,13 @@ export interface IngestResult {
|
|
|
54
54
|
export interface IngestCallbacks {
|
|
55
55
|
onEntryStart?: (label: string) => void;
|
|
56
56
|
onEntryComplete?: (entry: IngestEntryResult) => void;
|
|
57
|
+
/**
|
|
58
|
+
* Fires for sub-step progress within a single entry (e.g. "embedding
|
|
59
|
+
* 32/168"). The callback runs many times per entry and is intended for
|
|
60
|
+
* driving an interactive spinner — non-interactive callers should ignore
|
|
61
|
+
* it to avoid log spam.
|
|
62
|
+
*/
|
|
63
|
+
onEntryProgress?: (label: string, sublabel: string) => void;
|
|
57
64
|
}
|
|
58
65
|
|
|
59
66
|
/**
|
|
@@ -140,23 +147,27 @@ async function ingestInline(
|
|
|
140
147
|
source_sha256: sha,
|
|
141
148
|
};
|
|
142
149
|
try {
|
|
143
|
-
const versionId = await persistVersion(
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
150
|
+
const versionId = await persistVersion(
|
|
151
|
+
ctx,
|
|
152
|
+
{
|
|
153
|
+
logicalPath,
|
|
154
|
+
sourceType: "inline",
|
|
155
|
+
sourcePath: null,
|
|
156
|
+
sourceMtimeMs: null,
|
|
157
|
+
sourceSha: sha,
|
|
158
|
+
blobSha: null,
|
|
159
|
+
mime: "text/markdown",
|
|
160
|
+
bytes: null,
|
|
161
|
+
markdown: text,
|
|
162
|
+
fetcher: "inline",
|
|
163
|
+
fetcherServer: null,
|
|
164
|
+
fetcherTool: null,
|
|
165
|
+
fetcherArgs: null,
|
|
166
|
+
refreshSec,
|
|
167
|
+
changeNote: input.change_note ?? null,
|
|
168
|
+
},
|
|
169
|
+
(done, total) => callbacks?.onEntryProgress?.(logicalPath, `embedding ${done}/${total}`),
|
|
170
|
+
);
|
|
160
171
|
result.version_id = versionId;
|
|
161
172
|
} catch (err) {
|
|
162
173
|
result.status = "failed";
|
|
@@ -217,22 +228,26 @@ async function ingestUrl(
|
|
|
217
228
|
}
|
|
218
229
|
}
|
|
219
230
|
|
|
220
|
-
const versionId = await pipelineForBytes(
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
231
|
+
const versionId = await pipelineForBytes(
|
|
232
|
+
ctx,
|
|
233
|
+
{
|
|
234
|
+
logicalPath,
|
|
235
|
+
bytes: fetched.bytes,
|
|
236
|
+
mime: fetched.mimeType,
|
|
237
|
+
source: url,
|
|
238
|
+
sourceType: "remote",
|
|
239
|
+
sourcePath: url,
|
|
240
|
+
sourceMtimeMs: null,
|
|
241
|
+
sourceSha: fetched.sha256,
|
|
242
|
+
fetcher: fetched.fetcher,
|
|
243
|
+
fetcherServer: fetched.fetcherServer,
|
|
244
|
+
fetcherTool: fetched.fetcherTool,
|
|
245
|
+
fetcherArgs: fetched.fetcherArgs,
|
|
246
|
+
refreshSec,
|
|
247
|
+
changeNote: input.change_note ?? null,
|
|
248
|
+
},
|
|
249
|
+
(done, total) => callbacks?.onEntryProgress?.(url, `embedding ${done}/${total}`),
|
|
250
|
+
);
|
|
236
251
|
result.version_id = versionId;
|
|
237
252
|
} catch (err) {
|
|
238
253
|
result.status = "failed";
|
|
@@ -299,22 +314,26 @@ async function ingestLocalFiles(
|
|
|
299
314
|
}
|
|
300
315
|
}
|
|
301
316
|
|
|
302
|
-
const versionId = await pipelineForBytes(
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
317
|
+
const versionId = await pipelineForBytes(
|
|
318
|
+
ctx,
|
|
319
|
+
{
|
|
320
|
+
logicalPath,
|
|
321
|
+
bytes: local.bytes,
|
|
322
|
+
mime: local.mimeType,
|
|
323
|
+
source: entry.absPath,
|
|
324
|
+
sourceType: "local",
|
|
325
|
+
sourcePath: entry.absPath,
|
|
326
|
+
sourceMtimeMs: local.mtimeMs,
|
|
327
|
+
sourceSha: local.sha256,
|
|
328
|
+
fetcher: "local",
|
|
329
|
+
fetcherServer: null,
|
|
330
|
+
fetcherTool: null,
|
|
331
|
+
fetcherArgs: null,
|
|
332
|
+
refreshSec,
|
|
333
|
+
changeNote: input.change_note ?? null,
|
|
334
|
+
},
|
|
335
|
+
(done, total) => callbacks?.onEntryProgress?.(entry.relPathFromBase, `embedding ${done}/${total}`),
|
|
336
|
+
);
|
|
318
337
|
result.version_id = versionId;
|
|
319
338
|
} catch (err) {
|
|
320
339
|
result.status = "failed";
|
|
@@ -353,9 +372,14 @@ interface PipelineParams {
|
|
|
353
372
|
* Run the bytes-in / version-out pipeline: store the blob, convert to
|
|
354
373
|
* markdown, describe, chunk, embed, and write a new files row + chunks
|
|
355
374
|
* rows under a fresh version_id. Returns the version_id so callers can
|
|
356
|
-
* report it back.
|
|
375
|
+
* report it back. The optional `onEmbedProgress` is forwarded to the
|
|
376
|
+
* embedder so callers can drive a spinner during the slow phase.
|
|
357
377
|
*/
|
|
358
|
-
async function pipelineForBytes(
|
|
378
|
+
async function pipelineForBytes(
|
|
379
|
+
ctx: AppContext,
|
|
380
|
+
p: PipelineParams,
|
|
381
|
+
onEmbedProgress?: (done: number, total: number) => void,
|
|
382
|
+
): Promise<string> {
|
|
359
383
|
await upsertBlob(ctx.db, {
|
|
360
384
|
sha256: p.sourceSha,
|
|
361
385
|
mime_type: p.mime,
|
|
@@ -367,24 +391,28 @@ async function pipelineForBytes(ctx: AppContext, p: PipelineParams): Promise<str
|
|
|
367
391
|
const markdown = conversion.markdown;
|
|
368
392
|
const contentSha = sha256Hex(new TextEncoder().encode(markdown));
|
|
369
393
|
|
|
370
|
-
return persistVersion(
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
394
|
+
return persistVersion(
|
|
395
|
+
ctx,
|
|
396
|
+
{
|
|
397
|
+
logicalPath: p.logicalPath,
|
|
398
|
+
sourceType: p.sourceType,
|
|
399
|
+
sourcePath: p.sourcePath,
|
|
400
|
+
sourceMtimeMs: p.sourceMtimeMs,
|
|
401
|
+
sourceSha: p.sourceSha,
|
|
402
|
+
blobSha: p.sourceSha,
|
|
403
|
+
mime: p.mime,
|
|
404
|
+
bytes: p.bytes,
|
|
405
|
+
markdown,
|
|
406
|
+
contentSha,
|
|
407
|
+
fetcher: p.fetcher,
|
|
408
|
+
fetcherServer: p.fetcherServer,
|
|
409
|
+
fetcherTool: p.fetcherTool,
|
|
410
|
+
fetcherArgs: p.fetcherArgs,
|
|
411
|
+
refreshSec: p.refreshSec,
|
|
412
|
+
changeNote: p.changeNote,
|
|
413
|
+
},
|
|
414
|
+
onEmbedProgress,
|
|
415
|
+
);
|
|
388
416
|
}
|
|
389
417
|
|
|
390
418
|
interface PersistParams {
|
|
@@ -412,13 +440,17 @@ interface PersistParams {
|
|
|
412
440
|
* embedded text per chunk is `<path>\n<description>\n\n<body>`, stored
|
|
413
441
|
* verbatim as `chunks.search_text` and later FTS-indexed.
|
|
414
442
|
*/
|
|
415
|
-
async function persistVersion(
|
|
443
|
+
async function persistVersion(
|
|
444
|
+
ctx: AppContext,
|
|
445
|
+
p: PersistParams,
|
|
446
|
+
onEmbedProgress?: (done: number, total: number) => void,
|
|
447
|
+
): Promise<string> {
|
|
416
448
|
const description = await describe(p.logicalPath, p.mime, p.markdown, ctx.config.llm);
|
|
417
449
|
const chunks = chunkDeterministic(p.markdown, ctx.config.chunker);
|
|
418
450
|
const searchTexts = chunks.map((c) => buildSearchText(p.logicalPath, description, c.content));
|
|
419
451
|
let embeddings: number[][];
|
|
420
452
|
try {
|
|
421
|
-
embeddings = await embed(searchTexts, ctx.config.embedding_model);
|
|
453
|
+
embeddings = await embed(searchTexts, ctx.config.embedding_model, { onProgress: onEmbedProgress });
|
|
422
454
|
} catch (err) {
|
|
423
455
|
throw asHelpful(
|
|
424
456
|
err,
|
package/src/operations/add.ts
CHANGED
|
@@ -138,6 +138,7 @@ Pass \`logical_path\` to override. For a multi-source / directory / glob walk it
|
|
|
138
138
|
const callbacks: IngestCallbacks = {
|
|
139
139
|
onEntryStart: (label) => ctx.progress.tick(label),
|
|
140
140
|
onEntryComplete: (entry) => ctx.progress.entry(formatEntryLine(entry)),
|
|
141
|
+
onEntryProgress: (_label, sublabel) => ctx.progress.update(sublabel),
|
|
141
142
|
};
|
|
142
143
|
|
|
143
144
|
for (const outcome of outcomes) {
|
|
@@ -60,7 +60,9 @@ export const refreshOperation = defineOperation({
|
|
|
60
60
|
for (const path of targets) {
|
|
61
61
|
ctx.progress.tick(path);
|
|
62
62
|
try {
|
|
63
|
-
const r = await refreshOne(ctx, path, input.force)
|
|
63
|
+
const r = await refreshOne(ctx, path, input.force, (done, total) =>
|
|
64
|
+
ctx.progress.update(`embedding ${done}/${total}`),
|
|
65
|
+
);
|
|
64
66
|
out.push(r);
|
|
65
67
|
} catch (err) {
|
|
66
68
|
out.push({ logical_path: path, status: "failed", error: err instanceof Error ? err.message : String(err) });
|
package/src/output/progress.ts
CHANGED
|
@@ -15,6 +15,13 @@ import { isSilent, useSpinner } from "./tty.ts";
|
|
|
15
15
|
export interface Progress {
|
|
16
16
|
start(total: number, label?: string): void;
|
|
17
17
|
tick(label: string): void;
|
|
18
|
+
/**
|
|
19
|
+
* Re-render the active spinner with the most recent `tick` label plus an
|
|
20
|
+
* extra suffix (e.g. "embedding 32/168") without advancing the counter.
|
|
21
|
+
* No-op in non-interactive / silent / JSON modes — sub-step progress is
|
|
22
|
+
* deliberately TTY-only so CI logs don't get one line per inner batch.
|
|
23
|
+
*/
|
|
24
|
+
update(suffix: string): void;
|
|
18
25
|
entry(line: string): void;
|
|
19
26
|
done(summary?: string): void;
|
|
20
27
|
fail(summary?: string): void;
|
|
@@ -51,25 +58,28 @@ function truncateLabel(label: string, max = LABEL_MAX): string {
|
|
|
51
58
|
export function createProgress(): Progress {
|
|
52
59
|
let total = 0;
|
|
53
60
|
let count = 0;
|
|
61
|
+
let lastLabel = "";
|
|
54
62
|
let spinner: ReturnType<typeof logger.startSpinner> | null = null;
|
|
55
63
|
|
|
56
64
|
const interactive = useSpinner();
|
|
57
65
|
const silent = isSilent();
|
|
58
66
|
|
|
59
|
-
const renderSpinnerText = (label: string): string => {
|
|
67
|
+
const renderSpinnerText = (label: string, suffix?: string): string => {
|
|
60
68
|
const bar = renderBar(count, total);
|
|
61
69
|
const pct = total > 0 ? Math.floor((count / total) * 100) : 0;
|
|
62
|
-
const
|
|
63
|
-
|
|
70
|
+
const labelTail = label ? ` — ${truncateLabel(label)}` : "";
|
|
71
|
+
const suffixTail = suffix ? ` — ${suffix}` : "";
|
|
72
|
+
return `${bar} ${count}/${total} (${pct}%)${labelTail}${suffixTail}`;
|
|
64
73
|
};
|
|
65
74
|
|
|
66
75
|
return {
|
|
67
76
|
start(t: number, label?: string) {
|
|
68
77
|
total = t;
|
|
69
78
|
count = 0;
|
|
79
|
+
lastLabel = label ?? "";
|
|
70
80
|
if (silent) return;
|
|
71
81
|
if (interactive) {
|
|
72
|
-
const initial = renderSpinnerText(
|
|
82
|
+
const initial = renderSpinnerText(lastLabel);
|
|
73
83
|
spinner = logger.startSpinner(initial);
|
|
74
84
|
} else if (label) {
|
|
75
85
|
logger.info(`${label}: 0/${total}`);
|
|
@@ -77,6 +87,7 @@ export function createProgress(): Progress {
|
|
|
77
87
|
},
|
|
78
88
|
tick(label: string) {
|
|
79
89
|
count += 1;
|
|
90
|
+
lastLabel = label;
|
|
80
91
|
if (silent) return;
|
|
81
92
|
if (interactive && spinner) {
|
|
82
93
|
spinner.update(renderSpinnerText(label));
|
|
@@ -84,6 +95,11 @@ export function createProgress(): Progress {
|
|
|
84
95
|
logger.info(`[${count}/${total}] ${label}`);
|
|
85
96
|
}
|
|
86
97
|
},
|
|
98
|
+
update(suffix: string) {
|
|
99
|
+
if (silent) return;
|
|
100
|
+
if (!interactive || !spinner) return;
|
|
101
|
+
spinner.update(renderSpinnerText(lastLabel, suffix));
|
|
102
|
+
},
|
|
87
103
|
entry(line: string) {
|
|
88
104
|
if (silent) return;
|
|
89
105
|
logger.info(line);
|
package/src/refresh/runner.ts
CHANGED
|
@@ -24,9 +24,16 @@ export interface RefreshOutcome {
|
|
|
24
24
|
* via the persisted mcpx invocation), and creates a new version only if
|
|
25
25
|
* the source bytes changed. Always updates `refreshed_at` and
|
|
26
26
|
* `last_refresh_status` on the row. Returns a per-path outcome — never
|
|
27
|
-
* throws unless the path doesn't exist.
|
|
27
|
+
* throws unless the path doesn't exist. The optional `onEmbedProgress`
|
|
28
|
+
* callback is forwarded to the embedder so interactive callers (e.g. the
|
|
29
|
+
* `refresh` operation) can drive a spinner during the slow phase.
|
|
28
30
|
*/
|
|
29
|
-
export async function refreshOne(
|
|
31
|
+
export async function refreshOne(
|
|
32
|
+
ctx: AppContext,
|
|
33
|
+
logicalPath: string,
|
|
34
|
+
force = false,
|
|
35
|
+
onEmbedProgress?: (done: number, total: number) => void,
|
|
36
|
+
): Promise<RefreshOutcome> {
|
|
30
37
|
const cur = await getCurrent(ctx.db, logicalPath);
|
|
31
38
|
if (!cur) {
|
|
32
39
|
throw new HelpfulError({
|
|
@@ -42,10 +49,10 @@ export async function refreshOne(ctx: AppContext, logicalPath: string, force = f
|
|
|
42
49
|
|
|
43
50
|
try {
|
|
44
51
|
if (cur.source_type === "local") {
|
|
45
|
-
return await refreshLocal(ctx, cur, force);
|
|
52
|
+
return await refreshLocal(ctx, cur, force, onEmbedProgress);
|
|
46
53
|
}
|
|
47
54
|
if (cur.source_type === "remote") {
|
|
48
|
-
return await refreshRemote(ctx, cur, force);
|
|
55
|
+
return await refreshRemote(ctx, cur, force, onEmbedProgress);
|
|
49
56
|
}
|
|
50
57
|
} catch (err) {
|
|
51
58
|
const message = err instanceof Error ? err.message : String(err);
|
|
@@ -74,7 +81,12 @@ interface CurrentRow {
|
|
|
74
81
|
}
|
|
75
82
|
|
|
76
83
|
/** Local-file refresh: stat-then-sha gate before re-running the pipeline. */
|
|
77
|
-
async function refreshLocal(
|
|
84
|
+
async function refreshLocal(
|
|
85
|
+
ctx: AppContext,
|
|
86
|
+
cur: CurrentRow,
|
|
87
|
+
force: boolean,
|
|
88
|
+
onEmbedProgress?: (done: number, total: number) => void,
|
|
89
|
+
): Promise<RefreshOutcome> {
|
|
78
90
|
if (!cur.source_path) {
|
|
79
91
|
throw new HelpfulError({
|
|
80
92
|
kind: "input_error",
|
|
@@ -92,26 +104,35 @@ async function refreshLocal(ctx: AppContext, cur: CurrentRow, force: boolean): P
|
|
|
92
104
|
return { logical_path: cur.logical_path, status: "unchanged" };
|
|
93
105
|
}
|
|
94
106
|
|
|
95
|
-
const versionId = await runPipelineForRefresh(
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
107
|
+
const versionId = await runPipelineForRefresh(
|
|
108
|
+
ctx,
|
|
109
|
+
{
|
|
110
|
+
logicalPath: cur.logical_path,
|
|
111
|
+
bytes: local.bytes,
|
|
112
|
+
mime: local.mimeType,
|
|
113
|
+
source: cur.source_path,
|
|
114
|
+
sourceType: "local",
|
|
115
|
+
sourcePath: cur.source_path,
|
|
116
|
+
sourceMtimeMs: local.mtimeMs,
|
|
117
|
+
sourceSha: local.sha256,
|
|
118
|
+
fetcher: "local",
|
|
119
|
+
fetcherServer: null,
|
|
120
|
+
fetcherTool: null,
|
|
121
|
+
fetcherArgs: null,
|
|
122
|
+
refreshSec: cur.refresh_frequency_sec,
|
|
123
|
+
},
|
|
124
|
+
onEmbedProgress,
|
|
125
|
+
);
|
|
110
126
|
return { logical_path: cur.logical_path, status: "ok", new_version_id: versionId };
|
|
111
127
|
}
|
|
112
128
|
|
|
113
129
|
/** Remote refresh: replay the persisted mcpx invocation, or plain HTTP. */
|
|
114
|
-
async function refreshRemote(
|
|
130
|
+
async function refreshRemote(
|
|
131
|
+
ctx: AppContext,
|
|
132
|
+
cur: CurrentRow,
|
|
133
|
+
force: boolean,
|
|
134
|
+
onEmbedProgress?: (done: number, total: number) => void,
|
|
135
|
+
): Promise<RefreshOutcome> {
|
|
115
136
|
if (!cur.source_path) {
|
|
116
137
|
throw new HelpfulError({
|
|
117
138
|
kind: "input_error",
|
|
@@ -129,21 +150,25 @@ async function refreshRemote(ctx: AppContext, cur: CurrentRow, force: boolean):
|
|
|
129
150
|
return { logical_path: cur.logical_path, status: "unchanged" };
|
|
130
151
|
}
|
|
131
152
|
|
|
132
|
-
const versionId = await runPipelineForRefresh(
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
153
|
+
const versionId = await runPipelineForRefresh(
|
|
154
|
+
ctx,
|
|
155
|
+
{
|
|
156
|
+
logicalPath: cur.logical_path,
|
|
157
|
+
bytes: fetched.bytes,
|
|
158
|
+
mime: fetched.mimeType,
|
|
159
|
+
source: cur.source_path,
|
|
160
|
+
sourceType: "remote",
|
|
161
|
+
sourcePath: cur.source_path,
|
|
162
|
+
sourceMtimeMs: null,
|
|
163
|
+
sourceSha: fetched.sha256,
|
|
164
|
+
fetcher: cur.fetcher === "mcpx" ? "mcpx" : "http",
|
|
165
|
+
fetcherServer: fetched.fetcherServer,
|
|
166
|
+
fetcherTool: fetched.fetcherTool,
|
|
167
|
+
fetcherArgs: fetched.fetcherArgs,
|
|
168
|
+
refreshSec: cur.refresh_frequency_sec,
|
|
169
|
+
},
|
|
170
|
+
onEmbedProgress,
|
|
171
|
+
);
|
|
147
172
|
return { logical_path: cur.logical_path, status: "ok", new_version_id: versionId };
|
|
148
173
|
}
|
|
149
174
|
|
|
@@ -237,7 +262,11 @@ interface PipelineParams {
|
|
|
237
262
|
* fields (`change_note='refresh: source updated'`) aren't accidentally
|
|
238
263
|
* applied to first-time ingests.
|
|
239
264
|
*/
|
|
240
|
-
async function runPipelineForRefresh(
|
|
265
|
+
async function runPipelineForRefresh(
|
|
266
|
+
ctx: AppContext,
|
|
267
|
+
p: PipelineParams,
|
|
268
|
+
onEmbedProgress?: (done: number, total: number) => void,
|
|
269
|
+
): Promise<string> {
|
|
241
270
|
await upsertBlob(ctx.db, {
|
|
242
271
|
sha256: p.sourceSha,
|
|
243
272
|
mime_type: p.mime,
|
|
@@ -250,7 +279,7 @@ async function runPipelineForRefresh(ctx: AppContext, p: PipelineParams): Promis
|
|
|
250
279
|
const description = await describe(p.logicalPath, p.mime, markdown, ctx.config.llm);
|
|
251
280
|
const chunks = chunkDeterministic(markdown, ctx.config.chunker);
|
|
252
281
|
const searchTexts = chunks.map((c) => buildSearchText(p.logicalPath, description, c.content));
|
|
253
|
-
const embeddings = await embed(searchTexts, ctx.config.embedding_model);
|
|
282
|
+
const embeddings = await embed(searchTexts, ctx.config.embedding_model, { onProgress: onEmbedProgress });
|
|
254
283
|
|
|
255
284
|
const versionId = millisIso(Date.now());
|
|
256
285
|
const contentSha = sha256Hex(new TextEncoder().encode(markdown));
|