membot 0.5.2 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/skills/membot.md +25 -10
- package/.cursor/rules/membot.mdc +25 -10
- package/README.md +36 -4
- package/package.json +8 -5
- package/scripts/apply-patches.sh +0 -11
- package/scripts/build-test-docx.ts +84 -0
- package/src/cli.ts +2 -2
- package/src/commands/login-page.mustache +50 -0
- package/src/commands/login.ts +83 -0
- package/src/config/schemas.ts +23 -5
- package/src/constants.ts +20 -1
- package/src/context.ts +1 -24
- package/src/db/files.ts +21 -25
- package/src/db/migrations/003-downloader-columns.ts +58 -0
- package/src/db/migrations.ts +2 -1
- package/src/ingest/converter/docx.ts +47 -5
- package/src/ingest/converter/html.ts +10 -3
- package/src/ingest/converter/image.ts +40 -3
- package/src/ingest/converter/images-inline.ts +132 -0
- package/src/ingest/converter/index.ts +13 -3
- package/src/ingest/converter/xlsx.ts +111 -0
- package/src/ingest/downloaders/browser.ts +180 -0
- package/src/ingest/downloaders/generic-web.ts +81 -0
- package/src/ingest/downloaders/github.ts +178 -0
- package/src/ingest/downloaders/google-docs.ts +56 -0
- package/src/ingest/downloaders/google-shared.ts +86 -0
- package/src/ingest/downloaders/google-sheets.ts +58 -0
- package/src/ingest/downloaders/google-slides.ts +53 -0
- package/src/ingest/downloaders/index.ts +182 -0
- package/src/ingest/downloaders/linear.ts +291 -0
- package/src/ingest/fetcher.ts +104 -129
- package/src/ingest/ingest.ts +44 -71
- package/src/mcp/instructions.ts +4 -2
- package/src/operations/add.ts +6 -4
- package/src/operations/info.ts +4 -6
- package/src/operations/move.ts +2 -3
- package/src/operations/refresh.ts +2 -4
- package/src/operations/remove.ts +23 -2
- package/src/operations/tree.ts +1 -1
- package/src/operations/types.ts +1 -1
- package/src/refresh/runner.ts +60 -115
- package/src/types/text-modules.d.ts +5 -0
- package/patches/@evantahler%2Fmcpx@0.21.4.patch +0 -51
- package/src/commands/mcpx.ts +0 -112
- package/src/ingest/agent-fetcher.ts +0 -639
package/src/ingest/fetcher.ts
CHANGED
|
@@ -1,158 +1,133 @@
|
|
|
1
|
-
import
|
|
2
|
-
import {
|
|
3
|
-
import {
|
|
1
|
+
import { join } from "node:path";
|
|
2
|
+
import type { MembotConfig } from "../config/schemas.ts";
|
|
3
|
+
import { FILES } from "../constants.ts";
|
|
4
|
+
import { HelpfulError } from "../errors.ts";
|
|
4
5
|
import { logger } from "../output/logger.ts";
|
|
5
|
-
import
|
|
6
|
-
import {
|
|
7
|
-
|
|
6
|
+
import { BrowserPool } from "./downloaders/browser.ts";
|
|
7
|
+
import {
|
|
8
|
+
type DownloadedRemote,
|
|
9
|
+
type Downloader,
|
|
10
|
+
type DownloaderCtx,
|
|
11
|
+
findDownloader,
|
|
12
|
+
findDownloaderByName,
|
|
13
|
+
listDownloaders,
|
|
14
|
+
} from "./downloaders/index.ts";
|
|
8
15
|
|
|
9
|
-
export
|
|
10
|
-
bytes: Uint8Array;
|
|
11
|
-
sha256: string;
|
|
12
|
-
mimeType: string;
|
|
13
|
-
fetcher: "http" | "mcpx";
|
|
14
|
-
fetcherServer: string | null;
|
|
15
|
-
fetcherTool: string | null;
|
|
16
|
-
fetcherArgs: Record<string, unknown> | null;
|
|
17
|
-
sourceUrl: string;
|
|
18
|
-
}
|
|
16
|
+
export type FetchedRemote = DownloadedRemote;
|
|
19
17
|
|
|
20
18
|
export interface FetchOptions {
|
|
21
19
|
/**
|
|
22
|
-
*
|
|
23
|
-
*
|
|
24
|
-
*
|
|
25
|
-
*
|
|
20
|
+
* Optional explicit downloader override. Free-form; matched
|
|
21
|
+
* case-insensitively against `Downloader.name`. When given, skips the
|
|
22
|
+
* URL-based matching and forces that downloader (useful for the
|
|
23
|
+
* "use the generic-web fallback even though google-docs claimed
|
|
24
|
+
* this URL" escape hatch).
|
|
26
25
|
*/
|
|
27
|
-
|
|
28
|
-
/** Live mcpx adapter the agent loop drives via search/list/info/exec. */
|
|
29
|
-
mcpx?: AgentMcpxAdapter | null;
|
|
26
|
+
downloaderName?: string;
|
|
30
27
|
/**
|
|
31
|
-
*
|
|
32
|
-
*
|
|
28
|
+
* Override the on-disk path of the persistent chromium profile.
|
|
29
|
+
* Defaults to `<ctx.dataDir>/auth/browser-profile`.
|
|
33
30
|
*/
|
|
34
|
-
|
|
31
|
+
userDataDir?: string;
|
|
32
|
+
/** Pre-built BrowserPool to share across many fetches (set by ingest's outer loop). */
|
|
33
|
+
pool?: BrowserPool;
|
|
35
34
|
/**
|
|
36
|
-
*
|
|
37
|
-
*
|
|
35
|
+
* Sublabel hook forwarded to the downloader's `DownloaderCtx`.
|
|
36
|
+
* Drives the per-entry spinner text during multi-step fetches.
|
|
38
37
|
*/
|
|
39
38
|
onProgress?: (sublabel: string) => void;
|
|
40
39
|
}
|
|
41
40
|
|
|
42
41
|
/**
|
|
43
|
-
* Fetch a remote URL.
|
|
44
|
-
*
|
|
45
|
-
*
|
|
46
|
-
*
|
|
47
|
-
*
|
|
48
|
-
*
|
|
49
|
-
* The agent's selected mcp_exec invocation is recorded on the
|
|
50
|
-
* returned row so refresh can replay it deterministically without
|
|
51
|
-
* another agent round-trip.
|
|
52
|
-
*
|
|
53
|
-
* If the agent decides plain HTTP is the right call (`request_http_fallback`,
|
|
54
|
-
* no tool calls, max turns) we transparently fall through to `httpFetch`.
|
|
55
|
-
* If the agent reports an actionable failure, we surface that as a
|
|
56
|
-
* `HelpfulError`. If mcpx is configured but the LLM key is missing AND
|
|
57
|
-
* the HTTP fallback also fails, we surface an `auth_error` naming the env
|
|
58
|
-
* var so users see the real cause instead of a misleading 401.
|
|
42
|
+
* Fetch a remote URL via the per-service downloader registry. Specific
|
|
43
|
+
* downloaders (Google, GitHub, Linear) match first; the generic-web
|
|
44
|
+
* downloader is the always-matching catch-all. Every fetch authenticates
|
|
45
|
+
* via the cookies the user persisted with `membot login`. The returned
|
|
46
|
+
* shape includes the chosen downloader name and its args so refresh can
|
|
47
|
+
* replay it deterministically without involving the LLM.
|
|
59
48
|
*/
|
|
60
|
-
export async function fetchRemote(
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
const
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
return await httpFetch(url);
|
|
73
|
-
} catch (err) {
|
|
74
|
-
if (err instanceof HelpfulError && err.kind === "network_error") {
|
|
75
|
-
throw new HelpfulError({
|
|
76
|
-
kind: "auth_error",
|
|
77
|
-
message: `${url} couldn't be fetched directly (${err.message}). Membot has mcpx configured, but routing through it requires Claude to translate the URL into the right tool arguments — and ANTHROPIC_API_KEY isn't set.`,
|
|
78
|
-
hint: `Set ANTHROPIC_API_KEY in your environment (or under llm.anthropic_api_key in ~/.membot/config.json), then retry. To force the HTTP path explicitly, run \`membot add ${url} --fetcher http\`.`,
|
|
79
|
-
});
|
|
80
|
-
}
|
|
81
|
-
throw err;
|
|
82
|
-
}
|
|
83
|
-
}
|
|
49
|
+
export async function fetchRemote(
|
|
50
|
+
url: string,
|
|
51
|
+
config: MembotConfig,
|
|
52
|
+
options: FetchOptions = {},
|
|
53
|
+
dataDir?: string,
|
|
54
|
+
): Promise<FetchedRemote> {
|
|
55
|
+
const downloader = pickDownloader(url, options.downloaderName);
|
|
56
|
+
const userDataDir = options.userDataDir ?? defaultProfileDir(dataDir);
|
|
57
|
+
const ownsPool = !options.pool;
|
|
58
|
+
const headless = !downloader.requireHeaded;
|
|
59
|
+
const pool = options.pool ?? new BrowserPool({ userDataDir, headless });
|
|
60
|
+
const dctx: DownloaderCtx = { pool, logger, config, onProgress: options.onProgress };
|
|
84
61
|
|
|
85
|
-
let outcome: Awaited<ReturnType<typeof agentFetch>>;
|
|
86
62
|
try {
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
63
|
+
// Fetches are strictly non-interactive: there's no auto-launch
|
|
64
|
+
// of a browser when auth fails. Batch ingest (`membot add` of
|
|
65
|
+
// many URLs) and the refresh daemon both run without a human
|
|
66
|
+
// available to drive a window, so any auth_error must
|
|
67
|
+
// propagate as-is. The HelpfulError's hint tells the user to
|
|
68
|
+
// `membot login` (cookie-based services) or `membot config set
|
|
69
|
+
// downloaders.<svc>.api_key` (API-key services); they fix it
|
|
70
|
+
// once and re-run.
|
|
71
|
+
return await downloader.download(new URL(url), dctx);
|
|
72
|
+
} finally {
|
|
73
|
+
if (ownsPool) await pool.dispose();
|
|
92
74
|
}
|
|
75
|
+
}
|
|
93
76
|
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
77
|
+
/**
|
|
78
|
+
* Replay a fetch by downloader name (used by refresh). Looks up the
|
|
79
|
+
* persisted downloader by name and calls it against the original URL —
|
|
80
|
+
* deterministic, no agent loop. When the persisted downloader is no
|
|
81
|
+
* longer registered (e.g. from a prior membot version), falls back to
|
|
82
|
+
* URL-based dispatch so refresh degrades gracefully instead of erroring.
|
|
83
|
+
*/
|
|
84
|
+
export async function fetchRemoteByDownloader(
|
|
85
|
+
downloaderName: string | null,
|
|
86
|
+
url: string,
|
|
87
|
+
pool: BrowserPool,
|
|
88
|
+
config: MembotConfig,
|
|
89
|
+
): Promise<FetchedRemote> {
|
|
90
|
+
const named = downloaderName ? findDownloaderByName(downloaderName) : null;
|
|
91
|
+
const downloader = named ?? findDownloader(url);
|
|
92
|
+
if (!downloader) {
|
|
93
|
+
throw new HelpfulError({
|
|
94
|
+
kind: "input_error",
|
|
95
|
+
message: `no downloader matches ${url}`,
|
|
96
|
+
hint: "Re-add the URL with `membot add <url>` to pick a fresh downloader.",
|
|
97
|
+
});
|
|
105
98
|
}
|
|
106
|
-
|
|
107
|
-
return
|
|
99
|
+
const dctx: DownloaderCtx = { pool, logger, config };
|
|
100
|
+
return downloader.download(new URL(url), dctx);
|
|
108
101
|
}
|
|
109
102
|
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
);
|
|
103
|
+
function pickDownloader(url: string, override?: string): Downloader {
|
|
104
|
+
if (override) {
|
|
105
|
+
const named = findDownloaderByName(override.toLowerCase());
|
|
106
|
+
if (!named) {
|
|
107
|
+
const available = listDownloaders()
|
|
108
|
+
.map((d) => d.name)
|
|
109
|
+
.join(", ");
|
|
110
|
+
throw new HelpfulError({
|
|
111
|
+
kind: "input_error",
|
|
112
|
+
message: `unknown downloader '${override}'`,
|
|
113
|
+
hint: `Pick one of: ${available}.`,
|
|
114
|
+
});
|
|
115
|
+
}
|
|
116
|
+
return named;
|
|
125
117
|
}
|
|
126
|
-
|
|
118
|
+
const matched = findDownloader(url);
|
|
119
|
+
if (!matched) {
|
|
127
120
|
throw new HelpfulError({
|
|
128
|
-
kind: "
|
|
129
|
-
message: `
|
|
130
|
-
hint: "
|
|
121
|
+
kind: "input_error",
|
|
122
|
+
message: `not a fetchable URL: ${url}`,
|
|
123
|
+
hint: "Pass an http(s):// URL.",
|
|
131
124
|
});
|
|
132
125
|
}
|
|
133
|
-
|
|
134
|
-
const ct = resp.headers.get("content-type") ?? "";
|
|
135
|
-
const mime = ct.split(";")[0]?.trim() || "application/octet-stream";
|
|
136
|
-
return {
|
|
137
|
-
bytes,
|
|
138
|
-
sha256: sha256Hex(bytes),
|
|
139
|
-
mimeType: mime,
|
|
140
|
-
fetcher: "http",
|
|
141
|
-
fetcherServer: null,
|
|
142
|
-
fetcherTool: null,
|
|
143
|
-
fetcherArgs: null,
|
|
144
|
-
sourceUrl: url,
|
|
145
|
-
};
|
|
126
|
+
return matched;
|
|
146
127
|
}
|
|
147
128
|
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
* as a successful payload. Used by the refresh runner; the agent loop
|
|
153
|
-
* has its own preview-aware check.
|
|
154
|
-
*/
|
|
155
|
-
export function isMcpToolError(result: unknown): boolean {
|
|
156
|
-
if (!result || typeof result !== "object") return false;
|
|
157
|
-
return (result as { isError?: unknown }).isError === true;
|
|
129
|
+
function defaultProfileDir(dataDir?: string): string {
|
|
130
|
+
if (dataDir) return join(dataDir, FILES.BROWSER_PROFILE);
|
|
131
|
+
const home = process.env.MEMBOT_HOME ?? `${process.env.HOME ?? "."}/.membot`;
|
|
132
|
+
return join(home, FILES.BROWSER_PROFILE);
|
|
158
133
|
}
|
package/src/ingest/ingest.ts
CHANGED
|
@@ -20,7 +20,7 @@ export interface IngestInput {
|
|
|
20
20
|
exclude?: string;
|
|
21
21
|
follow_symlinks?: boolean;
|
|
22
22
|
refresh_frequency?: string;
|
|
23
|
-
|
|
23
|
+
downloader?: string;
|
|
24
24
|
change_note?: string;
|
|
25
25
|
force?: boolean;
|
|
26
26
|
}
|
|
@@ -161,13 +161,12 @@ async function ingestInline(
|
|
|
161
161
|
bytes: null,
|
|
162
162
|
markdown: text,
|
|
163
163
|
fetcher: "inline",
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
fetcherArgs: null,
|
|
164
|
+
downloader: null,
|
|
165
|
+
downloaderArgs: null,
|
|
167
166
|
refreshSec,
|
|
168
167
|
changeNote: input.change_note ?? null,
|
|
169
168
|
},
|
|
170
|
-
(
|
|
169
|
+
(sublabel) => callbacks?.onEntryProgress?.(logicalPath, sublabel),
|
|
171
170
|
);
|
|
172
171
|
result.version_id = versionId;
|
|
173
172
|
} catch (err) {
|
|
@@ -187,38 +186,6 @@ async function ingestUrl(
|
|
|
187
186
|
force: boolean,
|
|
188
187
|
callbacks?: IngestCallbacks,
|
|
189
188
|
): Promise<IngestResult> {
|
|
190
|
-
const mcpxAdapter = ctx.mcpx
|
|
191
|
-
? {
|
|
192
|
-
async search(query: string, options?: { keywordOnly?: boolean; semanticOnly?: boolean }) {
|
|
193
|
-
try {
|
|
194
|
-
const results = await ctx.mcpx!.search(query, options);
|
|
195
|
-
return results.map((r) => ({
|
|
196
|
-
server: r.server,
|
|
197
|
-
tool: r.tool,
|
|
198
|
-
description: r.description ?? undefined,
|
|
199
|
-
score: r.score,
|
|
200
|
-
matchType: r.matchType ?? undefined,
|
|
201
|
-
}));
|
|
202
|
-
} catch (err) {
|
|
203
|
-
logger.debug(`mcpx.search(${query}) failed: ${err instanceof Error ? err.message : String(err)}`);
|
|
204
|
-
return [];
|
|
205
|
-
}
|
|
206
|
-
},
|
|
207
|
-
async listTools(server?: string) {
|
|
208
|
-
const tools = await ctx.mcpx!.listTools(server);
|
|
209
|
-
return tools.map((t) => ({ server: t.server, tool: { name: t.tool.name, description: t.tool.description } }));
|
|
210
|
-
},
|
|
211
|
-
async info(server: string, tool: string) {
|
|
212
|
-
const t = await ctx.mcpx!.info(server, tool);
|
|
213
|
-
if (!t) return undefined;
|
|
214
|
-
return { name: t.name, description: t.description, inputSchema: t.inputSchema };
|
|
215
|
-
},
|
|
216
|
-
async exec(server: string, tool: string, args?: Record<string, unknown>) {
|
|
217
|
-
return ctx.mcpx!.exec(server, tool, args ?? {});
|
|
218
|
-
},
|
|
219
|
-
}
|
|
220
|
-
: null;
|
|
221
|
-
|
|
222
189
|
const logicalPath = input.logical_path ?? defaultLogicalForUrl(url);
|
|
223
190
|
callbacks?.onEntryStart?.(url);
|
|
224
191
|
const result: IngestEntryResult = {
|
|
@@ -228,20 +195,24 @@ async function ingestUrl(
|
|
|
228
195
|
status: "ok",
|
|
229
196
|
mime_type: null,
|
|
230
197
|
size_bytes: 0,
|
|
231
|
-
fetcher: "
|
|
198
|
+
fetcher: "downloader",
|
|
232
199
|
source_sha256: "",
|
|
233
200
|
};
|
|
234
201
|
|
|
235
202
|
try {
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
203
|
+
callbacks?.onEntryProgress?.(url, "fetching");
|
|
204
|
+
const fetched = await fetchRemote(
|
|
205
|
+
url,
|
|
206
|
+
ctx.config,
|
|
207
|
+
{
|
|
208
|
+
downloaderName: input.downloader,
|
|
209
|
+
onProgress: (sublabel) => callbacks?.onEntryProgress?.(url, sublabel),
|
|
210
|
+
},
|
|
211
|
+
ctx.dataDir,
|
|
212
|
+
);
|
|
242
213
|
result.mime_type = fetched.mimeType;
|
|
243
214
|
result.size_bytes = fetched.bytes.byteLength;
|
|
244
|
-
result.fetcher =
|
|
215
|
+
result.fetcher = "downloader";
|
|
245
216
|
result.source_sha256 = fetched.sha256;
|
|
246
217
|
|
|
247
218
|
if (!force) {
|
|
@@ -265,14 +236,13 @@ async function ingestUrl(
|
|
|
265
236
|
sourcePath: url,
|
|
266
237
|
sourceMtimeMs: null,
|
|
267
238
|
sourceSha: fetched.sha256,
|
|
268
|
-
fetcher:
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
fetcherArgs: fetched.fetcherArgs,
|
|
239
|
+
fetcher: "downloader",
|
|
240
|
+
downloader: fetched.downloader,
|
|
241
|
+
downloaderArgs: fetched.downloaderArgs,
|
|
272
242
|
refreshSec,
|
|
273
243
|
changeNote: input.change_note ?? null,
|
|
274
244
|
},
|
|
275
|
-
(
|
|
245
|
+
(sublabel) => callbacks?.onEntryProgress?.(url, sublabel),
|
|
276
246
|
);
|
|
277
247
|
result.version_id = versionId;
|
|
278
248
|
} catch (err) {
|
|
@@ -352,13 +322,12 @@ async function ingestLocalFiles(
|
|
|
352
322
|
sourceMtimeMs: local.mtimeMs,
|
|
353
323
|
sourceSha: local.sha256,
|
|
354
324
|
fetcher: "local",
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
fetcherArgs: null,
|
|
325
|
+
downloader: null,
|
|
326
|
+
downloaderArgs: null,
|
|
358
327
|
refreshSec,
|
|
359
328
|
changeNote: input.change_note ?? null,
|
|
360
329
|
},
|
|
361
|
-
(
|
|
330
|
+
(sublabel) => callbacks?.onEntryProgress?.(entry.relPathFromBase, sublabel),
|
|
362
331
|
);
|
|
363
332
|
result.version_id = versionId;
|
|
364
333
|
} catch (err) {
|
|
@@ -387,9 +356,8 @@ interface PipelineParams {
|
|
|
387
356
|
sourceMtimeMs: number | null;
|
|
388
357
|
sourceSha: string;
|
|
389
358
|
fetcher: FetcherKind;
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
fetcherArgs: Record<string, unknown> | null;
|
|
359
|
+
downloader: string | null;
|
|
360
|
+
downloaderArgs: Record<string, unknown> | null;
|
|
393
361
|
refreshSec: number | null;
|
|
394
362
|
changeNote: string | null;
|
|
395
363
|
}
|
|
@@ -404,8 +372,9 @@ interface PipelineParams {
|
|
|
404
372
|
async function pipelineForBytes(
|
|
405
373
|
ctx: AppContext,
|
|
406
374
|
p: PipelineParams,
|
|
407
|
-
|
|
375
|
+
onPhase?: (sublabel: string) => void,
|
|
408
376
|
): Promise<string> {
|
|
377
|
+
onPhase?.("storing blob");
|
|
409
378
|
await upsertBlob(ctx.db, {
|
|
410
379
|
sha256: p.sourceSha,
|
|
411
380
|
mime_type: p.mime,
|
|
@@ -413,7 +382,8 @@ async function pipelineForBytes(
|
|
|
413
382
|
bytes: p.bytes,
|
|
414
383
|
});
|
|
415
384
|
|
|
416
|
-
|
|
385
|
+
onPhase?.("converting");
|
|
386
|
+
const conversion = await convert(p.bytes, p.mime, p.source, ctx.config.llm, ctx.config.converters);
|
|
417
387
|
const markdown = conversion.markdown;
|
|
418
388
|
const contentSha = sha256Hex(new TextEncoder().encode(markdown));
|
|
419
389
|
|
|
@@ -431,13 +401,12 @@ async function pipelineForBytes(
|
|
|
431
401
|
markdown,
|
|
432
402
|
contentSha,
|
|
433
403
|
fetcher: p.fetcher,
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
fetcherArgs: p.fetcherArgs,
|
|
404
|
+
downloader: p.downloader,
|
|
405
|
+
downloaderArgs: p.downloaderArgs,
|
|
437
406
|
refreshSec: p.refreshSec,
|
|
438
407
|
changeNote: p.changeNote,
|
|
439
408
|
},
|
|
440
|
-
|
|
409
|
+
onPhase,
|
|
441
410
|
);
|
|
442
411
|
}
|
|
443
412
|
|
|
@@ -453,9 +422,8 @@ interface PersistParams {
|
|
|
453
422
|
markdown: string;
|
|
454
423
|
contentSha?: string;
|
|
455
424
|
fetcher: FetcherKind;
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
fetcherArgs: Record<string, unknown> | null;
|
|
425
|
+
downloader: string | null;
|
|
426
|
+
downloaderArgs: Record<string, unknown> | null;
|
|
459
427
|
refreshSec: number | null;
|
|
460
428
|
changeNote: string | null;
|
|
461
429
|
}
|
|
@@ -469,14 +437,18 @@ interface PersistParams {
|
|
|
469
437
|
async function persistVersion(
|
|
470
438
|
ctx: AppContext,
|
|
471
439
|
p: PersistParams,
|
|
472
|
-
|
|
440
|
+
onPhase?: (sublabel: string) => void,
|
|
473
441
|
): Promise<string> {
|
|
442
|
+
onPhase?.("describing");
|
|
474
443
|
const description = await describe(p.logicalPath, p.mime, p.markdown, ctx.config.llm);
|
|
444
|
+
onPhase?.("chunking");
|
|
475
445
|
const chunks = chunkDeterministic(p.markdown, ctx.config.chunker);
|
|
476
446
|
const searchTexts = chunks.map((c) => buildSearchText(p.logicalPath, description, c.content));
|
|
477
447
|
let embeddings: number[][];
|
|
478
448
|
try {
|
|
479
|
-
embeddings = await embed(searchTexts, ctx.config.embedding_model, {
|
|
449
|
+
embeddings = await embed(searchTexts, ctx.config.embedding_model, {
|
|
450
|
+
onProgress: (done, total) => onPhase?.(`embedding ${done}/${total}`),
|
|
451
|
+
});
|
|
480
452
|
} catch (err) {
|
|
481
453
|
throw asHelpful(
|
|
482
454
|
err,
|
|
@@ -485,6 +457,7 @@ async function persistVersion(
|
|
|
485
457
|
);
|
|
486
458
|
}
|
|
487
459
|
|
|
460
|
+
onPhase?.("persisting");
|
|
488
461
|
const versionId = millisIso(Date.now());
|
|
489
462
|
const contentSha = p.contentSha ?? sha256Hex(new TextEncoder().encode(p.markdown));
|
|
490
463
|
await insertVersion(ctx.db, {
|
|
@@ -501,9 +474,8 @@ async function persistVersion(
|
|
|
501
474
|
mime_type: p.mime,
|
|
502
475
|
size_bytes: p.bytes?.byteLength ?? new TextEncoder().encode(p.markdown).byteLength,
|
|
503
476
|
fetcher: p.fetcher,
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
fetcher_args: p.fetcherArgs,
|
|
477
|
+
downloader: p.downloader,
|
|
478
|
+
downloader_args: p.downloaderArgs,
|
|
507
479
|
refresh_frequency_sec: p.refreshSec,
|
|
508
480
|
refreshed_at: new Date().toISOString(),
|
|
509
481
|
last_refresh_status: "ok",
|
|
@@ -521,6 +493,7 @@ async function persistVersion(
|
|
|
521
493
|
embedding: embeddings[i] ?? new Array(embeddings[0]?.length ?? 0).fill(0),
|
|
522
494
|
})),
|
|
523
495
|
);
|
|
496
|
+
onPhase?.("indexing");
|
|
524
497
|
await rebuildFts(ctx.db);
|
|
525
498
|
return versionId;
|
|
526
499
|
}
|
package/src/mcp/instructions.ts
CHANGED
|
@@ -11,8 +11,10 @@ indexed with BM25 — so prefer membot_search to membot_read+grep for discovery.
|
|
|
11
11
|
Workflow:
|
|
12
12
|
1. membot_tree or membot_search to find what already exists before adding new content.
|
|
13
13
|
2. membot_add to ingest a local file, a URL, or a remote document. URLs are
|
|
14
|
-
fetched via
|
|
15
|
-
|
|
14
|
+
fetched via per-service downloaders (Google Docs, Sheets, Slides, GitHub,
|
|
15
|
+
Linear, with a generic browser print-to-PDF fallback). Authentication
|
|
16
|
+
comes from the user's logged-in browser cookies (saved via \`membot login\`).
|
|
17
|
+
Each row stores which downloader was used so refresh is deterministic.
|
|
16
18
|
3. membot_read or membot_search hits to consume content.
|
|
17
19
|
4. membot_write to record agent-authored notes (source_type='inline').
|
|
18
20
|
|
package/src/operations/add.ts
CHANGED
|
@@ -10,7 +10,7 @@ import { type ResolvedSource, resolveSource } from "../ingest/source-resolver.ts
|
|
|
10
10
|
import { colors } from "../output/formatter.ts";
|
|
11
11
|
import { defineOperation } from "./types.ts";
|
|
12
12
|
|
|
13
|
-
const FetcherKindEnum = z.enum(["
|
|
13
|
+
const FetcherKindEnum = z.enum(["downloader", "local", "inline"]);
|
|
14
14
|
|
|
15
15
|
export const addOperation = defineOperation({
|
|
16
16
|
name: "membot_add",
|
|
@@ -19,7 +19,7 @@ export const addOperation = defineOperation({
|
|
|
19
19
|
- a local file path
|
|
20
20
|
- a local directory (recursive walk, symlinks followed)
|
|
21
21
|
- a glob pattern (e.g. "docs/**/*.md")
|
|
22
|
-
- a URL (fetched via
|
|
22
|
+
- a URL (fetched via the per-service downloader registry — Google Docs/Sheets/Slides via export endpoints, GitHub + Linear as rendered HTML, anything else through a generic browser print-to-PDF fallback. All fetches authenticate via the user's logged-in browser session — run \`membot login\` once to sign in.)
|
|
23
23
|
- "inline:<text>" literal
|
|
24
24
|
Pass any number of args; each is resolved independently and the matched entries are concatenated into one response. PDF, DOCX, HTML, images, and other binaries are converted to markdown — native libraries first, vision/OCR for images, LLM fallback for messy or scanned input. Original bytes are kept in the blobs table; \`membot_read bytes=true\` returns them. Setting \`refresh_frequency\` enables automatic refresh from the daemon. By default, re-ingesting an unchanged source (same source_sha256 as the current version) is a no-op and reports \`status: "unchanged"\`; pass \`force=true\` to always create a new version. Each newly-ingested file becomes a new version under its own logical_path; existing versions stay queryable via membot_versions. Directory/glob ingests stream one file at a time — partial failures do not abort the rest; the response lists per-entry status.
|
|
25
25
|
|
|
@@ -54,10 +54,12 @@ Pass \`logical_path\` to override. For a multi-source / directory / glob walk it
|
|
|
54
54
|
.default(true)
|
|
55
55
|
.describe("Follow symlinks during directory walks (cycles broken via realpath)"),
|
|
56
56
|
refresh_frequency: z.string().optional().describe("Auto-refresh cadence: 5m | 1h | 24h | 7d. Omit to disable."),
|
|
57
|
-
|
|
57
|
+
downloader: z
|
|
58
58
|
.string()
|
|
59
59
|
.optional()
|
|
60
|
-
.describe(
|
|
60
|
+
.describe(
|
|
61
|
+
"Force a specific downloader by name (e.g. 'google-docs', 'github', 'generic-web'). Skips URL-based matching.",
|
|
62
|
+
),
|
|
61
63
|
change_note: z.string().optional().describe("Free-text note attached to the new version"),
|
|
62
64
|
force: z
|
|
63
65
|
.boolean()
|
package/src/operations/info.ts
CHANGED
|
@@ -25,9 +25,8 @@ export const infoOperation = defineOperation({
|
|
|
25
25
|
size_bytes: z.number().nullable(),
|
|
26
26
|
description: z.string().nullable(),
|
|
27
27
|
fetcher: z.string().nullable(),
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
fetcher_args: z.record(z.string(), z.unknown()).nullable(),
|
|
28
|
+
downloader: z.string().nullable(),
|
|
29
|
+
downloader_args: z.record(z.string(), z.unknown()).nullable(),
|
|
31
30
|
refresh_frequency_sec: z.number().nullable(),
|
|
32
31
|
refreshed_at: z.string().nullable(),
|
|
33
32
|
last_refresh_status: z.string().nullable(),
|
|
@@ -53,9 +52,8 @@ export const infoOperation = defineOperation({
|
|
|
53
52
|
lines.push(fmt("blob_sha256", orDash(result.blob_sha256)));
|
|
54
53
|
lines.push(fmt("source_sha256", orDash(result.source_sha256)));
|
|
55
54
|
if (result.fetcher) lines.push(fmt("fetcher", result.fetcher));
|
|
56
|
-
if (result.
|
|
57
|
-
if (result.
|
|
58
|
-
if (result.fetcher_args) lines.push(fmt("fetcher_args", JSON.stringify(result.fetcher_args)));
|
|
55
|
+
if (result.downloader) lines.push(fmt("downloader", result.downloader));
|
|
56
|
+
if (result.downloader_args) lines.push(fmt("downloader_args", JSON.stringify(result.downloader_args)));
|
|
59
57
|
lines.push(
|
|
60
58
|
fmt(
|
|
61
59
|
"refresh_frequency",
|
package/src/operations/move.ts
CHANGED
|
@@ -54,9 +54,8 @@ export const moveOperation = defineOperation({
|
|
|
54
54
|
mime_type: cur.mime_type,
|
|
55
55
|
size_bytes: cur.size_bytes,
|
|
56
56
|
fetcher: cur.fetcher,
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
fetcher_args: cur.fetcher_args,
|
|
57
|
+
downloader: cur.downloader,
|
|
58
|
+
downloader_args: cur.downloader_args,
|
|
60
59
|
refresh_frequency_sec: cur.refresh_frequency_sec,
|
|
61
60
|
refreshed_at: cur.refreshed_at,
|
|
62
61
|
last_refresh_status: cur.last_refresh_status,
|
|
@@ -7,7 +7,7 @@ import { defineOperation } from "./types.ts";
|
|
|
7
7
|
export const refreshOperation = defineOperation({
|
|
8
8
|
name: "membot_refresh",
|
|
9
9
|
cliName: "refresh",
|
|
10
|
-
description: `Re-read a file's source and create a new version only if the source bytes changed. Pass \`logical_path\` to refresh one file, or omit it to refresh every file whose refresh_frequency_sec has elapsed. Local files are detected via mtime+sha; remote files are re-fetched via the same
|
|
10
|
+
description: `Re-read a file's source and create a new version only if the source bytes changed. Pass \`logical_path\` to refresh one file, or omit it to refresh every file whose refresh_frequency_sec has elapsed. Local files are detected via mtime+sha; remote files are re-fetched via the same downloader (Google Docs, GitHub, etc.) that was originally chosen. On auth or network failure the prior version stays current — check \`last_refresh_status\`. If the failure mentions a login redirect, re-run \`membot login\` and try again.`,
|
|
11
11
|
inputSchema: z.object({
|
|
12
12
|
logical_path: z.string().optional().describe("Single path to refresh; omit for all-due"),
|
|
13
13
|
force: z.boolean().default(false).describe("Re-embed even if source sha is unchanged"),
|
|
@@ -60,9 +60,7 @@ export const refreshOperation = defineOperation({
|
|
|
60
60
|
for (const path of targets) {
|
|
61
61
|
ctx.progress.tick(path);
|
|
62
62
|
try {
|
|
63
|
-
const r = await refreshOne(ctx, path, input.force, (
|
|
64
|
-
ctx.progress.update(`embedding ${done}/${total}`),
|
|
65
|
-
);
|
|
63
|
+
const r = await refreshOne(ctx, path, input.force, (sublabel) => ctx.progress.update(sublabel));
|
|
66
64
|
out.push(r);
|
|
67
65
|
} catch (err) {
|
|
68
66
|
out.push({ logical_path: path, status: "failed", error: err instanceof Error ? err.message : String(err) });
|