membot 0.5.2 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/.claude/skills/membot.md +25 -10
  2. package/.cursor/rules/membot.mdc +25 -10
  3. package/README.md +36 -4
  4. package/package.json +8 -5
  5. package/scripts/apply-patches.sh +0 -11
  6. package/scripts/build-test-docx.ts +84 -0
  7. package/src/cli.ts +2 -2
  8. package/src/commands/login-page.mustache +50 -0
  9. package/src/commands/login.ts +83 -0
  10. package/src/config/schemas.ts +23 -5
  11. package/src/constants.ts +20 -1
  12. package/src/context.ts +1 -24
  13. package/src/db/files.ts +21 -25
  14. package/src/db/migrations/003-downloader-columns.ts +58 -0
  15. package/src/db/migrations.ts +2 -1
  16. package/src/ingest/converter/docx.ts +47 -5
  17. package/src/ingest/converter/html.ts +10 -3
  18. package/src/ingest/converter/image.ts +40 -3
  19. package/src/ingest/converter/images-inline.ts +132 -0
  20. package/src/ingest/converter/index.ts +13 -3
  21. package/src/ingest/converter/xlsx.ts +111 -0
  22. package/src/ingest/downloaders/browser.ts +180 -0
  23. package/src/ingest/downloaders/generic-web.ts +81 -0
  24. package/src/ingest/downloaders/github.ts +178 -0
  25. package/src/ingest/downloaders/google-docs.ts +56 -0
  26. package/src/ingest/downloaders/google-shared.ts +86 -0
  27. package/src/ingest/downloaders/google-sheets.ts +58 -0
  28. package/src/ingest/downloaders/google-slides.ts +53 -0
  29. package/src/ingest/downloaders/index.ts +182 -0
  30. package/src/ingest/downloaders/linear.ts +291 -0
  31. package/src/ingest/fetcher.ts +104 -129
  32. package/src/ingest/ingest.ts +44 -71
  33. package/src/mcp/instructions.ts +4 -2
  34. package/src/operations/add.ts +6 -4
  35. package/src/operations/info.ts +4 -6
  36. package/src/operations/move.ts +2 -3
  37. package/src/operations/refresh.ts +2 -4
  38. package/src/operations/remove.ts +23 -2
  39. package/src/operations/tree.ts +1 -1
  40. package/src/operations/types.ts +1 -1
  41. package/src/refresh/runner.ts +60 -115
  42. package/src/types/text-modules.d.ts +5 -0
  43. package/patches/@evantahler%2Fmcpx@0.21.4.patch +0 -51
  44. package/src/commands/mcpx.ts +0 -112
  45. package/src/ingest/agent-fetcher.ts +0 -639
@@ -1,158 +1,133 @@
1
- import type { LlmConfig } from "../config/schemas.ts";
2
- import { DEFAULTS } from "../constants.ts";
3
- import { asHelpful, HelpfulError } from "../errors.ts";
1
+ import { join } from "node:path";
2
+ import type { MembotConfig } from "../config/schemas.ts";
3
+ import { FILES } from "../constants.ts";
4
+ import { HelpfulError } from "../errors.ts";
4
5
  import { logger } from "../output/logger.ts";
5
- import type { AgentMcpxAdapter } from "./agent-fetcher.ts";
6
- import { agentFetch } from "./agent-fetcher.ts";
7
- import { sha256Hex } from "./local-reader.ts";
6
+ import { BrowserPool } from "./downloaders/browser.ts";
7
+ import {
8
+ type DownloadedRemote,
9
+ type Downloader,
10
+ type DownloaderCtx,
11
+ findDownloader,
12
+ findDownloaderByName,
13
+ listDownloaders,
14
+ } from "./downloaders/index.ts";
8
15
 
9
- export interface FetchedRemote {
10
- bytes: Uint8Array;
11
- sha256: string;
12
- mimeType: string;
13
- fetcher: "http" | "mcpx";
14
- fetcherServer: string | null;
15
- fetcherTool: string | null;
16
- fetcherArgs: Record<string, unknown> | null;
17
- sourceUrl: string;
18
- }
16
+ export type FetchedRemote = DownloadedRemote;
19
17
 
20
18
  export interface FetchOptions {
21
19
  /**
22
- * User-provided hint. Free-form keyword (e.g. "firecrawl", "github",
23
- * "google-docs", "http"). Special-cased: "http" forces plain fetch.
24
- * Otherwise the hint is passed verbatim to the agent loop as extra
25
- * guidance about which provider to prefer.
20
+ * Optional explicit downloader override. Free-form; matched
21
+ * case-insensitively against `Downloader.name`. When given, skips the
22
+ * URL-based matching and forces that downloader (useful for the
23
+ * "use the generic-web fallback even though google-docs claimed
24
+ * this URL" escape hatch).
26
25
  */
27
- hint?: string;
28
- /** Live mcpx adapter the agent loop drives via search/list/info/exec. */
29
- mcpx?: AgentMcpxAdapter | null;
26
+ downloaderName?: string;
30
27
  /**
31
- * LLM config. The agent loop needs an Anthropic key; without one the
32
- * mcpx path is skipped and we fall back to plain HTTP.
28
+ * Override the on-disk path of the persistent chromium profile.
29
+ * Defaults to `<ctx.dataDir>/auth/browser-profile`.
33
30
  */
34
- llm?: LlmConfig;
31
+ userDataDir?: string;
32
+ /** Pre-built BrowserPool to share across many fetches (set by ingest's outer loop). */
33
+ pool?: BrowserPool;
35
34
  /**
36
- * Forwarded to the agent loop so callers (e.g. the ingest progress
37
- * reporter) can drive a spinner sublabel from per-turn agent activity.
35
+ * Sublabel hook forwarded to the downloader's `DownloaderCtx`.
36
+ * Drives the per-entry spinner text during multi-step fetches.
38
37
  */
39
38
  onProgress?: (sublabel: string) => void;
40
39
  }
41
40
 
42
41
  /**
43
- * Fetch a remote URL.
44
- *
45
- * - `--fetcher http` (or no mcpx, or no LLM key) → plain HTTP.
46
- * - Otherwise multi-turn agent loop: Claude is given mcpx tools
47
- * (search/list/info/exec) and decides how to retrieve the URL,
48
- * including multi-step flows (start a job poll → download).
49
- * The agent's selected mcp_exec invocation is recorded on the
50
- * returned row so refresh can replay it deterministically without
51
- * another agent round-trip.
52
- *
53
- * If the agent decides plain HTTP is the right call (`request_http_fallback`,
54
- * no tool calls, max turns) we transparently fall through to `httpFetch`.
55
- * If the agent reports an actionable failure, we surface that as a
56
- * `HelpfulError`. If mcpx is configured but the LLM key is missing AND
57
- * the HTTP fallback also fails, we surface an `auth_error` naming the env
58
- * var so users see the real cause instead of a misleading 401.
42
+ * Fetch a remote URL via the per-service downloader registry. Specific
43
+ * downloaders (Google, GitHub, Linear) match first; the generic-web
44
+ * downloader is the always-matching catch-all. Every fetch authenticates
45
+ * via the cookies the user persisted with `membot login`. The returned
46
+ * shape includes the chosen downloader name and its args so refresh can
47
+ * replay it deterministically without involving the LLM.
59
48
  */
60
- export async function fetchRemote(url: string, options: FetchOptions = {}): Promise<FetchedRemote> {
61
- const mcpx = options.mcpx;
62
- const hint = options.hint?.trim();
63
-
64
- if (hint === "http") return httpFetch(url);
65
- if (!mcpx) return httpFetch(url);
66
-
67
- const apiKey = options.llm?.anthropic_api_key?.trim();
68
- if (!apiKey) {
69
- // No way to drive the agent. Try HTTP; if that fails, the user
70
- // almost certainly wanted mcpx surface a clear key-missing error.
71
- try {
72
- return await httpFetch(url);
73
- } catch (err) {
74
- if (err instanceof HelpfulError && err.kind === "network_error") {
75
- throw new HelpfulError({
76
- kind: "auth_error",
77
- message: `${url} couldn't be fetched directly (${err.message}). Membot has mcpx configured, but routing through it requires Claude to translate the URL into the right tool arguments — and ANTHROPIC_API_KEY isn't set.`,
78
- hint: `Set ANTHROPIC_API_KEY in your environment (or under llm.anthropic_api_key in ~/.membot/config.json), then retry. To force the HTTP path explicitly, run \`membot add ${url} --fetcher http\`.`,
79
- });
80
- }
81
- throw err;
82
- }
83
- }
49
+ export async function fetchRemote(
50
+ url: string,
51
+ config: MembotConfig,
52
+ options: FetchOptions = {},
53
+ dataDir?: string,
54
+ ): Promise<FetchedRemote> {
55
+ const downloader = pickDownloader(url, options.downloaderName);
56
+ const userDataDir = options.userDataDir ?? defaultProfileDir(dataDir);
57
+ const ownsPool = !options.pool;
58
+ const headless = !downloader.requireHeaded;
59
+ const pool = options.pool ?? new BrowserPool({ userDataDir, headless });
60
+ const dctx: DownloaderCtx = { pool, logger, config, onProgress: options.onProgress };
84
61
 
85
- let outcome: Awaited<ReturnType<typeof agentFetch>>;
86
62
  try {
87
- outcome = await agentFetch({ url, mcpx, llm: options.llm!, hint, onProgress: options.onProgress });
88
- } catch (err) {
89
- if (err instanceof HelpfulError) throw err;
90
- logger.warn(`agent-fetch failed (${err instanceof Error ? err.message : String(err)}) — falling back to HTTP`);
91
- return httpFetch(url);
63
+ // Fetches are strictly non-interactive: there's no auto-launch
64
+ // of a browser when auth fails. Batch ingest (`membot add` of
65
+ // many URLs) and the refresh daemon both run without a human
66
+ // available to drive a window, so any auth_error must
67
+ // propagate as-is. The HelpfulError's hint tells the user to
68
+ // `membot login` (cookie-based services) or `membot config set
69
+ // downloaders.<svc>.api_key` (API-key services); they fix it
70
+ // once and re-run.
71
+ return await downloader.download(new URL(url), dctx);
72
+ } finally {
73
+ if (ownsPool) await pool.dispose();
92
74
  }
75
+ }
93
76
 
94
- if (outcome.kind === "accepted") {
95
- return {
96
- bytes: outcome.result.bytes,
97
- sha256: outcome.result.sha256,
98
- mimeType: outcome.result.mimeType,
99
- fetcher: "mcpx",
100
- fetcherServer: outcome.result.fetcherServer,
101
- fetcherTool: outcome.result.fetcherTool,
102
- fetcherArgs: outcome.result.fetcherArgs,
103
- sourceUrl: url,
104
- };
77
+ /**
78
+ * Replay a fetch by downloader name (used by refresh). Looks up the
79
+ * persisted downloader by name and calls it against the original URL —
80
+ * deterministic, no agent loop. When the persisted downloader is no
81
+ * longer registered (e.g. from a prior membot version), falls back to
82
+ * URL-based dispatch so refresh degrades gracefully instead of erroring.
83
+ */
84
+ export async function fetchRemoteByDownloader(
85
+ downloaderName: string | null,
86
+ url: string,
87
+ pool: BrowserPool,
88
+ config: MembotConfig,
89
+ ): Promise<FetchedRemote> {
90
+ const named = downloaderName ? findDownloaderByName(downloaderName) : null;
91
+ const downloader = named ?? findDownloader(url);
92
+ if (!downloader) {
93
+ throw new HelpfulError({
94
+ kind: "input_error",
95
+ message: `no downloader matches ${url}`,
96
+ hint: "Re-add the URL with `membot add <url>` to pick a fresh downloader.",
97
+ });
105
98
  }
106
- logger.info(`[fetcher] falling back to HTTP: ${outcome.reason}`);
107
- return httpFetch(url);
99
+ const dctx: DownloaderCtx = { pool, logger, config };
100
+ return downloader.download(new URL(url), dctx);
108
101
  }
109
102
 
110
- /** Plain `fetch` fallback. Used when mcpx isn't configured or the hint says so. */
111
- async function httpFetch(url: string): Promise<FetchedRemote> {
112
- let resp: Response;
113
- try {
114
- resp = await fetch(url, {
115
- headers: { "User-Agent": "membot/0.1" },
116
- signal: AbortSignal.timeout(DEFAULTS.HTTP_TIMEOUT_MS),
117
- });
118
- } catch (err) {
119
- throw asHelpful(
120
- err,
121
- `while fetching ${url}`,
122
- `Check your network and that ${url} is reachable. For mcpx-managed sources (gdocs/github/firecrawl), set ANTHROPIC_API_KEY so membot can drive an mcpx tool.`,
123
- "network_error",
124
- );
103
+ function pickDownloader(url: string, override?: string): Downloader {
104
+ if (override) {
105
+ const named = findDownloaderByName(override.toLowerCase());
106
+ if (!named) {
107
+ const available = listDownloaders()
108
+ .map((d) => d.name)
109
+ .join(", ");
110
+ throw new HelpfulError({
111
+ kind: "input_error",
112
+ message: `unknown downloader '${override}'`,
113
+ hint: `Pick one of: ${available}.`,
114
+ });
115
+ }
116
+ return named;
125
117
  }
126
- if (!resp.ok) {
118
+ const matched = findDownloader(url);
119
+ if (!matched) {
127
120
  throw new HelpfulError({
128
- kind: "network_error",
129
- message: `HTTP ${resp.status} ${resp.statusText}: ${url}`,
130
- hint: "Verify the URL is reachable and not gated behind auth. For private docs use mcpx (set ANTHROPIC_API_KEY).",
121
+ kind: "input_error",
122
+ message: `not a fetchable URL: ${url}`,
123
+ hint: "Pass an http(s):// URL.",
131
124
  });
132
125
  }
133
- const bytes = new Uint8Array(await resp.arrayBuffer());
134
- const ct = resp.headers.get("content-type") ?? "";
135
- const mime = ct.split(";")[0]?.trim() || "application/octet-stream";
136
- return {
137
- bytes,
138
- sha256: sha256Hex(bytes),
139
- mimeType: mime,
140
- fetcher: "http",
141
- fetcherServer: null,
142
- fetcherTool: null,
143
- fetcherArgs: null,
144
- sourceUrl: url,
145
- };
126
+ return matched;
146
127
  }
147
128
 
148
- /**
149
- * Detect MCP `CallToolResult` envelopes that signal tool failure. MCP
150
- * tool errors don't throw — they return `{ isError: true, content: [...] }`
151
- * — so callers must check this explicitly before treating the content
152
- * as a successful payload. Used by the refresh runner; the agent loop
153
- * has its own preview-aware check.
154
- */
155
- export function isMcpToolError(result: unknown): boolean {
156
- if (!result || typeof result !== "object") return false;
157
- return (result as { isError?: unknown }).isError === true;
129
+ function defaultProfileDir(dataDir?: string): string {
130
+ if (dataDir) return join(dataDir, FILES.BROWSER_PROFILE);
131
+ const home = process.env.MEMBOT_HOME ?? `${process.env.HOME ?? "."}/.membot`;
132
+ return join(home, FILES.BROWSER_PROFILE);
158
133
  }
@@ -20,7 +20,7 @@ export interface IngestInput {
20
20
  exclude?: string;
21
21
  follow_symlinks?: boolean;
22
22
  refresh_frequency?: string;
23
- fetcher_hint?: string;
23
+ downloader?: string;
24
24
  change_note?: string;
25
25
  force?: boolean;
26
26
  }
@@ -161,13 +161,12 @@ async function ingestInline(
161
161
  bytes: null,
162
162
  markdown: text,
163
163
  fetcher: "inline",
164
- fetcherServer: null,
165
- fetcherTool: null,
166
- fetcherArgs: null,
164
+ downloader: null,
165
+ downloaderArgs: null,
167
166
  refreshSec,
168
167
  changeNote: input.change_note ?? null,
169
168
  },
170
- (done, total) => callbacks?.onEntryProgress?.(logicalPath, `embedding ${done}/${total}`),
169
+ (sublabel) => callbacks?.onEntryProgress?.(logicalPath, sublabel),
171
170
  );
172
171
  result.version_id = versionId;
173
172
  } catch (err) {
@@ -187,38 +186,6 @@ async function ingestUrl(
187
186
  force: boolean,
188
187
  callbacks?: IngestCallbacks,
189
188
  ): Promise<IngestResult> {
190
- const mcpxAdapter = ctx.mcpx
191
- ? {
192
- async search(query: string, options?: { keywordOnly?: boolean; semanticOnly?: boolean }) {
193
- try {
194
- const results = await ctx.mcpx!.search(query, options);
195
- return results.map((r) => ({
196
- server: r.server,
197
- tool: r.tool,
198
- description: r.description ?? undefined,
199
- score: r.score,
200
- matchType: r.matchType ?? undefined,
201
- }));
202
- } catch (err) {
203
- logger.debug(`mcpx.search(${query}) failed: ${err instanceof Error ? err.message : String(err)}`);
204
- return [];
205
- }
206
- },
207
- async listTools(server?: string) {
208
- const tools = await ctx.mcpx!.listTools(server);
209
- return tools.map((t) => ({ server: t.server, tool: { name: t.tool.name, description: t.tool.description } }));
210
- },
211
- async info(server: string, tool: string) {
212
- const t = await ctx.mcpx!.info(server, tool);
213
- if (!t) return undefined;
214
- return { name: t.name, description: t.description, inputSchema: t.inputSchema };
215
- },
216
- async exec(server: string, tool: string, args?: Record<string, unknown>) {
217
- return ctx.mcpx!.exec(server, tool, args ?? {});
218
- },
219
- }
220
- : null;
221
-
222
189
  const logicalPath = input.logical_path ?? defaultLogicalForUrl(url);
223
190
  callbacks?.onEntryStart?.(url);
224
191
  const result: IngestEntryResult = {
@@ -228,20 +195,24 @@ async function ingestUrl(
228
195
  status: "ok",
229
196
  mime_type: null,
230
197
  size_bytes: 0,
231
- fetcher: "http",
198
+ fetcher: "downloader",
232
199
  source_sha256: "",
233
200
  };
234
201
 
235
202
  try {
236
- const fetched = await fetchRemote(url, {
237
- hint: input.fetcher_hint,
238
- mcpx: mcpxAdapter,
239
- llm: ctx.config.llm,
240
- onProgress: (sublabel) => callbacks?.onEntryProgress?.(url, sublabel),
241
- });
203
+ callbacks?.onEntryProgress?.(url, "fetching");
204
+ const fetched = await fetchRemote(
205
+ url,
206
+ ctx.config,
207
+ {
208
+ downloaderName: input.downloader,
209
+ onProgress: (sublabel) => callbacks?.onEntryProgress?.(url, sublabel),
210
+ },
211
+ ctx.dataDir,
212
+ );
242
213
  result.mime_type = fetched.mimeType;
243
214
  result.size_bytes = fetched.bytes.byteLength;
244
- result.fetcher = fetched.fetcher;
215
+ result.fetcher = "downloader";
245
216
  result.source_sha256 = fetched.sha256;
246
217
 
247
218
  if (!force) {
@@ -265,14 +236,13 @@ async function ingestUrl(
265
236
  sourcePath: url,
266
237
  sourceMtimeMs: null,
267
238
  sourceSha: fetched.sha256,
268
- fetcher: fetched.fetcher,
269
- fetcherServer: fetched.fetcherServer,
270
- fetcherTool: fetched.fetcherTool,
271
- fetcherArgs: fetched.fetcherArgs,
239
+ fetcher: "downloader",
240
+ downloader: fetched.downloader,
241
+ downloaderArgs: fetched.downloaderArgs,
272
242
  refreshSec,
273
243
  changeNote: input.change_note ?? null,
274
244
  },
275
- (done, total) => callbacks?.onEntryProgress?.(url, `embedding ${done}/${total}`),
245
+ (sublabel) => callbacks?.onEntryProgress?.(url, sublabel),
276
246
  );
277
247
  result.version_id = versionId;
278
248
  } catch (err) {
@@ -352,13 +322,12 @@ async function ingestLocalFiles(
352
322
  sourceMtimeMs: local.mtimeMs,
353
323
  sourceSha: local.sha256,
354
324
  fetcher: "local",
355
- fetcherServer: null,
356
- fetcherTool: null,
357
- fetcherArgs: null,
325
+ downloader: null,
326
+ downloaderArgs: null,
358
327
  refreshSec,
359
328
  changeNote: input.change_note ?? null,
360
329
  },
361
- (done, total) => callbacks?.onEntryProgress?.(entry.relPathFromBase, `embedding ${done}/${total}`),
330
+ (sublabel) => callbacks?.onEntryProgress?.(entry.relPathFromBase, sublabel),
362
331
  );
363
332
  result.version_id = versionId;
364
333
  } catch (err) {
@@ -387,9 +356,8 @@ interface PipelineParams {
387
356
  sourceMtimeMs: number | null;
388
357
  sourceSha: string;
389
358
  fetcher: FetcherKind;
390
- fetcherServer: string | null;
391
- fetcherTool: string | null;
392
- fetcherArgs: Record<string, unknown> | null;
359
+ downloader: string | null;
360
+ downloaderArgs: Record<string, unknown> | null;
393
361
  refreshSec: number | null;
394
362
  changeNote: string | null;
395
363
  }
@@ -404,8 +372,9 @@ interface PipelineParams {
404
372
  async function pipelineForBytes(
405
373
  ctx: AppContext,
406
374
  p: PipelineParams,
407
- onEmbedProgress?: (done: number, total: number) => void,
375
+ onPhase?: (sublabel: string) => void,
408
376
  ): Promise<string> {
377
+ onPhase?.("storing blob");
409
378
  await upsertBlob(ctx.db, {
410
379
  sha256: p.sourceSha,
411
380
  mime_type: p.mime,
@@ -413,7 +382,8 @@ async function pipelineForBytes(
413
382
  bytes: p.bytes,
414
383
  });
415
384
 
416
- const conversion = await convert(p.bytes, p.mime, p.source, ctx.config.llm);
385
+ onPhase?.("converting");
386
+ const conversion = await convert(p.bytes, p.mime, p.source, ctx.config.llm, ctx.config.converters);
417
387
  const markdown = conversion.markdown;
418
388
  const contentSha = sha256Hex(new TextEncoder().encode(markdown));
419
389
 
@@ -431,13 +401,12 @@ async function pipelineForBytes(
431
401
  markdown,
432
402
  contentSha,
433
403
  fetcher: p.fetcher,
434
- fetcherServer: p.fetcherServer,
435
- fetcherTool: p.fetcherTool,
436
- fetcherArgs: p.fetcherArgs,
404
+ downloader: p.downloader,
405
+ downloaderArgs: p.downloaderArgs,
437
406
  refreshSec: p.refreshSec,
438
407
  changeNote: p.changeNote,
439
408
  },
440
- onEmbedProgress,
409
+ onPhase,
441
410
  );
442
411
  }
443
412
 
@@ -453,9 +422,8 @@ interface PersistParams {
453
422
  markdown: string;
454
423
  contentSha?: string;
455
424
  fetcher: FetcherKind;
456
- fetcherServer: string | null;
457
- fetcherTool: string | null;
458
- fetcherArgs: Record<string, unknown> | null;
425
+ downloader: string | null;
426
+ downloaderArgs: Record<string, unknown> | null;
459
427
  refreshSec: number | null;
460
428
  changeNote: string | null;
461
429
  }
@@ -469,14 +437,18 @@ interface PersistParams {
469
437
  async function persistVersion(
470
438
  ctx: AppContext,
471
439
  p: PersistParams,
472
- onEmbedProgress?: (done: number, total: number) => void,
440
+ onPhase?: (sublabel: string) => void,
473
441
  ): Promise<string> {
442
+ onPhase?.("describing");
474
443
  const description = await describe(p.logicalPath, p.mime, p.markdown, ctx.config.llm);
444
+ onPhase?.("chunking");
475
445
  const chunks = chunkDeterministic(p.markdown, ctx.config.chunker);
476
446
  const searchTexts = chunks.map((c) => buildSearchText(p.logicalPath, description, c.content));
477
447
  let embeddings: number[][];
478
448
  try {
479
- embeddings = await embed(searchTexts, ctx.config.embedding_model, { onProgress: onEmbedProgress });
449
+ embeddings = await embed(searchTexts, ctx.config.embedding_model, {
450
+ onProgress: (done, total) => onPhase?.(`embedding ${done}/${total}`),
451
+ });
480
452
  } catch (err) {
481
453
  throw asHelpful(
482
454
  err,
@@ -485,6 +457,7 @@ async function persistVersion(
485
457
  );
486
458
  }
487
459
 
460
+ onPhase?.("persisting");
488
461
  const versionId = millisIso(Date.now());
489
462
  const contentSha = p.contentSha ?? sha256Hex(new TextEncoder().encode(p.markdown));
490
463
  await insertVersion(ctx.db, {
@@ -501,9 +474,8 @@ async function persistVersion(
501
474
  mime_type: p.mime,
502
475
  size_bytes: p.bytes?.byteLength ?? new TextEncoder().encode(p.markdown).byteLength,
503
476
  fetcher: p.fetcher,
504
- fetcher_server: p.fetcherServer,
505
- fetcher_tool: p.fetcherTool,
506
- fetcher_args: p.fetcherArgs,
477
+ downloader: p.downloader,
478
+ downloader_args: p.downloaderArgs,
507
479
  refresh_frequency_sec: p.refreshSec,
508
480
  refreshed_at: new Date().toISOString(),
509
481
  last_refresh_status: "ok",
@@ -521,6 +493,7 @@ async function persistVersion(
521
493
  embedding: embeddings[i] ?? new Array(embeddings[0]?.length ?? 0).fill(0),
522
494
  })),
523
495
  );
496
+ onPhase?.("indexing");
524
497
  await rebuildFts(ctx.db);
525
498
  return versionId;
526
499
  }
@@ -11,8 +11,10 @@ indexed with BM25 — so prefer membot_search to membot_read+grep for discovery.
11
11
  Workflow:
12
12
  1. membot_tree or membot_search to find what already exists before adding new content.
13
13
  2. membot_add to ingest a local file, a URL, or a remote document. URLs are
14
- fetched via mcpx (the chosen invocation is stored so refresh is fast and
15
- deterministic).
14
+ fetched via per-service downloaders (Google Docs, Sheets, Slides, GitHub,
15
+ Linear, with a generic browser print-to-PDF fallback). Authentication
16
+ comes from the user's logged-in browser cookies (saved via \`membot login\`).
17
+ Each row stores which downloader was used so refresh is deterministic.
16
18
  3. membot_read or membot_search hits to consume content.
17
19
  4. membot_write to record agent-authored notes (source_type='inline').
18
20
 
@@ -10,7 +10,7 @@ import { type ResolvedSource, resolveSource } from "../ingest/source-resolver.ts
10
10
  import { colors } from "../output/formatter.ts";
11
11
  import { defineOperation } from "./types.ts";
12
12
 
13
- const FetcherKindEnum = z.enum(["http", "mcpx", "local", "inline"]);
13
+ const FetcherKindEnum = z.enum(["downloader", "local", "inline"]);
14
14
 
15
15
  export const addOperation = defineOperation({
16
16
  name: "membot_add",
@@ -19,7 +19,7 @@ export const addOperation = defineOperation({
19
19
  - a local file path
20
20
  - a local directory (recursive walk, symlinks followed)
21
21
  - a glob pattern (e.g. "docs/**/*.md")
22
- - a URL (fetched via mcpx if configured, otherwise plain HTTP)
22
+ - a URL (fetched via the per-service downloader registry — Google Docs/Sheets/Slides via export endpoints, GitHub + Linear as rendered HTML, anything else through a generic browser print-to-PDF fallback. All fetches authenticate via the user's logged-in browser session — run \`membot login\` once to sign in.)
23
23
  - "inline:<text>" literal
24
24
  Pass any number of args; each is resolved independently and the matched entries are concatenated into one response. PDF, DOCX, HTML, images, and other binaries are converted to markdown — native libraries first, vision/OCR for images, LLM fallback for messy or scanned input. Original bytes are kept in the blobs table; \`membot_read bytes=true\` returns them. Setting \`refresh_frequency\` enables automatic refresh from the daemon. By default, re-ingesting an unchanged source (same source_sha256 as the current version) is a no-op and reports \`status: "unchanged"\`; pass \`force=true\` to always create a new version. Each newly-ingested file becomes a new version under its own logical_path; existing versions stay queryable via membot_versions. Directory/glob ingests stream one file at a time — partial failures do not abort the rest; the response lists per-entry status.
25
25
 
@@ -54,10 +54,12 @@ Pass \`logical_path\` to override. For a multi-source / directory / glob walk it
54
54
  .default(true)
55
55
  .describe("Follow symlinks during directory walks (cycles broken via realpath)"),
56
56
  refresh_frequency: z.string().optional().describe("Auto-refresh cadence: 5m | 1h | 24h | 7d. Omit to disable."),
57
- fetcher_hint: z
57
+ downloader: z
58
58
  .string()
59
59
  .optional()
60
- .describe("Free-form hint passed to mcpx tool search (e.g. 'firecrawl', 'github', 'google docs', 'http')"),
60
+ .describe(
61
+ "Force a specific downloader by name (e.g. 'google-docs', 'github', 'generic-web'). Skips URL-based matching.",
62
+ ),
61
63
  change_note: z.string().optional().describe("Free-text note attached to the new version"),
62
64
  force: z
63
65
  .boolean()
@@ -25,9 +25,8 @@ export const infoOperation = defineOperation({
25
25
  size_bytes: z.number().nullable(),
26
26
  description: z.string().nullable(),
27
27
  fetcher: z.string().nullable(),
28
- fetcher_server: z.string().nullable(),
29
- fetcher_tool: z.string().nullable(),
30
- fetcher_args: z.record(z.string(), z.unknown()).nullable(),
28
+ downloader: z.string().nullable(),
29
+ downloader_args: z.record(z.string(), z.unknown()).nullable(),
31
30
  refresh_frequency_sec: z.number().nullable(),
32
31
  refreshed_at: z.string().nullable(),
33
32
  last_refresh_status: z.string().nullable(),
@@ -53,9 +52,8 @@ export const infoOperation = defineOperation({
53
52
  lines.push(fmt("blob_sha256", orDash(result.blob_sha256)));
54
53
  lines.push(fmt("source_sha256", orDash(result.source_sha256)));
55
54
  if (result.fetcher) lines.push(fmt("fetcher", result.fetcher));
56
- if (result.fetcher_server) lines.push(fmt("fetcher_server", result.fetcher_server));
57
- if (result.fetcher_tool) lines.push(fmt("fetcher_tool", result.fetcher_tool));
58
- if (result.fetcher_args) lines.push(fmt("fetcher_args", JSON.stringify(result.fetcher_args)));
55
+ if (result.downloader) lines.push(fmt("downloader", result.downloader));
56
+ if (result.downloader_args) lines.push(fmt("downloader_args", JSON.stringify(result.downloader_args)));
59
57
  lines.push(
60
58
  fmt(
61
59
  "refresh_frequency",
@@ -54,9 +54,8 @@ export const moveOperation = defineOperation({
54
54
  mime_type: cur.mime_type,
55
55
  size_bytes: cur.size_bytes,
56
56
  fetcher: cur.fetcher,
57
- fetcher_server: cur.fetcher_server,
58
- fetcher_tool: cur.fetcher_tool,
59
- fetcher_args: cur.fetcher_args,
57
+ downloader: cur.downloader,
58
+ downloader_args: cur.downloader_args,
60
59
  refresh_frequency_sec: cur.refresh_frequency_sec,
61
60
  refreshed_at: cur.refreshed_at,
62
61
  last_refresh_status: cur.last_refresh_status,
@@ -7,7 +7,7 @@ import { defineOperation } from "./types.ts";
7
7
  export const refreshOperation = defineOperation({
8
8
  name: "membot_refresh",
9
9
  cliName: "refresh",
10
- description: `Re-read a file's source and create a new version only if the source bytes changed. Pass \`logical_path\` to refresh one file, or omit it to refresh every file whose refresh_frequency_sec has elapsed. Local files are detected via mtime+sha; remote files are re-fetched via the same mcpx invocation that was originally used. On auth or network failure the prior version stays current — check \`last_refresh_status\`.`,
10
+ description: `Re-read a file's source and create a new version only if the source bytes changed. Pass \`logical_path\` to refresh one file, or omit it to refresh every file whose refresh_frequency_sec has elapsed. Local files are detected via mtime+sha; remote files are re-fetched via the same downloader (Google Docs, GitHub, etc.) that was originally chosen. On auth or network failure the prior version stays current — check \`last_refresh_status\`. If the failure mentions a login redirect, re-run \`membot login\` and try again.`,
11
11
  inputSchema: z.object({
12
12
  logical_path: z.string().optional().describe("Single path to refresh; omit for all-due"),
13
13
  force: z.boolean().default(false).describe("Re-embed even if source sha is unchanged"),
@@ -60,9 +60,7 @@ export const refreshOperation = defineOperation({
60
60
  for (const path of targets) {
61
61
  ctx.progress.tick(path);
62
62
  try {
63
- const r = await refreshOne(ctx, path, input.force, (done, total) =>
64
- ctx.progress.update(`embedding ${done}/${total}`),
65
- );
63
+ const r = await refreshOne(ctx, path, input.force, (sublabel) => ctx.progress.update(sublabel));
66
64
  out.push(r);
67
65
  } catch (err) {
68
66
  out.push({ logical_path: path, status: "failed", error: err instanceof Error ? err.message : String(err) });