npm - membot - Versions diffs - 0.5.1 → 0.6.0 - Mend

membot 0.5.1 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

package/.claude/skills/membot.md +25 -10
package/.cursor/rules/membot.mdc +25 -10
package/README.md +35 -4
package/package.json +8 -5
package/scripts/apply-patches.sh +0 -11
package/src/cli.ts +2 -2
package/src/commands/login-page.mustache +50 -0
package/src/commands/login.ts +83 -0
package/src/config/schemas.ts +17 -5
package/src/constants.ts +13 -1
package/src/context.ts +1 -24
package/src/db/files.ts +21 -25
package/src/db/migrations/003-downloader-columns.ts +58 -0
package/src/db/migrations.ts +2 -1
package/src/ingest/converter/index.ts +9 -0
package/src/ingest/converter/xlsx.ts +111 -0
package/src/ingest/downloaders/browser.ts +180 -0
package/src/ingest/downloaders/generic-web.ts +81 -0
package/src/ingest/downloaders/github.ts +178 -0
package/src/ingest/downloaders/google-docs.ts +56 -0
package/src/ingest/downloaders/google-shared.ts +86 -0
package/src/ingest/downloaders/google-sheets.ts +58 -0
package/src/ingest/downloaders/google-slides.ts +53 -0
package/src/ingest/downloaders/index.ts +182 -0
package/src/ingest/downloaders/linear.ts +291 -0
package/src/ingest/fetcher.ts +107 -127
package/src/ingest/ingest.ts +43 -69
package/src/mcp/instructions.ts +4 -2
package/src/operations/add.ts +6 -4
package/src/operations/info.ts +4 -6
package/src/operations/move.ts +2 -3
package/src/operations/refresh.ts +2 -4
package/src/operations/remove.ts +23 -2
package/src/operations/tree.ts +1 -1
package/src/operations/types.ts +1 -1
package/src/refresh/runner.ts +59 -114
package/src/types/text-modules.d.ts +5 -0
package/patches/@evantahler%2Fmcpx@0.21.4.patch +0 -51
package/src/commands/mcpx.ts +0 -112
package/src/ingest/agent-fetcher.ts +0 -564

package/src/ingest/ingest.ts CHANGED Viewed

@@ -20,7 +20,7 @@ export interface IngestInput {
 	exclude?: string;
 	follow_symlinks?: boolean;
 	refresh_frequency?: string;
-	fetcher_hint?: string;
+	downloader?: string;
 	change_note?: string;
 	force?: boolean;
 }
@@ -161,13 +161,12 @@ async function ingestInline(
 				bytes: null,
 				markdown: text,
 				fetcher: "inline",
-				fetcherServer: null,
-				fetcherTool: null,
-				fetcherArgs: null,
+				downloader: null,
+				downloaderArgs: null,
 				refreshSec,
 				changeNote: input.change_note ?? null,
 			},
-			(done, total) => callbacks?.onEntryProgress?.(logicalPath, `embedding ${done}/${total}`),
+			(sublabel) => callbacks?.onEntryProgress?.(logicalPath, sublabel),
 		);
 		result.version_id = versionId;
 	} catch (err) {
@@ -187,38 +186,6 @@ async function ingestUrl(
 	force: boolean,
 	callbacks?: IngestCallbacks,
 ): Promise<IngestResult> {
-	const mcpxAdapter = ctx.mcpx
-		? {
-				async search(query: string, options?: { keywordOnly?: boolean; semanticOnly?: boolean }) {
-					try {
-						const results = await ctx.mcpx!.search(query, options);
-						return results.map((r) => ({
-							server: r.server,
-							tool: r.tool,
-							description: r.description ?? undefined,
-							score: r.score,
-							matchType: r.matchType ?? undefined,
-						}));
-					} catch (err) {
-						logger.debug(`mcpx.search(${query}) failed: ${err instanceof Error ? err.message : String(err)}`);
-						return [];
-					}
-				},
-				async listTools(server?: string) {
-					const tools = await ctx.mcpx!.listTools(server);
-					return tools.map((t) => ({ server: t.server, tool: { name: t.tool.name, description: t.tool.description } }));
-				},
-				async info(server: string, tool: string) {
-					const t = await ctx.mcpx!.info(server, tool);
-					if (!t) return undefined;
-					return { name: t.name, description: t.description, inputSchema: t.inputSchema };
-				},
-				async exec(server: string, tool: string, args?: Record<string, unknown>) {
-					return ctx.mcpx!.exec(server, tool, args ?? {});
-				},
-			}
-		: null;
 	const logicalPath = input.logical_path ?? defaultLogicalForUrl(url);
 	callbacks?.onEntryStart?.(url);
 	const result: IngestEntryResult = {
@@ -228,19 +195,24 @@ async function ingestUrl(
 		status: "ok",
 		mime_type: null,
 		size_bytes: 0,
-		fetcher: "http",
+		fetcher: "downloader",
 		source_sha256: "",
 	};
 	try {
-		const fetched = await fetchRemote(url, {
-			hint: input.fetcher_hint,
-			mcpx: mcpxAdapter,
-			llm: ctx.config.llm,
-		});
+		callbacks?.onEntryProgress?.(url, "fetching");
+		const fetched = await fetchRemote(
+			url,
+			ctx.config,
+			{
+				downloaderName: input.downloader,
+				onProgress: (sublabel) => callbacks?.onEntryProgress?.(url, sublabel),
+			},
+			ctx.dataDir,
+		);
 		result.mime_type = fetched.mimeType;
 		result.size_bytes = fetched.bytes.byteLength;
-		result.fetcher = fetched.fetcher;
+		result.fetcher = "downloader";
 		result.source_sha256 = fetched.sha256;
 		if (!force) {
@@ -264,14 +236,13 @@ async function ingestUrl(
 				sourcePath: url,
 				sourceMtimeMs: null,
 				sourceSha: fetched.sha256,
-				fetcher: fetched.fetcher,
-				fetcherServer: fetched.fetcherServer,
-				fetcherTool: fetched.fetcherTool,
-				fetcherArgs: fetched.fetcherArgs,
+				fetcher: "downloader",
+				downloader: fetched.downloader,
+				downloaderArgs: fetched.downloaderArgs,
 				refreshSec,
 				changeNote: input.change_note ?? null,
 			},
-			(done, total) => callbacks?.onEntryProgress?.(url, `embedding ${done}/${total}`),
+			(sublabel) => callbacks?.onEntryProgress?.(url, sublabel),
 		);
 		result.version_id = versionId;
 	} catch (err) {
@@ -351,13 +322,12 @@ async function ingestLocalFiles(
 					sourceMtimeMs: local.mtimeMs,
 					sourceSha: local.sha256,
 					fetcher: "local",
-					fetcherServer: null,
-					fetcherTool: null,
-					fetcherArgs: null,
+					downloader: null,
+					downloaderArgs: null,
 					refreshSec,
 					changeNote: input.change_note ?? null,
 				},
-				(done, total) => callbacks?.onEntryProgress?.(entry.relPathFromBase, `embedding ${done}/${total}`),
+				(sublabel) => callbacks?.onEntryProgress?.(entry.relPathFromBase, sublabel),
 			);
 			result.version_id = versionId;
 		} catch (err) {
@@ -386,9 +356,8 @@ interface PipelineParams {
 	sourceMtimeMs: number | null;
 	sourceSha: string;
 	fetcher: FetcherKind;
-	fetcherServer: string | null;
-	fetcherTool: string | null;
-	fetcherArgs: Record<string, unknown> | null;
+	downloader: string | null;
+	downloaderArgs: Record<string, unknown> | null;
 	refreshSec: number | null;
 	changeNote: string | null;
 }
@@ -403,8 +372,9 @@ interface PipelineParams {
 async function pipelineForBytes(
 	ctx: AppContext,
 	p: PipelineParams,
-	onEmbedProgress?: (done: number, total: number) => void,
+	onPhase?: (sublabel: string) => void,
 ): Promise<string> {
+	onPhase?.("storing blob");
 	await upsertBlob(ctx.db, {
 		sha256: p.sourceSha,
 		mime_type: p.mime,
@@ -412,6 +382,7 @@ async function pipelineForBytes(
 		bytes: p.bytes,
 	});
+	onPhase?.("converting");
 	const conversion = await convert(p.bytes, p.mime, p.source, ctx.config.llm);
 	const markdown = conversion.markdown;
 	const contentSha = sha256Hex(new TextEncoder().encode(markdown));
@@ -430,13 +401,12 @@ async function pipelineForBytes(
 			markdown,
 			contentSha,
 			fetcher: p.fetcher,
-			fetcherServer: p.fetcherServer,
-			fetcherTool: p.fetcherTool,
-			fetcherArgs: p.fetcherArgs,
+			downloader: p.downloader,
+			downloaderArgs: p.downloaderArgs,
 			refreshSec: p.refreshSec,
 			changeNote: p.changeNote,
 		},
-		onEmbedProgress,
+		onPhase,
 	);
 }
@@ -452,9 +422,8 @@ interface PersistParams {
 	markdown: string;
 	contentSha?: string;
 	fetcher: FetcherKind;
-	fetcherServer: string | null;
-	fetcherTool: string | null;
-	fetcherArgs: Record<string, unknown> | null;
+	downloader: string | null;
+	downloaderArgs: Record<string, unknown> | null;
 	refreshSec: number | null;
 	changeNote: string | null;
 }
@@ -468,14 +437,18 @@ interface PersistParams {
 async function persistVersion(
 	ctx: AppContext,
 	p: PersistParams,
-	onEmbedProgress?: (done: number, total: number) => void,
+	onPhase?: (sublabel: string) => void,
 ): Promise<string> {
+	onPhase?.("describing");
 	const description = await describe(p.logicalPath, p.mime, p.markdown, ctx.config.llm);
+	onPhase?.("chunking");
 	const chunks = chunkDeterministic(p.markdown, ctx.config.chunker);
 	const searchTexts = chunks.map((c) => buildSearchText(p.logicalPath, description, c.content));
 	let embeddings: number[][];
 	try {
-		embeddings = await embed(searchTexts, ctx.config.embedding_model, { onProgress: onEmbedProgress });
+		embeddings = await embed(searchTexts, ctx.config.embedding_model, {
+			onProgress: (done, total) => onPhase?.(`embedding ${done}/${total}`),
+		});
 	} catch (err) {
 		throw asHelpful(
 			err,
@@ -484,6 +457,7 @@ async function persistVersion(
 		);
 	}
+	onPhase?.("persisting");
 	const versionId = millisIso(Date.now());
 	const contentSha = p.contentSha ?? sha256Hex(new TextEncoder().encode(p.markdown));
 	await insertVersion(ctx.db, {
@@ -500,9 +474,8 @@ async function persistVersion(
 		mime_type: p.mime,
 		size_bytes: p.bytes?.byteLength ?? new TextEncoder().encode(p.markdown).byteLength,
 		fetcher: p.fetcher,
-		fetcher_server: p.fetcherServer,
-		fetcher_tool: p.fetcherTool,
-		fetcher_args: p.fetcherArgs,
+		downloader: p.downloader,
+		downloader_args: p.downloaderArgs,
 		refresh_frequency_sec: p.refreshSec,
 		refreshed_at: new Date().toISOString(),
 		last_refresh_status: "ok",
@@ -520,6 +493,7 @@ async function persistVersion(
 			embedding: embeddings[i] ?? new Array(embeddings[0]?.length ?? 0).fill(0),
 		})),
 	);
+	onPhase?.("indexing");
 	await rebuildFts(ctx.db);
 	return versionId;
 }

package/src/mcp/instructions.ts CHANGED Viewed

@@ -11,8 +11,10 @@ indexed with BM25 — so prefer membot_search to membot_read+grep for discovery.
 Workflow:
   1. membot_tree or membot_search to find what already exists before adding new content.
   2. membot_add to ingest a local file, a URL, or a remote document. URLs are
-     fetched via mcpx (the chosen invocation is stored so refresh is fast and
-     deterministic).
+     fetched via per-service downloaders (Google Docs, Sheets, Slides, GitHub,
+     Linear, with a generic browser print-to-PDF fallback). Authentication
+     comes from the user's logged-in browser cookies (saved via \`membot login\`).
+     Each row stores which downloader was used so refresh is deterministic.
   3. membot_read or membot_search hits to consume content.
   4. membot_write to record agent-authored notes (source_type='inline').

package/src/operations/add.ts CHANGED Viewed

@@ -10,7 +10,7 @@ import { type ResolvedSource, resolveSource } from "../ingest/source-resolver.ts
 import { colors } from "../output/formatter.ts";
 import { defineOperation } from "./types.ts";
-const FetcherKindEnum = z.enum(["http", "mcpx", "local", "inline"]);
+const FetcherKindEnum = z.enum(["downloader", "local", "inline"]);
 export const addOperation = defineOperation({
 	name: "membot_add",
@@ -19,7 +19,7 @@ export const addOperation = defineOperation({
   - a local file path
   - a local directory (recursive walk, symlinks followed)
   - a glob pattern (e.g. "docs/**/*.md")
-  - a URL (fetched via mcpx if configured, otherwise plain HTTP)
+  - a URL (fetched via the per-service downloader registry — Google Docs/Sheets/Slides via export endpoints, GitHub + Linear as rendered HTML, anything else through a generic browser print-to-PDF fallback. All fetches authenticate via the user's logged-in browser session — run \`membot login\` once to sign in.)
   - "inline:<text>" literal
 Pass any number of args; each is resolved independently and the matched entries are concatenated into one response. PDF, DOCX, HTML, images, and other binaries are converted to markdown — native libraries first, vision/OCR for images, LLM fallback for messy or scanned input. Original bytes are kept in the blobs table; \`membot_read bytes=true\` returns them. Setting \`refresh_frequency\` enables automatic refresh from the daemon. By default, re-ingesting an unchanged source (same source_sha256 as the current version) is a no-op and reports \`status: "unchanged"\`; pass \`force=true\` to always create a new version. Each newly-ingested file becomes a new version under its own logical_path; existing versions stay queryable via membot_versions. Directory/glob ingests stream one file at a time — partial failures do not abort the rest; the response lists per-entry status.
@@ -54,10 +54,12 @@ Pass \`logical_path\` to override. For a multi-source / directory / glob walk it
 			.default(true)
 			.describe("Follow symlinks during directory walks (cycles broken via realpath)"),
 		refresh_frequency: z.string().optional().describe("Auto-refresh cadence: 5m | 1h | 24h | 7d. Omit to disable."),
-		fetcher_hint: z
+		downloader: z
 			.string()
 			.optional()
-			.describe("Free-form hint passed to mcpx tool search (e.g. 'firecrawl', 'github', 'google docs', 'http')"),
+			.describe(
+				"Force a specific downloader by name (e.g. 'google-docs', 'github', 'generic-web'). Skips URL-based matching.",
+			),
 		change_note: z.string().optional().describe("Free-text note attached to the new version"),
 		force: z
 			.boolean()

package/src/operations/info.ts CHANGED Viewed

@@ -25,9 +25,8 @@ export const infoOperation = defineOperation({
 		size_bytes: z.number().nullable(),
 		description: z.string().nullable(),
 		fetcher: z.string().nullable(),
-		fetcher_server: z.string().nullable(),
-		fetcher_tool: z.string().nullable(),
-		fetcher_args: z.record(z.string(), z.unknown()).nullable(),
+		downloader: z.string().nullable(),
+		downloader_args: z.record(z.string(), z.unknown()).nullable(),
 		refresh_frequency_sec: z.number().nullable(),
 		refreshed_at: z.string().nullable(),
 		last_refresh_status: z.string().nullable(),
@@ -53,9 +52,8 @@ export const infoOperation = defineOperation({
 		lines.push(fmt("blob_sha256", orDash(result.blob_sha256)));
 		lines.push(fmt("source_sha256", orDash(result.source_sha256)));
 		if (result.fetcher) lines.push(fmt("fetcher", result.fetcher));
-		if (result.fetcher_server) lines.push(fmt("fetcher_server", result.fetcher_server));
-		if (result.fetcher_tool) lines.push(fmt("fetcher_tool", result.fetcher_tool));
-		if (result.fetcher_args) lines.push(fmt("fetcher_args", JSON.stringify(result.fetcher_args)));
+		if (result.downloader) lines.push(fmt("downloader", result.downloader));
+		if (result.downloader_args) lines.push(fmt("downloader_args", JSON.stringify(result.downloader_args)));
 		lines.push(
 			fmt(
 				"refresh_frequency",

package/src/operations/move.ts CHANGED Viewed

@@ -54,9 +54,8 @@ export const moveOperation = defineOperation({
 			mime_type: cur.mime_type,
 			size_bytes: cur.size_bytes,
 			fetcher: cur.fetcher,
-			fetcher_server: cur.fetcher_server,
-			fetcher_tool: cur.fetcher_tool,
-			fetcher_args: cur.fetcher_args,
+			downloader: cur.downloader,
+			downloader_args: cur.downloader_args,
 			refresh_frequency_sec: cur.refresh_frequency_sec,
 			refreshed_at: cur.refreshed_at,
 			last_refresh_status: cur.last_refresh_status,

package/src/operations/refresh.ts CHANGED Viewed

@@ -7,7 +7,7 @@ import { defineOperation } from "./types.ts";
 export const refreshOperation = defineOperation({
 	name: "membot_refresh",
 	cliName: "refresh",
-	description: `Re-read a file's source and create a new version only if the source bytes changed. Pass \`logical_path\` to refresh one file, or omit it to refresh every file whose refresh_frequency_sec has elapsed. Local files are detected via mtime+sha; remote files are re-fetched via the same mcpx invocation that was originally used. On auth or network failure the prior version stays current — check \`last_refresh_status\`.`,
+	description: `Re-read a file's source and create a new version only if the source bytes changed. Pass \`logical_path\` to refresh one file, or omit it to refresh every file whose refresh_frequency_sec has elapsed. Local files are detected via mtime+sha; remote files are re-fetched via the same downloader (Google Docs, GitHub, etc.) that was originally chosen. On auth or network failure the prior version stays current — check \`last_refresh_status\`. If the failure mentions a login redirect, re-run \`membot login\` and try again.`,
 	inputSchema: z.object({
 		logical_path: z.string().optional().describe("Single path to refresh; omit for all-due"),
 		force: z.boolean().default(false).describe("Re-embed even if source sha is unchanged"),
@@ -60,9 +60,7 @@ export const refreshOperation = defineOperation({
 		for (const path of targets) {
 			ctx.progress.tick(path);
 			try {
-				const r = await refreshOne(ctx, path, input.force, (done, total) =>
-					ctx.progress.update(`embedding ${done}/${total}`),
-				);
+				const r = await refreshOne(ctx, path, input.force, (sublabel) => ctx.progress.update(sublabel));
 				out.push(r);
 			} catch (err) {
 				out.push({ logical_path: path, status: "failed", error: err instanceof Error ? err.message : String(err) });

package/src/operations/remove.ts CHANGED Viewed

@@ -10,7 +10,7 @@ export const removeOperation = defineOperation({
 	name: "membot_delete",
 	cliName: "rm",
 	bashEquivalent: "rm",
-	description: `Tombstone one or more logical_paths so they no longer appear in membot_list / membot_tree / membot_search. Each \`paths\` arg is independently treated as either a literal logical_path or a glob pattern (e.g. "docs/**/*.md"); globs are matched against current logical_paths in the DB, not the filesystem. The union of matches is deduplicated, then tombstoned one at a time — partial failures are reported per-entry without aborting the rest. An input arg that matches zero current files is an error (the response includes which arg). Old versions remain queryable via membot_versions and membot_read with an explicit version. Use membot_prune to permanently drop history.`,
+	description: `Tombstone one or more logical_paths so they no longer appear in membot_list / membot_tree / membot_search. Each \`paths\` arg is independently treated as either a literal logical_path or a glob pattern (e.g. "docs/**/*.md"); globs are matched against current logical_paths in the DB, not the filesystem. A literal arg that matches no exact file but is a prefix of existing paths (a "directory") is rejected unless \`recursive\` is true, in which case every path beneath it is tombstoned. The union of matches is deduplicated, then tombstoned one at a time — partial failures are reported per-entry without aborting the rest. An input arg that matches zero current files is an error (the response includes which arg). Old versions remain queryable via membot_versions and membot_read with an explicit version. Use membot_prune to permanently drop history.`,
 	inputSchema: z.object({
 		paths: z
 			.array(z.string())
@@ -18,6 +18,12 @@ export const removeOperation = defineOperation({
 			.describe(
 				'One or more logical_paths or glob patterns (e.g. "docs/**/*.md"). Each arg is matched independently against current logical_paths in the DB.',
 			),
+		recursive: z
+			.boolean()
+			.default(false)
+			.describe(
+				"If a literal path arg matches no file but is a prefix of existing paths, treat it as a directory and remove everything beneath it. Mirrors `rm -r`. Ignored for glob args.",
+			),
 		change_note: z.string().optional().describe("Why this is being deleted"),
 	}),
 	outputSchema: z.object({
@@ -33,7 +39,7 @@ export const removeOperation = defineOperation({
 		ok: z.number(),
 		failed: z.number(),
 	}),
-	cli: { positional: ["paths"], aliases: { change_note: "-m" } },
+	cli: { positional: ["paths"], aliases: { change_note: "-m", recursive: "-r" } },
 	console_formatter: (result) => {
 		const lines = result.removed.map((e) =>
 			e.status === "ok"
@@ -59,6 +65,21 @@ export const removeOperation = defineOperation({
 				}
 			} else if (currentSet.has(arg)) {
 				matches.push(arg);
+			} else {
+				const normalized = arg.endsWith("/") ? arg.slice(0, -1) : arg;
+				const dirPrefix = `${normalized}/`;
+				const dirMatches = currentPaths.filter((p) => p.startsWith(dirPrefix));
+				if (dirMatches.length > 0) {
+					if (input.recursive) {
+						matches.push(...dirMatches);
+					} else {
+						throw new HelpfulError({
+							kind: "not_found",
+							message: `\`${arg}\` is a directory (${dirMatches.length} files); pass --recursive to remove its contents`,
+							hint: `Re-run with \`-r\` / \`--recursive\` to tombstone every path under \`${normalized}/\`.`,
+						});
+					}
+				}
 			}
 			if (matches.length === 0) {
 				throw new HelpfulError({

package/src/operations/tree.ts CHANGED Viewed

@@ -18,7 +18,7 @@ export const treeOperation = defineOperation({
 	description: `Render the logical-path tree of the current store. Tree is synthesised from "/" segments in logical_path — there are no real directories. Tombstoned and historical versions are hidden. Use this before membot_add to pick a sensible logical path.`,
 	inputSchema: z.object({
 		prefix: z.string().optional().describe("Only show paths starting with this prefix"),
-		max_depth: z.number().default(4).describe("How many path segments deep to render"),
+		max_depth: z.number().default(6).describe("How many path segments deep to render"),
 		max_items: z
 			.number()
 			.default(20)

package/src/operations/types.ts CHANGED Viewed

@@ -39,7 +39,7 @@ export interface Operation<I extends z.ZodObject = z.ZodObject, O extends z.ZodT
 	 * falls back to pretty-printed JSON.
 	 */
 	console_formatter?: (result: z.infer<O>) => string;
-	/** The work itself. AppContext gives access to db, embedder, mcpx, logger, config. */
+	/** The work itself. AppContext gives access to db, embedder, logger, config. */
 	handler: (input: z.infer<I>, ctx: AppContext) => Promise<z.infer<O>>;
 }