npm - membot - Versions diffs - 0.7.0 → 0.10.0 - Mend

membot 0.7.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

package/.claude/skills/membot.md +3 -0
package/.cursor/rules/membot.mdc +3 -0
package/README.md +7 -0
package/package.json +1 -1
package/src/cli.ts +11 -0
package/src/config/schemas.ts +33 -0
package/src/constants.ts +23 -0
package/src/context.ts +24 -0
package/src/ingest/concurrency.ts +60 -0
package/src/ingest/describer.ts +49 -3
package/src/ingest/embed-worker.ts +74 -0
package/src/ingest/embedder-pool.ts +391 -0
package/src/ingest/embedder.ts +40 -2
package/src/ingest/ingest.ts +277 -67
package/src/operations/add.ts +139 -99
package/src/operations/index.ts +2 -0
package/src/operations/refresh.ts +61 -34
package/src/operations/stats.ts +342 -0
package/src/operations/write.ts +48 -40
package/src/output/formatter.ts +21 -0
package/src/output/logger.ts +36 -0
package/src/output/progress.ts +408 -46
package/src/refresh/scheduler.ts +22 -13

package/src/operations/stats.ts ADDED Viewed

@@ -0,0 +1,342 @@
+import { z } from "zod";
+import type { DbConnection, SqlParam } from "../db/connection.ts";
+import { listDueRefreshes } from "../db/files.ts";
+import { colors } from "../output/formatter.ts";
+import { defineOperation } from "./types.ts";
+export const statsOperation = defineOperation({
+	name: "membot_stats",
+	cliName: "stats",
+	description: `Summarize the local membot index: file/version/chunk/blob counts, total content and on-disk size, refresh health, and breakdowns by source_type, downloader, and mime_type. Optional prefix narrows aggregates to a subtree (same semantics as 'membot tree <prefix>'). Read-only. Use this before membot_prune to gauge how much there is to drop, or as a first call to confirm the index has anything in it.`,
+	inputSchema: z.object({
+		prefix: z
+			.string()
+			.optional()
+			.describe(
+				"Restrict aggregates to logical paths starting with this prefix (e.g. 'docs/api/'). Omit to summarize the whole index.",
+			),
+	}),
+	outputSchema: z.object({
+		prefix: z.string().nullable(),
+		db_path: z.string(),
+		db_size_bytes: z.number(),
+		files: z.object({
+			current: z.number(),
+			tombstoned_paths: z.number(),
+			total_versions: z.number(),
+			distinct_paths: z.number(),
+			by_source_type: z.record(z.string(), z.number()),
+			by_downloader: z.record(z.string(), z.number()),
+			by_mime_type: z.record(z.string(), z.number()),
+		}),
+		content: z.object({
+			total_bytes: z.number(),
+			total_versions_bytes: z.number(),
+		}),
+		chunks: z.object({
+			current: z.number(),
+			total: z.number(),
+		}),
+		blobs: z.object({
+			count: z.number(),
+			total_bytes: z.number(),
+		}),
+		refresh: z.object({
+			scheduled: z.number(),
+			due_now: z.number(),
+			last_status: z.record(z.string(), z.number()),
+		}),
+	}),
+	cli: { positional: ["prefix"] },
+	console_formatter: (result) => {
+		const lines: string[] = [];
+		const heading = (s: string) => colors.bold(s);
+		// Always leave at least 2 spaces between key and value, even when the
+		// key is wider than the target column (long mime types, long keys).
+		const kv = (k: string, v: string, indent = 0) => {
+			const target = Math.max(22 - indent, k.length + 2);
+			return `${" ".repeat(indent)}${colors.dim(k.padEnd(target))}${v}`;
+		};
+		const orNone = (record: Record<string, number>): string[] => {
+			const keys = Object.keys(record);
+			if (keys.length === 0) return [`  ${colors.dim("(none)")}`];
+			return keys.map((k) => kv(k, String(record[k]), 4));
+		};
+		const header = result.prefix
+			? `${heading("membot index summary")} ${colors.dim(`[prefix=${result.prefix}]`)}`
+			: heading("membot index summary");
+		lines.push(header);
+		lines.push(kv("db_path", result.db_path));
+		lines.push(kv("db_size_bytes", formatBytes(result.db_size_bytes)));
+		lines.push("");
+		lines.push(heading("files"));
+		lines.push(kv("current", String(result.files.current), 2));
+		lines.push(kv("tombstoned_paths", String(result.files.tombstoned_paths), 2));
+		lines.push(kv("total_versions", String(result.files.total_versions), 2));
+		lines.push(kv("distinct_paths", String(result.files.distinct_paths), 2));
+		lines.push(kv("by_source_type", "", 2));
+		lines.push(...orNone(result.files.by_source_type));
+		lines.push(kv("by_downloader", "", 2));
+		lines.push(...orNone(result.files.by_downloader));
+		lines.push(kv("by_mime_type", "", 2));
+		lines.push(...orNone(result.files.by_mime_type));
+		lines.push("");
+		lines.push(heading("content"));
+		lines.push(kv("total_bytes", formatBytes(result.content.total_bytes), 2));
+		lines.push(kv("total_versions_bytes", formatBytes(result.content.total_versions_bytes), 2));
+		lines.push("");
+		lines.push(heading("chunks"));
+		lines.push(kv("current", String(result.chunks.current), 2));
+		lines.push(kv("total", String(result.chunks.total), 2));
+		lines.push("");
+		lines.push(heading("blobs"));
+		lines.push(kv("count", String(result.blobs.count), 2));
+		lines.push(kv("total_bytes", formatBytes(result.blobs.total_bytes), 2));
+		lines.push("");
+		lines.push(heading("refresh"));
+		lines.push(kv("scheduled", String(result.refresh.scheduled), 2));
+		lines.push(kv("due_now", String(result.refresh.due_now), 2));
+		lines.push(kv("last_status", "", 2));
+		lines.push(...orNone(result.refresh.last_status));
+		return lines.join("\n");
+	},
+	handler: async (input, ctx) => {
+		const prefix = input.prefix ?? null;
+		const dbSize = await dbFileSize(ctx.db.path);
+		const files = await collectFileStats(ctx.db, prefix);
+		const content = await collectContentStats(ctx.db, prefix);
+		const chunks = await collectChunkStats(ctx.db, prefix);
+		const blobs = await collectBlobStats(ctx.db, prefix);
+		const refresh = await collectRefreshStats(ctx.db, prefix);
+		return {
+			prefix,
+			db_path: ctx.db.path,
+			db_size_bytes: dbSize,
+			files,
+			content,
+			chunks,
+			blobs,
+			refresh,
+		};
+	},
+});
+/** Stat the DuckDB file. Returns 0 if the file isn't on disk yet (in-memory or freshly opened). */
+async function dbFileSize(path: string): Promise<number> {
+	try {
+		const f = Bun.file(path);
+		const exists = await f.exists();
+		return exists ? f.size : 0;
+	} catch {
+		return 0;
+	}
+}
+/** Build a `logical_path LIKE ?1` clause + params, or empty when prefix is null. */
+function prefixFilter(prefix: string | null): { clause: string; params: SqlParam[] } {
+	if (!prefix) return { clause: "", params: [] };
+	return { clause: "logical_path LIKE ?1", params: [`${prefix}%`] };
+}
+/** Combine an existing WHERE fragment with an optional prefix filter. */
+function and(base: string, extra: string): string {
+	if (!base) return extra;
+	if (!extra) return base;
+	return `${base} AND ${extra}`;
+}
+interface FileStats {
+	current: number;
+	tombstoned_paths: number;
+	total_versions: number;
+	distinct_paths: number;
+	by_source_type: Record<string, number>;
+	by_downloader: Record<string, number>;
+	by_mime_type: Record<string, number>;
+}
+async function collectFileStats(db: DbConnection, prefix: string | null): Promise<FileStats> {
+	const pf = prefixFilter(prefix);
+	const where = pf.clause ? `WHERE ${pf.clause}` : "";
+	const current = await scalar(db, `SELECT COUNT(*) AS n FROM current_files ${where}`, ...pf.params);
+	const totalVersions = await scalar(db, `SELECT COUNT(*) AS n FROM files ${where}`, ...pf.params);
+	const distinctPaths = await scalar(db, `SELECT COUNT(DISTINCT logical_path) AS n FROM files ${where}`, ...pf.params);
+	// Tombstoned path = a logical_path whose latest (max version_id) row is a tombstone.
+	// current_files already excludes those, so we join "latest per path" against files
+	// and count rows where tombstone = TRUE.
+	const tombstonedPaths = await scalar(
+		db,
+		`SELECT COUNT(*) AS n
+		 FROM files f
+		 JOIN (
+			SELECT logical_path, MAX(version_id) AS v FROM files ${where} GROUP BY logical_path
+		 ) m ON f.logical_path = m.logical_path AND f.version_id = m.v
+		 WHERE f.tombstone = TRUE`,
+		...pf.params,
+	);
+	const by_source_type = await groupCount(db, "source_type", "current_files", pf);
+	const by_downloader = await groupCount(db, "downloader", "current_files", pf, { skipNull: true });
+	const by_mime_type = await groupCount(db, "mime_type", "current_files", pf, { topN: 10, skipNull: true });
+	return {
+		current,
+		tombstoned_paths: tombstonedPaths,
+		total_versions: totalVersions,
+		distinct_paths: distinctPaths,
+		by_source_type,
+		by_downloader,
+		by_mime_type,
+	};
+}
+async function collectContentStats(
+	db: DbConnection,
+	prefix: string | null,
+): Promise<{ total_bytes: number; total_versions_bytes: number }> {
+	const pf = prefixFilter(prefix);
+	const where = pf.clause ? `WHERE ${pf.clause}` : "";
+	const total_bytes = await scalar(
+		db,
+		`SELECT COALESCE(SUM(size_bytes), 0) AS n FROM current_files ${where}`,
+		...pf.params,
+	);
+	const total_versions_bytes = await scalar(
+		db,
+		`SELECT COALESCE(SUM(size_bytes), 0) AS n FROM files ${where}`,
+		...pf.params,
+	);
+	return { total_bytes, total_versions_bytes };
+}
+async function collectChunkStats(db: DbConnection, prefix: string | null): Promise<{ current: number; total: number }> {
+	if (!prefix) {
+		const current = await scalar(db, `SELECT COUNT(*) AS n FROM current_chunks`);
+		const total = await scalar(db, `SELECT COUNT(*) AS n FROM chunks`);
+		return { current, total };
+	}
+	const pf = prefixFilter(prefix);
+	const current = await scalar(db, `SELECT COUNT(*) AS n FROM current_chunks WHERE ${pf.clause}`, ...pf.params);
+	const total = await scalar(db, `SELECT COUNT(*) AS n FROM chunks WHERE ${pf.clause}`, ...pf.params);
+	return { current, total };
+}
+async function collectBlobStats(
+	db: DbConnection,
+	prefix: string | null,
+): Promise<{ count: number; total_bytes: number }> {
+	if (!prefix) {
+		const row = await db.queryGet<{ count: number | bigint; total: number | bigint | null }>(
+			`SELECT COUNT(*) AS count, COALESCE(SUM(size_bytes), 0) AS total FROM blobs`,
+		);
+		return { count: Number(row?.count ?? 0), total_bytes: Number(row?.total ?? 0) };
+	}
+	const pf = prefixFilter(prefix);
+	const row = await db.queryGet<{ count: number | bigint; total: number | bigint | null }>(
+		`SELECT COUNT(*) AS count, COALESCE(SUM(size_bytes), 0) AS total
+		 FROM blobs
+		 WHERE sha256 IN (
+			SELECT blob_sha256 FROM current_files
+			WHERE ${pf.clause} AND blob_sha256 IS NOT NULL
+		 )`,
+		...pf.params,
+	);
+	return { count: Number(row?.count ?? 0), total_bytes: Number(row?.total ?? 0) };
+}
+async function collectRefreshStats(
+	db: DbConnection,
+	prefix: string | null,
+): Promise<{ scheduled: number; due_now: number; last_status: Record<string, number> }> {
+	const pf = prefixFilter(prefix);
+	const scheduledWhere = and(pf.clause, "refresh_frequency_sec IS NOT NULL");
+	const scheduled = await scalar(db, `SELECT COUNT(*) AS n FROM current_files WHERE ${scheduledWhere}`, ...pf.params);
+	const due = await listDueRefreshes(db);
+	const due_now = prefix ? due.filter((r) => r.logical_path.startsWith(prefix)).length : due.length;
+	const statusRows = await db.queryAll<{ k: string | null; n: number | bigint }>(
+		`SELECT last_refresh_status AS k, COUNT(*) AS n
+		 FROM current_files
+		 WHERE last_refresh_status IS NOT NULL${pf.clause ? ` AND ${pf.clause}` : ""}
+		 GROUP BY last_refresh_status
+		 ORDER BY n DESC`,
+		...pf.params,
+	);
+	const last_status: Record<string, number> = {};
+	for (const r of statusRows) {
+		if (r.k !== null) last_status[r.k] = Number(r.n);
+	}
+	return { scheduled, due_now, last_status };
+}
+/** Run a query whose first row has a single numeric column `n`, returning that number (0 when null). */
+async function scalar(db: DbConnection, sql: string, ...params: SqlParam[]): Promise<number> {
+	const row = await db.queryGet<{ n: number | bigint | null }>(sql, ...params);
+	return Number(row?.n ?? 0);
+}
+interface GroupOptions {
+	skipNull?: boolean;
+	topN?: number;
+}
+/**
+ * GROUP BY a column on a current_files-shaped table, optionally dropping NULLs
+ * and rolling overflow into an "(other)" bucket when topN is set.
+ */
+async function groupCount(
+	db: DbConnection,
+	column: string,
+	table: string,
+	pf: { clause: string; params: SqlParam[] },
+	opts: GroupOptions = {},
+): Promise<Record<string, number>> {
+	const filters: string[] = [];
+	if (pf.clause) filters.push(pf.clause);
+	if (opts.skipNull) filters.push(`${column} IS NOT NULL`);
+	const where = filters.length ? `WHERE ${filters.join(" AND ")}` : "";
+	const rows = await db.queryAll<{ k: string | null; n: number | bigint }>(
+		`SELECT ${column} AS k, COUNT(*) AS n FROM ${table} ${where} GROUP BY ${column} ORDER BY n DESC`,
+		...pf.params,
+	);
+	const out: Record<string, number> = {};
+	if (opts.topN && rows.length > opts.topN) {
+		let other = 0;
+		for (let i = 0; i < rows.length; i++) {
+			const r = rows[i]!;
+			const key = r.k ?? "(null)";
+			if (i < opts.topN) out[key] = Number(r.n);
+			else other += Number(r.n);
+		}
+		if (other > 0) out["(other)"] = other;
+		return out;
+	}
+	for (const r of rows) {
+		out[r.k ?? "(null)"] = Number(r.n);
+	}
+	return out;
+}
+/** Format a byte count in human units. 1024 boundary, 1-decimal precision past KB. */
+function formatBytes(bytes: number): string {
+	if (bytes < 1024) return `${bytes} B`;
+	const units = ["KB", "MB", "GB", "TB"];
+	let i = -1;
+	let n = bytes;
+	while (n >= 1024 && i < units.length - 1) {
+		n /= 1024;
+		i++;
+	}
+	return `${n.toFixed(n >= 100 ? 0 : 1)} ${units[i]}`;
+}

package/src/operations/write.ts CHANGED Viewed

@@ -1,9 +1,11 @@
 import { z } from "zod";
+import { resolveEmbeddingWorkers } from "../context.ts";
 import { insertChunksForVersion, rebuildFts } from "../db/chunks.ts";
 import { insertVersion, millisIso } from "../db/files.ts";
 import { chunkDeterministic } from "../ingest/chunker.ts";
 import { describe } from "../ingest/describer.ts";
 import { embed } from "../ingest/embedder.ts";
+import { withEmbedderPool } from "../ingest/embedder-pool.ts";
 import { parseDuration } from "../ingest/ingest.ts";
 import { sha256Hex } from "../ingest/local-reader.ts";
 import { buildSearchText } from "../ingest/search-text.ts";
@@ -30,48 +32,54 @@ export const writeOperation = defineOperation({
 	console_formatter: (result) =>
 		`${colors.green("✓")} ${colors.cyan(result.logical_path)} ${colors.dim(`@ ${result.version_id}`)} ${colors.dim(`(${result.size_bytes}B)`)}`,
 	handler: async (input, ctx) => {
-		const refreshSec = parseDuration(input.refresh_frequency);
-		const bytes = new TextEncoder().encode(input.content);
-		const description = await describe(input.logical_path, "text/markdown", input.content, ctx.config.llm);
-		const chunks = chunkDeterministic(input.content, ctx.config.chunker);
-		const searchTexts = chunks.map((c) => buildSearchText(input.logical_path, description, c.content));
-		const embeddings = await embed(searchTexts, ctx.config.embedding_model);
+		// Per-command embedder pool: spawn workers, embed this version's
+		// chunks in parallel, kill workers before returning. Short-circuits
+		// to single-process when `embedding.workers` is 1.
+		const workers = resolveEmbeddingWorkers(ctx.config.embedding.workers);
+		return withEmbedderPool(workers, ctx.config.embedding_model, async () => {
+			const refreshSec = parseDuration(input.refresh_frequency);
+			const bytes = new TextEncoder().encode(input.content);
+			const description = await describe(input.logical_path, "text/markdown", input.content, ctx.config.llm);
+			const chunks = chunkDeterministic(input.content, ctx.config.chunker);
+			const searchTexts = chunks.map((c) => buildSearchText(input.logical_path, description, c.content));
+			const embeddings = await embed(searchTexts, ctx.config.embedding_model);
-		const versionId = millisIso(Date.now());
-		const contentSha = sha256Hex(bytes);
-		await insertVersion(ctx.db, {
-			logical_path: input.logical_path,
-			version_id: versionId,
-			source_type: "inline",
-			source_path: null,
-			source_mtime_ms: null,
-			source_sha256: contentSha,
-			blob_sha256: null,
-			content_sha256: contentSha,
-			content: input.content,
-			description,
-			mime_type: "text/markdown",
-			size_bytes: bytes.byteLength,
-			fetcher: "inline",
-			refresh_frequency_sec: refreshSec,
-			refreshed_at: new Date().toISOString(),
-			last_refresh_status: "ok",
-			change_note: input.change_note ?? null,
-		});
+			const versionId = millisIso(Date.now());
+			const contentSha = sha256Hex(bytes);
+			await insertVersion(ctx.db, {
+				logical_path: input.logical_path,
+				version_id: versionId,
+				source_type: "inline",
+				source_path: null,
+				source_mtime_ms: null,
+				source_sha256: contentSha,
+				blob_sha256: null,
+				content_sha256: contentSha,
+				content: input.content,
+				description,
+				mime_type: "text/markdown",
+				size_bytes: bytes.byteLength,
+				fetcher: "inline",
+				refresh_frequency_sec: refreshSec,
+				refreshed_at: new Date().toISOString(),
+				last_refresh_status: "ok",
+				change_note: input.change_note ?? null,
+			});
-		await insertChunksForVersion(
-			ctx.db,
-			input.logical_path,
-			versionId,
-			chunks.map((c, i) => ({
-				chunk_index: c.index,
-				chunk_content: c.content,
-				search_text: searchTexts[i] ?? buildSearchText(input.logical_path, description, c.content),
-				embedding: embeddings[i] ?? new Array(embeddings[0]?.length ?? 0).fill(0),
-			})),
-		);
-		await rebuildFts(ctx.db);
+			await insertChunksForVersion(
+				ctx.db,
+				input.logical_path,
+				versionId,
+				chunks.map((c, i) => ({
+					chunk_index: c.index,
+					chunk_content: c.content,
+					search_text: searchTexts[i] ?? buildSearchText(input.logical_path, description, c.content),
+					embedding: embeddings[i] ?? new Array(embeddings[0]?.length ?? 0).fill(0),
+				})),
+			);
+			await rebuildFts(ctx.db);
-		return { logical_path: input.logical_path, version_id: versionId, size_bytes: bytes.byteLength };
+			return { logical_path: input.logical_path, version_id: versionId, size_bytes: bytes.byteLength };
+		});
 	},
 });

package/src/output/formatter.ts CHANGED Viewed

@@ -18,6 +18,27 @@ export function renderResult<T>(result: T, opts: { console_formatter?: (result:
 	return JSON.stringify(result, null, 2);
 }
+/**
+ * Format a byte count as a short human-readable string: 5654 → `5.5 KB`,
+ * 14_859 → `14.5 KB`, 2_345_678 → `2.2 MB`. Uses 1024-based units (binary
+ * prefixes) since file sizes on disk are typically reported that way.
+ * Negative or non-finite inputs render as `0 B`.
+ */
+export function formatBytes(n: number): string {
+	if (!Number.isFinite(n) || n < 0) return "0 B";
+	if (n < 1024) return `${n} B`;
+	const units = ["KB", "MB", "GB", "TB"] as const;
+	let value = n / 1024;
+	let unit: string = units[0];
+	for (let i = 1; i < units.length && value >= 1024; i++) {
+		value /= 1024;
+		unit = units[i] as string;
+	}
+	// One decimal until 100, then round to integer (so the column stays narrow).
+	const formatted = value < 100 ? value.toFixed(1) : `${Math.round(value)}`;
+	return `${formatted} ${unit}`;
+}
 /**
  * Pretty-print a 2D array of cells as an aligned table. Column widths are
  * computed from the visible (escape-stripped) length of each cell so coloured

package/src/output/logger.ts CHANGED Viewed

@@ -9,6 +9,17 @@ export interface Spinner {
 	stop(): void;
 }
+/**
+ * Anything occupying a fixed area of stderr that needs to be torn down before
+ * the logger writes a stray line, then redrawn afterward. nanospinner's
+ * single-line spinner and progress.ts's multi-line worker view both implement
+ * this so log/info/warn lines don't shred the live display.
+ */
+export interface LiveArea {
+	clear(): void;
+	render(): void;
+}
 const NOOP_SPINNER: Spinner = { update() {}, success() {}, error() {}, stop() {} };
 /**
@@ -20,6 +31,7 @@ const NOOP_SPINNER: Spinner = { update() {}, success() {}, error() {}, stop() {}
 class Logger {
 	private static instance: Logger;
 	private activeSpinner: ReturnType<typeof createSpinner> | null = null;
+	private activeLiveArea: LiveArea | null = null;
 	/** Singleton accessor. Use the exported `logger` const instead in normal code. */
 	static getInstance(): Logger {
@@ -31,7 +43,24 @@ class Logger {
 		return useColor() ? fn(msg) : msg;
 	}
+	/**
+	 * Register a multi-line live display. Logger will `clear()` it before any
+	 * stderr write and `render()` it after, so log lines don't punch through
+	 * the live area. Pass null to deregister. Mutually exclusive with the
+	 * nanospinner path (only one live thing on stderr at a time).
+	 */
+	setActiveLiveArea(area: LiveArea | null): void {
+		this.activeLiveArea = area;
+	}
 	private writeStderr(msg: string): void {
+		const area = this.activeLiveArea;
+		if (area) {
+			area.clear();
+			process.stderr.write(`${msg}\n`);
+			area.render();
+			return;
+		}
 		if (this.activeSpinner) {
 			this.activeSpinner.clear();
 			process.stderr.write(`${msg}\n`);
@@ -66,6 +95,13 @@ class Logger {
 	/** Raw stderr write, no formatting added. Spinner-aware. */
 	writeRaw(msg: string): void {
+		const area = this.activeLiveArea;
+		if (area) {
+			area.clear();
+			process.stderr.write(msg);
+			area.render();
+			return;
+		}
 		if (this.activeSpinner) {
 			this.activeSpinner.clear();
 			process.stderr.write(msg);