npm - struth - Versions diffs - 1.0.0 - Mend

struth 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

package/COMPLIANCE.md +41 -0
package/LICENSE +21 -0
package/README.md +135 -0
package/package.json +54 -0
package/src/cli/index.ts +244 -0
package/src/core/constants.ts +32 -0
package/src/core/pipeline/clean.ts +246 -0
package/src/core/pipeline/condense.ts +249 -0
package/src/core/pipeline/discover.ts +448 -0
package/src/core/pipeline/integrity.ts +214 -0
package/src/core/pipeline/organize.ts +184 -0
package/src/core/schemas.ts +204 -0
package/src/core/spawn.ts +22 -0
package/src/core/storage/index.ts +108 -0
package/src/core/storage/paths.ts +40 -0
package/src/core/types.ts +36 -0
package/src/daemon/process.ts +95 -0
package/src/daemon/refresh.ts +254 -0
package/src/mcp/fts5-index.ts +114 -0
package/src/mcp/fts5-search.ts +150 -0
package/src/mcp/lockfile.ts +135 -0
package/src/mcp/retrieval.ts +141 -0
package/src/mcp/schemas.ts +12 -0
package/src/mcp/server.ts +293 -0
package/src/telemetry/client.ts +36 -0
package/src/telemetry/schemas.ts +5 -0

package/src/core/pipeline/organize.ts ADDED Viewed

@@ -0,0 +1,184 @@
+import { createHash } from "node:crypto";
+import type { z } from "zod";
+import { DEFAULT_CHECK_INTERVAL_HOURS, SCHEMA_VERSION } from "../constants.js";
+import {
+	type CondensedPage,
+	type Coverage,
+	DocSetManifest,
+	type MirrorOptions,
+	type PageRef,
+	type Section,
+} from "../schemas.js";
+type OrganizeOpts = Pick<z.infer<typeof MirrorOptions>, "name" | "output"> & {
+	totalDiscovered?: number;
+};
+/**
+ * Extract section slug from URL path.
+ * Uses first path segment after domain, or "root" if none.
+ */
+function extractSectionSlug(url: string): string {
+	try {
+		const parsed = new URL(url);
+		const segments = parsed.pathname.split("/").filter(Boolean);
+		if (segments.length === 0) return "root";
+		return segments[0].toLowerCase().replace(/[^a-z0-9-]/g, "-");
+	} catch {
+		return "root";
+	}
+}
+/**
+ * Make a section name human-readable from its slug.
+ */
+function sectionName(slug: string): string {
+	if (slug === "root") return "Root";
+	return slug
+		.split("-")
+		.map((word) => word.charAt(0).toUpperCase() + word.slice(1))
+		.join(" ");
+}
+/**
+ * Extract topic tags from H1/H2 headings in content.
+ */
+function extractTopicTags(content: string): string[] {
+	const headingRegex = /^#{1,2}\s+(.+)$/gm;
+	const tags = new Set<string>();
+	for (const match of content.matchAll(headingRegex)) {
+		const heading = match[1].trim().toLowerCase();
+		// Extract individual words as tags, filter short/common ones
+		const words = heading.split(/[\s/\\-]+/).filter((w) => w.length > 2);
+		for (const word of words) {
+			tags.add(word.replace(/[^a-z0-9]/g, ""));
+		}
+	}
+	// Filter empty strings
+	return [...tags].filter(Boolean);
+}
+/**
+ * Organize condensed pages into sections and build the doc set manifest.
+ * Groups pages by URL path prefix, generates topic tags, calculates coverage.
+ */
+export async function organize(
+	pages: z.infer<typeof CondensedPage>[],
+	sourceUrl: string,
+	opts: OrganizeOpts,
+): Promise<z.infer<typeof DocSetManifest>> {
+	const now = new Date().toISOString();
+	const totalDiscovered = opts.totalDiscovered ?? pages.length;
+	// Group pages by section
+	const sectionMap = new Map<
+		string,
+		{
+			pages: z.infer<typeof PageRef>[];
+			allContent: string;
+			wordCountClean: number;
+			wordCountCondensed: number;
+		}
+	>();
+	for (const page of pages) {
+		const slug = extractSectionSlug(page.url);
+		if (!sectionMap.has(slug)) {
+			sectionMap.set(slug, {
+				pages: [],
+				allContent: "",
+				wordCountClean: 0,
+				wordCountCondensed: 0,
+			});
+		}
+		const section = sectionMap.get(slug);
+		if (!section) continue;
+		const pageRef: z.infer<typeof PageRef> = {
+			url: page.url,
+			slug: page.slug,
+			section: slug,
+			word_count_clean: page.word_count_clean,
+			word_count_condensed: page.word_count_condensed,
+			condensed: page.condensed,
+		};
+		section.pages.push(pageRef);
+		section.allContent += `\n${page.content_condensed}`;
+		section.wordCountClean += page.word_count_clean;
+		section.wordCountCondensed += page.word_count_condensed;
+	}
+	// Build sections
+	const sections: z.infer<typeof Section>[] = [];
+	for (const [slug, data] of sectionMap) {
+		sections.push({
+			name: sectionName(slug),
+			slug,
+			pages: data.pages,
+			word_count_clean: data.wordCountClean,
+			word_count_condensed: data.wordCountCondensed,
+			page_count: data.pages.length,
+			topic_tags: extractTopicTags(data.allContent),
+		});
+	}
+	// Build flat page list
+	const allPages = sections.flatMap((s) => s.pages);
+	// Calculate coverage
+	const coverage: z.infer<typeof Coverage> = {
+		total_discovered: totalDiscovered,
+		successfully_processed: pages.length,
+		skipped: totalDiscovered - pages.length,
+		skip_reasons: {},
+		coverage_ratio: totalDiscovered > 0 ? pages.length / totalDiscovered : 0,
+	};
+	// Calculate content hash
+	const allContent = pages.map((p) => p.content_condensed).join("\n");
+	const contentHash = createHash("sha256").update(allContent).digest("hex");
+	// Derive name from URL if not provided
+	let name = opts.name;
+	if (!name) {
+		try {
+			const parsed = new URL(sourceUrl);
+			name = parsed.hostname.replace(/^(www|docs)\./, "").replace(/\./g, "-");
+		} catch {
+			name = "unknown";
+		}
+	}
+	const manifest: z.infer<typeof DocSetManifest> = {
+		schema_version: SCHEMA_VERSION,
+		name,
+		version: null,
+		source_url: sourceUrl,
+		generated_at: now,
+		sections,
+		pages: allPages,
+		coverage,
+		provenance: {
+			source_url: sourceUrl,
+			fetched_at: now,
+			content_hash: contentHash,
+			license: null,
+			robots_txt_status: "allowed",
+		},
+		trust: {
+			freshness: "fresh",
+			last_checked: now,
+			last_changed: now,
+			check_interval_hours: DEFAULT_CHECK_INTERVAL_HOURS,
+			coverage,
+		},
+		platform_detected: null,
+	};
+	return DocSetManifest.parse(manifest);
+}

package/src/core/schemas.ts ADDED Viewed

@@ -0,0 +1,204 @@
+import { z } from "zod";
+// ── Schema version ──────────────────────────────
+export const SCHEMA_VERSION = "1.0.0";
+// ── Primitives ──────────────────────────────────
+export const UrlString = z.string().url();
+export const IsoTimestamp = z.string().datetime();
+export const Slug = z.string().regex(/^[a-z0-9-]+$/);
+// ── Content Integrity ───────────────────────────
+export const ContentIntegrity = z.object({
+	unicode_normalized: z.boolean(),
+	structural_baseline: z.number().min(0).max(1),
+	flagged_anomalies: z.array(z.string()),
+	owasp_llm01_checked: z.boolean(),
+	pipeline_version: z.string(),
+});
+export const StructuralMetrics = z.object({
+	char_entropy: z.number(),
+	code_block_ratio: z.number().min(0).max(1),
+	avg_section_words: z.number(),
+	imperative_sentence_ratio: z.number().min(0).max(1),
+	total_tokens: z.number().int(),
+});
+// ── Pipeline Stage Outputs ──────────────────────
+// Stage 1: Discover
+export const DiscoveredUrl = z.object({
+	url: UrlString,
+	source: z.enum(["llms_txt", "llms_full_txt", "md_suffix", "sitemap", "firecrawl", "link_walk"]),
+});
+export const DiscoverResult = z.object({
+	urls: z.array(DiscoveredUrl),
+	source_method: z.string(),
+	total_found: z.number().int(),
+	after_dedup: z.number().int(),
+	platform_detected: z.string().nullable(),
+	robots_txt_status: z.enum(["allowed", "blocked", "no_robots_txt"]),
+});
+// Stage 2: Clean
+export const CleanedPage = z.object({
+	url: UrlString,
+	slug: Slug,
+	section: z.string(),
+	content_clean: z.string(),
+	word_count: z.number().int(),
+	fetch_strategy: z.enum(["readability", "jina", "md_suffix", "llms_full_txt"]),
+	content_integrity: ContentIntegrity,
+	structural_metrics: StructuralMetrics,
+});
+// Stage 3: Condense
+export const CondensedPage = z.object({
+	url: UrlString,
+	slug: Slug,
+	section: z.string(),
+	content_condensed: z.string(),
+	content_clean: z.string(),
+	word_count_clean: z.number().int(),
+	word_count_condensed: z.number().int(),
+	condensed: z.boolean(),
+	condense_method: z.enum(["claude_cli", "api", "skipped"]),
+});
+// Stage 4: Organize
+export const PageRef = z.object({
+	url: UrlString,
+	slug: Slug,
+	section: z.string(),
+	word_count_clean: z.number().int(),
+	word_count_condensed: z.number().int(),
+	condensed: z.boolean(),
+});
+export const Section = z.object({
+	name: z.string(),
+	slug: Slug,
+	pages: z.array(PageRef),
+	word_count_clean: z.number().int(),
+	word_count_condensed: z.number().int(),
+	page_count: z.number().int(),
+	topic_tags: z.array(z.string()),
+});
+// ── Coverage & Provenance ───────────────────────
+export const Coverage = z.object({
+	total_discovered: z.number().int(),
+	successfully_processed: z.number().int(),
+	skipped: z.number().int(),
+	skip_reasons: z.record(z.string(), z.number().int()),
+	coverage_ratio: z.number().min(0).max(1),
+});
+export const Provenance = z.object({
+	source_url: UrlString,
+	fetched_at: IsoTimestamp,
+	content_hash: z.string(),
+	license: z.string().nullable(),
+	robots_txt_status: z.enum(["allowed", "blocked", "no_robots_txt"]),
+});
+export const TrustMetadata = z.object({
+	freshness: z.enum(["fresh", "stale", "unknown"]),
+	last_checked: IsoTimestamp.nullable(),
+	last_changed: IsoTimestamp.nullable(),
+	check_interval_hours: z.number(),
+	coverage: Coverage,
+});
+// ── Storage (persisted to disk) ─────────────────
+export const DocSetManifest = z.object({
+	schema_version: z.string(),
+	name: z.string(),
+	version: z.string().nullable(),
+	source_url: UrlString,
+	generated_at: IsoTimestamp,
+	sections: z.array(Section),
+	pages: z.array(PageRef),
+	coverage: Coverage,
+	provenance: Provenance,
+	trust: TrustMetadata,
+	platform_detected: z.string().nullable(),
+});
+// ── MCP Schemas ─────────────────────────────────
+export const GetDocsRequest = z.object({
+	query: z.string().min(1),
+	library: z.string().optional(),
+	version: z.string().optional(),
+	project_path: z.string().optional(),
+	max_sections: z.number().int().min(1).max(20).default(5),
+});
+export const ServedSection = z.object({
+	title: z.string(),
+	content: z.string(),
+	provenance: Provenance,
+	content_integrity: ContentIntegrity,
+});
+export const GetDocsResponse = z.object({
+	sections: z.array(ServedSection),
+	trust: TrustMetadata,
+	library: z.string(),
+	version: z.string().nullable(),
+	query: z.string(),
+});
+export const ListLibrariesRequest = z.object({
+	filter: z.string().optional(),
+});
+export const LibrarySummary = z.object({
+	name: z.string(),
+	version: z.string().nullable(),
+	sections: z.number().int(),
+	pages: z.number().int(),
+	freshness: z.enum(["fresh", "stale", "unknown"]),
+	last_updated: IsoTimestamp,
+	word_count: z.number().int(),
+});
+export const ListLibrariesResponse = z.object({
+	libraries: z.array(LibrarySummary),
+});
+// ── Telemetry ───────────────────────────────────
+export const TelemetryEvent = z.object({
+	event: z.enum(["query", "index", "error"]),
+	library: z.string(),
+	sections_hit: z.array(z.string()),
+	latency_ms: z.number(),
+	error_type: z.string().optional(),
+	client_version: z.string(),
+	timestamp: IsoTimestamp,
+});
+// ── Pipeline Options ────────────────────────────
+export const MirrorOptions = z.object({
+	url: UrlString,
+	name: z.string().optional(),
+	condense: z.boolean().default(false),
+	condenseApi: z.boolean().default(false),
+	concurrency: z.number().int().min(1).max(20).default(3),
+	excludePath: z.array(z.string()).default([]),
+	exclude: z.array(z.number().int()).default([]),
+	smart: z.string().optional(),
+	filter: z.string().optional(),
+	top: z.number().int().default(20),
+	fetchStrategy: z.enum(["auto", "readability", "jina"]).default("auto"),
+	output: z.string().optional(),
+});

package/src/core/spawn.ts ADDED Viewed

@@ -0,0 +1,22 @@
+/**
+ * Bun.spawn wrappers — extracted for testability.
+ * Tests can vi.mock this module without touching the readonly Bun global.
+ */
+/** Spawn a CLI subprocess with stdin input. Used by the condense pipeline. */
+export function spawnCli(
+	cmd: string[],
+	opts: { stdin: Blob },
+): { exited: Promise<number>; stdout: ReadableStream<Uint8Array> } {
+	const proc = Bun.spawn(cmd, opts);
+	return { exited: proc.exited, stdout: proc.stdout as ReadableStream<Uint8Array> };
+}
+/** Spawn a detached background process. Used by the daemon. */
+export function spawnDetached(
+	cmd: string[],
+	opts: { stdio: ["ignore", "ignore", "inherit"]; detached: true },
+): { pid: number; unref: () => void } {
+	const child = Bun.spawn(cmd, opts);
+	return { pid: child.pid, unref: () => child.unref() };
+}

package/src/core/storage/index.ts ADDED Viewed

@@ -0,0 +1,108 @@
+import { randomUUID } from "node:crypto";
+import { mkdir, readFile, readdir, rename, rm, writeFile } from "node:fs/promises";
+import { join } from "node:path";
+import type { z } from "zod";
+import { SCHEMA_VERSION } from "../constants.js";
+import { DocSetManifest } from "../schemas.js";
+import { docSetDir, librariesDir, manifestPath } from "./paths.js";
+/**
+ * Read a doc set manifest and its sections from disk.
+ * Validates against current schema version.
+ */
+export async function readDocSet(
+	name: string,
+	version?: string,
+): Promise<z.infer<typeof DocSetManifest>> {
+	const mPath = manifestPath(name, version ?? null);
+	const raw = await readFile(mPath, "utf-8");
+	const data = JSON.parse(raw);
+	const manifest = DocSetManifest.parse(data);
+	if (manifest.schema_version !== SCHEMA_VERSION) {
+		console.warn(
+			`[struth] Warning: doc set "${name}" has schema version ${manifest.schema_version}, current is ${SCHEMA_VERSION}`,
+		);
+	}
+	return manifest;
+}
+/**
+ * Write a doc set manifest and its sections to disk.
+ * Uses atomic write: temp dir → rename on success.
+ */
+export async function writeDocSet(
+	manifest: z.infer<typeof DocSetManifest>,
+	pages: { slug: string; clean: string; condensed: string }[],
+): Promise<void> {
+	const targetDir = docSetDir(manifest.name, manifest.version);
+	const tmpSuffix = randomUUID().slice(0, 8);
+	const tempDir = `${targetDir}.tmp.${tmpSuffix}`;
+	try {
+		const tempSectionsDir = join(tempDir, "sections");
+		await mkdir(tempSectionsDir, { recursive: true });
+		// Write manifest
+		await writeFile(join(tempDir, "manifest.json"), JSON.stringify(manifest, null, 2), "utf-8");
+		// Write section files
+		for (const page of pages) {
+			await writeFile(join(tempSectionsDir, `${page.slug}.clean.md`), page.clean, "utf-8");
+			await writeFile(join(tempSectionsDir, `${page.slug}.condensed.md`), page.condensed, "utf-8");
+		}
+		// Atomic swap: remove old target, rename temp to target
+		try {
+			await rm(targetDir, { recursive: true, force: true });
+		} catch {
+			// Target didn't exist — fine
+		}
+		await rename(tempDir, targetDir);
+	} catch (err) {
+		// Cleanup temp dir on failure
+		try {
+			await rm(tempDir, { recursive: true, force: true });
+		} catch {
+			// Best effort cleanup
+		}
+		throw err;
+	}
+}
+/**
+ * List all indexed doc sets.
+ */
+export async function listDocSets(): Promise<
+	{ name: string; version: string | null; path: string }[]
+> {
+	const libDir = librariesDir();
+	let entries: string[];
+	try {
+		entries = await readdir(libDir);
+	} catch {
+		return [];
+	}
+	const results: { name: string; version: string | null; path: string }[] = [];
+	for (const entry of entries) {
+		const entryPath = join(libDir, entry);
+		const mPath = join(entryPath, "manifest.json");
+		try {
+			const raw = await readFile(mPath, "utf-8");
+			const data = JSON.parse(raw);
+			results.push({
+				name: data.name ?? entry,
+				version: data.version ?? null,
+				path: entryPath,
+			});
+		} catch {
+			// Skip entries without valid manifest
+		}
+	}
+	return results;
+}

package/src/core/storage/paths.ts ADDED Viewed

@@ -0,0 +1,40 @@
+import { homedir } from "node:os";
+import { join } from "node:path";
+import { LIBRARIES_DIR } from "../constants.js";
+/**
+ * Resolve the struth home directory.
+ * Expands ~ to the user's home directory.
+ */
+export function struthHome(): string {
+	return join(homedir(), ".struth");
+}
+/**
+ * Resolve the libraries directory.
+ */
+export function librariesDir(): string {
+	return join(struthHome(), LIBRARIES_DIR);
+}
+/**
+ * Resolve the path for a specific doc set.
+ */
+export function docSetDir(name: string, version?: string | null): string {
+	const slug = version ? `${name}-${version}` : name;
+	return join(librariesDir(), slug);
+}
+/**
+ * Resolve the manifest path for a doc set.
+ */
+export function manifestPath(name: string, version?: string | null): string {
+	return join(docSetDir(name, version), "manifest.json");
+}
+/**
+ * Resolve the sections directory for a doc set.
+ */
+export function sectionsDir(name: string, version?: string | null): string {
+	return join(docSetDir(name, version), "sections");
+}

package/src/core/types.ts ADDED Viewed

@@ -0,0 +1,36 @@
+import type { z } from "zod";
+import type * as S from "./schemas.js";
+// Pipeline stage types
+export type DiscoveredUrl = z.infer<typeof S.DiscoveredUrl>;
+export type DiscoverResult = z.infer<typeof S.DiscoverResult>;
+export type CleanedPage = z.infer<typeof S.CleanedPage>;
+export type CondensedPage = z.infer<typeof S.CondensedPage>;
+export type PageRef = z.infer<typeof S.PageRef>;
+export type Section = z.infer<typeof S.Section>;
+// Integrity types
+export type ContentIntegrity = z.infer<typeof S.ContentIntegrity>;
+export type StructuralMetrics = z.infer<typeof S.StructuralMetrics>;
+// Coverage & provenance types
+export type Coverage = z.infer<typeof S.Coverage>;
+export type Provenance = z.infer<typeof S.Provenance>;
+export type TrustMetadata = z.infer<typeof S.TrustMetadata>;
+// Storage types
+export type DocSetManifest = z.infer<typeof S.DocSetManifest>;
+// MCP types
+export type GetDocsRequest = z.infer<typeof S.GetDocsRequest>;
+export type GetDocsResponse = z.infer<typeof S.GetDocsResponse>;
+export type ServedSection = z.infer<typeof S.ServedSection>;
+export type ListLibrariesRequest = z.infer<typeof S.ListLibrariesRequest>;
+export type ListLibrariesResponse = z.infer<typeof S.ListLibrariesResponse>;
+export type LibrarySummary = z.infer<typeof S.LibrarySummary>;
+// Telemetry types
+export type TelemetryEvent = z.infer<typeof S.TelemetryEvent>;
+// Options types
+export type MirrorOptions = z.infer<typeof S.MirrorOptions>;

package/src/daemon/process.ts ADDED Viewed

@@ -0,0 +1,95 @@
+import { readFile, rm, writeFile } from "node:fs/promises";
+import { join } from "node:path";
+import { spawnDetached } from "../core/spawn.js";
+import { struthHome } from "../core/storage/paths.js";
+function pidFilePath(): string {
+	return join(struthHome(), "daemon.pid");
+}
+/**
+ * Check whether the daemon process is currently running.
+ */
+export async function isDaemonRunning(): Promise<boolean> {
+	try {
+		const raw = await readFile(pidFilePath(), "utf-8");
+		const pid = Number.parseInt(raw.trim(), 10);
+		process.kill(pid, 0);
+		return true;
+	} catch {
+		// No PID file or process not running — clean up stale file
+		try {
+			await rm(pidFilePath());
+		} catch {
+			// No file to remove
+		}
+		return false;
+	}
+}
+/**
+ * Start the daemon as a detached background process.
+ */
+export async function startDaemon(intervalHours = 24): Promise<void> {
+	if (await isDaemonRunning()) {
+		process.stderr.write("Daemon already running\n");
+		return;
+	}
+	const cliPath = join(import.meta.dir ?? ".", "../cli/index.ts");
+	const child = spawnDetached(
+		["bun", "run", cliPath, "daemon", "run", "--interval", String(intervalHours)],
+		{ stdio: ["ignore", "ignore", "inherit"], detached: true },
+	);
+	child.unref();
+	await writeFile(pidFilePath(), String(child.pid), "utf-8");
+	process.stderr.write(`Daemon started (PID: ${child.pid}, interval: ${intervalHours}h)\n`);
+}
+/**
+ * Stop the running daemon process.
+ */
+export async function stopDaemon(): Promise<void> {
+	let pid: number;
+	try {
+		const raw = await readFile(pidFilePath(), "utf-8");
+		pid = Number.parseInt(raw.trim(), 10);
+	} catch {
+		process.stderr.write("No daemon running\n");
+		return;
+	}
+	try {
+		process.kill(pid, 0);
+		// Process is alive — send SIGTERM
+		process.kill(pid, "SIGTERM");
+		process.stderr.write(`Daemon stopped (PID: ${pid})\n`);
+	} catch {
+		// Process not alive — stale PID
+		process.stderr.write("Cleaned up stale PID file\n");
+	}
+	await rm(pidFilePath());
+}
+/**
+ * Run the daemon loop (called by the spawned process).
+ */
+export async function runDaemonLoop(intervalHours: number): Promise<never> {
+	const { cleanupOrphans, refreshAll } = await import("./refresh.js");
+	await cleanupOrphans();
+	process.stderr.write(`Daemon starting, interval: ${intervalHours}h\n`);
+	process.on("SIGTERM", () => {
+		process.stderr.write("Shutting down...\n");
+		process.exit(0);
+	});
+	while (true) {
+		await refreshAll();
+		await Bun.sleep(intervalHours * 3600 * 1000);
+	}
+}