npm - struth - Versions diffs - 1.0.0 - Mend

struth 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

package/COMPLIANCE.md +41 -0
package/LICENSE +21 -0
package/README.md +135 -0
package/package.json +54 -0
package/src/cli/index.ts +244 -0
package/src/core/constants.ts +32 -0
package/src/core/pipeline/clean.ts +246 -0
package/src/core/pipeline/condense.ts +249 -0
package/src/core/pipeline/discover.ts +448 -0
package/src/core/pipeline/integrity.ts +214 -0
package/src/core/pipeline/organize.ts +184 -0
package/src/core/schemas.ts +204 -0
package/src/core/spawn.ts +22 -0
package/src/core/storage/index.ts +108 -0
package/src/core/storage/paths.ts +40 -0
package/src/core/types.ts +36 -0
package/src/daemon/process.ts +95 -0
package/src/daemon/refresh.ts +254 -0
package/src/mcp/fts5-index.ts +114 -0
package/src/mcp/fts5-search.ts +150 -0
package/src/mcp/lockfile.ts +135 -0
package/src/mcp/retrieval.ts +141 -0
package/src/mcp/schemas.ts +12 -0
package/src/mcp/server.ts +293 -0
package/src/telemetry/client.ts +36 -0
package/src/telemetry/schemas.ts +5 -0

package/COMPLIANCE.md ADDED Viewed

@@ -0,0 +1,41 @@
+# Compliance
+## Content Attribution
+Struth indexes publicly available documentation. Each served section includes full provenance metadata: the source URL, fetch timestamp, and content hash. Attribution is automatic and immutable -- it is embedded at index time and cannot be stripped from query results.
+## robots.txt Compliance
+Struth respects robots.txt directives. The discover stage checks each site's robots.txt before crawling any pages.
+Struth identifies itself with the User-Agent `Struth-Bot`. Site owners can block indexing by adding the following to their robots.txt:
+```
+User-agent: Struth-Bot
+Disallow: /
+```
+## Takedown Process
+Content owners can request removal of their indexed documentation through any of the following channels:
+1. **GitHub issue** -- Open an issue at [github.com/tzioup/struth](https://github.com/tzioup/struth) with the subject line "Takedown Request" and the URLs to be removed.
+2. **Email** -- Send a request to takedown@struth.dev with the affected URLs.
+Struth will remove indexed content within 48 hours of a valid request.
+## Data Storage
+All indexed data is stored locally on the user's machine at `~/.struth/`. No content is uploaded to any central server. Struth operates entirely offline after the initial crawl.
+## Telemetry
+Telemetry is optional and opt-in only. It is disabled by default.
+To enable telemetry, set the environment variable:
+```bash
+export STRUTH_TELEMETRY=on
+```
+When enabled, Struth collects anonymous usage events: query count, library name, and latency. No documentation content is ever transmitted. Telemetry can be disabled at any time by unsetting the variable or setting it to any value other than `on`.

package/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Leo Gester
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

package/README.md ADDED Viewed

@@ -0,0 +1,135 @@
+# Struth
+> A trust layer between LLMs and the living web.
+## What it does
+Struth pre-indexes documentation sites into an LLM-optimised format and serves them via the Model Context Protocol (MCP). It crawls, cleans, optionally condenses, and organises documentation into searchable sections stored locally on your machine. LLMs can then query this indexed documentation through an MCP server with full-text search, provenance tracking, and freshness guarantees.
+## Installation
+```bash
+# npm
+npm install -g struth
+# npx (no install)
+npx struth mirror <url>
+# bun
+bun install -g struth
+```
+## Quick Start
+```bash
+# Index a documentation site
+struth mirror https://docs.example.com --name example
+# See what you've indexed
+struth list
+```
+## Usage
+### `struth mirror <url>`
+Index a documentation site.
+```bash
+struth mirror https://docs.example.com --name example --top 50 --condense
+```
+| Flag | Description |
+|------|-------------|
+| `--name NAME` | Human-readable name for the doc set |
+| `--condense` | Enable LLM condensing to reduce token count |
+| `--api` | Use API instead of CLI for condensing |
+| `--top N` | Max pages to index (default: 20) |
+| `--concurrency N` | Max concurrent operations (default: 3) |
+| `--filter QUERY` | Keep only URLs matching this query |
+| `--exclude-path PATHS` | Comma-separated path patterns to exclude |
+| `--fetch-strategy STR` | `auto`, `readability`, or `jina` (default: `auto`) |
+### `struth list`
+List all indexed doc sets.
+```bash
+struth list
+```
+### `struth daemon start|stop|status`
+Manage the freshness daemon, which periodically re-checks indexed documentation for updates.
+```bash
+# Start with default 24-hour interval
+struth daemon start
+# Start with custom interval
+struth daemon start --interval 12
+# Check status
+struth daemon status
+# Stop
+struth daemon stop
+```
+| Flag | Description |
+|------|-------------|
+| `--interval HOURS` | Check interval in hours (default: 24) |
+### `struth mcp serve`
+Start the MCP server (stdio transport) for use with LLM clients.
+```bash
+struth mcp serve
+```
+## MCP Setup
+Add Struth to your Claude Desktop configuration:
+```json
+{
+  "mcpServers": {
+    "struth": {
+      "command": "npx",
+      "args": ["-y", "struth", "mcp", "serve"]
+    }
+  }
+}
+```
+## Architecture
+Struth processes documentation through a five-stage pipeline:
+1. **Discover** -- Crawl the target site, detect platform (e.g. Docusaurus, GitBook), extract URLs, deduplicate, and respect robots.txt.
+2. **Clean** -- Fetch each page, strip navigation and boilerplate, convert to clean Markdown via Readability + Turndown.
+3. **Condense** -- Optionally compress content using an LLM to reduce token count while preserving meaning.
+4. **Organize** -- Group pages into logical sections and generate a manifest with metadata and provenance.
+5. **Storage** -- Write sections to `~/.struth/libraries/` with content hashes and timestamps.
+**Search** uses FTS5 with BM25 scoring. Cross-library queries use Reciprocal Rank Fusion (RRF) to merge results from multiple indexed doc sets.
+**Freshness** is maintained by a background daemon that periodically checks indexed sources. It uses HTTP HEAD requests as a pre-filter, only re-fetching pages whose headers indicate a change, then compares content hashes to detect actual modifications.
+## Contributing
+1. Fork the repository
+2. Create a feature branch (`git checkout -b feature/my-feature`)
+3. Make your changes
+4. Run the test suite and checks:
+   ```bash
+   bun test
+   bun run typecheck
+   bun run lint
+   ```
+5. Commit and open a pull request
+## License
+MIT

package/package.json ADDED Viewed

@@ -0,0 +1,54 @@
+{
+  "name": "struth",
+  "version": "1.0.0",
+  "type": "module",
+  "description": "LLM-optimised documentation indexing and serving via MCP",
+  "license": "MIT",
+  "repository": {
+    "type": "git",
+    "url": "https://github.com/tzioup/struth.git"
+  },
+  "homepage": "https://github.com/tzioup/struth",
+  "keywords": [
+    "mcp",
+    "documentation",
+    "llm",
+    "indexing",
+    "model-context-protocol",
+    "ai",
+    "search",
+    "fts5"
+  ],
+  "bin": {
+    "struth": "./src/cli/index.ts"
+  },
+  "files": [
+    "src/",
+    "LICENSE",
+    "README.md",
+    "COMPLIANCE.md"
+  ],
+  "scripts": {
+    "dev": "bun run src/cli/index.ts",
+    "test": "bun test",
+    "typecheck": "tsc --noEmit",
+    "lint": "biome check src/",
+    "build": "bun build --compile --outfile=struth src/cli/index.ts"
+  },
+  "dependencies": {
+    "@anthropic-ai/sdk": "^0.88.0",
+    "@modelcontextprotocol/sdk": "^1",
+    "@mozilla/readability": "^0.5",
+    "linkedom": "^0.16",
+    "turndown": "^7",
+    "yaml": "^2.8.3",
+    "zod": "^3"
+  },
+  "devDependencies": {
+    "@biomejs/biome": "^1",
+    "@types/turndown": "^5",
+    "bun-types": "^1.3.12",
+    "typescript": "^5",
+    "vitest": "^3"
+  }
+}

package/src/cli/index.ts ADDED Viewed

@@ -0,0 +1,244 @@
+#!/usr/bin/env bun
+/**
+ * Struth CLI entry point.
+ *
+ * Commands:
+ *   struth mirror <url> [flags]  — Index a documentation site
+ *   struth mcp serve             — Start MCP server
+ *   struth daemon start|stop     — Manage freshness daemon
+ *   struth list                  — List indexed doc sets
+ */
+import { CLIENT_VERSION } from "../core/constants.js";
+import { cleanPages } from "../core/pipeline/clean.js";
+import { condensePages } from "../core/pipeline/condense.js";
+import { discover } from "../core/pipeline/discover.js";
+import { organize } from "../core/pipeline/organize.js";
+import { listDocSets, writeDocSet } from "../core/storage/index.js";
+import { sectionsDir } from "../core/storage/paths.js";
+import { isDaemonRunning, runDaemonLoop, startDaemon, stopDaemon } from "../daemon/process.js";
+import { buildFts5Index } from "../mcp/fts5-index.js";
+import { startMcpServer } from "../mcp/server.js";
+import { sendEvent } from "../telemetry/client.js";
+function parseArgs(args: string[]): {
+	command: string;
+	url?: string;
+	flags: Record<string, string | boolean>;
+} {
+	const command = args[0] ?? "help";
+	const url = args[1] && !args[1].startsWith("--") ? args[1] : undefined;
+	const flags: Record<string, string | boolean> = {};
+	for (let i = url ? 2 : 1; i < args.length; i++) {
+		const arg = args[i];
+		if (arg.startsWith("--")) {
+			const key = arg.slice(2);
+			const next = args[i + 1];
+			if (next && !next.startsWith("--")) {
+				flags[key] = next;
+				i++;
+			} else {
+				flags[key] = true;
+			}
+		}
+	}
+	return { command, url, flags };
+}
+async function mirror(url: string, flags: Record<string, string | boolean>): Promise<void> {
+	const mirrorStart = Date.now();
+	const name = typeof flags.name === "string" ? flags.name : undefined;
+	const condense = flags.condense === true;
+	const condenseApi = flags.api === true;
+	const concurrency =
+		typeof flags.concurrency === "string" ? Number.parseInt(flags.concurrency, 10) : 3;
+	const top = typeof flags.top === "string" ? Number.parseInt(flags.top, 10) : 20;
+	const filter = typeof flags.filter === "string" ? flags.filter : undefined;
+	const excludePath =
+		typeof flags["exclude-path"] === "string"
+			? flags["exclude-path"].split(",").map((s) => s.trim())
+			: [];
+	const fetchStrategy =
+		typeof flags["fetch-strategy"] === "string"
+			? (flags["fetch-strategy"] as "auto" | "readability" | "jina")
+			: "auto";
+	// Stage 1: Discover
+	console.error("[struth] Stage 1: Discovering pages...");
+	const discoverResult = await discover(url, {
+		filter,
+		excludePath,
+		exclude: [],
+		top,
+	});
+	console.error(
+		`[struth] Found ${discoverResult.total_found} URLs (${discoverResult.after_dedup} after dedup) via ${discoverResult.source_method}`,
+	);
+	if (discoverResult.platform_detected) {
+		console.error(`[struth] Platform detected: ${discoverResult.platform_detected}`);
+	}
+	if (discoverResult.urls.length === 0) {
+		console.error("[struth] No pages found. Exiting.");
+		process.exit(1);
+	}
+	// Stage 2: Clean
+	console.error(`[struth] Stage 2: Cleaning ${discoverResult.urls.length} pages...`);
+	const urls = discoverResult.urls.map((u) => u.url);
+	const cleanedPages = await cleanPages(urls, { fetchStrategy, concurrency });
+	console.error(`[struth] Cleaned ${cleanedPages.length} pages`);
+	// Stage 3: Condense (optional)
+	let condensedPages: Awaited<ReturnType<typeof condensePages>>;
+	if (condense) {
+		console.error(`[struth] Stage 3: Condensing ${cleanedPages.length} pages...`);
+		condensedPages = await condensePages(cleanedPages, {
+			condense: true,
+			condenseApi,
+			concurrency,
+		});
+		const condensedCount = condensedPages.filter((p) => p.condensed).length;
+		console.error(`[struth] Condensed ${condensedCount}/${condensedPages.length} pages`);
+	} else {
+		// Skip condense — pass through as uncondensed
+		condensedPages = cleanedPages.map((page) => ({
+			url: page.url,
+			slug: page.slug,
+			section: page.section,
+			content_condensed: page.content_clean,
+			content_clean: page.content_clean,
+			word_count_clean: page.word_count,
+			word_count_condensed: page.word_count,
+			condensed: false,
+			condense_method: "skipped" as const,
+		}));
+		console.error("[struth] Stage 3: Condense skipped (use --condense to enable)");
+	}
+	// Stage 4: Organize
+	console.error("[struth] Stage 4: Organizing into sections...");
+	const manifest = await organize(condensedPages, url, { name });
+	console.error(
+		`[struth] Organized into ${manifest.sections.length} sections, ${manifest.pages.length} pages`,
+	);
+	// Stage 5: Write to storage
+	console.error("[struth] Stage 5: Writing to storage...");
+	const pages = condensedPages.map((p) => ({
+		slug: p.slug,
+		clean: p.content_clean,
+		condensed: p.content_condensed,
+	}));
+	await writeDocSet(manifest, pages);
+	console.error(`[struth] Written to ~/.struth/libraries/${manifest.name}/`);
+	// Stage 6: Build FTS5 search index
+	console.error("[struth] Stage 6: Building search index...");
+	const sDir = sectionsDir(manifest.name, manifest.version);
+	await buildFts5Index(manifest.name, manifest.version, manifest, sDir);
+	console.error("[struth] Search index built.");
+	// Telemetry: send index event (await to prevent dropped events on exit)
+	await sendEvent({
+		event: "index",
+		library: manifest.name,
+		sections_hit: manifest.sections.map((s) => s.slug),
+		latency_ms: Date.now() - mirrorStart,
+		client_version: CLIENT_VERSION,
+		timestamp: new Date().toISOString(),
+	});
+	console.error("[struth] Done.");
+}
+async function list(): Promise<void> {
+	const docSets = await listDocSets();
+	if (docSets.length === 0) {
+		console.log("No indexed doc sets. Use 'struth mirror <url>' to index one.");
+		return;
+	}
+	console.log("Indexed doc sets:");
+	for (const ds of docSets) {
+		const version = ds.version ? ` v${ds.version}` : "";
+		console.log(`  ${ds.name}${version}  ${ds.path}`);
+	}
+}
+async function main(): Promise<void> {
+	const { command, url, flags } = parseArgs(process.argv.slice(2));
+	switch (command) {
+		case "mirror": {
+			if (!url) {
+				console.error("Usage: struth mirror <url> [--name NAME] [--condense] [--top N]");
+				process.exit(1);
+			}
+			await mirror(url, flags);
+			break;
+		}
+		case "mcp": {
+			const subcommand = url; // "serve" expected as second arg
+			if (subcommand !== "serve") {
+				console.error("Usage: struth mcp serve");
+				process.exit(1);
+			}
+			await startMcpServer();
+			break;
+		}
+		case "daemon": {
+			const daemonSub = url;
+			if (daemonSub === "start") {
+				const interval =
+					typeof flags.interval === "string" ? Number.parseInt(flags.interval, 10) : 24;
+				await startDaemon(interval);
+			} else if (daemonSub === "stop") {
+				await stopDaemon();
+			} else if (daemonSub === "status") {
+				const running = await isDaemonRunning();
+				console.log(running ? "Daemon is running" : "Daemon is not running");
+			} else if (daemonSub === "run") {
+				// Internal: called by startDaemon as detached child
+				const interval =
+					typeof flags.interval === "string" ? Number.parseInt(flags.interval, 10) : 24;
+				await runDaemonLoop(interval);
+			} else {
+				console.error("Usage: struth daemon start|stop|status [--interval HOURS]");
+				process.exit(1);
+			}
+			break;
+		}
+		case "list":
+			await list();
+			break;
+		default:
+			console.log("Usage: struth <mirror|mcp|daemon|list> [options]");
+			console.log("");
+			console.log("Commands:");
+			console.log("  mirror <url>   Index a documentation site");
+			console.log("  list           List indexed doc sets");
+			console.log("  mcp serve      Start MCP server (stdio transport)");
+			console.log("  daemon start   Start freshness daemon [--interval HOURS]");
+			console.log("  daemon stop    Stop freshness daemon");
+			console.log("  daemon status  Check if daemon is running");
+			console.log("");
+			console.log("Mirror flags:");
+			console.log("  --name NAME           Human-readable name");
+			console.log("  --condense            Enable LLM condensing");
+			console.log("  --api                 Use API instead of CLI for condensing");
+			console.log("  --top N               Max pages to index (default: 20)");
+			console.log("  --concurrency N       Max concurrent operations (default: 3)");
+			console.log("  --filter QUERY        Keep only matching URLs");
+			console.log("  --exclude-path PATHS  Comma-separated path patterns to exclude");
+			console.log("  --fetch-strategy STR  auto|readability|jina (default: auto)");
+			process.exit(1);
+	}
+}
+main().catch((err) => {
+	console.error(err);
+	process.exit(1);
+});

package/src/core/constants.ts ADDED Viewed

@@ -0,0 +1,32 @@
+export { SCHEMA_VERSION } from "./schemas.js";
+/** Default storage location for indexed doc sets */
+export const STRUTH_HOME = "~/.struth";
+/** Default library storage path */
+export const LIBRARIES_DIR = "libraries";
+/** Default concurrency for pipeline operations */
+export const DEFAULT_CONCURRENCY = 3;
+/** Max concurrency for pipeline operations */
+export const MAX_CONCURRENCY = 20;
+/** Default number of top pages to index */
+export const DEFAULT_TOP = 20;
+/** Default max sections returned by MCP get_docs */
+export const DEFAULT_MAX_SECTIONS = 5;
+/** Telemetry endpoint */
+export const TELEMETRY_ENDPOINT = "https://struth-telemetry.workers.dev";
+/** Client version — read from package.json to avoid drift */
+import pkg from "../../package.json";
+export const CLIENT_VERSION: string = pkg.version;
+/** User agent for fetch requests */
+export const USER_AGENT = `Struth-Bot/${pkg.version}`;
+/** Freshness check interval (hours) */
+export const DEFAULT_CHECK_INTERVAL_HOURS = 24;