struth 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/COMPLIANCE.md ADDED
@@ -0,0 +1,41 @@
1
+ # Compliance
2
+
3
+ ## Content Attribution
4
+
5
+ Struth indexes publicly available documentation. Each served section includes full provenance metadata: the source URL, fetch timestamp, and content hash. Attribution is automatic and immutable -- it is embedded at index time and cannot be stripped from query results.
6
+
7
+ ## robots.txt Compliance
8
+
9
+ Struth respects robots.txt directives. The discover stage checks each site's robots.txt before crawling any pages.
10
+
11
+ Struth identifies itself with the User-Agent `Struth-Bot`. Site owners can block indexing by adding the following to their robots.txt:
12
+
13
+ ```
14
+ User-agent: Struth-Bot
15
+ Disallow: /
16
+ ```
17
+
18
+ ## Takedown Process
19
+
20
+ Content owners can request removal of their indexed documentation through any of the following channels:
21
+
22
+ 1. **GitHub issue** -- Open an issue at [github.com/tzioup/struth](https://github.com/tzioup/struth) with the subject line "Takedown Request" and the URLs to be removed.
23
+ 2. **Email** -- Send a request to takedown@struth.dev with the affected URLs.
24
+
25
+ Struth will remove indexed content within 48 hours of a valid request.
26
+
27
+ ## Data Storage
28
+
29
+ All indexed data is stored locally on the user's machine at `~/.struth/`. No content is uploaded to any central server. Struth operates entirely offline after the initial crawl.
30
+
31
+ ## Telemetry
32
+
33
+ Telemetry is optional and opt-in only. It is disabled by default.
34
+
35
+ To enable telemetry, set the environment variable:
36
+
37
+ ```bash
38
+ export STRUTH_TELEMETRY=on
39
+ ```
40
+
41
+ When enabled, Struth collects anonymous usage events: query count, library name, and latency. No documentation content is ever transmitted. Telemetry can be disabled at any time by unsetting the variable or setting it to any value other than `on`.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Leo Gester
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,135 @@
1
+ # Struth
2
+
3
+ > A trust layer between LLMs and the living web.
4
+
5
+ ## What it does
6
+
7
+ Struth pre-indexes documentation sites into an LLM-optimised format and serves them via the Model Context Protocol (MCP). It crawls, cleans, optionally condenses, and organises documentation into searchable sections stored locally on your machine. LLMs can then query this indexed documentation through an MCP server with full-text search, provenance tracking, and freshness guarantees.
8
+
9
+ ## Installation
10
+
11
+ ```bash
12
+ # npm
13
+ npm install -g struth
14
+
15
+ # npx (no install)
16
+ npx struth mirror <url>
17
+
18
+ # bun
19
+ bun install -g struth
20
+ ```
21
+
22
+ ## Quick Start
23
+
24
+ ```bash
25
+ # Index a documentation site
26
+ struth mirror https://docs.example.com --name example
27
+
28
+ # See what you've indexed
29
+ struth list
30
+ ```
31
+
32
+ ## Usage
33
+
34
+ ### `struth mirror <url>`
35
+
36
+ Index a documentation site.
37
+
38
+ ```bash
39
+ struth mirror https://docs.example.com --name example --top 50 --condense
40
+ ```
41
+
42
+ | Flag | Description |
43
+ |------|-------------|
44
+ | `--name NAME` | Human-readable name for the doc set |
45
+ | `--condense` | Enable LLM condensing to reduce token count |
46
+ | `--api` | Use API instead of CLI for condensing |
47
+ | `--top N` | Max pages to index (default: 20) |
48
+ | `--concurrency N` | Max concurrent operations (default: 3) |
49
+ | `--filter QUERY` | Keep only URLs matching this query |
50
+ | `--exclude-path PATHS` | Comma-separated path patterns to exclude |
51
+ | `--fetch-strategy STR` | `auto`, `readability`, or `jina` (default: `auto`) |
52
+
53
+ ### `struth list`
54
+
55
+ List all indexed doc sets.
56
+
57
+ ```bash
58
+ struth list
59
+ ```
60
+
61
+ ### `struth daemon start|stop|status`
62
+
63
+ Manage the freshness daemon, which periodically re-checks indexed documentation for updates.
64
+
65
+ ```bash
66
+ # Start with default 24-hour interval
67
+ struth daemon start
68
+
69
+ # Start with custom interval
70
+ struth daemon start --interval 12
71
+
72
+ # Check status
73
+ struth daemon status
74
+
75
+ # Stop
76
+ struth daemon stop
77
+ ```
78
+
79
+ | Flag | Description |
80
+ |------|-------------|
81
+ | `--interval HOURS` | Check interval in hours (default: 24) |
82
+
83
+ ### `struth mcp serve`
84
+
85
+ Start the MCP server (stdio transport) for use with LLM clients.
86
+
87
+ ```bash
88
+ struth mcp serve
89
+ ```
90
+
91
+ ## MCP Setup
92
+
93
+ Add Struth to your Claude Desktop configuration:
94
+
95
+ ```json
96
+ {
97
+ "mcpServers": {
98
+ "struth": {
99
+ "command": "npx",
100
+ "args": ["-y", "struth", "mcp", "serve"]
101
+ }
102
+ }
103
+ }
104
+ ```
105
+
106
+ ## Architecture
107
+
108
+ Struth processes documentation through a five-stage pipeline:
109
+
110
+ 1. **Discover** -- Crawl the target site, detect platform (e.g. Docusaurus, GitBook), extract URLs, deduplicate, and respect robots.txt.
111
+ 2. **Clean** -- Fetch each page, strip navigation and boilerplate, convert to clean Markdown via Readability + Turndown.
112
+ 3. **Condense** -- Optionally compress content using an LLM to reduce token count while preserving meaning.
113
+ 4. **Organize** -- Group pages into logical sections and generate a manifest with metadata and provenance.
114
+ 5. **Storage** -- Write sections to `~/.struth/libraries/` with content hashes and timestamps.
115
+
116
+ **Search** uses FTS5 with BM25 scoring. Cross-library queries use Reciprocal Rank Fusion (RRF) to merge results from multiple indexed doc sets.
117
+
118
+ **Freshness** is maintained by a background daemon that periodically checks indexed sources. It uses HTTP HEAD requests as a pre-filter, only re-fetching pages whose headers indicate a change, then compares content hashes to detect actual modifications.
119
+
120
+ ## Contributing
121
+
122
+ 1. Fork the repository
123
+ 2. Create a feature branch (`git checkout -b feature/my-feature`)
124
+ 3. Make your changes
125
+ 4. Run the test suite and checks:
126
+ ```bash
127
+ bun test
128
+ bun run typecheck
129
+ bun run lint
130
+ ```
131
+ 5. Commit and open a pull request
132
+
133
+ ## License
134
+
135
+ MIT
package/package.json ADDED
@@ -0,0 +1,54 @@
1
+ {
2
+ "name": "struth",
3
+ "version": "1.0.0",
4
+ "type": "module",
5
+ "description": "LLM-optimised documentation indexing and serving via MCP",
6
+ "license": "MIT",
7
+ "repository": {
8
+ "type": "git",
9
+ "url": "https://github.com/tzioup/struth.git"
10
+ },
11
+ "homepage": "https://github.com/tzioup/struth",
12
+ "keywords": [
13
+ "mcp",
14
+ "documentation",
15
+ "llm",
16
+ "indexing",
17
+ "model-context-protocol",
18
+ "ai",
19
+ "search",
20
+ "fts5"
21
+ ],
22
+ "bin": {
23
+ "struth": "./src/cli/index.ts"
24
+ },
25
+ "files": [
26
+ "src/",
27
+ "LICENSE",
28
+ "README.md",
29
+ "COMPLIANCE.md"
30
+ ],
31
+ "scripts": {
32
+ "dev": "bun run src/cli/index.ts",
33
+ "test": "bun test",
34
+ "typecheck": "tsc --noEmit",
35
+ "lint": "biome check src/",
36
+ "build": "bun build --compile --outfile=struth src/cli/index.ts"
37
+ },
38
+ "dependencies": {
39
+ "@anthropic-ai/sdk": "^0.88.0",
40
+ "@modelcontextprotocol/sdk": "^1",
41
+ "@mozilla/readability": "^0.5",
42
+ "linkedom": "^0.16",
43
+ "turndown": "^7",
44
+ "yaml": "^2.8.3",
45
+ "zod": "^3"
46
+ },
47
+ "devDependencies": {
48
+ "@biomejs/biome": "^1",
49
+ "@types/turndown": "^5",
50
+ "bun-types": "^1.3.12",
51
+ "typescript": "^5",
52
+ "vitest": "^3"
53
+ }
54
+ }
@@ -0,0 +1,244 @@
1
+ #!/usr/bin/env bun
2
+
3
+ /**
4
+ * Struth CLI entry point.
5
+ *
6
+ * Commands:
7
+ * struth mirror <url> [flags] — Index a documentation site
8
+ * struth mcp serve — Start MCP server
9
+ * struth daemon start|stop — Manage freshness daemon
10
+ * struth list — List indexed doc sets
11
+ */
12
+
13
+ import { CLIENT_VERSION } from "../core/constants.js";
14
+ import { cleanPages } from "../core/pipeline/clean.js";
15
+ import { condensePages } from "../core/pipeline/condense.js";
16
+ import { discover } from "../core/pipeline/discover.js";
17
+ import { organize } from "../core/pipeline/organize.js";
18
+ import { listDocSets, writeDocSet } from "../core/storage/index.js";
19
+ import { sectionsDir } from "../core/storage/paths.js";
20
+ import { isDaemonRunning, runDaemonLoop, startDaemon, stopDaemon } from "../daemon/process.js";
21
+ import { buildFts5Index } from "../mcp/fts5-index.js";
22
+ import { startMcpServer } from "../mcp/server.js";
23
+ import { sendEvent } from "../telemetry/client.js";
24
+
25
+ function parseArgs(args: string[]): {
26
+ command: string;
27
+ url?: string;
28
+ flags: Record<string, string | boolean>;
29
+ } {
30
+ const command = args[0] ?? "help";
31
+ const url = args[1] && !args[1].startsWith("--") ? args[1] : undefined;
32
+ const flags: Record<string, string | boolean> = {};
33
+
34
+ for (let i = url ? 2 : 1; i < args.length; i++) {
35
+ const arg = args[i];
36
+ if (arg.startsWith("--")) {
37
+ const key = arg.slice(2);
38
+ const next = args[i + 1];
39
+ if (next && !next.startsWith("--")) {
40
+ flags[key] = next;
41
+ i++;
42
+ } else {
43
+ flags[key] = true;
44
+ }
45
+ }
46
+ }
47
+
48
+ return { command, url, flags };
49
+ }
50
+
51
+ async function mirror(url: string, flags: Record<string, string | boolean>): Promise<void> {
52
+ const mirrorStart = Date.now();
53
+ const name = typeof flags.name === "string" ? flags.name : undefined;
54
+ const condense = flags.condense === true;
55
+ const condenseApi = flags.api === true;
56
+ const concurrency =
57
+ typeof flags.concurrency === "string" ? Number.parseInt(flags.concurrency, 10) : 3;
58
+ const top = typeof flags.top === "string" ? Number.parseInt(flags.top, 10) : 20;
59
+ const filter = typeof flags.filter === "string" ? flags.filter : undefined;
60
+ const excludePath =
61
+ typeof flags["exclude-path"] === "string"
62
+ ? flags["exclude-path"].split(",").map((s) => s.trim())
63
+ : [];
64
+ const fetchStrategy =
65
+ typeof flags["fetch-strategy"] === "string"
66
+ ? (flags["fetch-strategy"] as "auto" | "readability" | "jina")
67
+ : "auto";
68
+
69
+ // Stage 1: Discover
70
+ console.error("[struth] Stage 1: Discovering pages...");
71
+ const discoverResult = await discover(url, {
72
+ filter,
73
+ excludePath,
74
+ exclude: [],
75
+ top,
76
+ });
77
+ console.error(
78
+ `[struth] Found ${discoverResult.total_found} URLs (${discoverResult.after_dedup} after dedup) via ${discoverResult.source_method}`,
79
+ );
80
+ if (discoverResult.platform_detected) {
81
+ console.error(`[struth] Platform detected: ${discoverResult.platform_detected}`);
82
+ }
83
+
84
+ if (discoverResult.urls.length === 0) {
85
+ console.error("[struth] No pages found. Exiting.");
86
+ process.exit(1);
87
+ }
88
+
89
+ // Stage 2: Clean
90
+ console.error(`[struth] Stage 2: Cleaning ${discoverResult.urls.length} pages...`);
91
+ const urls = discoverResult.urls.map((u) => u.url);
92
+ const cleanedPages = await cleanPages(urls, { fetchStrategy, concurrency });
93
+ console.error(`[struth] Cleaned ${cleanedPages.length} pages`);
94
+
95
+ // Stage 3: Condense (optional)
96
+ let condensedPages: Awaited<ReturnType<typeof condensePages>>;
97
+ if (condense) {
98
+ console.error(`[struth] Stage 3: Condensing ${cleanedPages.length} pages...`);
99
+ condensedPages = await condensePages(cleanedPages, {
100
+ condense: true,
101
+ condenseApi,
102
+ concurrency,
103
+ });
104
+ const condensedCount = condensedPages.filter((p) => p.condensed).length;
105
+ console.error(`[struth] Condensed ${condensedCount}/${condensedPages.length} pages`);
106
+ } else {
107
+ // Skip condense — pass through as uncondensed
108
+ condensedPages = cleanedPages.map((page) => ({
109
+ url: page.url,
110
+ slug: page.slug,
111
+ section: page.section,
112
+ content_condensed: page.content_clean,
113
+ content_clean: page.content_clean,
114
+ word_count_clean: page.word_count,
115
+ word_count_condensed: page.word_count,
116
+ condensed: false,
117
+ condense_method: "skipped" as const,
118
+ }));
119
+ console.error("[struth] Stage 3: Condense skipped (use --condense to enable)");
120
+ }
121
+
122
+ // Stage 4: Organize
123
+ console.error("[struth] Stage 4: Organizing into sections...");
124
+ const manifest = await organize(condensedPages, url, { name });
125
+ console.error(
126
+ `[struth] Organized into ${manifest.sections.length} sections, ${manifest.pages.length} pages`,
127
+ );
128
+
129
+ // Stage 5: Write to storage
130
+ console.error("[struth] Stage 5: Writing to storage...");
131
+ const pages = condensedPages.map((p) => ({
132
+ slug: p.slug,
133
+ clean: p.content_clean,
134
+ condensed: p.content_condensed,
135
+ }));
136
+ await writeDocSet(manifest, pages);
137
+ console.error(`[struth] Written to ~/.struth/libraries/${manifest.name}/`);
138
+
139
+ // Stage 6: Build FTS5 search index
140
+ console.error("[struth] Stage 6: Building search index...");
141
+ const sDir = sectionsDir(manifest.name, manifest.version);
142
+ await buildFts5Index(manifest.name, manifest.version, manifest, sDir);
143
+ console.error("[struth] Search index built.");
144
+
145
+ // Telemetry: send index event (await to prevent dropped events on exit)
146
+ await sendEvent({
147
+ event: "index",
148
+ library: manifest.name,
149
+ sections_hit: manifest.sections.map((s) => s.slug),
150
+ latency_ms: Date.now() - mirrorStart,
151
+ client_version: CLIENT_VERSION,
152
+ timestamp: new Date().toISOString(),
153
+ });
154
+
155
+ console.error("[struth] Done.");
156
+ }
157
+
158
+ async function list(): Promise<void> {
159
+ const docSets = await listDocSets();
160
+ if (docSets.length === 0) {
161
+ console.log("No indexed doc sets. Use 'struth mirror <url>' to index one.");
162
+ return;
163
+ }
164
+ console.log("Indexed doc sets:");
165
+ for (const ds of docSets) {
166
+ const version = ds.version ? ` v${ds.version}` : "";
167
+ console.log(` ${ds.name}${version} ${ds.path}`);
168
+ }
169
+ }
170
+
171
+ async function main(): Promise<void> {
172
+ const { command, url, flags } = parseArgs(process.argv.slice(2));
173
+
174
+ switch (command) {
175
+ case "mirror": {
176
+ if (!url) {
177
+ console.error("Usage: struth mirror <url> [--name NAME] [--condense] [--top N]");
178
+ process.exit(1);
179
+ }
180
+ await mirror(url, flags);
181
+ break;
182
+ }
183
+ case "mcp": {
184
+ const subcommand = url; // "serve" expected as second arg
185
+ if (subcommand !== "serve") {
186
+ console.error("Usage: struth mcp serve");
187
+ process.exit(1);
188
+ }
189
+ await startMcpServer();
190
+ break;
191
+ }
192
+ case "daemon": {
193
+ const daemonSub = url;
194
+ if (daemonSub === "start") {
195
+ const interval =
196
+ typeof flags.interval === "string" ? Number.parseInt(flags.interval, 10) : 24;
197
+ await startDaemon(interval);
198
+ } else if (daemonSub === "stop") {
199
+ await stopDaemon();
200
+ } else if (daemonSub === "status") {
201
+ const running = await isDaemonRunning();
202
+ console.log(running ? "Daemon is running" : "Daemon is not running");
203
+ } else if (daemonSub === "run") {
204
+ // Internal: called by startDaemon as detached child
205
+ const interval =
206
+ typeof flags.interval === "string" ? Number.parseInt(flags.interval, 10) : 24;
207
+ await runDaemonLoop(interval);
208
+ } else {
209
+ console.error("Usage: struth daemon start|stop|status [--interval HOURS]");
210
+ process.exit(1);
211
+ }
212
+ break;
213
+ }
214
+ case "list":
215
+ await list();
216
+ break;
217
+ default:
218
+ console.log("Usage: struth <mirror|mcp|daemon|list> [options]");
219
+ console.log("");
220
+ console.log("Commands:");
221
+ console.log(" mirror <url> Index a documentation site");
222
+ console.log(" list List indexed doc sets");
223
+ console.log(" mcp serve Start MCP server (stdio transport)");
224
+ console.log(" daemon start Start freshness daemon [--interval HOURS]");
225
+ console.log(" daemon stop Stop freshness daemon");
226
+ console.log(" daemon status Check if daemon is running");
227
+ console.log("");
228
+ console.log("Mirror flags:");
229
+ console.log(" --name NAME Human-readable name");
230
+ console.log(" --condense Enable LLM condensing");
231
+ console.log(" --api Use API instead of CLI for condensing");
232
+ console.log(" --top N Max pages to index (default: 20)");
233
+ console.log(" --concurrency N Max concurrent operations (default: 3)");
234
+ console.log(" --filter QUERY Keep only matching URLs");
235
+ console.log(" --exclude-path PATHS Comma-separated path patterns to exclude");
236
+ console.log(" --fetch-strategy STR auto|readability|jina (default: auto)");
237
+ process.exit(1);
238
+ }
239
+ }
240
+
241
+ main().catch((err) => {
242
+ console.error(err);
243
+ process.exit(1);
244
+ });
@@ -0,0 +1,32 @@
1
+ export { SCHEMA_VERSION } from "./schemas.js";
2
+
3
+ /** Default storage location for indexed doc sets */
4
+ export const STRUTH_HOME = "~/.struth";
5
+
6
+ /** Default library storage path */
7
+ export const LIBRARIES_DIR = "libraries";
8
+
9
+ /** Default concurrency for pipeline operations */
10
+ export const DEFAULT_CONCURRENCY = 3;
11
+
12
+ /** Max concurrency for pipeline operations */
13
+ export const MAX_CONCURRENCY = 20;
14
+
15
+ /** Default number of top pages to index */
16
+ export const DEFAULT_TOP = 20;
17
+
18
+ /** Default max sections returned by MCP get_docs */
19
+ export const DEFAULT_MAX_SECTIONS = 5;
20
+
21
+ /** Telemetry endpoint */
22
+ export const TELEMETRY_ENDPOINT = "https://struth-telemetry.workers.dev";
23
+
24
+ /** Client version — read from package.json to avoid drift */
25
+ import pkg from "../../package.json";
26
+ export const CLIENT_VERSION: string = pkg.version;
27
+
28
+ /** User agent for fetch requests */
29
+ export const USER_AGENT = `Struth-Bot/${pkg.version}`;
30
+
31
+ /** Freshness check interval (hours) */
32
+ export const DEFAULT_CHECK_INTERVAL_HOURS = 24;