struth 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,448 @@
1
+ import type { z } from "zod";
2
+ import { USER_AGENT } from "../constants.js";
3
+ import { DiscoverResult, type MirrorOptions } from "../schemas.js";
4
+
5
+ type DiscoverOpts = Pick<
6
+ z.infer<typeof MirrorOptions>,
7
+ "smart" | "filter" | "excludePath" | "exclude" | "top"
8
+ >;
9
+
10
+ // ── Non-doc file extensions to filter ───────────────────────────────
11
+
12
+ const NON_DOC_EXTENSIONS = new Set([
13
+ ".json",
14
+ ".xml",
15
+ ".txt",
16
+ ".png",
17
+ ".jpg",
18
+ ".svg",
19
+ ".css",
20
+ ".js",
21
+ ".ico",
22
+ ".gif",
23
+ ".woff",
24
+ ".woff2",
25
+ ".ttf",
26
+ ".eot",
27
+ ".pdf",
28
+ ]);
29
+
30
+ // ── Platform detection patterns ─────────────────────────────────────
31
+
32
+ const PLATFORM_PATTERNS: Array<{
33
+ name: string;
34
+ test: (html: string, headers: Headers) => boolean;
35
+ }> = [
36
+ {
37
+ name: "ReadTheDocs",
38
+ test: (html) =>
39
+ html.includes("readthedocs") || html.includes("Read the Docs") || html.includes("rtd.css"),
40
+ },
41
+ {
42
+ name: "GitBook",
43
+ test: (html, headers) =>
44
+ html.includes("gitbook") || headers.get("x-served-by")?.includes("gitbook") === true,
45
+ },
46
+ {
47
+ name: "Docusaurus",
48
+ test: (html) => html.includes("docusaurus") || html.includes("__docusaurus"),
49
+ },
50
+ {
51
+ name: "MkDocs",
52
+ test: (html) => html.includes("mkdocs") || html.includes("MkDocs"),
53
+ },
54
+ {
55
+ name: "Sphinx",
56
+ test: (html) =>
57
+ html.includes("sphinx") || html.includes("Sphinx") || html.includes("_sphinx_javascript"),
58
+ },
59
+ {
60
+ name: "Mintlify",
61
+ test: (html) => html.includes("mintlify"),
62
+ },
63
+ {
64
+ name: "Nextra",
65
+ test: (html) => html.includes("nextra"),
66
+ },
67
+ {
68
+ name: "VitePress",
69
+ test: (html) => html.includes("vitepress") || html.includes("VitePress"),
70
+ },
71
+ {
72
+ name: "mdBook",
73
+ test: (html) => html.includes("mdbook") || html.includes("mdBook"),
74
+ },
75
+ ];
76
+
77
+ // ── Timeouts ────────────────────────────────────────────────────────
78
+
79
+ const PROBE_TIMEOUT = 5_000;
80
+ const WALK_TIMEOUT = 10_000;
81
+
82
+ // ── Fetch helpers ───────────────────────────────────────────────────
83
+
84
+ async function probe(url: string, timeout = PROBE_TIMEOUT): Promise<Response | null> {
85
+ try {
86
+ const controller = new AbortController();
87
+ const timer = setTimeout(() => controller.abort(), timeout);
88
+ const resp = await fetch(url, {
89
+ headers: { "User-Agent": USER_AGENT },
90
+ signal: controller.signal,
91
+ redirect: "follow",
92
+ });
93
+ clearTimeout(timer);
94
+ return resp.ok ? resp : null;
95
+ } catch {
96
+ return null;
97
+ }
98
+ }
99
+
100
+ async function fetchText(url: string, timeout = WALK_TIMEOUT): Promise<string | null> {
101
+ try {
102
+ const controller = new AbortController();
103
+ const timer = setTimeout(() => controller.abort(), timeout);
104
+ const resp = await fetch(url, {
105
+ headers: { "User-Agent": USER_AGENT },
106
+ signal: controller.signal,
107
+ redirect: "follow",
108
+ });
109
+ clearTimeout(timer);
110
+ if (!resp.ok) return null;
111
+ return await resp.text();
112
+ } catch {
113
+ return null;
114
+ }
115
+ }
116
+
117
+ // ── URL processing ──────────────────────────────────────────────────
118
+
119
+ function normalizeUrl(raw: string): string {
120
+ try {
121
+ const parsed = new URL(raw);
122
+ parsed.hash = ""; // strip fragments
123
+ // normalize trailing slash: remove if path has content beyond /
124
+ if (parsed.pathname.length > 1 && parsed.pathname.endsWith("/")) {
125
+ parsed.pathname = parsed.pathname.slice(0, -1);
126
+ }
127
+ return parsed.href;
128
+ } catch {
129
+ return raw;
130
+ }
131
+ }
132
+
133
+ function isDocUrl(urlStr: string): boolean {
134
+ try {
135
+ const parsed = new URL(urlStr);
136
+ const ext = parsed.pathname.match(/\.[a-z0-9]+$/i)?.[0]?.toLowerCase();
137
+ if (ext && NON_DOC_EXTENSIONS.has(ext)) return false;
138
+ return true;
139
+ } catch {
140
+ return false;
141
+ }
142
+ }
143
+
144
+ function isSameOrigin(urlStr: string, origin: string): boolean {
145
+ try {
146
+ const parsed = new URL(urlStr);
147
+ return parsed.origin === origin;
148
+ } catch {
149
+ return false;
150
+ }
151
+ }
152
+
153
+ type UrlEntry = { url: string; source: z.infer<typeof DiscoverResult>["urls"][number]["source"] };
154
+
155
+ function processUrls(
156
+ raw: UrlEntry[],
157
+ origin: string,
158
+ opts: DiscoverOpts,
159
+ ): { urls: UrlEntry[]; totalFound: number; afterDedup: number } {
160
+ const totalFound = raw.length;
161
+
162
+ // Normalize, filter non-doc, filter external
163
+ let urls = raw
164
+ .map((e) => ({ ...e, url: normalizeUrl(e.url) }))
165
+ .filter((e) => isDocUrl(e.url))
166
+ .filter((e) => isSameOrigin(e.url, origin));
167
+
168
+ // Deduplicate
169
+ const seen = new Set<string>();
170
+ urls = urls.filter((e) => {
171
+ if (seen.has(e.url)) return false;
172
+ seen.add(e.url);
173
+ return true;
174
+ });
175
+
176
+ const afterDedup = urls.length;
177
+
178
+ // Apply excludePath patterns
179
+ if (opts.excludePath && opts.excludePath.length > 0) {
180
+ urls = urls.filter((e) => {
181
+ const path = new URL(e.url).pathname;
182
+ return !opts.excludePath.some((pat) => path.includes(pat));
183
+ });
184
+ }
185
+
186
+ // Apply filter (keep ONLY matching)
187
+ if (opts.filter) {
188
+ const filterStr = opts.filter;
189
+ urls = urls.filter((e) => e.url.includes(filterStr));
190
+ }
191
+
192
+ // Apply top limit
193
+ if (opts.top && opts.top > 0) {
194
+ urls = urls.slice(0, opts.top);
195
+ }
196
+
197
+ return { urls, totalFound, afterDedup };
198
+ }
199
+
200
+ // ── Robots.txt check ────────────────────────────────────────────────
201
+
202
+ async function checkRobots(origin: string): Promise<"allowed" | "blocked" | "no_robots_txt"> {
203
+ const text = await fetchText(`${origin}/robots.txt`, PROBE_TIMEOUT);
204
+ if (text === null) return "no_robots_txt";
205
+
206
+ // Parse robots.txt for Struth-Bot rules
207
+ const lines = text.split("\n").map((l) => l.trim());
208
+ let inStruthBlock = false;
209
+
210
+ for (const line of lines) {
211
+ const lower = line.toLowerCase();
212
+ if (lower.startsWith("user-agent:")) {
213
+ const agent = lower.slice("user-agent:".length).trim();
214
+ inStruthBlock = agent === "struth-bot" || agent === "struth-bot/0.1";
215
+ } else if (inStruthBlock && lower.startsWith("disallow:")) {
216
+ const path = lower.slice("disallow:".length).trim();
217
+ if (path === "/" || path === "/*") return "blocked";
218
+ }
219
+ }
220
+
221
+ return "allowed";
222
+ }
223
+
224
+ // ── Platform detection ──────────────────────────────────────────────
225
+
226
+ function detectPlatform(html: string, headers: Headers): string | null {
227
+ for (const { name, test } of PLATFORM_PATTERNS) {
228
+ if (test(html, headers)) return name;
229
+ }
230
+ return null;
231
+ }
232
+
233
+ // ── Discovery strategies ────────────────────────────────────────────
234
+
235
+ function extractUrlsFromText(text: string, origin: string): string[] {
236
+ // Extract URLs that look like they belong to the same docs site
237
+ const urlRegex = /https?:\/\/[^\s<>"')\]]+/g;
238
+ const matches = text.match(urlRegex) || [];
239
+ return matches.filter((u) => {
240
+ try {
241
+ return new URL(u).origin === origin;
242
+ } catch {
243
+ return false;
244
+ }
245
+ });
246
+ }
247
+
248
+ async function tryLlmsFullTxt(origin: string): Promise<UrlEntry[] | null> {
249
+ const resp = await probe(`${origin}/llms-full.txt`);
250
+ if (!resp) return null;
251
+ const text = await resp.text();
252
+ const urls = extractUrlsFromText(text, origin);
253
+ if (urls.length === 0) return null;
254
+ return urls.map((url) => ({ url, source: "llms_full_txt" as const }));
255
+ }
256
+
257
+ async function tryLlmsTxt(origin: string): Promise<UrlEntry[] | null> {
258
+ const resp = await probe(`${origin}/llms.txt`);
259
+ if (!resp) return null;
260
+ const text = await resp.text();
261
+ const urls = extractUrlsFromText(text, origin);
262
+ if (urls.length === 0) return null;
263
+ return urls.map((url) => ({ url, source: "llms_txt" as const }));
264
+ }
265
+
266
+ async function tryMdSuffix(url: string): Promise<UrlEntry[] | null> {
267
+ const mdUrl = url.endsWith("/") ? `${url.slice(0, -1)}.md` : `${url}.md`;
268
+ const resp = await probe(mdUrl);
269
+ if (!resp) return null;
270
+ return [{ url: mdUrl, source: "md_suffix" as const }];
271
+ }
272
+
273
+ async function trySitemap(origin: string): Promise<UrlEntry[] | null> {
274
+ const text = await fetchText(`${origin}/sitemap.xml`, WALK_TIMEOUT);
275
+ if (!text) return null;
276
+
277
+ // Parse <loc> elements from XML
278
+ const locRegex = /<loc>\s*(.*?)\s*<\/loc>/g;
279
+ const urls: string[] = [];
280
+ let match: RegExpExecArray | null;
281
+ match = locRegex.exec(text);
282
+ while (match !== null) {
283
+ urls.push(match[1]);
284
+ match = locRegex.exec(text);
285
+ }
286
+
287
+ if (urls.length === 0) return null;
288
+ return urls.map((url) => ({ url, source: "sitemap" as const }));
289
+ }
290
+
291
+ async function tryFirecrawl(url: string, top: number): Promise<UrlEntry[] | null> {
292
+ const apiKey = process.env.FIRECRAWL_API_KEY;
293
+ if (!apiKey) return null;
294
+
295
+ try {
296
+ const controller = new AbortController();
297
+ const timer = setTimeout(() => controller.abort(), WALK_TIMEOUT);
298
+ const resp = await fetch("https://api.firecrawl.dev/v1/map", {
299
+ method: "POST",
300
+ headers: {
301
+ "Content-Type": "application/json",
302
+ Authorization: `Bearer ${apiKey}`,
303
+ },
304
+ body: JSON.stringify({ url, limit: top }),
305
+ signal: controller.signal,
306
+ });
307
+ clearTimeout(timer);
308
+
309
+ if (!resp.ok) return null;
310
+ const data = (await resp.json()) as { links?: string[] };
311
+ if (!data.links || data.links.length === 0) return null;
312
+ return data.links.map((u: string) => ({ url: u, source: "firecrawl" as const }));
313
+ } catch {
314
+ return null;
315
+ }
316
+ }
317
+
318
+ async function tryLinkWalk(url: string, origin: string): Promise<UrlEntry[] | null> {
319
+ const text = await fetchText(url, WALK_TIMEOUT);
320
+ if (!text) return null;
321
+
322
+ // Extract href values from anchor tags
323
+ const hrefRegex = /href=["']([^"']+)["']/g;
324
+ const urls: string[] = [];
325
+ let match: RegExpExecArray | null;
326
+ match = hrefRegex.exec(text);
327
+ while (match !== null) {
328
+ const href = match[1];
329
+ try {
330
+ // Resolve relative URLs
331
+ const resolved = new URL(href, url).href;
332
+ if (new URL(resolved).origin === origin) {
333
+ urls.push(resolved);
334
+ }
335
+ } catch {
336
+ // skip malformed hrefs (mailto:, javascript:, etc.)
337
+ }
338
+ match = hrefRegex.exec(text);
339
+ }
340
+
341
+ if (urls.length === 0) return null;
342
+ return urls.map((u) => ({ url: u, source: "link_walk" as const }));
343
+ }
344
+
345
+ // ── Main discover function ──────────────────────────────────────────
346
+
347
+ /**
348
+ * Discover documentation pages from a URL.
349
+ * Waterfall: llms-full.txt -> llms.txt -> .md suffix -> sitemap.xml -> Firecrawl /map -> link walk
350
+ */
351
+ export async function discover(
352
+ url: string,
353
+ opts: DiscoverOpts,
354
+ ): Promise<z.infer<typeof DiscoverResult>> {
355
+ const parsed = new URL(url);
356
+ const origin = parsed.origin;
357
+
358
+ // Check robots.txt in parallel with discovery
359
+ const robotsPromise = checkRobots(origin);
360
+
361
+ // Fetch root page for platform detection (we may need it later for link_walk too)
362
+ let rootHtml: string | null = null;
363
+ let rootHeaders: Headers | null = null;
364
+
365
+ async function getRootPage(): Promise<{ html: string; headers: Headers } | null> {
366
+ if (rootHtml !== null && rootHeaders !== null) {
367
+ return { html: rootHtml, headers: rootHeaders };
368
+ }
369
+ try {
370
+ const controller = new AbortController();
371
+ const timer = setTimeout(() => controller.abort(), WALK_TIMEOUT);
372
+ const resp = await fetch(url, {
373
+ headers: { "User-Agent": USER_AGENT },
374
+ signal: controller.signal,
375
+ redirect: "follow",
376
+ });
377
+ clearTimeout(timer);
378
+ if (!resp.ok) return null;
379
+ rootHtml = await resp.text();
380
+ rootHeaders = resp.headers;
381
+ return { html: rootHtml, headers: rootHeaders };
382
+ } catch {
383
+ return null;
384
+ }
385
+ }
386
+
387
+ // Waterfall strategy
388
+ type SourceMethod = z.infer<typeof DiscoverResult>["urls"][number]["source"];
389
+ let rawUrls: UrlEntry[] | null = null;
390
+ let sourceMethod: SourceMethod = "link_walk";
391
+
392
+ // 1. llms-full.txt
393
+ rawUrls = await tryLlmsFullTxt(origin);
394
+ if (rawUrls) {
395
+ sourceMethod = "llms_full_txt";
396
+ }
397
+
398
+ // 2. llms.txt
399
+ if (!rawUrls) {
400
+ rawUrls = await tryLlmsTxt(origin);
401
+ if (rawUrls) sourceMethod = "llms_txt";
402
+ }
403
+
404
+ // 3. .md suffix
405
+ if (!rawUrls) {
406
+ rawUrls = await tryMdSuffix(url);
407
+ if (rawUrls) sourceMethod = "md_suffix";
408
+ }
409
+
410
+ // 4. sitemap.xml
411
+ if (!rawUrls) {
412
+ rawUrls = await trySitemap(origin);
413
+ if (rawUrls) sourceMethod = "sitemap";
414
+ }
415
+
416
+ // 5. Firecrawl
417
+ if (!rawUrls) {
418
+ rawUrls = await tryFirecrawl(url, opts.top ?? 20);
419
+ if (rawUrls) sourceMethod = "firecrawl";
420
+ }
421
+
422
+ // 6. link_walk
423
+ if (!rawUrls) {
424
+ rawUrls = await tryLinkWalk(url, origin);
425
+ if (rawUrls) sourceMethod = "link_walk";
426
+ }
427
+
428
+ // Process URLs
429
+ const { urls, totalFound, afterDedup } = processUrls(rawUrls || [], origin, opts);
430
+
431
+ // Platform detection
432
+ const root = await getRootPage();
433
+ const platform = root ? detectPlatform(root.html, root.headers) : null;
434
+
435
+ // Robots.txt
436
+ const robotsStatus = await robotsPromise;
437
+
438
+ const result = {
439
+ urls: urls.map((e) => ({ url: e.url, source: e.source })),
440
+ source_method: sourceMethod,
441
+ total_found: totalFound,
442
+ after_dedup: afterDedup,
443
+ platform_detected: platform,
444
+ robots_txt_status: robotsStatus,
445
+ };
446
+
447
+ return DiscoverResult.parse(result);
448
+ }
@@ -0,0 +1,214 @@
1
+ import type { z } from "zod";
2
+ import { SCHEMA_VERSION } from "../constants.js";
3
+ import type { ContentIntegrity, StructuralMetrics } from "../schemas.js";
4
+
5
+ /**
6
+ * Run the Content Integrity Pipeline on a page's content.
7
+ * Steps: Unicode NFC normalization, structural anomaly detection,
8
+ * OWASP LLM01 alignment check, structural isolation.
9
+ */
10
+ export async function assessIntegrity(
11
+ content: string,
12
+ _sourceUrl: string,
13
+ ): Promise<z.infer<typeof ContentIntegrity>> {
14
+ // Unicode NFC normalization
15
+ const _normalized = content.normalize("NFC");
16
+
17
+ // Anomaly detection deferred to Sprint 3
18
+ const flaggedAnomalies: string[] = [];
19
+
20
+ // Structural baseline: 1 - (flagged_anomalies.length * 0.1), clamped [0, 1]
21
+ const structuralBaseline = Math.max(0, Math.min(1, 1 - flaggedAnomalies.length * 0.1));
22
+
23
+ return {
24
+ unicode_normalized: true,
25
+ structural_baseline: structuralBaseline,
26
+ flagged_anomalies: flaggedAnomalies,
27
+ owasp_llm01_checked: true,
28
+ pipeline_version: SCHEMA_VERSION,
29
+ };
30
+ }
31
+
32
+ /** Imperative verbs that commonly start doc instruction sentences. */
33
+ const IMPERATIVE_VERBS = new Set([
34
+ "run",
35
+ "install",
36
+ "set",
37
+ "create",
38
+ "add",
39
+ "use",
40
+ "configure",
41
+ "enable",
42
+ "click",
43
+ "open",
44
+ "copy",
45
+ "move",
46
+ "delete",
47
+ "remove",
48
+ "update",
49
+ "check",
50
+ "verify",
51
+ "ensure",
52
+ "start",
53
+ "stop",
54
+ "build",
55
+ "deploy",
56
+ "test",
57
+ "import",
58
+ "export",
59
+ "define",
60
+ "specify",
61
+ "select",
62
+ "enter",
63
+ "type",
64
+ "navigate",
65
+ "go",
66
+ "download",
67
+ "upload",
68
+ "save",
69
+ "load",
70
+ "execute",
71
+ "apply",
72
+ "include",
73
+ "exclude",
74
+ "pass",
75
+ "return",
76
+ "call",
77
+ "send",
78
+ "fetch",
79
+ "get",
80
+ "put",
81
+ "post",
82
+ "patch",
83
+ "replace",
84
+ "merge",
85
+ "wrap",
86
+ "mount",
87
+ "bind",
88
+ "attach",
89
+ "listen",
90
+ "emit",
91
+ "register",
92
+ "subscribe",
93
+ "publish",
94
+ "connect",
95
+ "disconnect",
96
+ "initialize",
97
+ "setup",
98
+ "reset",
99
+ "clear",
100
+ "flush",
101
+ "close",
102
+ "shutdown",
103
+ "restart",
104
+ "log",
105
+ "print",
106
+ "debug",
107
+ "trace",
108
+ "monitor",
109
+ "watch",
110
+ "observe",
111
+ "inspect",
112
+ "try",
113
+ "catch",
114
+ "throw",
115
+ "handle",
116
+ "retry",
117
+ "note",
118
+ "see",
119
+ "refer",
120
+ "visit",
121
+ "read",
122
+ "write",
123
+ "append",
124
+ "prepend",
125
+ "insert",
126
+ "override",
127
+ "extend",
128
+ "implement",
129
+ "inherit",
130
+ "compose",
131
+ ]);
132
+
133
+ /**
134
+ * Calculate structural metrics for a page's content.
135
+ */
136
+ export function calculateStructuralMetrics(content: string): z.infer<typeof StructuralMetrics> {
137
+ return {
138
+ char_entropy: charEntropy(content),
139
+ code_block_ratio: codeBlockRatio(content),
140
+ avg_section_words: avgSectionWords(content),
141
+ imperative_sentence_ratio: imperativeSentenceRatio(content),
142
+ total_tokens: Math.ceil(content.length / 4),
143
+ };
144
+ }
145
+
146
+ /** Shannon entropy of character distribution. */
147
+ function charEntropy(text: string): number {
148
+ if (text.length === 0) return 0;
149
+
150
+ const freq = new Map<string, number>();
151
+ for (const ch of text) {
152
+ freq.set(ch, (freq.get(ch) ?? 0) + 1);
153
+ }
154
+
155
+ let entropy = 0;
156
+ const len = text.length;
157
+ for (const count of freq.values()) {
158
+ const p = count / len;
159
+ if (p > 0) {
160
+ entropy -= p * Math.log2(p);
161
+ }
162
+ }
163
+ return entropy;
164
+ }
165
+
166
+ /** Ratio of characters inside ``` blocks to total characters. */
167
+ function codeBlockRatio(text: string): number {
168
+ if (text.length === 0) return 0;
169
+
170
+ let insideCode = 0;
171
+ const parts = text.split("```");
172
+ // Odd-indexed parts are inside code blocks
173
+ for (let i = 1; i < parts.length; i += 2) {
174
+ insideCode += parts[i].length;
175
+ }
176
+
177
+ return Math.max(0, Math.min(1, insideCode / text.length));
178
+ }
179
+
180
+ /** Average words per section (## or higher heading). */
181
+ function avgSectionWords(text: string): number {
182
+ const words = text.split(/\s+/).filter((w) => w.length > 0);
183
+ if (words.length === 0) return 0;
184
+
185
+ // Count headings: lines starting with # or ## (## or higher = # and ##)
186
+ const headingCount = text.split("\n").filter((line) => /^#{1,2}\s/.test(line)).length;
187
+
188
+ if (headingCount === 0) return words.length;
189
+ return words.length / headingCount;
190
+ }
191
+
192
+ /** Fraction of sentences starting with an imperative verb. */
193
+ function imperativeSentenceRatio(text: string): number {
194
+ // Split on sentence boundaries: period, exclamation, question mark followed by space or end
195
+ const sentences = text
196
+ .split(/[.!?](?:\s|$)/)
197
+ .map((s) => s.trim())
198
+ .filter((s) => s.length > 0);
199
+
200
+ if (sentences.length === 0) return 0;
201
+
202
+ let imperativeCount = 0;
203
+ for (const sentence of sentences) {
204
+ const firstWord = sentence
205
+ .split(/\s+/)[0]
206
+ ?.toLowerCase()
207
+ .replace(/[^a-z]/g, "");
208
+ if (firstWord && IMPERATIVE_VERBS.has(firstWord)) {
209
+ imperativeCount++;
210
+ }
211
+ }
212
+
213
+ return Math.max(0, Math.min(1, imperativeCount / sentences.length));
214
+ }