struth 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/COMPLIANCE.md +41 -0
- package/LICENSE +21 -0
- package/README.md +135 -0
- package/package.json +54 -0
- package/src/cli/index.ts +244 -0
- package/src/core/constants.ts +32 -0
- package/src/core/pipeline/clean.ts +246 -0
- package/src/core/pipeline/condense.ts +249 -0
- package/src/core/pipeline/discover.ts +448 -0
- package/src/core/pipeline/integrity.ts +214 -0
- package/src/core/pipeline/organize.ts +184 -0
- package/src/core/schemas.ts +204 -0
- package/src/core/spawn.ts +22 -0
- package/src/core/storage/index.ts +108 -0
- package/src/core/storage/paths.ts +40 -0
- package/src/core/types.ts +36 -0
- package/src/daemon/process.ts +95 -0
- package/src/daemon/refresh.ts +254 -0
- package/src/mcp/fts5-index.ts +114 -0
- package/src/mcp/fts5-search.ts +150 -0
- package/src/mcp/lockfile.ts +135 -0
- package/src/mcp/retrieval.ts +141 -0
- package/src/mcp/schemas.ts +12 -0
- package/src/mcp/server.ts +293 -0
- package/src/telemetry/client.ts +36 -0
- package/src/telemetry/schemas.ts +5 -0
|
@@ -0,0 +1,448 @@
|
|
|
1
|
+
import type { z } from "zod";
|
|
2
|
+
import { USER_AGENT } from "../constants.js";
|
|
3
|
+
import { DiscoverResult, type MirrorOptions } from "../schemas.js";
|
|
4
|
+
|
|
5
|
+
type DiscoverOpts = Pick<
|
|
6
|
+
z.infer<typeof MirrorOptions>,
|
|
7
|
+
"smart" | "filter" | "excludePath" | "exclude" | "top"
|
|
8
|
+
>;
|
|
9
|
+
|
|
10
|
+
// ── Non-doc file extensions to filter ───────────────────────────────
|
|
11
|
+
|
|
12
|
+
const NON_DOC_EXTENSIONS = new Set([
|
|
13
|
+
".json",
|
|
14
|
+
".xml",
|
|
15
|
+
".txt",
|
|
16
|
+
".png",
|
|
17
|
+
".jpg",
|
|
18
|
+
".svg",
|
|
19
|
+
".css",
|
|
20
|
+
".js",
|
|
21
|
+
".ico",
|
|
22
|
+
".gif",
|
|
23
|
+
".woff",
|
|
24
|
+
".woff2",
|
|
25
|
+
".ttf",
|
|
26
|
+
".eot",
|
|
27
|
+
".pdf",
|
|
28
|
+
]);
|
|
29
|
+
|
|
30
|
+
// ── Platform detection patterns ─────────────────────────────────────
|
|
31
|
+
|
|
32
|
+
const PLATFORM_PATTERNS: Array<{
|
|
33
|
+
name: string;
|
|
34
|
+
test: (html: string, headers: Headers) => boolean;
|
|
35
|
+
}> = [
|
|
36
|
+
{
|
|
37
|
+
name: "ReadTheDocs",
|
|
38
|
+
test: (html) =>
|
|
39
|
+
html.includes("readthedocs") || html.includes("Read the Docs") || html.includes("rtd.css"),
|
|
40
|
+
},
|
|
41
|
+
{
|
|
42
|
+
name: "GitBook",
|
|
43
|
+
test: (html, headers) =>
|
|
44
|
+
html.includes("gitbook") || headers.get("x-served-by")?.includes("gitbook") === true,
|
|
45
|
+
},
|
|
46
|
+
{
|
|
47
|
+
name: "Docusaurus",
|
|
48
|
+
test: (html) => html.includes("docusaurus") || html.includes("__docusaurus"),
|
|
49
|
+
},
|
|
50
|
+
{
|
|
51
|
+
name: "MkDocs",
|
|
52
|
+
test: (html) => html.includes("mkdocs") || html.includes("MkDocs"),
|
|
53
|
+
},
|
|
54
|
+
{
|
|
55
|
+
name: "Sphinx",
|
|
56
|
+
test: (html) =>
|
|
57
|
+
html.includes("sphinx") || html.includes("Sphinx") || html.includes("_sphinx_javascript"),
|
|
58
|
+
},
|
|
59
|
+
{
|
|
60
|
+
name: "Mintlify",
|
|
61
|
+
test: (html) => html.includes("mintlify"),
|
|
62
|
+
},
|
|
63
|
+
{
|
|
64
|
+
name: "Nextra",
|
|
65
|
+
test: (html) => html.includes("nextra"),
|
|
66
|
+
},
|
|
67
|
+
{
|
|
68
|
+
name: "VitePress",
|
|
69
|
+
test: (html) => html.includes("vitepress") || html.includes("VitePress"),
|
|
70
|
+
},
|
|
71
|
+
{
|
|
72
|
+
name: "mdBook",
|
|
73
|
+
test: (html) => html.includes("mdbook") || html.includes("mdBook"),
|
|
74
|
+
},
|
|
75
|
+
];
|
|
76
|
+
|
|
77
|
+
// ── Timeouts ────────────────────────────────────────────────────────
|
|
78
|
+
|
|
79
|
+
const PROBE_TIMEOUT = 5_000;
|
|
80
|
+
const WALK_TIMEOUT = 10_000;
|
|
81
|
+
|
|
82
|
+
// ── Fetch helpers ───────────────────────────────────────────────────
|
|
83
|
+
|
|
84
|
+
async function probe(url: string, timeout = PROBE_TIMEOUT): Promise<Response | null> {
|
|
85
|
+
try {
|
|
86
|
+
const controller = new AbortController();
|
|
87
|
+
const timer = setTimeout(() => controller.abort(), timeout);
|
|
88
|
+
const resp = await fetch(url, {
|
|
89
|
+
headers: { "User-Agent": USER_AGENT },
|
|
90
|
+
signal: controller.signal,
|
|
91
|
+
redirect: "follow",
|
|
92
|
+
});
|
|
93
|
+
clearTimeout(timer);
|
|
94
|
+
return resp.ok ? resp : null;
|
|
95
|
+
} catch {
|
|
96
|
+
return null;
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
async function fetchText(url: string, timeout = WALK_TIMEOUT): Promise<string | null> {
|
|
101
|
+
try {
|
|
102
|
+
const controller = new AbortController();
|
|
103
|
+
const timer = setTimeout(() => controller.abort(), timeout);
|
|
104
|
+
const resp = await fetch(url, {
|
|
105
|
+
headers: { "User-Agent": USER_AGENT },
|
|
106
|
+
signal: controller.signal,
|
|
107
|
+
redirect: "follow",
|
|
108
|
+
});
|
|
109
|
+
clearTimeout(timer);
|
|
110
|
+
if (!resp.ok) return null;
|
|
111
|
+
return await resp.text();
|
|
112
|
+
} catch {
|
|
113
|
+
return null;
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
// ── URL processing ──────────────────────────────────────────────────
|
|
118
|
+
|
|
119
|
+
function normalizeUrl(raw: string): string {
|
|
120
|
+
try {
|
|
121
|
+
const parsed = new URL(raw);
|
|
122
|
+
parsed.hash = ""; // strip fragments
|
|
123
|
+
// normalize trailing slash: remove if path has content beyond /
|
|
124
|
+
if (parsed.pathname.length > 1 && parsed.pathname.endsWith("/")) {
|
|
125
|
+
parsed.pathname = parsed.pathname.slice(0, -1);
|
|
126
|
+
}
|
|
127
|
+
return parsed.href;
|
|
128
|
+
} catch {
|
|
129
|
+
return raw;
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
function isDocUrl(urlStr: string): boolean {
|
|
134
|
+
try {
|
|
135
|
+
const parsed = new URL(urlStr);
|
|
136
|
+
const ext = parsed.pathname.match(/\.[a-z0-9]+$/i)?.[0]?.toLowerCase();
|
|
137
|
+
if (ext && NON_DOC_EXTENSIONS.has(ext)) return false;
|
|
138
|
+
return true;
|
|
139
|
+
} catch {
|
|
140
|
+
return false;
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
function isSameOrigin(urlStr: string, origin: string): boolean {
|
|
145
|
+
try {
|
|
146
|
+
const parsed = new URL(urlStr);
|
|
147
|
+
return parsed.origin === origin;
|
|
148
|
+
} catch {
|
|
149
|
+
return false;
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
type UrlEntry = { url: string; source: z.infer<typeof DiscoverResult>["urls"][number]["source"] };
|
|
154
|
+
|
|
155
|
+
function processUrls(
|
|
156
|
+
raw: UrlEntry[],
|
|
157
|
+
origin: string,
|
|
158
|
+
opts: DiscoverOpts,
|
|
159
|
+
): { urls: UrlEntry[]; totalFound: number; afterDedup: number } {
|
|
160
|
+
const totalFound = raw.length;
|
|
161
|
+
|
|
162
|
+
// Normalize, filter non-doc, filter external
|
|
163
|
+
let urls = raw
|
|
164
|
+
.map((e) => ({ ...e, url: normalizeUrl(e.url) }))
|
|
165
|
+
.filter((e) => isDocUrl(e.url))
|
|
166
|
+
.filter((e) => isSameOrigin(e.url, origin));
|
|
167
|
+
|
|
168
|
+
// Deduplicate
|
|
169
|
+
const seen = new Set<string>();
|
|
170
|
+
urls = urls.filter((e) => {
|
|
171
|
+
if (seen.has(e.url)) return false;
|
|
172
|
+
seen.add(e.url);
|
|
173
|
+
return true;
|
|
174
|
+
});
|
|
175
|
+
|
|
176
|
+
const afterDedup = urls.length;
|
|
177
|
+
|
|
178
|
+
// Apply excludePath patterns
|
|
179
|
+
if (opts.excludePath && opts.excludePath.length > 0) {
|
|
180
|
+
urls = urls.filter((e) => {
|
|
181
|
+
const path = new URL(e.url).pathname;
|
|
182
|
+
return !opts.excludePath.some((pat) => path.includes(pat));
|
|
183
|
+
});
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
// Apply filter (keep ONLY matching)
|
|
187
|
+
if (opts.filter) {
|
|
188
|
+
const filterStr = opts.filter;
|
|
189
|
+
urls = urls.filter((e) => e.url.includes(filterStr));
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
// Apply top limit
|
|
193
|
+
if (opts.top && opts.top > 0) {
|
|
194
|
+
urls = urls.slice(0, opts.top);
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
return { urls, totalFound, afterDedup };
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
// ── Robots.txt check ────────────────────────────────────────────────
|
|
201
|
+
|
|
202
|
+
async function checkRobots(origin: string): Promise<"allowed" | "blocked" | "no_robots_txt"> {
|
|
203
|
+
const text = await fetchText(`${origin}/robots.txt`, PROBE_TIMEOUT);
|
|
204
|
+
if (text === null) return "no_robots_txt";
|
|
205
|
+
|
|
206
|
+
// Parse robots.txt for Struth-Bot rules
|
|
207
|
+
const lines = text.split("\n").map((l) => l.trim());
|
|
208
|
+
let inStruthBlock = false;
|
|
209
|
+
|
|
210
|
+
for (const line of lines) {
|
|
211
|
+
const lower = line.toLowerCase();
|
|
212
|
+
if (lower.startsWith("user-agent:")) {
|
|
213
|
+
const agent = lower.slice("user-agent:".length).trim();
|
|
214
|
+
inStruthBlock = agent === "struth-bot" || agent === "struth-bot/0.1";
|
|
215
|
+
} else if (inStruthBlock && lower.startsWith("disallow:")) {
|
|
216
|
+
const path = lower.slice("disallow:".length).trim();
|
|
217
|
+
if (path === "/" || path === "/*") return "blocked";
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
return "allowed";
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
// ── Platform detection ──────────────────────────────────────────────
|
|
225
|
+
|
|
226
|
+
function detectPlatform(html: string, headers: Headers): string | null {
|
|
227
|
+
for (const { name, test } of PLATFORM_PATTERNS) {
|
|
228
|
+
if (test(html, headers)) return name;
|
|
229
|
+
}
|
|
230
|
+
return null;
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
// ── Discovery strategies ────────────────────────────────────────────
|
|
234
|
+
|
|
235
|
+
function extractUrlsFromText(text: string, origin: string): string[] {
|
|
236
|
+
// Extract URLs that look like they belong to the same docs site
|
|
237
|
+
const urlRegex = /https?:\/\/[^\s<>"')\]]+/g;
|
|
238
|
+
const matches = text.match(urlRegex) || [];
|
|
239
|
+
return matches.filter((u) => {
|
|
240
|
+
try {
|
|
241
|
+
return new URL(u).origin === origin;
|
|
242
|
+
} catch {
|
|
243
|
+
return false;
|
|
244
|
+
}
|
|
245
|
+
});
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
async function tryLlmsFullTxt(origin: string): Promise<UrlEntry[] | null> {
|
|
249
|
+
const resp = await probe(`${origin}/llms-full.txt`);
|
|
250
|
+
if (!resp) return null;
|
|
251
|
+
const text = await resp.text();
|
|
252
|
+
const urls = extractUrlsFromText(text, origin);
|
|
253
|
+
if (urls.length === 0) return null;
|
|
254
|
+
return urls.map((url) => ({ url, source: "llms_full_txt" as const }));
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
async function tryLlmsTxt(origin: string): Promise<UrlEntry[] | null> {
|
|
258
|
+
const resp = await probe(`${origin}/llms.txt`);
|
|
259
|
+
if (!resp) return null;
|
|
260
|
+
const text = await resp.text();
|
|
261
|
+
const urls = extractUrlsFromText(text, origin);
|
|
262
|
+
if (urls.length === 0) return null;
|
|
263
|
+
return urls.map((url) => ({ url, source: "llms_txt" as const }));
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
async function tryMdSuffix(url: string): Promise<UrlEntry[] | null> {
|
|
267
|
+
const mdUrl = url.endsWith("/") ? `${url.slice(0, -1)}.md` : `${url}.md`;
|
|
268
|
+
const resp = await probe(mdUrl);
|
|
269
|
+
if (!resp) return null;
|
|
270
|
+
return [{ url: mdUrl, source: "md_suffix" as const }];
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
async function trySitemap(origin: string): Promise<UrlEntry[] | null> {
|
|
274
|
+
const text = await fetchText(`${origin}/sitemap.xml`, WALK_TIMEOUT);
|
|
275
|
+
if (!text) return null;
|
|
276
|
+
|
|
277
|
+
// Parse <loc> elements from XML
|
|
278
|
+
const locRegex = /<loc>\s*(.*?)\s*<\/loc>/g;
|
|
279
|
+
const urls: string[] = [];
|
|
280
|
+
let match: RegExpExecArray | null;
|
|
281
|
+
match = locRegex.exec(text);
|
|
282
|
+
while (match !== null) {
|
|
283
|
+
urls.push(match[1]);
|
|
284
|
+
match = locRegex.exec(text);
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
if (urls.length === 0) return null;
|
|
288
|
+
return urls.map((url) => ({ url, source: "sitemap" as const }));
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
async function tryFirecrawl(url: string, top: number): Promise<UrlEntry[] | null> {
|
|
292
|
+
const apiKey = process.env.FIRECRAWL_API_KEY;
|
|
293
|
+
if (!apiKey) return null;
|
|
294
|
+
|
|
295
|
+
try {
|
|
296
|
+
const controller = new AbortController();
|
|
297
|
+
const timer = setTimeout(() => controller.abort(), WALK_TIMEOUT);
|
|
298
|
+
const resp = await fetch("https://api.firecrawl.dev/v1/map", {
|
|
299
|
+
method: "POST",
|
|
300
|
+
headers: {
|
|
301
|
+
"Content-Type": "application/json",
|
|
302
|
+
Authorization: `Bearer ${apiKey}`,
|
|
303
|
+
},
|
|
304
|
+
body: JSON.stringify({ url, limit: top }),
|
|
305
|
+
signal: controller.signal,
|
|
306
|
+
});
|
|
307
|
+
clearTimeout(timer);
|
|
308
|
+
|
|
309
|
+
if (!resp.ok) return null;
|
|
310
|
+
const data = (await resp.json()) as { links?: string[] };
|
|
311
|
+
if (!data.links || data.links.length === 0) return null;
|
|
312
|
+
return data.links.map((u: string) => ({ url: u, source: "firecrawl" as const }));
|
|
313
|
+
} catch {
|
|
314
|
+
return null;
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
async function tryLinkWalk(url: string, origin: string): Promise<UrlEntry[] | null> {
|
|
319
|
+
const text = await fetchText(url, WALK_TIMEOUT);
|
|
320
|
+
if (!text) return null;
|
|
321
|
+
|
|
322
|
+
// Extract href values from anchor tags
|
|
323
|
+
const hrefRegex = /href=["']([^"']+)["']/g;
|
|
324
|
+
const urls: string[] = [];
|
|
325
|
+
let match: RegExpExecArray | null;
|
|
326
|
+
match = hrefRegex.exec(text);
|
|
327
|
+
while (match !== null) {
|
|
328
|
+
const href = match[1];
|
|
329
|
+
try {
|
|
330
|
+
// Resolve relative URLs
|
|
331
|
+
const resolved = new URL(href, url).href;
|
|
332
|
+
if (new URL(resolved).origin === origin) {
|
|
333
|
+
urls.push(resolved);
|
|
334
|
+
}
|
|
335
|
+
} catch {
|
|
336
|
+
// skip malformed hrefs (mailto:, javascript:, etc.)
|
|
337
|
+
}
|
|
338
|
+
match = hrefRegex.exec(text);
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
if (urls.length === 0) return null;
|
|
342
|
+
return urls.map((u) => ({ url: u, source: "link_walk" as const }));
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
// ── Main discover function ──────────────────────────────────────────
|
|
346
|
+
|
|
347
|
+
/**
|
|
348
|
+
* Discover documentation pages from a URL.
|
|
349
|
+
* Waterfall: llms-full.txt -> llms.txt -> .md suffix -> sitemap.xml -> Firecrawl /map -> link walk
|
|
350
|
+
*/
|
|
351
|
+
export async function discover(
|
|
352
|
+
url: string,
|
|
353
|
+
opts: DiscoverOpts,
|
|
354
|
+
): Promise<z.infer<typeof DiscoverResult>> {
|
|
355
|
+
const parsed = new URL(url);
|
|
356
|
+
const origin = parsed.origin;
|
|
357
|
+
|
|
358
|
+
// Check robots.txt in parallel with discovery
|
|
359
|
+
const robotsPromise = checkRobots(origin);
|
|
360
|
+
|
|
361
|
+
// Fetch root page for platform detection (we may need it later for link_walk too)
|
|
362
|
+
let rootHtml: string | null = null;
|
|
363
|
+
let rootHeaders: Headers | null = null;
|
|
364
|
+
|
|
365
|
+
async function getRootPage(): Promise<{ html: string; headers: Headers } | null> {
|
|
366
|
+
if (rootHtml !== null && rootHeaders !== null) {
|
|
367
|
+
return { html: rootHtml, headers: rootHeaders };
|
|
368
|
+
}
|
|
369
|
+
try {
|
|
370
|
+
const controller = new AbortController();
|
|
371
|
+
const timer = setTimeout(() => controller.abort(), WALK_TIMEOUT);
|
|
372
|
+
const resp = await fetch(url, {
|
|
373
|
+
headers: { "User-Agent": USER_AGENT },
|
|
374
|
+
signal: controller.signal,
|
|
375
|
+
redirect: "follow",
|
|
376
|
+
});
|
|
377
|
+
clearTimeout(timer);
|
|
378
|
+
if (!resp.ok) return null;
|
|
379
|
+
rootHtml = await resp.text();
|
|
380
|
+
rootHeaders = resp.headers;
|
|
381
|
+
return { html: rootHtml, headers: rootHeaders };
|
|
382
|
+
} catch {
|
|
383
|
+
return null;
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
// Waterfall strategy
|
|
388
|
+
type SourceMethod = z.infer<typeof DiscoverResult>["urls"][number]["source"];
|
|
389
|
+
let rawUrls: UrlEntry[] | null = null;
|
|
390
|
+
let sourceMethod: SourceMethod = "link_walk";
|
|
391
|
+
|
|
392
|
+
// 1. llms-full.txt
|
|
393
|
+
rawUrls = await tryLlmsFullTxt(origin);
|
|
394
|
+
if (rawUrls) {
|
|
395
|
+
sourceMethod = "llms_full_txt";
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
// 2. llms.txt
|
|
399
|
+
if (!rawUrls) {
|
|
400
|
+
rawUrls = await tryLlmsTxt(origin);
|
|
401
|
+
if (rawUrls) sourceMethod = "llms_txt";
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
// 3. .md suffix
|
|
405
|
+
if (!rawUrls) {
|
|
406
|
+
rawUrls = await tryMdSuffix(url);
|
|
407
|
+
if (rawUrls) sourceMethod = "md_suffix";
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
// 4. sitemap.xml
|
|
411
|
+
if (!rawUrls) {
|
|
412
|
+
rawUrls = await trySitemap(origin);
|
|
413
|
+
if (rawUrls) sourceMethod = "sitemap";
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
// 5. Firecrawl
|
|
417
|
+
if (!rawUrls) {
|
|
418
|
+
rawUrls = await tryFirecrawl(url, opts.top ?? 20);
|
|
419
|
+
if (rawUrls) sourceMethod = "firecrawl";
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
// 6. link_walk
|
|
423
|
+
if (!rawUrls) {
|
|
424
|
+
rawUrls = await tryLinkWalk(url, origin);
|
|
425
|
+
if (rawUrls) sourceMethod = "link_walk";
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
// Process URLs
|
|
429
|
+
const { urls, totalFound, afterDedup } = processUrls(rawUrls || [], origin, opts);
|
|
430
|
+
|
|
431
|
+
// Platform detection
|
|
432
|
+
const root = await getRootPage();
|
|
433
|
+
const platform = root ? detectPlatform(root.html, root.headers) : null;
|
|
434
|
+
|
|
435
|
+
// Robots.txt
|
|
436
|
+
const robotsStatus = await robotsPromise;
|
|
437
|
+
|
|
438
|
+
const result = {
|
|
439
|
+
urls: urls.map((e) => ({ url: e.url, source: e.source })),
|
|
440
|
+
source_method: sourceMethod,
|
|
441
|
+
total_found: totalFound,
|
|
442
|
+
after_dedup: afterDedup,
|
|
443
|
+
platform_detected: platform,
|
|
444
|
+
robots_txt_status: robotsStatus,
|
|
445
|
+
};
|
|
446
|
+
|
|
447
|
+
return DiscoverResult.parse(result);
|
|
448
|
+
}
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
import type { z } from "zod";
|
|
2
|
+
import { SCHEMA_VERSION } from "../constants.js";
|
|
3
|
+
import type { ContentIntegrity, StructuralMetrics } from "../schemas.js";
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Run the Content Integrity Pipeline on a page's content.
|
|
7
|
+
* Steps: Unicode NFC normalization, structural anomaly detection,
|
|
8
|
+
* OWASP LLM01 alignment check, structural isolation.
|
|
9
|
+
*/
|
|
10
|
+
export async function assessIntegrity(
|
|
11
|
+
content: string,
|
|
12
|
+
_sourceUrl: string,
|
|
13
|
+
): Promise<z.infer<typeof ContentIntegrity>> {
|
|
14
|
+
// Unicode NFC normalization
|
|
15
|
+
const _normalized = content.normalize("NFC");
|
|
16
|
+
|
|
17
|
+
// Anomaly detection deferred to Sprint 3
|
|
18
|
+
const flaggedAnomalies: string[] = [];
|
|
19
|
+
|
|
20
|
+
// Structural baseline: 1 - (flagged_anomalies.length * 0.1), clamped [0, 1]
|
|
21
|
+
const structuralBaseline = Math.max(0, Math.min(1, 1 - flaggedAnomalies.length * 0.1));
|
|
22
|
+
|
|
23
|
+
return {
|
|
24
|
+
unicode_normalized: true,
|
|
25
|
+
structural_baseline: structuralBaseline,
|
|
26
|
+
flagged_anomalies: flaggedAnomalies,
|
|
27
|
+
owasp_llm01_checked: true,
|
|
28
|
+
pipeline_version: SCHEMA_VERSION,
|
|
29
|
+
};
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/** Imperative verbs that commonly start doc instruction sentences. */
|
|
33
|
+
const IMPERATIVE_VERBS = new Set([
|
|
34
|
+
"run",
|
|
35
|
+
"install",
|
|
36
|
+
"set",
|
|
37
|
+
"create",
|
|
38
|
+
"add",
|
|
39
|
+
"use",
|
|
40
|
+
"configure",
|
|
41
|
+
"enable",
|
|
42
|
+
"click",
|
|
43
|
+
"open",
|
|
44
|
+
"copy",
|
|
45
|
+
"move",
|
|
46
|
+
"delete",
|
|
47
|
+
"remove",
|
|
48
|
+
"update",
|
|
49
|
+
"check",
|
|
50
|
+
"verify",
|
|
51
|
+
"ensure",
|
|
52
|
+
"start",
|
|
53
|
+
"stop",
|
|
54
|
+
"build",
|
|
55
|
+
"deploy",
|
|
56
|
+
"test",
|
|
57
|
+
"import",
|
|
58
|
+
"export",
|
|
59
|
+
"define",
|
|
60
|
+
"specify",
|
|
61
|
+
"select",
|
|
62
|
+
"enter",
|
|
63
|
+
"type",
|
|
64
|
+
"navigate",
|
|
65
|
+
"go",
|
|
66
|
+
"download",
|
|
67
|
+
"upload",
|
|
68
|
+
"save",
|
|
69
|
+
"load",
|
|
70
|
+
"execute",
|
|
71
|
+
"apply",
|
|
72
|
+
"include",
|
|
73
|
+
"exclude",
|
|
74
|
+
"pass",
|
|
75
|
+
"return",
|
|
76
|
+
"call",
|
|
77
|
+
"send",
|
|
78
|
+
"fetch",
|
|
79
|
+
"get",
|
|
80
|
+
"put",
|
|
81
|
+
"post",
|
|
82
|
+
"patch",
|
|
83
|
+
"replace",
|
|
84
|
+
"merge",
|
|
85
|
+
"wrap",
|
|
86
|
+
"mount",
|
|
87
|
+
"bind",
|
|
88
|
+
"attach",
|
|
89
|
+
"listen",
|
|
90
|
+
"emit",
|
|
91
|
+
"register",
|
|
92
|
+
"subscribe",
|
|
93
|
+
"publish",
|
|
94
|
+
"connect",
|
|
95
|
+
"disconnect",
|
|
96
|
+
"initialize",
|
|
97
|
+
"setup",
|
|
98
|
+
"reset",
|
|
99
|
+
"clear",
|
|
100
|
+
"flush",
|
|
101
|
+
"close",
|
|
102
|
+
"shutdown",
|
|
103
|
+
"restart",
|
|
104
|
+
"log",
|
|
105
|
+
"print",
|
|
106
|
+
"debug",
|
|
107
|
+
"trace",
|
|
108
|
+
"monitor",
|
|
109
|
+
"watch",
|
|
110
|
+
"observe",
|
|
111
|
+
"inspect",
|
|
112
|
+
"try",
|
|
113
|
+
"catch",
|
|
114
|
+
"throw",
|
|
115
|
+
"handle",
|
|
116
|
+
"retry",
|
|
117
|
+
"note",
|
|
118
|
+
"see",
|
|
119
|
+
"refer",
|
|
120
|
+
"visit",
|
|
121
|
+
"read",
|
|
122
|
+
"write",
|
|
123
|
+
"append",
|
|
124
|
+
"prepend",
|
|
125
|
+
"insert",
|
|
126
|
+
"override",
|
|
127
|
+
"extend",
|
|
128
|
+
"implement",
|
|
129
|
+
"inherit",
|
|
130
|
+
"compose",
|
|
131
|
+
]);
|
|
132
|
+
|
|
133
|
+
/**
|
|
134
|
+
* Calculate structural metrics for a page's content.
|
|
135
|
+
*/
|
|
136
|
+
export function calculateStructuralMetrics(content: string): z.infer<typeof StructuralMetrics> {
|
|
137
|
+
return {
|
|
138
|
+
char_entropy: charEntropy(content),
|
|
139
|
+
code_block_ratio: codeBlockRatio(content),
|
|
140
|
+
avg_section_words: avgSectionWords(content),
|
|
141
|
+
imperative_sentence_ratio: imperativeSentenceRatio(content),
|
|
142
|
+
total_tokens: Math.ceil(content.length / 4),
|
|
143
|
+
};
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
/** Shannon entropy of character distribution. */
|
|
147
|
+
function charEntropy(text: string): number {
|
|
148
|
+
if (text.length === 0) return 0;
|
|
149
|
+
|
|
150
|
+
const freq = new Map<string, number>();
|
|
151
|
+
for (const ch of text) {
|
|
152
|
+
freq.set(ch, (freq.get(ch) ?? 0) + 1);
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
let entropy = 0;
|
|
156
|
+
const len = text.length;
|
|
157
|
+
for (const count of freq.values()) {
|
|
158
|
+
const p = count / len;
|
|
159
|
+
if (p > 0) {
|
|
160
|
+
entropy -= p * Math.log2(p);
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
return entropy;
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
/** Ratio of characters inside ``` blocks to total characters. */
|
|
167
|
+
function codeBlockRatio(text: string): number {
|
|
168
|
+
if (text.length === 0) return 0;
|
|
169
|
+
|
|
170
|
+
let insideCode = 0;
|
|
171
|
+
const parts = text.split("```");
|
|
172
|
+
// Odd-indexed parts are inside code blocks
|
|
173
|
+
for (let i = 1; i < parts.length; i += 2) {
|
|
174
|
+
insideCode += parts[i].length;
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
return Math.max(0, Math.min(1, insideCode / text.length));
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
/** Average words per section (## or higher heading). */
|
|
181
|
+
function avgSectionWords(text: string): number {
|
|
182
|
+
const words = text.split(/\s+/).filter((w) => w.length > 0);
|
|
183
|
+
if (words.length === 0) return 0;
|
|
184
|
+
|
|
185
|
+
// Count headings: lines starting with # or ## (## or higher = # and ##)
|
|
186
|
+
const headingCount = text.split("\n").filter((line) => /^#{1,2}\s/.test(line)).length;
|
|
187
|
+
|
|
188
|
+
if (headingCount === 0) return words.length;
|
|
189
|
+
return words.length / headingCount;
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
/** Fraction of sentences starting with an imperative verb. */
|
|
193
|
+
function imperativeSentenceRatio(text: string): number {
|
|
194
|
+
// Split on sentence boundaries: period, exclamation, question mark followed by space or end
|
|
195
|
+
const sentences = text
|
|
196
|
+
.split(/[.!?](?:\s|$)/)
|
|
197
|
+
.map((s) => s.trim())
|
|
198
|
+
.filter((s) => s.length > 0);
|
|
199
|
+
|
|
200
|
+
if (sentences.length === 0) return 0;
|
|
201
|
+
|
|
202
|
+
let imperativeCount = 0;
|
|
203
|
+
for (const sentence of sentences) {
|
|
204
|
+
const firstWord = sentence
|
|
205
|
+
.split(/\s+/)[0]
|
|
206
|
+
?.toLowerCase()
|
|
207
|
+
.replace(/[^a-z]/g, "");
|
|
208
|
+
if (firstWord && IMPERATIVE_VERBS.has(firstWord)) {
|
|
209
|
+
imperativeCount++;
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
return Math.max(0, Math.min(1, imperativeCount / sentences.length));
|
|
214
|
+
}
|