@pseolint/core 0.1.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/ai/adapters/index.d.ts +53 -0
- package/dist/ai/adapters/index.d.ts.map +1 -0
- package/dist/ai/adapters/index.js +158 -0
- package/dist/ai/adapters/index.js.map +1 -0
- package/dist/ai/cache.d.ts +11 -0
- package/dist/ai/cache.d.ts.map +1 -0
- package/dist/ai/cache.js +40 -0
- package/dist/ai/cache.js.map +1 -0
- package/dist/ai/cost.d.ts +3 -0
- package/dist/ai/cost.d.ts.map +1 -0
- package/dist/ai/cost.js +22 -0
- package/dist/ai/cost.js.map +1 -0
- package/dist/ai/feedback-prompt.d.ts +22 -0
- package/dist/ai/feedback-prompt.d.ts.map +1 -0
- package/dist/ai/feedback-prompt.js +39 -0
- package/dist/ai/feedback-prompt.js.map +1 -0
- package/dist/ai/prompt.d.ts +10 -0
- package/dist/ai/prompt.d.ts.map +1 -0
- package/dist/ai/prompt.js +51 -0
- package/dist/ai/prompt.js.map +1 -0
- package/dist/ai/triage.d.ts +28 -0
- package/dist/ai/triage.d.ts.map +1 -0
- package/dist/ai/triage.js +136 -0
- package/dist/ai/triage.js.map +1 -0
- package/dist/ai/types.d.ts +27 -0
- package/dist/ai/types.d.ts.map +1 -0
- package/dist/ai/types.js +2 -0
- package/dist/ai/types.js.map +1 -0
- package/dist/auditor.d.ts.map +1 -1
- package/dist/auditor.js +399 -83
- package/dist/auditor.js.map +1 -1
- package/dist/cache.d.ts +44 -0
- package/dist/cache.d.ts.map +1 -0
- package/dist/cache.js +182 -0
- package/dist/cache.js.map +1 -0
- package/dist/data-source-loader.d.ts +14 -0
- package/dist/data-source-loader.d.ts.map +1 -0
- package/dist/data-source-loader.js +76 -0
- package/dist/data-source-loader.js.map +1 -0
- package/dist/enrich-findings.d.ts.map +1 -1
- package/dist/enrich-findings.js +4 -0
- package/dist/enrich-findings.js.map +1 -1
- package/dist/formatters/console.d.ts.map +1 -1
- package/dist/formatters/console.js +30 -0
- package/dist/formatters/console.js.map +1 -1
- package/dist/formatters/html.d.ts.map +1 -1
- package/dist/formatters/html.js +92 -70
- package/dist/formatters/html.js.map +1 -1
- package/dist/formatters/markdown.d.ts.map +1 -1
- package/dist/formatters/markdown.js +29 -0
- package/dist/formatters/markdown.js.map +1 -1
- package/dist/index.d.ts +20 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +11 -0
- package/dist/index.js.map +1 -1
- package/dist/rule-references.d.ts.map +1 -1
- package/dist/rule-references.js +3 -0
- package/dist/rule-references.js.map +1 -1
- package/dist/rules/data/data-binding.d.ts +4 -0
- package/dist/rules/data/data-binding.d.ts.map +1 -0
- package/dist/rules/data/data-binding.js +107 -0
- package/dist/rules/data/data-binding.js.map +1 -0
- package/dist/rules/tech/robots-sitemap-presence.d.ts +2 -1
- package/dist/rules/tech/robots-sitemap-presence.d.ts.map +1 -1
- package/dist/rules/tech/robots-sitemap-presence.js +101 -0
- package/dist/rules/tech/robots-sitemap-presence.js.map +1 -1
- package/dist/rules/tech/sitemap-completeness.d.ts.map +1 -1
- package/dist/rules/tech/sitemap-completeness.js +15 -0
- package/dist/rules/tech/sitemap-completeness.js.map +1 -1
- package/dist/state.d.ts +35 -0
- package/dist/state.d.ts.map +1 -0
- package/dist/state.js +64 -0
- package/dist/state.js.map +1 -0
- package/dist/stratified-sample.d.ts +3 -0
- package/dist/stratified-sample.d.ts.map +1 -0
- package/dist/stratified-sample.js +88 -0
- package/dist/stratified-sample.js.map +1 -0
- package/dist/telemetry/aggregator.d.ts +47 -0
- package/dist/telemetry/aggregator.d.ts.map +1 -0
- package/dist/telemetry/aggregator.js +77 -0
- package/dist/telemetry/aggregator.js.map +1 -0
- package/dist/telemetry/index.d.ts +5 -0
- package/dist/telemetry/index.d.ts.map +1 -0
- package/dist/telemetry/index.js +5 -0
- package/dist/telemetry/index.js.map +1 -0
- package/dist/telemetry/reader.d.ts +12 -0
- package/dist/telemetry/reader.d.ts.map +1 -0
- package/dist/telemetry/reader.js +35 -0
- package/dist/telemetry/reader.js.map +1 -0
- package/dist/telemetry/types.d.ts +126 -0
- package/dist/telemetry/types.d.ts.map +1 -0
- package/dist/telemetry/types.js +75 -0
- package/dist/telemetry/types.js.map +1 -0
- package/dist/telemetry/writer.d.ts +12 -0
- package/dist/telemetry/writer.d.ts.map +1 -0
- package/dist/telemetry/writer.js +38 -0
- package/dist/telemetry/writer.js.map +1 -0
- package/dist/types.d.ts +96 -0
- package/dist/types.d.ts.map +1 -1
- package/package.json +26 -6
- package/dist/algorithms/entity-mask.test.d.ts +0 -2
- package/dist/algorithms/entity-mask.test.d.ts.map +0 -1
- package/dist/algorithms/entity-mask.test.js +0 -23
- package/dist/algorithms/entity-mask.test.js.map +0 -1
- package/dist/algorithms/simhash.test.d.ts +0 -2
- package/dist/algorithms/simhash.test.d.ts.map +0 -1
- package/dist/algorithms/simhash.test.js +0 -23
- package/dist/algorithms/simhash.test.js.map +0 -1
- package/dist/auditor.test.d.ts +0 -2
- package/dist/auditor.test.d.ts.map +0 -1
- package/dist/auditor.test.js +0 -134
- package/dist/auditor.test.js.map +0 -1
- package/dist/parser.test.d.ts +0 -2
- package/dist/parser.test.d.ts.map +0 -1
- package/dist/parser.test.js +0 -37
- package/dist/parser.test.js.map +0 -1
package/dist/auditor.js
CHANGED
|
@@ -26,6 +26,7 @@ import { hreflangConsistencyRule } from "./rules/tech/hreflang-consistency.js";
|
|
|
26
26
|
import { ogCompletenessRule } from "./rules/tech/og-completeness.js";
|
|
27
27
|
import { robotsNoindexConflictRule } from "./rules/tech/robots-noindex-conflict.js";
|
|
28
28
|
import { sitemapCompletenessRule } from "./rules/tech/sitemap-completeness.js";
|
|
29
|
+
import { robotsComplianceRule } from "./rules/tech/robots-sitemap-presence.js";
|
|
29
30
|
import { redirectChainRule } from "./rules/tech/redirect-chain.js";
|
|
30
31
|
import { soft404Rule } from "./rules/tech/soft-404.js";
|
|
31
32
|
import { jsonLdValidRule } from "./rules/schema/json-ld-valid.js";
|
|
@@ -35,9 +36,17 @@ import { titleOverlapRule } from "./rules/cannibal/title-overlap.js";
|
|
|
35
36
|
import { keywordCollisionRule } from "./rules/cannibal/keyword-collision.js";
|
|
36
37
|
import { urlPatternRule } from "./rules/cannibal/url-pattern.js";
|
|
37
38
|
import { templateCoverageRule } from "./rules/spam/template-coverage.js";
|
|
39
|
+
import { dataBindingRule, dataIdenticalRule } from "./rules/data/data-binding.js";
|
|
38
40
|
import { classifyPages, isRuleEnabled } from "./page-classifier.js";
|
|
39
41
|
import { RULE_REFERENCES } from "./rule-references.js";
|
|
40
42
|
import { enrichFindings } from "./enrich-findings.js";
|
|
43
|
+
import { triageFindings } from "./ai/triage.js";
|
|
44
|
+
import { createLanguageModel } from "./ai/adapters/index.js";
|
|
45
|
+
import { promptTriageFeedback } from "./ai/feedback-prompt.js";
|
|
46
|
+
import { generateRunId, appendTelemetryRecord, todayTriageSpendUsd, } from "./telemetry/index.js";
|
|
47
|
+
import { cachedFetch } from "./cache.js";
|
|
48
|
+
import { stratifiedSample } from "./stratified-sample.js";
|
|
49
|
+
import { readState, writeState, computeContentHash, STATE_SCHEMA_VERSION, } from "./state.js";
|
|
41
50
|
const DEFAULTS = {
|
|
42
51
|
nearDuplicateThreshold: 0.85,
|
|
43
52
|
entitySwapThreshold: 0.95,
|
|
@@ -84,13 +93,17 @@ function resolveGroupRules(baseRules, overrides) {
|
|
|
84
93
|
}
|
|
85
94
|
return result;
|
|
86
95
|
}
|
|
87
|
-
function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, entityPatterns) {
|
|
96
|
+
function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, entityPatterns, overrides) {
|
|
88
97
|
const findings = [];
|
|
89
|
-
const tag = (results) => results.map((r) =>
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
98
|
+
const tag = (results) => results.map((r) => {
|
|
99
|
+
const override = overrides?.[r.ruleId];
|
|
100
|
+
return {
|
|
101
|
+
...r,
|
|
102
|
+
group: groupName === "__default" ? undefined : groupName,
|
|
103
|
+
ref: r.ref ?? RULE_REFERENCES[r.ruleId],
|
|
104
|
+
...(override?.severity ? { severity: override.severity } : {}),
|
|
105
|
+
};
|
|
106
|
+
});
|
|
94
107
|
// Spam rules — always compute cross-page data, only push findings if enabled
|
|
95
108
|
const nearDuplicate = nearDuplicateRule(pages, resolvedRules.nearDuplicateThreshold);
|
|
96
109
|
if (isEnabled("spam/near-duplicate")) {
|
|
@@ -257,79 +270,57 @@ async function collectHtmlFiles(directory) {
|
|
|
257
270
|
}));
|
|
258
271
|
return files.flat();
|
|
259
272
|
}
|
|
260
|
-
async function fetchWithRetry(url, timeoutMs) {
|
|
273
|
+
async function fetchWithRetry(url, timeoutMs, cache, stats) {
|
|
261
274
|
try {
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
275
|
+
stats.total += 1;
|
|
276
|
+
const r = await cachedFetch(url, { timeoutMs, cache });
|
|
277
|
+
if (r.fromCache) {
|
|
278
|
+
stats.hits += 1;
|
|
279
|
+
stats.bytesSavedEstimate += r.body.length;
|
|
265
280
|
}
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
};
|
|
281
|
+
if (r.status < 200 || r.status >= 300)
|
|
282
|
+
return null;
|
|
283
|
+
return { text: r.body, contentType: (r.headers["content-type"] ?? "").toLowerCase() };
|
|
270
284
|
}
|
|
271
285
|
catch {
|
|
272
286
|
return null;
|
|
273
287
|
}
|
|
274
288
|
}
|
|
275
|
-
async function fetchPageWithMeta(url, timeoutMs) {
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
redirect: "manual",
|
|
283
|
-
signal: AbortSignal.timeout(timeoutMs),
|
|
284
|
-
});
|
|
285
|
-
}
|
|
286
|
-
catch {
|
|
287
|
-
return null;
|
|
288
|
-
}
|
|
289
|
-
const status = response.status;
|
|
290
|
-
if (status >= 300 && status < 400) {
|
|
291
|
-
const location = response.headers.get("location");
|
|
292
|
-
if (!location)
|
|
293
|
-
break;
|
|
294
|
-
redirectChain.push(currentUrl);
|
|
295
|
-
try {
|
|
296
|
-
currentUrl = new URL(location, currentUrl).href;
|
|
297
|
-
}
|
|
298
|
-
catch {
|
|
299
|
-
break;
|
|
300
|
-
}
|
|
301
|
-
continue;
|
|
302
|
-
}
|
|
303
|
-
let html;
|
|
304
|
-
try {
|
|
305
|
-
html = await response.text();
|
|
306
|
-
}
|
|
307
|
-
catch {
|
|
308
|
-
return null;
|
|
289
|
+
async function fetchPageWithMeta(url, timeoutMs, cache, stats) {
|
|
290
|
+
try {
|
|
291
|
+
stats.total += 1;
|
|
292
|
+
const r = await cachedFetch(url, { timeoutMs, cache });
|
|
293
|
+
if (r.fromCache) {
|
|
294
|
+
stats.hits += 1;
|
|
295
|
+
stats.bytesSavedEstimate += r.body.length;
|
|
309
296
|
}
|
|
310
297
|
return {
|
|
311
298
|
url,
|
|
312
|
-
html,
|
|
299
|
+
html: r.body,
|
|
313
300
|
httpMeta: {
|
|
314
|
-
statusCode: status,
|
|
315
|
-
finalUrl:
|
|
316
|
-
redirectChain,
|
|
317
|
-
xRobotsTag:
|
|
318
|
-
linkHeader:
|
|
301
|
+
statusCode: r.status,
|
|
302
|
+
finalUrl: r.url,
|
|
303
|
+
redirectChain: r.redirectChain,
|
|
304
|
+
xRobotsTag: r.headers["x-robots-tag"] ?? "",
|
|
305
|
+
linkHeader: r.headers.link ?? "",
|
|
319
306
|
},
|
|
320
307
|
};
|
|
321
308
|
}
|
|
322
|
-
|
|
309
|
+
catch {
|
|
310
|
+
return null;
|
|
311
|
+
}
|
|
323
312
|
}
|
|
324
|
-
async function fetchTextStrict(url, timeoutMs) {
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
313
|
+
async function fetchTextStrict(url, timeoutMs, cache, stats) {
|
|
314
|
+
stats.total += 1;
|
|
315
|
+
const r = await cachedFetch(url, { timeoutMs, cache });
|
|
316
|
+
if (r.fromCache) {
|
|
317
|
+
stats.hits += 1;
|
|
318
|
+
stats.bytesSavedEstimate += r.body.length;
|
|
328
319
|
}
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
};
|
|
320
|
+
if (r.status < 200 || r.status >= 300) {
|
|
321
|
+
throw new Error(`Failed to fetch source: ${r.status}`);
|
|
322
|
+
}
|
|
323
|
+
return { text: r.body, contentType: (r.headers["content-type"] ?? "").toLowerCase() };
|
|
333
324
|
}
|
|
334
325
|
async function runWithConcurrency(items, limit, fn) {
|
|
335
326
|
let index = 0;
|
|
@@ -418,7 +409,7 @@ function fisherYatesSample(items, n) {
|
|
|
418
409
|
}
|
|
419
410
|
return arr.slice(arr.length - n);
|
|
420
411
|
}
|
|
421
|
-
async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutMs) {
|
|
412
|
+
async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutMs, cache, stats) {
|
|
422
413
|
visited.add(sitemapUrl);
|
|
423
414
|
const locs = parseSitemapUrls(sitemapText);
|
|
424
415
|
if (!isSitemapIndex(sitemapText)) {
|
|
@@ -428,24 +419,24 @@ async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutM
|
|
|
428
419
|
for (const childUrl of locs) {
|
|
429
420
|
if (visited.has(childUrl))
|
|
430
421
|
continue;
|
|
431
|
-
const child = await fetchWithRetry(childUrl, timeoutMs);
|
|
422
|
+
const child = await fetchWithRetry(childUrl, timeoutMs, cache, stats);
|
|
432
423
|
if (!child)
|
|
433
424
|
continue;
|
|
434
425
|
const childLike = child.contentType.includes("xml") || looksLikeSitemap(child.text);
|
|
435
426
|
if (!childLike)
|
|
436
427
|
continue;
|
|
437
|
-
const childUrls = await collectUrlsFromSitemap(child.text, childUrl, visited, timeoutMs);
|
|
428
|
+
const childUrls = await collectUrlsFromSitemap(child.text, childUrl, visited, timeoutMs, cache, stats);
|
|
438
429
|
allUrls.push(...childUrls);
|
|
439
430
|
}
|
|
440
431
|
return allUrls;
|
|
441
432
|
}
|
|
442
|
-
async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery) {
|
|
433
|
+
async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cache, stats) {
|
|
443
434
|
if (/^https?:\/\//i.test(source)) {
|
|
444
435
|
let text;
|
|
445
436
|
let contentType;
|
|
446
437
|
let sourceStatus = 200;
|
|
447
438
|
try {
|
|
448
|
-
const fetched = await fetchTextStrict(source, timeoutMs);
|
|
439
|
+
const fetched = await fetchTextStrict(source, timeoutMs, cache, stats);
|
|
449
440
|
text = fetched.text;
|
|
450
441
|
contentType = fetched.contentType;
|
|
451
442
|
}
|
|
@@ -454,7 +445,7 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
454
445
|
if (source.includes("sitemap")) {
|
|
455
446
|
try {
|
|
456
447
|
const origin = new URL(source).origin;
|
|
457
|
-
const fallback = await fetchTextStrict(origin, timeoutMs);
|
|
448
|
+
const fallback = await fetchTextStrict(origin, timeoutMs, cache, stats);
|
|
458
449
|
text = fallback.text;
|
|
459
450
|
contentType = fallback.contentType;
|
|
460
451
|
sourceStatus = -1; // flag that we fell back
|
|
@@ -470,17 +461,21 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
470
461
|
const isXml = (contentType.includes("xml") || looksLikeSitemap(text)) && sourceStatus !== -1;
|
|
471
462
|
if (isXml) {
|
|
472
463
|
const visited = new Set();
|
|
473
|
-
const
|
|
464
|
+
const allSitemapUrls = await collectUrlsFromSitemap(text, source, visited, timeoutMs, cache, stats);
|
|
465
|
+
// If we have a budget, sample from sitemap URLs before fetching
|
|
466
|
+
const urlsToFetch = discoveryBudget > 0 && allSitemapUrls.length > discoveryBudget
|
|
467
|
+
? fisherYatesSample(allSitemapUrls, discoveryBudget)
|
|
468
|
+
: allSitemapUrls;
|
|
474
469
|
const pages = [];
|
|
475
|
-
await runWithConcurrency(
|
|
476
|
-
const result = await fetchPageWithMeta(url, timeoutMs);
|
|
470
|
+
await runWithConcurrency(urlsToFetch, concurrency, async (url) => {
|
|
471
|
+
const result = await fetchPageWithMeta(url, timeoutMs, cache, stats);
|
|
477
472
|
if (result) {
|
|
478
473
|
pages.push(result);
|
|
479
474
|
}
|
|
480
475
|
});
|
|
481
|
-
//
|
|
482
|
-
if (crawlDiscovery) {
|
|
483
|
-
const sitemapUrlSet = new Set(
|
|
476
|
+
// Skip additional crawl discovery when budget is active — sitemap is authoritative
|
|
477
|
+
if (crawlDiscovery && discoveryBudget === 0) {
|
|
478
|
+
const sitemapUrlSet = new Set(allSitemapUrls);
|
|
484
479
|
const discoveredUrls = new Set();
|
|
485
480
|
let sourceOrigin;
|
|
486
481
|
try {
|
|
@@ -516,14 +511,14 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
516
511
|
}
|
|
517
512
|
if (discoveredUrls.size > 0) {
|
|
518
513
|
await runWithConcurrency(Array.from(discoveredUrls), concurrency, async (url) => {
|
|
519
|
-
const result = await fetchPageWithMeta(url, timeoutMs);
|
|
514
|
+
const result = await fetchPageWithMeta(url, timeoutMs, cache, stats);
|
|
520
515
|
if (result && result.httpMeta && result.httpMeta.statusCode >= 200 && result.httpMeta.statusCode < 300) {
|
|
521
516
|
pages.push(result);
|
|
522
517
|
}
|
|
523
518
|
});
|
|
524
519
|
}
|
|
525
520
|
}
|
|
526
|
-
return { pages, sitemapUrls: new Set(
|
|
521
|
+
return { pages, sitemapUrls: new Set(allSitemapUrls), discoveredUrlCount: allSitemapUrls.length };
|
|
527
522
|
}
|
|
528
523
|
if (contentType.includes("html") || looksLikeHtml(text)) {
|
|
529
524
|
const initialPage = { url: source, html: text };
|
|
@@ -537,8 +532,12 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
537
532
|
sourceOrigin = "";
|
|
538
533
|
}
|
|
539
534
|
const knownCrawled = new Set([source]);
|
|
535
|
+
const allDiscoveredUrls = new Set([source]);
|
|
540
536
|
const maxDepth = 3;
|
|
541
537
|
for (let depth = 0; depth < maxDepth; depth += 1) {
|
|
538
|
+
// Stop if we've hit the discovery budget
|
|
539
|
+
if (discoveryBudget > 0 && pages.length >= discoveryBudget)
|
|
540
|
+
break;
|
|
542
541
|
const frontier = new Set();
|
|
543
542
|
for (const page of pages) {
|
|
544
543
|
if (depth > 0 && !knownCrawled.has("__depth_" + depth + "_" + page.url))
|
|
@@ -568,11 +567,25 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
568
567
|
}
|
|
569
568
|
}
|
|
570
569
|
}
|
|
570
|
+
// Track all discovered URLs even if we don't fetch them
|
|
571
|
+
for (const url of frontier) {
|
|
572
|
+
allDiscoveredUrls.add(url);
|
|
573
|
+
}
|
|
571
574
|
if (frontier.size === 0)
|
|
572
575
|
break;
|
|
576
|
+
// If budget active, only fetch up to budget
|
|
577
|
+
let urlsToFetch = Array.from(frontier);
|
|
578
|
+
if (discoveryBudget > 0) {
|
|
579
|
+
const remaining = discoveryBudget - pages.length;
|
|
580
|
+
if (remaining <= 0)
|
|
581
|
+
break;
|
|
582
|
+
if (urlsToFetch.length > remaining) {
|
|
583
|
+
urlsToFetch = urlsToFetch.slice(0, remaining);
|
|
584
|
+
}
|
|
585
|
+
}
|
|
573
586
|
const newPages = [];
|
|
574
|
-
await runWithConcurrency(
|
|
575
|
-
const result = await fetchPageWithMeta(url, timeoutMs);
|
|
587
|
+
await runWithConcurrency(urlsToFetch, concurrency, async (url) => {
|
|
588
|
+
const result = await fetchPageWithMeta(url, timeoutMs, cache, stats);
|
|
576
589
|
if (result && result.httpMeta && result.httpMeta.statusCode >= 200 && result.httpMeta.statusCode < 300) {
|
|
577
590
|
newPages.push(result);
|
|
578
591
|
knownCrawled.add(url);
|
|
@@ -586,6 +599,7 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
586
599
|
if (newPages.length === 0)
|
|
587
600
|
break;
|
|
588
601
|
}
|
|
602
|
+
return { pages, discoveredUrlCount: allDiscoveredUrls.size };
|
|
589
603
|
}
|
|
590
604
|
return { pages };
|
|
591
605
|
}
|
|
@@ -613,6 +627,8 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
613
627
|
return { pages: [] };
|
|
614
628
|
}
|
|
615
629
|
export async function auditSource(source, options) {
|
|
630
|
+
const runId = generateRunId();
|
|
631
|
+
const runStartedAt = Date.now();
|
|
616
632
|
const concurrency = options?.concurrency ?? 5;
|
|
617
633
|
const timeoutMs = options?.timeout ?? 30000;
|
|
618
634
|
const ignorePatterns = options?.ignore ?? [];
|
|
@@ -638,7 +654,63 @@ export async function auditSource(source, options) {
|
|
|
638
654
|
stripWwwHost: options?.rules?.stripWwwHost ?? false
|
|
639
655
|
});
|
|
640
656
|
const crawlDiscovery = /^https?:\/\//i.test(source) && (options?.crawlDiscovery ?? true);
|
|
641
|
-
|
|
657
|
+
// Discovery budget: when sampleSize is set, cap discovery at 2x (min 50) to avoid
|
|
658
|
+
// fetching far more pages than we'll sample. First-run egress is bounded by sampleSize;
|
|
659
|
+
// re-runs hit the cache. Remove adaptive 200-cap: users get full crawl by default,
|
|
660
|
+
// repeated audits stay cheap via --cache.
|
|
661
|
+
const discoveryBudget = options?.sampleSize && options.sampleSize > 0
|
|
662
|
+
? Math.max(50, options.sampleSize * 2)
|
|
663
|
+
: 0;
|
|
664
|
+
const cacheStats = { hits: 0, total: 0, bytesSavedEstimate: 0 };
|
|
665
|
+
const cacheConfig = options?.cache
|
|
666
|
+
? {
|
|
667
|
+
dir: options.cache.dir ?? ".pseolint/cache",
|
|
668
|
+
ttlMs: options.cache.ttlMs ?? 7 * 24 * 60 * 60 * 1000,
|
|
669
|
+
}
|
|
670
|
+
: null;
|
|
671
|
+
const { pages: loadedPagesRaw, sitemapUrls: sitemapUrlSet, discoveredUrlCount } = await loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cacheConfig, cacheStats);
|
|
672
|
+
const loadedPages = [...loadedPagesRaw];
|
|
673
|
+
if (discoveredUrlCount && discoveredUrlCount > loadedPages.length) {
|
|
674
|
+
console.error(`Discovered ${discoveredUrlCount} pages, fetched ${loadedPages.length} for audit. Use --sample-size 0 for full crawl.`);
|
|
675
|
+
}
|
|
676
|
+
// State read + delta filtering
|
|
677
|
+
let priorState = null;
|
|
678
|
+
const skippedUrls = [];
|
|
679
|
+
if (options?.state?.since || options?.state?.exitOnRegression) {
|
|
680
|
+
const statePath = options.state.path ?? ".pseolint/state.json";
|
|
681
|
+
priorState = await readState(statePath);
|
|
682
|
+
const currentRenderMode = options.render ? "rendered" : "static";
|
|
683
|
+
if (priorState && priorState.renderMode !== currentRenderMode) {
|
|
684
|
+
console.error(`warning: prior state renderMode=${priorState.renderMode} differs from current ${currentRenderMode}. Performing full re-audit.`);
|
|
685
|
+
priorState = null;
|
|
686
|
+
}
|
|
687
|
+
if (priorState && options.state.since) {
|
|
688
|
+
const kept = [];
|
|
689
|
+
for (const p of loadedPages) {
|
|
690
|
+
const prior = priorState.urls[p.url];
|
|
691
|
+
if (prior && prior.contentHash === computeContentHash(p.html)) {
|
|
692
|
+
skippedUrls.push(p.url);
|
|
693
|
+
}
|
|
694
|
+
else {
|
|
695
|
+
kept.push(p);
|
|
696
|
+
}
|
|
697
|
+
}
|
|
698
|
+
loadedPages.splice(0, loadedPages.length, ...kept);
|
|
699
|
+
}
|
|
700
|
+
else if (!priorState && options.state.since) {
|
|
701
|
+
console.error("no prior state found — performing full baseline audit");
|
|
702
|
+
}
|
|
703
|
+
}
|
|
704
|
+
let robotsTxtContent = "";
|
|
705
|
+
if (/^https?:\/\//i.test(source)) {
|
|
706
|
+
try {
|
|
707
|
+
const origin = new URL(source).origin;
|
|
708
|
+
const result = await fetchWithRetry(`${origin}/robots.txt`, timeoutMs, cacheConfig, cacheStats);
|
|
709
|
+
if (result)
|
|
710
|
+
robotsTxtContent = result.text;
|
|
711
|
+
}
|
|
712
|
+
catch { /* ignore */ }
|
|
713
|
+
}
|
|
642
714
|
const deduped = [];
|
|
643
715
|
const urlHashes = new Map();
|
|
644
716
|
const duplicateUrlFindings = [];
|
|
@@ -665,8 +737,15 @@ export async function auditSource(source, options) {
|
|
|
665
737
|
const filtered = ignorePatterns.length > 0
|
|
666
738
|
? deduped.filter((page) => !shouldIgnore(page.url, ignorePatterns))
|
|
667
739
|
: deduped;
|
|
740
|
+
const strategy = options?.samplingStrategy ?? "stratified";
|
|
668
741
|
const sampled = sampleSize > 0 && sampleSize < filtered.length
|
|
669
|
-
?
|
|
742
|
+
? (strategy === "stratified"
|
|
743
|
+
? (() => {
|
|
744
|
+
const urlsMap = new Map(filtered.map(p => [p.url, p]));
|
|
745
|
+
const sampledUrls = stratifiedSample(filtered.map(p => p.url), sampleSize);
|
|
746
|
+
return sampledUrls.map(u => urlsMap.get(u));
|
|
747
|
+
})()
|
|
748
|
+
: fisherYatesSample(filtered, sampleSize))
|
|
670
749
|
: filtered;
|
|
671
750
|
const parsedPages = sampled.map((page) => {
|
|
672
751
|
const parsed = parseHtmlPage(page.html, page.url, { normalizeUrl: normalizeUrlOptions });
|
|
@@ -686,6 +765,29 @@ export async function auditSource(source, options) {
|
|
|
686
765
|
inbound.set(link, (inbound.get(link) ?? 0) + 1);
|
|
687
766
|
}
|
|
688
767
|
}
|
|
768
|
+
// Build entity patterns, merging user-supplied config patterns with defaults.
|
|
769
|
+
// Flags are restricted to known-safe characters to prevent ReDoS via crafted flags;
|
|
770
|
+
// each pattern is compiled eagerly so bad regexes fail at config time, not mid-audit.
|
|
771
|
+
const SAFE_FLAGS_RE = /^[gimsuy]*$/;
|
|
772
|
+
const entityPatterns = options?.entityPatterns
|
|
773
|
+
? [
|
|
774
|
+
...DEFAULT_ENTITY_PATTERNS,
|
|
775
|
+
...options.entityPatterns.map((p) => {
|
|
776
|
+
const rawFlags = p.flags ?? "gi";
|
|
777
|
+
if (!SAFE_FLAGS_RE.test(rawFlags)) {
|
|
778
|
+
throw new Error(`Invalid regex flags "${rawFlags}" in entityPatterns for placeholder "${p.placeholder}". ` +
|
|
779
|
+
`Only the flags g, i, m, s, u, y are permitted.`);
|
|
780
|
+
}
|
|
781
|
+
try {
|
|
782
|
+
// Flags validated against SAFE_FLAGS_RE above; pattern is from trusted local config, not HTTP input.
|
|
783
|
+
return { placeholder: p.placeholder, pattern: new RegExp(p.pattern, rawFlags) }; // nosemgrep
|
|
784
|
+
}
|
|
785
|
+
catch (err) {
|
|
786
|
+
throw new Error(`Invalid regex pattern for placeholder "${p.placeholder}": ${err.message}`);
|
|
787
|
+
}
|
|
788
|
+
}),
|
|
789
|
+
]
|
|
790
|
+
: DEFAULT_ENTITY_PATTERNS;
|
|
689
791
|
// Classify pages into groups and run only enabled rules per group
|
|
690
792
|
const classified = classifyPages(parsedPages, options?.pageGroups);
|
|
691
793
|
const allFindings = [...duplicateUrlFindings];
|
|
@@ -695,6 +797,18 @@ export async function auditSource(source, options) {
|
|
|
695
797
|
if (sitemapUrlSet && sitemapUrlSet.size > 0) {
|
|
696
798
|
const sitemapFindings = sitemapCompletenessRule(parsedPages, sitemapUrlSet);
|
|
697
799
|
allFindings.push(...sitemapFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
|
|
800
|
+
if (robotsTxtContent) {
|
|
801
|
+
const robotsFindings = robotsComplianceRule(parsedPages, sitemapUrlSet, robotsTxtContent);
|
|
802
|
+
allFindings.push(...robotsFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
|
|
803
|
+
}
|
|
804
|
+
}
|
|
805
|
+
// Data source comparison rules
|
|
806
|
+
if (options?.dataSource?.records && options.dataSource.records.length > 0) {
|
|
807
|
+
const dataFindings = [
|
|
808
|
+
...dataBindingRule(parsedPages, options.dataSource.records),
|
|
809
|
+
...dataIdenticalRule(parsedPages, options.dataSource.records),
|
|
810
|
+
];
|
|
811
|
+
allFindings.push(...dataFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
|
|
698
812
|
}
|
|
699
813
|
for (const [groupName, groupPages] of classified) {
|
|
700
814
|
if (groupPages.length === 0)
|
|
@@ -704,7 +818,7 @@ export async function auditSource(source, options) {
|
|
|
704
818
|
continue;
|
|
705
819
|
const groupRules = resolveGroupRules(resolvedRules, groupConfig?.overrides);
|
|
706
820
|
const enabledCheck = (ruleId) => isRuleEnabled(ruleId, groupConfig?.rules);
|
|
707
|
-
const findings = runRulesOnPages(groupPages, groupRules, enabledCheck, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, DEFAULT_ENTITY_PATTERNS);
|
|
821
|
+
const findings = runRulesOnPages(groupPages, groupRules, enabledCheck, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, DEFAULT_ENTITY_PATTERNS, groupConfig?.overrides);
|
|
708
822
|
allFindings.push(...findings);
|
|
709
823
|
groupPageCounts[groupName] = groupPages.length;
|
|
710
824
|
const { score } = scoreFromFindings(findings);
|
|
@@ -716,7 +830,7 @@ export async function auditSource(source, options) {
|
|
|
716
830
|
});
|
|
717
831
|
const { score, categoryScores } = scoreFromFindings(enriched.findings);
|
|
718
832
|
const auditedPageCount = Object.values(groupPageCounts).reduce((a, b) => a + b, 0);
|
|
719
|
-
|
|
833
|
+
const summary = {
|
|
720
834
|
score,
|
|
721
835
|
categoryScores,
|
|
722
836
|
groupScores: options?.pageGroups ? groupScores : undefined,
|
|
@@ -726,5 +840,207 @@ export async function auditSource(source, options) {
|
|
|
726
840
|
templateDetected: enriched.templateDetected,
|
|
727
841
|
rawFindingCount: enriched.rawFindingCount,
|
|
728
842
|
};
|
|
843
|
+
if (cacheConfig) {
|
|
844
|
+
summary.cacheStats = cacheStats;
|
|
845
|
+
}
|
|
846
|
+
if (skippedUrls.length > 0) {
|
|
847
|
+
summary.skippedUrls = skippedUrls;
|
|
848
|
+
}
|
|
849
|
+
if (priorState && options?.state?.exitOnRegression) {
|
|
850
|
+
let hasRegression = false;
|
|
851
|
+
const currentFindings = new Map();
|
|
852
|
+
for (const f of summary.findings) {
|
|
853
|
+
if (!f.pageUrl)
|
|
854
|
+
continue;
|
|
855
|
+
const set = currentFindings.get(f.pageUrl) ?? new Set();
|
|
856
|
+
set.add(f.ruleId);
|
|
857
|
+
currentFindings.set(f.pageUrl, set);
|
|
858
|
+
}
|
|
859
|
+
for (const [url, entry] of Object.entries(priorState.urls)) {
|
|
860
|
+
const cur = currentFindings.get(url);
|
|
861
|
+
if (!cur)
|
|
862
|
+
continue;
|
|
863
|
+
const priorIds = new Set(entry.findingIds);
|
|
864
|
+
for (const ruleId of cur) {
|
|
865
|
+
if (!priorIds.has(ruleId)) {
|
|
866
|
+
hasRegression = true;
|
|
867
|
+
break;
|
|
868
|
+
}
|
|
869
|
+
}
|
|
870
|
+
if (hasRegression)
|
|
871
|
+
break;
|
|
872
|
+
}
|
|
873
|
+
summary.hasRegression = hasRegression;
|
|
874
|
+
}
|
|
875
|
+
if (options?.state) {
|
|
876
|
+
const statePath = options.state.path ?? ".pseolint/state.json";
|
|
877
|
+
const renderMode = options.render ? "rendered" : "static";
|
|
878
|
+
const urls = {};
|
|
879
|
+
const findingsByUrl = new Map();
|
|
880
|
+
for (const f of summary.findings) {
|
|
881
|
+
if (!f.pageUrl)
|
|
882
|
+
continue;
|
|
883
|
+
const list = findingsByUrl.get(f.pageUrl) ?? [];
|
|
884
|
+
if (!list.includes(f.ruleId))
|
|
885
|
+
list.push(f.ruleId);
|
|
886
|
+
findingsByUrl.set(f.pageUrl, list);
|
|
887
|
+
}
|
|
888
|
+
// Preserve prior entries for URLs skipped by --since (they didn't change).
|
|
889
|
+
// Without this, delta runs would lose state for unchanged URLs.
|
|
890
|
+
if (priorState && skippedUrls.length > 0) {
|
|
891
|
+
for (const url of skippedUrls) {
|
|
892
|
+
const prior = priorState.urls[url];
|
|
893
|
+
if (prior)
|
|
894
|
+
urls[url] = prior;
|
|
895
|
+
}
|
|
896
|
+
}
|
|
897
|
+
for (const p of loadedPages) {
|
|
898
|
+
urls[p.url] = {
|
|
899
|
+
contentHash: computeContentHash(p.html),
|
|
900
|
+
fetchedAt: new Date().toISOString(),
|
|
901
|
+
status: p.httpMeta?.statusCode ?? 200,
|
|
902
|
+
findingIds: findingsByUrl.get(p.url) ?? [],
|
|
903
|
+
};
|
|
904
|
+
}
|
|
905
|
+
const newState = {
|
|
906
|
+
version: STATE_SCHEMA_VERSION,
|
|
907
|
+
lastRun: new Date().toISOString(),
|
|
908
|
+
source,
|
|
909
|
+
renderMode,
|
|
910
|
+
urls,
|
|
911
|
+
summary: {
|
|
912
|
+
score: summary.score,
|
|
913
|
+
totalFindings: summary.findings.length,
|
|
914
|
+
byCategory: Object.fromEntries(Object.entries(summary.categoryScores).map(([k, v]) => [k, v])),
|
|
915
|
+
},
|
|
916
|
+
};
|
|
917
|
+
await writeState(statePath, newState);
|
|
918
|
+
}
|
|
919
|
+
// Captured for telemetry even when triage is skipped, so users can diagnose
|
|
920
|
+
// model/provider reliability from their local stats.jsonl.
|
|
921
|
+
let triageAttempt;
|
|
922
|
+
if (options?.ai?.enabled) {
|
|
923
|
+
if (options.ai.apiKey) {
|
|
924
|
+
console.error("[ai-triage] warning: ai.apiKey is set in options. Prefer env vars (ANTHROPIC_API_KEY, OPENAI_API_KEY, etc.) — never commit an apiKey to a config file.");
|
|
925
|
+
}
|
|
926
|
+
try {
|
|
927
|
+
const resolved = await createLanguageModel({
|
|
928
|
+
provider: options.ai.provider,
|
|
929
|
+
model: options.ai.model,
|
|
930
|
+
endpoint: options.ai.endpoint,
|
|
931
|
+
apiKey: options.ai.apiKey,
|
|
932
|
+
});
|
|
933
|
+
const cacheConfig = options.ai.cache === false
|
|
934
|
+
? false
|
|
935
|
+
: {
|
|
936
|
+
dir: options.ai.cache?.dir ?? ".pseolint/ai-cache",
|
|
937
|
+
ttlMs: options.ai.cache?.ttlMs ?? 30 * 24 * 60 * 60 * 1000,
|
|
938
|
+
};
|
|
939
|
+
// Daily-budget pre-flight read (best-effort — missing file is fine).
|
|
940
|
+
let spentTodayUsd;
|
|
941
|
+
if (options.ai.dailyBudgetUsd !== undefined) {
|
|
942
|
+
const telemetryPath = options.telemetry?.path ?? ".pseolint/telemetry.jsonl";
|
|
943
|
+
try {
|
|
944
|
+
spentTodayUsd = await todayTriageSpendUsd(telemetryPath);
|
|
945
|
+
}
|
|
946
|
+
catch {
|
|
947
|
+
spentTodayUsd = 0;
|
|
948
|
+
}
|
|
949
|
+
}
|
|
950
|
+
const outcome = await triageFindings(summary.findings, summary.pageCount, {
|
|
951
|
+
enabled: true,
|
|
952
|
+
model: resolved.model,
|
|
953
|
+
providerId: resolved.providerId,
|
|
954
|
+
modelId: resolved.modelId,
|
|
955
|
+
maxInputTokens: options.ai.maxInputTokens,
|
|
956
|
+
maxOutputTokens: options.ai.maxOutputTokens,
|
|
957
|
+
maxCostUsd: options.ai.maxCostUsd,
|
|
958
|
+
dailyBudgetUsd: options.ai.dailyBudgetUsd,
|
|
959
|
+
spentTodayUsd,
|
|
960
|
+
cache: cacheConfig,
|
|
961
|
+
});
|
|
962
|
+
if (outcome.skipReason) {
|
|
963
|
+
console.error(`[ai-triage] skipped: ${outcome.skipReason}`);
|
|
964
|
+
triageAttempt = { providerId: resolved.providerId, modelId: resolved.modelId, skipReason: outcome.skipReason };
|
|
965
|
+
}
|
|
966
|
+
else {
|
|
967
|
+
summary.triage = outcome.result;
|
|
968
|
+
}
|
|
969
|
+
}
|
|
970
|
+
catch (e) {
|
|
971
|
+
const reason = e instanceof Error ? e.message : "unknown error";
|
|
972
|
+
console.error(`[ai-triage] skipped: ${reason}`);
|
|
973
|
+
// No resolved model — providerId/modelId blank.
|
|
974
|
+
triageAttempt = { providerId: options.ai.provider ?? "", modelId: options.ai.model ?? "", skipReason: reason };
|
|
975
|
+
}
|
|
976
|
+
}
|
|
977
|
+
if (options?.telemetry?.enabled) {
|
|
978
|
+
const telemetryPath = options.telemetry.path ?? ".pseolint/telemetry.jsonl";
|
|
979
|
+
const auditRecord = {
|
|
980
|
+
type: "audit",
|
|
981
|
+
schemaVersion: 1,
|
|
982
|
+
runId,
|
|
983
|
+
timestamp: new Date().toISOString(),
|
|
984
|
+
durationMs: Date.now() - runStartedAt,
|
|
985
|
+
score: summary.score,
|
|
986
|
+
pageCount: summary.pageCount,
|
|
987
|
+
findingCount: summary.findings.length,
|
|
988
|
+
...(summary.rawFindingCount !== undefined && { rawFindingCount: summary.rawFindingCount }),
|
|
989
|
+
...(summary.templateDetected !== undefined && { templateDetected: summary.templateDetected }),
|
|
990
|
+
...(summary.cacheStats && { cacheStats: summary.cacheStats }),
|
|
991
|
+
...(summary.triage && {
|
|
992
|
+
triage: {
|
|
993
|
+
success: true,
|
|
994
|
+
rootCauseCount: summary.triage.rootCauses.length,
|
|
995
|
+
providerId: summary.triage.providerId,
|
|
996
|
+
modelId: summary.triage.modelUsed,
|
|
997
|
+
cacheHit: summary.triage.cacheHit,
|
|
998
|
+
tokenUsage: summary.triage.tokenUsage,
|
|
999
|
+
...(summary.triage.estimatedCostUsd !== undefined && {
|
|
1000
|
+
estimatedCostUsd: summary.triage.estimatedCostUsd,
|
|
1001
|
+
}),
|
|
1002
|
+
truncatedInput: summary.triage.truncatedInput,
|
|
1003
|
+
},
|
|
1004
|
+
}),
|
|
1005
|
+
...(!summary.triage && triageAttempt && {
|
|
1006
|
+
triage: {
|
|
1007
|
+
success: false,
|
|
1008
|
+
skipReason: triageAttempt.skipReason,
|
|
1009
|
+
rootCauseCount: 0,
|
|
1010
|
+
providerId: triageAttempt.providerId,
|
|
1011
|
+
modelId: triageAttempt.modelId,
|
|
1012
|
+
cacheHit: false,
|
|
1013
|
+
tokenUsage: { input: 0, output: 0 },
|
|
1014
|
+
truncatedInput: false,
|
|
1015
|
+
},
|
|
1016
|
+
}),
|
|
1017
|
+
};
|
|
1018
|
+
await appendTelemetryRecord(telemetryPath, auditRecord);
|
|
1019
|
+
// Feedback: only if triage ran
|
|
1020
|
+
if (summary.triage) {
|
|
1021
|
+
let rating;
|
|
1022
|
+
if (options.telemetry.feedback) {
|
|
1023
|
+
rating = options.telemetry.feedback;
|
|
1024
|
+
}
|
|
1025
|
+
else if (options.telemetry.prompt !== false) {
|
|
1026
|
+
rating = await promptTriageFeedback();
|
|
1027
|
+
}
|
|
1028
|
+
if (rating) {
|
|
1029
|
+
const feedbackRecord = {
|
|
1030
|
+
type: "feedback",
|
|
1031
|
+
schemaVersion: 1,
|
|
1032
|
+
runId,
|
|
1033
|
+
timestamp: new Date().toISOString(),
|
|
1034
|
+
rating,
|
|
1035
|
+
};
|
|
1036
|
+
await appendTelemetryRecord(telemetryPath, feedbackRecord);
|
|
1037
|
+
}
|
|
1038
|
+
}
|
|
1039
|
+
}
|
|
1040
|
+
const aiHintEnabled = options?.ai?.suggest !== false;
|
|
1041
|
+
if (aiHintEnabled && !options?.ai?.enabled && process.env.ANTHROPIC_API_KEY) {
|
|
1042
|
+
console.error(`💡 AI triage available — re-run with --ai to prioritize ${summary.findings.length} findings into a fix list.`);
|
|
1043
|
+
}
|
|
1044
|
+
return summary;
|
|
729
1045
|
}
|
|
730
1046
|
//# sourceMappingURL=auditor.js.map
|