@pseolint/core 0.1.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. package/dist/ai/adapters/index.d.ts +53 -0
  2. package/dist/ai/adapters/index.d.ts.map +1 -0
  3. package/dist/ai/adapters/index.js +158 -0
  4. package/dist/ai/adapters/index.js.map +1 -0
  5. package/dist/ai/cache.d.ts +11 -0
  6. package/dist/ai/cache.d.ts.map +1 -0
  7. package/dist/ai/cache.js +40 -0
  8. package/dist/ai/cache.js.map +1 -0
  9. package/dist/ai/cost.d.ts +3 -0
  10. package/dist/ai/cost.d.ts.map +1 -0
  11. package/dist/ai/cost.js +22 -0
  12. package/dist/ai/cost.js.map +1 -0
  13. package/dist/ai/feedback-prompt.d.ts +22 -0
  14. package/dist/ai/feedback-prompt.d.ts.map +1 -0
  15. package/dist/ai/feedback-prompt.js +39 -0
  16. package/dist/ai/feedback-prompt.js.map +1 -0
  17. package/dist/ai/prompt.d.ts +10 -0
  18. package/dist/ai/prompt.d.ts.map +1 -0
  19. package/dist/ai/prompt.js +51 -0
  20. package/dist/ai/prompt.js.map +1 -0
  21. package/dist/ai/triage.d.ts +28 -0
  22. package/dist/ai/triage.d.ts.map +1 -0
  23. package/dist/ai/triage.js +136 -0
  24. package/dist/ai/triage.js.map +1 -0
  25. package/dist/ai/types.d.ts +27 -0
  26. package/dist/ai/types.d.ts.map +1 -0
  27. package/dist/ai/types.js +2 -0
  28. package/dist/ai/types.js.map +1 -0
  29. package/dist/auditor.d.ts.map +1 -1
  30. package/dist/auditor.js +399 -83
  31. package/dist/auditor.js.map +1 -1
  32. package/dist/cache.d.ts +44 -0
  33. package/dist/cache.d.ts.map +1 -0
  34. package/dist/cache.js +182 -0
  35. package/dist/cache.js.map +1 -0
  36. package/dist/data-source-loader.d.ts +14 -0
  37. package/dist/data-source-loader.d.ts.map +1 -0
  38. package/dist/data-source-loader.js +76 -0
  39. package/dist/data-source-loader.js.map +1 -0
  40. package/dist/enrich-findings.d.ts.map +1 -1
  41. package/dist/enrich-findings.js +4 -0
  42. package/dist/enrich-findings.js.map +1 -1
  43. package/dist/formatters/console.d.ts.map +1 -1
  44. package/dist/formatters/console.js +30 -0
  45. package/dist/formatters/console.js.map +1 -1
  46. package/dist/formatters/html.d.ts.map +1 -1
  47. package/dist/formatters/html.js +92 -70
  48. package/dist/formatters/html.js.map +1 -1
  49. package/dist/formatters/markdown.d.ts.map +1 -1
  50. package/dist/formatters/markdown.js +29 -0
  51. package/dist/formatters/markdown.js.map +1 -1
  52. package/dist/index.d.ts +20 -0
  53. package/dist/index.d.ts.map +1 -1
  54. package/dist/index.js +11 -0
  55. package/dist/index.js.map +1 -1
  56. package/dist/rule-references.d.ts.map +1 -1
  57. package/dist/rule-references.js +3 -0
  58. package/dist/rule-references.js.map +1 -1
  59. package/dist/rules/data/data-binding.d.ts +4 -0
  60. package/dist/rules/data/data-binding.d.ts.map +1 -0
  61. package/dist/rules/data/data-binding.js +107 -0
  62. package/dist/rules/data/data-binding.js.map +1 -0
  63. package/dist/rules/tech/robots-sitemap-presence.d.ts +2 -1
  64. package/dist/rules/tech/robots-sitemap-presence.d.ts.map +1 -1
  65. package/dist/rules/tech/robots-sitemap-presence.js +101 -0
  66. package/dist/rules/tech/robots-sitemap-presence.js.map +1 -1
  67. package/dist/rules/tech/sitemap-completeness.d.ts.map +1 -1
  68. package/dist/rules/tech/sitemap-completeness.js +15 -0
  69. package/dist/rules/tech/sitemap-completeness.js.map +1 -1
  70. package/dist/state.d.ts +35 -0
  71. package/dist/state.d.ts.map +1 -0
  72. package/dist/state.js +64 -0
  73. package/dist/state.js.map +1 -0
  74. package/dist/stratified-sample.d.ts +3 -0
  75. package/dist/stratified-sample.d.ts.map +1 -0
  76. package/dist/stratified-sample.js +88 -0
  77. package/dist/stratified-sample.js.map +1 -0
  78. package/dist/telemetry/aggregator.d.ts +47 -0
  79. package/dist/telemetry/aggregator.d.ts.map +1 -0
  80. package/dist/telemetry/aggregator.js +77 -0
  81. package/dist/telemetry/aggregator.js.map +1 -0
  82. package/dist/telemetry/index.d.ts +5 -0
  83. package/dist/telemetry/index.d.ts.map +1 -0
  84. package/dist/telemetry/index.js +5 -0
  85. package/dist/telemetry/index.js.map +1 -0
  86. package/dist/telemetry/reader.d.ts +12 -0
  87. package/dist/telemetry/reader.d.ts.map +1 -0
  88. package/dist/telemetry/reader.js +35 -0
  89. package/dist/telemetry/reader.js.map +1 -0
  90. package/dist/telemetry/types.d.ts +126 -0
  91. package/dist/telemetry/types.d.ts.map +1 -0
  92. package/dist/telemetry/types.js +75 -0
  93. package/dist/telemetry/types.js.map +1 -0
  94. package/dist/telemetry/writer.d.ts +12 -0
  95. package/dist/telemetry/writer.d.ts.map +1 -0
  96. package/dist/telemetry/writer.js +38 -0
  97. package/dist/telemetry/writer.js.map +1 -0
  98. package/dist/types.d.ts +96 -0
  99. package/dist/types.d.ts.map +1 -1
  100. package/package.json +26 -6
  101. package/dist/algorithms/entity-mask.test.d.ts +0 -2
  102. package/dist/algorithms/entity-mask.test.d.ts.map +0 -1
  103. package/dist/algorithms/entity-mask.test.js +0 -23
  104. package/dist/algorithms/entity-mask.test.js.map +0 -1
  105. package/dist/algorithms/simhash.test.d.ts +0 -2
  106. package/dist/algorithms/simhash.test.d.ts.map +0 -1
  107. package/dist/algorithms/simhash.test.js +0 -23
  108. package/dist/algorithms/simhash.test.js.map +0 -1
  109. package/dist/auditor.test.d.ts +0 -2
  110. package/dist/auditor.test.d.ts.map +0 -1
  111. package/dist/auditor.test.js +0 -134
  112. package/dist/auditor.test.js.map +0 -1
  113. package/dist/parser.test.d.ts +0 -2
  114. package/dist/parser.test.d.ts.map +0 -1
  115. package/dist/parser.test.js +0 -37
  116. package/dist/parser.test.js.map +0 -1
package/dist/auditor.js CHANGED
@@ -26,6 +26,7 @@ import { hreflangConsistencyRule } from "./rules/tech/hreflang-consistency.js";
26
26
  import { ogCompletenessRule } from "./rules/tech/og-completeness.js";
27
27
  import { robotsNoindexConflictRule } from "./rules/tech/robots-noindex-conflict.js";
28
28
  import { sitemapCompletenessRule } from "./rules/tech/sitemap-completeness.js";
29
+ import { robotsComplianceRule } from "./rules/tech/robots-sitemap-presence.js";
29
30
  import { redirectChainRule } from "./rules/tech/redirect-chain.js";
30
31
  import { soft404Rule } from "./rules/tech/soft-404.js";
31
32
  import { jsonLdValidRule } from "./rules/schema/json-ld-valid.js";
@@ -35,9 +36,17 @@ import { titleOverlapRule } from "./rules/cannibal/title-overlap.js";
35
36
  import { keywordCollisionRule } from "./rules/cannibal/keyword-collision.js";
36
37
  import { urlPatternRule } from "./rules/cannibal/url-pattern.js";
37
38
  import { templateCoverageRule } from "./rules/spam/template-coverage.js";
39
+ import { dataBindingRule, dataIdenticalRule } from "./rules/data/data-binding.js";
38
40
  import { classifyPages, isRuleEnabled } from "./page-classifier.js";
39
41
  import { RULE_REFERENCES } from "./rule-references.js";
40
42
  import { enrichFindings } from "./enrich-findings.js";
43
+ import { triageFindings } from "./ai/triage.js";
44
+ import { createLanguageModel } from "./ai/adapters/index.js";
45
+ import { promptTriageFeedback } from "./ai/feedback-prompt.js";
46
+ import { generateRunId, appendTelemetryRecord, todayTriageSpendUsd, } from "./telemetry/index.js";
47
+ import { cachedFetch } from "./cache.js";
48
+ import { stratifiedSample } from "./stratified-sample.js";
49
+ import { readState, writeState, computeContentHash, STATE_SCHEMA_VERSION, } from "./state.js";
41
50
  const DEFAULTS = {
42
51
  nearDuplicateThreshold: 0.85,
43
52
  entitySwapThreshold: 0.95,
@@ -84,13 +93,17 @@ function resolveGroupRules(baseRules, overrides) {
84
93
  }
85
94
  return result;
86
95
  }
87
- function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, entityPatterns) {
96
+ function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, entityPatterns, overrides) {
88
97
  const findings = [];
89
- const tag = (results) => results.map((r) => ({
90
- ...r,
91
- group: groupName === "__default" ? undefined : groupName,
92
- ref: r.ref ?? RULE_REFERENCES[r.ruleId],
93
- }));
98
+ const tag = (results) => results.map((r) => {
99
+ const override = overrides?.[r.ruleId];
100
+ return {
101
+ ...r,
102
+ group: groupName === "__default" ? undefined : groupName,
103
+ ref: r.ref ?? RULE_REFERENCES[r.ruleId],
104
+ ...(override?.severity ? { severity: override.severity } : {}),
105
+ };
106
+ });
94
107
  // Spam rules — always compute cross-page data, only push findings if enabled
95
108
  const nearDuplicate = nearDuplicateRule(pages, resolvedRules.nearDuplicateThreshold);
96
109
  if (isEnabled("spam/near-duplicate")) {
@@ -257,79 +270,57 @@ async function collectHtmlFiles(directory) {
257
270
  }));
258
271
  return files.flat();
259
272
  }
260
- async function fetchWithRetry(url, timeoutMs) {
273
+ async function fetchWithRetry(url, timeoutMs, cache, stats) {
261
274
  try {
262
- const response = await fetch(url, { signal: AbortSignal.timeout(timeoutMs) });
263
- if (!response.ok) {
264
- return null;
275
+ stats.total += 1;
276
+ const r = await cachedFetch(url, { timeoutMs, cache });
277
+ if (r.fromCache) {
278
+ stats.hits += 1;
279
+ stats.bytesSavedEstimate += r.body.length;
265
280
  }
266
- return {
267
- text: await response.text(),
268
- contentType: response.headers.get("content-type")?.toLowerCase() ?? ""
269
- };
281
+ if (r.status < 200 || r.status >= 300)
282
+ return null;
283
+ return { text: r.body, contentType: (r.headers["content-type"] ?? "").toLowerCase() };
270
284
  }
271
285
  catch {
272
286
  return null;
273
287
  }
274
288
  }
275
- async function fetchPageWithMeta(url, timeoutMs) {
276
- const redirectChain = [];
277
- let currentUrl = url;
278
- for (let hop = 0; hop < 10; hop += 1) {
279
- let response;
280
- try {
281
- response = await fetch(currentUrl, {
282
- redirect: "manual",
283
- signal: AbortSignal.timeout(timeoutMs),
284
- });
285
- }
286
- catch {
287
- return null;
288
- }
289
- const status = response.status;
290
- if (status >= 300 && status < 400) {
291
- const location = response.headers.get("location");
292
- if (!location)
293
- break;
294
- redirectChain.push(currentUrl);
295
- try {
296
- currentUrl = new URL(location, currentUrl).href;
297
- }
298
- catch {
299
- break;
300
- }
301
- continue;
302
- }
303
- let html;
304
- try {
305
- html = await response.text();
306
- }
307
- catch {
308
- return null;
289
+ async function fetchPageWithMeta(url, timeoutMs, cache, stats) {
290
+ try {
291
+ stats.total += 1;
292
+ const r = await cachedFetch(url, { timeoutMs, cache });
293
+ if (r.fromCache) {
294
+ stats.hits += 1;
295
+ stats.bytesSavedEstimate += r.body.length;
309
296
  }
310
297
  return {
311
298
  url,
312
- html,
299
+ html: r.body,
313
300
  httpMeta: {
314
- statusCode: status,
315
- finalUrl: currentUrl,
316
- redirectChain,
317
- xRobotsTag: response.headers.get("x-robots-tag") ?? "",
318
- linkHeader: response.headers.get("link") ?? "",
301
+ statusCode: r.status,
302
+ finalUrl: r.url,
303
+ redirectChain: r.redirectChain,
304
+ xRobotsTag: r.headers["x-robots-tag"] ?? "",
305
+ linkHeader: r.headers.link ?? "",
319
306
  },
320
307
  };
321
308
  }
322
- return null;
309
+ catch {
310
+ return null;
311
+ }
323
312
  }
324
- async function fetchTextStrict(url, timeoutMs) {
325
- const response = await fetch(url, { signal: AbortSignal.timeout(timeoutMs) });
326
- if (!response.ok) {
327
- throw new Error(`Failed to fetch source: ${response.status} ${response.statusText}`);
313
+ async function fetchTextStrict(url, timeoutMs, cache, stats) {
314
+ stats.total += 1;
315
+ const r = await cachedFetch(url, { timeoutMs, cache });
316
+ if (r.fromCache) {
317
+ stats.hits += 1;
318
+ stats.bytesSavedEstimate += r.body.length;
328
319
  }
329
- return {
330
- text: await response.text(),
331
- contentType: response.headers.get("content-type")?.toLowerCase() ?? ""
332
- };
320
+ if (r.status < 200 || r.status >= 300) {
321
+ throw new Error(`Failed to fetch source: ${r.status}`);
322
+ }
323
+ return { text: r.body, contentType: (r.headers["content-type"] ?? "").toLowerCase() };
333
324
  }
334
325
  async function runWithConcurrency(items, limit, fn) {
335
326
  let index = 0;
@@ -418,7 +409,7 @@ function fisherYatesSample(items, n) {
418
409
  }
419
410
  return arr.slice(arr.length - n);
420
411
  }
421
- async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutMs) {
412
+ async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutMs, cache, stats) {
422
413
  visited.add(sitemapUrl);
423
414
  const locs = parseSitemapUrls(sitemapText);
424
415
  if (!isSitemapIndex(sitemapText)) {
@@ -428,24 +419,24 @@ async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutM
428
419
  for (const childUrl of locs) {
429
420
  if (visited.has(childUrl))
430
421
  continue;
431
- const child = await fetchWithRetry(childUrl, timeoutMs);
422
+ const child = await fetchWithRetry(childUrl, timeoutMs, cache, stats);
432
423
  if (!child)
433
424
  continue;
434
425
  const childLike = child.contentType.includes("xml") || looksLikeSitemap(child.text);
435
426
  if (!childLike)
436
427
  continue;
437
- const childUrls = await collectUrlsFromSitemap(child.text, childUrl, visited, timeoutMs);
428
+ const childUrls = await collectUrlsFromSitemap(child.text, childUrl, visited, timeoutMs, cache, stats);
438
429
  allUrls.push(...childUrls);
439
430
  }
440
431
  return allUrls;
441
432
  }
442
- async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery) {
433
+ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cache, stats) {
443
434
  if (/^https?:\/\//i.test(source)) {
444
435
  let text;
445
436
  let contentType;
446
437
  let sourceStatus = 200;
447
438
  try {
448
- const fetched = await fetchTextStrict(source, timeoutMs);
439
+ const fetched = await fetchTextStrict(source, timeoutMs, cache, stats);
449
440
  text = fetched.text;
450
441
  contentType = fetched.contentType;
451
442
  }
@@ -454,7 +445,7 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
454
445
  if (source.includes("sitemap")) {
455
446
  try {
456
447
  const origin = new URL(source).origin;
457
- const fallback = await fetchTextStrict(origin, timeoutMs);
448
+ const fallback = await fetchTextStrict(origin, timeoutMs, cache, stats);
458
449
  text = fallback.text;
459
450
  contentType = fallback.contentType;
460
451
  sourceStatus = -1; // flag that we fell back
@@ -470,17 +461,21 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
470
461
  const isXml = (contentType.includes("xml") || looksLikeSitemap(text)) && sourceStatus !== -1;
471
462
  if (isXml) {
472
463
  const visited = new Set();
473
- const urls = await collectUrlsFromSitemap(text, source, visited, timeoutMs);
464
+ const allSitemapUrls = await collectUrlsFromSitemap(text, source, visited, timeoutMs, cache, stats);
465
+ // If we have a budget, sample from sitemap URLs before fetching
466
+ const urlsToFetch = discoveryBudget > 0 && allSitemapUrls.length > discoveryBudget
467
+ ? fisherYatesSample(allSitemapUrls, discoveryBudget)
468
+ : allSitemapUrls;
474
469
  const pages = [];
475
- await runWithConcurrency(urls, concurrency, async (url) => {
476
- const result = await fetchPageWithMeta(url, timeoutMs);
470
+ await runWithConcurrency(urlsToFetch, concurrency, async (url) => {
471
+ const result = await fetchPageWithMeta(url, timeoutMs, cache, stats);
477
472
  if (result) {
478
473
  pages.push(result);
479
474
  }
480
475
  });
481
- // Crawl discovery: follow internal links to find pages not in sitemap
482
- if (crawlDiscovery) {
483
- const sitemapUrlSet = new Set(urls);
476
+ // Skip additional crawl discovery when budget is active sitemap is authoritative
477
+ if (crawlDiscovery && discoveryBudget === 0) {
478
+ const sitemapUrlSet = new Set(allSitemapUrls);
484
479
  const discoveredUrls = new Set();
485
480
  let sourceOrigin;
486
481
  try {
@@ -516,14 +511,14 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
516
511
  }
517
512
  if (discoveredUrls.size > 0) {
518
513
  await runWithConcurrency(Array.from(discoveredUrls), concurrency, async (url) => {
519
- const result = await fetchPageWithMeta(url, timeoutMs);
514
+ const result = await fetchPageWithMeta(url, timeoutMs, cache, stats);
520
515
  if (result && result.httpMeta && result.httpMeta.statusCode >= 200 && result.httpMeta.statusCode < 300) {
521
516
  pages.push(result);
522
517
  }
523
518
  });
524
519
  }
525
520
  }
526
- return { pages, sitemapUrls: new Set(urls) };
521
+ return { pages, sitemapUrls: new Set(allSitemapUrls), discoveredUrlCount: allSitemapUrls.length };
527
522
  }
528
523
  if (contentType.includes("html") || looksLikeHtml(text)) {
529
524
  const initialPage = { url: source, html: text };
@@ -537,8 +532,12 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
537
532
  sourceOrigin = "";
538
533
  }
539
534
  const knownCrawled = new Set([source]);
535
+ const allDiscoveredUrls = new Set([source]);
540
536
  const maxDepth = 3;
541
537
  for (let depth = 0; depth < maxDepth; depth += 1) {
538
+ // Stop if we've hit the discovery budget
539
+ if (discoveryBudget > 0 && pages.length >= discoveryBudget)
540
+ break;
542
541
  const frontier = new Set();
543
542
  for (const page of pages) {
544
543
  if (depth > 0 && !knownCrawled.has("__depth_" + depth + "_" + page.url))
@@ -568,11 +567,25 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
568
567
  }
569
568
  }
570
569
  }
570
+ // Track all discovered URLs even if we don't fetch them
571
+ for (const url of frontier) {
572
+ allDiscoveredUrls.add(url);
573
+ }
571
574
  if (frontier.size === 0)
572
575
  break;
576
+ // If budget active, only fetch up to budget
577
+ let urlsToFetch = Array.from(frontier);
578
+ if (discoveryBudget > 0) {
579
+ const remaining = discoveryBudget - pages.length;
580
+ if (remaining <= 0)
581
+ break;
582
+ if (urlsToFetch.length > remaining) {
583
+ urlsToFetch = urlsToFetch.slice(0, remaining);
584
+ }
585
+ }
573
586
  const newPages = [];
574
- await runWithConcurrency(Array.from(frontier), concurrency, async (url) => {
575
- const result = await fetchPageWithMeta(url, timeoutMs);
587
+ await runWithConcurrency(urlsToFetch, concurrency, async (url) => {
588
+ const result = await fetchPageWithMeta(url, timeoutMs, cache, stats);
576
589
  if (result && result.httpMeta && result.httpMeta.statusCode >= 200 && result.httpMeta.statusCode < 300) {
577
590
  newPages.push(result);
578
591
  knownCrawled.add(url);
@@ -586,6 +599,7 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
586
599
  if (newPages.length === 0)
587
600
  break;
588
601
  }
602
+ return { pages, discoveredUrlCount: allDiscoveredUrls.size };
589
603
  }
590
604
  return { pages };
591
605
  }
@@ -613,6 +627,8 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
613
627
  return { pages: [] };
614
628
  }
615
629
  export async function auditSource(source, options) {
630
+ const runId = generateRunId();
631
+ const runStartedAt = Date.now();
616
632
  const concurrency = options?.concurrency ?? 5;
617
633
  const timeoutMs = options?.timeout ?? 30000;
618
634
  const ignorePatterns = options?.ignore ?? [];
@@ -638,7 +654,63 @@ export async function auditSource(source, options) {
638
654
  stripWwwHost: options?.rules?.stripWwwHost ?? false
639
655
  });
640
656
  const crawlDiscovery = /^https?:\/\//i.test(source) && (options?.crawlDiscovery ?? true);
641
- const { pages: loadedPages, sitemapUrls: sitemapUrlSet } = await loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery);
657
+ // Discovery budget: when sampleSize is set, cap discovery at 2x (min 50) to avoid
658
+ // fetching far more pages than we'll sample. First-run egress is bounded by sampleSize;
659
+ // re-runs hit the cache. Remove adaptive 200-cap: users get full crawl by default,
660
+ // repeated audits stay cheap via --cache.
661
+ const discoveryBudget = options?.sampleSize && options.sampleSize > 0
662
+ ? Math.max(50, options.sampleSize * 2)
663
+ : 0;
664
+ const cacheStats = { hits: 0, total: 0, bytesSavedEstimate: 0 };
665
+ const cacheConfig = options?.cache
666
+ ? {
667
+ dir: options.cache.dir ?? ".pseolint/cache",
668
+ ttlMs: options.cache.ttlMs ?? 7 * 24 * 60 * 60 * 1000,
669
+ }
670
+ : null;
671
+ const { pages: loadedPagesRaw, sitemapUrls: sitemapUrlSet, discoveredUrlCount } = await loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cacheConfig, cacheStats);
672
+ const loadedPages = [...loadedPagesRaw];
673
+ if (discoveredUrlCount && discoveredUrlCount > loadedPages.length) {
674
+ console.error(`Discovered ${discoveredUrlCount} pages, fetched ${loadedPages.length} for audit. Use --sample-size 0 for full crawl.`);
675
+ }
676
+ // State read + delta filtering
677
+ let priorState = null;
678
+ const skippedUrls = [];
679
+ if (options?.state?.since || options?.state?.exitOnRegression) {
680
+ const statePath = options.state.path ?? ".pseolint/state.json";
681
+ priorState = await readState(statePath);
682
+ const currentRenderMode = options.render ? "rendered" : "static";
683
+ if (priorState && priorState.renderMode !== currentRenderMode) {
684
+ console.error(`warning: prior state renderMode=${priorState.renderMode} differs from current ${currentRenderMode}. Performing full re-audit.`);
685
+ priorState = null;
686
+ }
687
+ if (priorState && options.state.since) {
688
+ const kept = [];
689
+ for (const p of loadedPages) {
690
+ const prior = priorState.urls[p.url];
691
+ if (prior && prior.contentHash === computeContentHash(p.html)) {
692
+ skippedUrls.push(p.url);
693
+ }
694
+ else {
695
+ kept.push(p);
696
+ }
697
+ }
698
+ loadedPages.splice(0, loadedPages.length, ...kept);
699
+ }
700
+ else if (!priorState && options.state.since) {
701
+ console.error("no prior state found — performing full baseline audit");
702
+ }
703
+ }
704
+ let robotsTxtContent = "";
705
+ if (/^https?:\/\//i.test(source)) {
706
+ try {
707
+ const origin = new URL(source).origin;
708
+ const result = await fetchWithRetry(`${origin}/robots.txt`, timeoutMs, cacheConfig, cacheStats);
709
+ if (result)
710
+ robotsTxtContent = result.text;
711
+ }
712
+ catch { /* ignore */ }
713
+ }
642
714
  const deduped = [];
643
715
  const urlHashes = new Map();
644
716
  const duplicateUrlFindings = [];
@@ -665,8 +737,15 @@ export async function auditSource(source, options) {
665
737
  const filtered = ignorePatterns.length > 0
666
738
  ? deduped.filter((page) => !shouldIgnore(page.url, ignorePatterns))
667
739
  : deduped;
740
+ const strategy = options?.samplingStrategy ?? "stratified";
668
741
  const sampled = sampleSize > 0 && sampleSize < filtered.length
669
- ? fisherYatesSample(filtered, sampleSize)
742
+ ? (strategy === "stratified"
743
+ ? (() => {
744
+ const urlsMap = new Map(filtered.map(p => [p.url, p]));
745
+ const sampledUrls = stratifiedSample(filtered.map(p => p.url), sampleSize);
746
+ return sampledUrls.map(u => urlsMap.get(u));
747
+ })()
748
+ : fisherYatesSample(filtered, sampleSize))
670
749
  : filtered;
671
750
  const parsedPages = sampled.map((page) => {
672
751
  const parsed = parseHtmlPage(page.html, page.url, { normalizeUrl: normalizeUrlOptions });
@@ -686,6 +765,29 @@ export async function auditSource(source, options) {
686
765
  inbound.set(link, (inbound.get(link) ?? 0) + 1);
687
766
  }
688
767
  }
768
+ // Build entity patterns, merging user-supplied config patterns with defaults.
769
+ // Flags are restricted to known-safe characters to prevent ReDoS via crafted flags;
770
+ // each pattern is compiled eagerly so bad regexes fail at config time, not mid-audit.
771
+ const SAFE_FLAGS_RE = /^[gimsuy]*$/;
772
+ const entityPatterns = options?.entityPatterns
773
+ ? [
774
+ ...DEFAULT_ENTITY_PATTERNS,
775
+ ...options.entityPatterns.map((p) => {
776
+ const rawFlags = p.flags ?? "gi";
777
+ if (!SAFE_FLAGS_RE.test(rawFlags)) {
778
+ throw new Error(`Invalid regex flags "${rawFlags}" in entityPatterns for placeholder "${p.placeholder}". ` +
779
+ `Only the flags g, i, m, s, u, y are permitted.`);
780
+ }
781
+ try {
782
+ // Flags validated against SAFE_FLAGS_RE above; pattern is from trusted local config, not HTTP input.
783
+ return { placeholder: p.placeholder, pattern: new RegExp(p.pattern, rawFlags) }; // nosemgrep
784
+ }
785
+ catch (err) {
786
+ throw new Error(`Invalid regex pattern for placeholder "${p.placeholder}": ${err.message}`);
787
+ }
788
+ }),
789
+ ]
790
+ : DEFAULT_ENTITY_PATTERNS;
689
791
  // Classify pages into groups and run only enabled rules per group
690
792
  const classified = classifyPages(parsedPages, options?.pageGroups);
691
793
  const allFindings = [...duplicateUrlFindings];
@@ -695,6 +797,18 @@ export async function auditSource(source, options) {
695
797
  if (sitemapUrlSet && sitemapUrlSet.size > 0) {
696
798
  const sitemapFindings = sitemapCompletenessRule(parsedPages, sitemapUrlSet);
697
799
  allFindings.push(...sitemapFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
800
+ if (robotsTxtContent) {
801
+ const robotsFindings = robotsComplianceRule(parsedPages, sitemapUrlSet, robotsTxtContent);
802
+ allFindings.push(...robotsFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
803
+ }
804
+ }
805
+ // Data source comparison rules
806
+ if (options?.dataSource?.records && options.dataSource.records.length > 0) {
807
+ const dataFindings = [
808
+ ...dataBindingRule(parsedPages, options.dataSource.records),
809
+ ...dataIdenticalRule(parsedPages, options.dataSource.records),
810
+ ];
811
+ allFindings.push(...dataFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
698
812
  }
699
813
  for (const [groupName, groupPages] of classified) {
700
814
  if (groupPages.length === 0)
@@ -704,7 +818,7 @@ export async function auditSource(source, options) {
704
818
  continue;
705
819
  const groupRules = resolveGroupRules(resolvedRules, groupConfig?.overrides);
706
820
  const enabledCheck = (ruleId) => isRuleEnabled(ruleId, groupConfig?.rules);
707
- const findings = runRulesOnPages(groupPages, groupRules, enabledCheck, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, DEFAULT_ENTITY_PATTERNS);
821
+ const findings = runRulesOnPages(groupPages, groupRules, enabledCheck, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, DEFAULT_ENTITY_PATTERNS, groupConfig?.overrides);
708
822
  allFindings.push(...findings);
709
823
  groupPageCounts[groupName] = groupPages.length;
710
824
  const { score } = scoreFromFindings(findings);
@@ -716,7 +830,7 @@ export async function auditSource(source, options) {
716
830
  });
717
831
  const { score, categoryScores } = scoreFromFindings(enriched.findings);
718
832
  const auditedPageCount = Object.values(groupPageCounts).reduce((a, b) => a + b, 0);
719
- return {
833
+ const summary = {
720
834
  score,
721
835
  categoryScores,
722
836
  groupScores: options?.pageGroups ? groupScores : undefined,
@@ -726,5 +840,207 @@ export async function auditSource(source, options) {
726
840
  templateDetected: enriched.templateDetected,
727
841
  rawFindingCount: enriched.rawFindingCount,
728
842
  };
843
+ if (cacheConfig) {
844
+ summary.cacheStats = cacheStats;
845
+ }
846
+ if (skippedUrls.length > 0) {
847
+ summary.skippedUrls = skippedUrls;
848
+ }
849
+ if (priorState && options?.state?.exitOnRegression) {
850
+ let hasRegression = false;
851
+ const currentFindings = new Map();
852
+ for (const f of summary.findings) {
853
+ if (!f.pageUrl)
854
+ continue;
855
+ const set = currentFindings.get(f.pageUrl) ?? new Set();
856
+ set.add(f.ruleId);
857
+ currentFindings.set(f.pageUrl, set);
858
+ }
859
+ for (const [url, entry] of Object.entries(priorState.urls)) {
860
+ const cur = currentFindings.get(url);
861
+ if (!cur)
862
+ continue;
863
+ const priorIds = new Set(entry.findingIds);
864
+ for (const ruleId of cur) {
865
+ if (!priorIds.has(ruleId)) {
866
+ hasRegression = true;
867
+ break;
868
+ }
869
+ }
870
+ if (hasRegression)
871
+ break;
872
+ }
873
+ summary.hasRegression = hasRegression;
874
+ }
875
+ if (options?.state) {
876
+ const statePath = options.state.path ?? ".pseolint/state.json";
877
+ const renderMode = options.render ? "rendered" : "static";
878
+ const urls = {};
879
+ const findingsByUrl = new Map();
880
+ for (const f of summary.findings) {
881
+ if (!f.pageUrl)
882
+ continue;
883
+ const list = findingsByUrl.get(f.pageUrl) ?? [];
884
+ if (!list.includes(f.ruleId))
885
+ list.push(f.ruleId);
886
+ findingsByUrl.set(f.pageUrl, list);
887
+ }
888
+ // Preserve prior entries for URLs skipped by --since (they didn't change).
889
+ // Without this, delta runs would lose state for unchanged URLs.
890
+ if (priorState && skippedUrls.length > 0) {
891
+ for (const url of skippedUrls) {
892
+ const prior = priorState.urls[url];
893
+ if (prior)
894
+ urls[url] = prior;
895
+ }
896
+ }
897
+ for (const p of loadedPages) {
898
+ urls[p.url] = {
899
+ contentHash: computeContentHash(p.html),
900
+ fetchedAt: new Date().toISOString(),
901
+ status: p.httpMeta?.statusCode ?? 200,
902
+ findingIds: findingsByUrl.get(p.url) ?? [],
903
+ };
904
+ }
905
+ const newState = {
906
+ version: STATE_SCHEMA_VERSION,
907
+ lastRun: new Date().toISOString(),
908
+ source,
909
+ renderMode,
910
+ urls,
911
+ summary: {
912
+ score: summary.score,
913
+ totalFindings: summary.findings.length,
914
+ byCategory: Object.fromEntries(Object.entries(summary.categoryScores).map(([k, v]) => [k, v])),
915
+ },
916
+ };
917
+ await writeState(statePath, newState);
918
+ }
919
+ // Captured for telemetry even when triage is skipped, so users can diagnose
920
+ // model/provider reliability from their local stats.jsonl.
921
+ let triageAttempt;
922
+ if (options?.ai?.enabled) {
923
+ if (options.ai.apiKey) {
924
+ console.error("[ai-triage] warning: ai.apiKey is set in options. Prefer env vars (ANTHROPIC_API_KEY, OPENAI_API_KEY, etc.) — never commit an apiKey to a config file.");
925
+ }
926
+ try {
927
+ const resolved = await createLanguageModel({
928
+ provider: options.ai.provider,
929
+ model: options.ai.model,
930
+ endpoint: options.ai.endpoint,
931
+ apiKey: options.ai.apiKey,
932
+ });
933
+ const cacheConfig = options.ai.cache === false
934
+ ? false
935
+ : {
936
+ dir: options.ai.cache?.dir ?? ".pseolint/ai-cache",
937
+ ttlMs: options.ai.cache?.ttlMs ?? 30 * 24 * 60 * 60 * 1000,
938
+ };
939
+ // Daily-budget pre-flight read (best-effort — missing file is fine).
940
+ let spentTodayUsd;
941
+ if (options.ai.dailyBudgetUsd !== undefined) {
942
+ const telemetryPath = options.telemetry?.path ?? ".pseolint/telemetry.jsonl";
943
+ try {
944
+ spentTodayUsd = await todayTriageSpendUsd(telemetryPath);
945
+ }
946
+ catch {
947
+ spentTodayUsd = 0;
948
+ }
949
+ }
950
+ const outcome = await triageFindings(summary.findings, summary.pageCount, {
951
+ enabled: true,
952
+ model: resolved.model,
953
+ providerId: resolved.providerId,
954
+ modelId: resolved.modelId,
955
+ maxInputTokens: options.ai.maxInputTokens,
956
+ maxOutputTokens: options.ai.maxOutputTokens,
957
+ maxCostUsd: options.ai.maxCostUsd,
958
+ dailyBudgetUsd: options.ai.dailyBudgetUsd,
959
+ spentTodayUsd,
960
+ cache: cacheConfig,
961
+ });
962
+ if (outcome.skipReason) {
963
+ console.error(`[ai-triage] skipped: ${outcome.skipReason}`);
964
+ triageAttempt = { providerId: resolved.providerId, modelId: resolved.modelId, skipReason: outcome.skipReason };
965
+ }
966
+ else {
967
+ summary.triage = outcome.result;
968
+ }
969
+ }
970
+ catch (e) {
971
+ const reason = e instanceof Error ? e.message : "unknown error";
972
+ console.error(`[ai-triage] skipped: ${reason}`);
973
+ // No resolved model — providerId/modelId blank.
974
+ triageAttempt = { providerId: options.ai.provider ?? "", modelId: options.ai.model ?? "", skipReason: reason };
975
+ }
976
+ }
977
+ if (options?.telemetry?.enabled) {
978
+ const telemetryPath = options.telemetry.path ?? ".pseolint/telemetry.jsonl";
979
+ const auditRecord = {
980
+ type: "audit",
981
+ schemaVersion: 1,
982
+ runId,
983
+ timestamp: new Date().toISOString(),
984
+ durationMs: Date.now() - runStartedAt,
985
+ score: summary.score,
986
+ pageCount: summary.pageCount,
987
+ findingCount: summary.findings.length,
988
+ ...(summary.rawFindingCount !== undefined && { rawFindingCount: summary.rawFindingCount }),
989
+ ...(summary.templateDetected !== undefined && { templateDetected: summary.templateDetected }),
990
+ ...(summary.cacheStats && { cacheStats: summary.cacheStats }),
991
+ ...(summary.triage && {
992
+ triage: {
993
+ success: true,
994
+ rootCauseCount: summary.triage.rootCauses.length,
995
+ providerId: summary.triage.providerId,
996
+ modelId: summary.triage.modelUsed,
997
+ cacheHit: summary.triage.cacheHit,
998
+ tokenUsage: summary.triage.tokenUsage,
999
+ ...(summary.triage.estimatedCostUsd !== undefined && {
1000
+ estimatedCostUsd: summary.triage.estimatedCostUsd,
1001
+ }),
1002
+ truncatedInput: summary.triage.truncatedInput,
1003
+ },
1004
+ }),
1005
+ ...(!summary.triage && triageAttempt && {
1006
+ triage: {
1007
+ success: false,
1008
+ skipReason: triageAttempt.skipReason,
1009
+ rootCauseCount: 0,
1010
+ providerId: triageAttempt.providerId,
1011
+ modelId: triageAttempt.modelId,
1012
+ cacheHit: false,
1013
+ tokenUsage: { input: 0, output: 0 },
1014
+ truncatedInput: false,
1015
+ },
1016
+ }),
1017
+ };
1018
+ await appendTelemetryRecord(telemetryPath, auditRecord);
1019
+ // Feedback: only if triage ran
1020
+ if (summary.triage) {
1021
+ let rating;
1022
+ if (options.telemetry.feedback) {
1023
+ rating = options.telemetry.feedback;
1024
+ }
1025
+ else if (options.telemetry.prompt !== false) {
1026
+ rating = await promptTriageFeedback();
1027
+ }
1028
+ if (rating) {
1029
+ const feedbackRecord = {
1030
+ type: "feedback",
1031
+ schemaVersion: 1,
1032
+ runId,
1033
+ timestamp: new Date().toISOString(),
1034
+ rating,
1035
+ };
1036
+ await appendTelemetryRecord(telemetryPath, feedbackRecord);
1037
+ }
1038
+ }
1039
+ }
1040
+ const aiHintEnabled = options?.ai?.suggest !== false;
1041
+ if (aiHintEnabled && !options?.ai?.enabled && process.env.ANTHROPIC_API_KEY) {
1042
+ console.error(`💡 AI triage available — re-run with --ai to prioritize ${summary.findings.length} findings into a fix list.`);
1043
+ }
1044
+ return summary;
729
1045
  }
730
1046
  //# sourceMappingURL=auditor.js.map