@pseolint/core 0.5.15 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/dist/algorithms/wikipedia-paraphrase.d.ts +23 -0
  2. package/dist/algorithms/wikipedia-paraphrase.d.ts.map +1 -0
  3. package/dist/algorithms/wikipedia-paraphrase.js +82 -0
  4. package/dist/algorithms/wikipedia-paraphrase.js.map +1 -0
  5. package/dist/auditor.d.ts.map +1 -1
  6. package/dist/auditor.js +142 -5
  7. package/dist/auditor.js.map +1 -1
  8. package/dist/formatters/console.d.ts +16 -0
  9. package/dist/formatters/console.d.ts.map +1 -1
  10. package/dist/formatters/console.js +34 -7
  11. package/dist/formatters/console.js.map +1 -1
  12. package/dist/formatters/html.d.ts +7 -1
  13. package/dist/formatters/html.d.ts.map +1 -1
  14. package/dist/formatters/html.js +28 -2
  15. package/dist/formatters/html.js.map +1 -1
  16. package/dist/formatters/markdown.d.ts +10 -1
  17. package/dist/formatters/markdown.d.ts.map +1 -1
  18. package/dist/formatters/markdown.js +33 -6
  19. package/dist/formatters/markdown.js.map +1 -1
  20. package/dist/formatters/template-cards.d.ts +44 -0
  21. package/dist/formatters/template-cards.d.ts.map +1 -0
  22. package/dist/formatters/template-cards.js +191 -0
  23. package/dist/formatters/template-cards.js.map +1 -0
  24. package/dist/rules/content/common-phrase-reuse.d.ts +10 -0
  25. package/dist/rules/content/common-phrase-reuse.d.ts.map +1 -0
  26. package/dist/rules/content/common-phrase-reuse.js +132 -0
  27. package/dist/rules/content/common-phrase-reuse.js.map +1 -0
  28. package/dist/rules/content/value-add.d.ts +5 -2
  29. package/dist/rules/content/value-add.d.ts.map +1 -1
  30. package/dist/rules/content/value-add.js +29 -5
  31. package/dist/rules/content/value-add.js.map +1 -1
  32. package/dist/rules/content/wikipedia-paraphrase.d.ts +15 -0
  33. package/dist/rules/content/wikipedia-paraphrase.d.ts.map +1 -0
  34. package/dist/rules/content/wikipedia-paraphrase.js +39 -0
  35. package/dist/rules/content/wikipedia-paraphrase.js.map +1 -0
  36. package/dist/types.d.ts +23 -0
  37. package/dist/types.d.ts.map +1 -1
  38. package/package.json +92 -92
@@ -0,0 +1,23 @@
1
+ /**
2
+ * wikipedia-paraphrase.ts
3
+ *
4
+ * Loads the prebuilt Wikipedia trigram bloom filter and provides a
5
+ * paraphrase-rate estimator. Used by content/wikipedia-paraphrase rule.
6
+ *
7
+ * Bloom filter layout (data/wikipedia-trigrams.bin):
8
+ * m = 65536 bits (8192 bytes)
9
+ * k = 3 FNV-1a-32 hash functions with distinct seeds
10
+ *
11
+ * FP rate ~5% for the curated corpus of ~10 k unique trigrams.
12
+ */
13
+ /** Load and cache the bloom filter binary. */
14
+ export declare function loadWikipediaBloomFilter(): Uint8Array;
15
+ /**
16
+ * Compute the fraction of the text's trigrams that match the Wikipedia
17
+ * bloom filter. Returns a 0-1 ratio. Returns 0 for text with fewer than
18
+ * 3 words (no trigrams extractable).
19
+ */
20
+ export declare function wikipediaParaphraseRate(text: string): number;
21
+ /** Reset the in-memory cache. For testing purposes only. */
22
+ export declare function _resetBloomCache(): void;
23
+ //# sourceMappingURL=wikipedia-paraphrase.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"wikipedia-paraphrase.d.ts","sourceRoot":"","sources":["../../src/algorithms/wikipedia-paraphrase.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAyBH,8CAA8C;AAC9C,wBAAgB,wBAAwB,IAAI,UAAU,CAKrD;AAuBD;;;;GAIG;AACH,wBAAgB,uBAAuB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAU5D;AAED,4DAA4D;AAC5D,wBAAgB,gBAAgB,IAAI,IAAI,CAEvC"}
@@ -0,0 +1,82 @@
1
+ /**
2
+ * wikipedia-paraphrase.ts
3
+ *
4
+ * Loads the prebuilt Wikipedia trigram bloom filter and provides a
5
+ * paraphrase-rate estimator. Used by content/wikipedia-paraphrase rule.
6
+ *
7
+ * Bloom filter layout (data/wikipedia-trigrams.bin):
8
+ * m = 65536 bits (8192 bytes)
9
+ * k = 3 FNV-1a-32 hash functions with distinct seeds
10
+ *
11
+ * FP rate ~5% for the curated corpus of ~10 k unique trigrams.
12
+ */
13
+ import { readFileSync } from "node:fs";
14
+ import { join, dirname } from "node:path";
15
+ import { fileURLToPath } from "node:url";
16
+ const __dirname = dirname(fileURLToPath(import.meta.url));
17
+ const DATA_PATH = join(__dirname, "..", "..", "data", "wikipedia-trigrams.bin");
18
+ const BLOOM_BITS = 65536;
19
+ const BLOOM_K = 3;
20
+ const FNV_PRIME = 0x01000193;
21
+ const FNV_SEEDS = [0x811c9dc5, 0x6b43a9b5, 0x29f7b4e3];
22
+ function fnv1a32(str, seed) {
23
+ let hash = seed >>> 0;
24
+ for (let i = 0; i < str.length; i++) {
25
+ hash ^= str.charCodeAt(i);
26
+ hash = Math.imul(hash, FNV_PRIME) >>> 0;
27
+ }
28
+ return hash;
29
+ }
30
+ let _cache = null;
31
+ /** Load and cache the bloom filter binary. */
32
+ export function loadWikipediaBloomFilter() {
33
+ if (_cache !== null)
34
+ return _cache;
35
+ const buf = readFileSync(DATA_PATH);
36
+ _cache = new Uint8Array(buf.buffer, buf.byteOffset, buf.byteLength);
37
+ return _cache;
38
+ }
39
+ function bloomQuery(bits, trigram) {
40
+ for (let k = 0; k < BLOOM_K; k++) {
41
+ const pos = fnv1a32(trigram, FNV_SEEDS[k]) % BLOOM_BITS;
42
+ if ((bits[pos >> 3] & (1 << (pos & 7))) === 0)
43
+ return false;
44
+ }
45
+ return true;
46
+ }
47
+ function extractTrigrams(text) {
48
+ const tokens = text
49
+ .toLowerCase()
50
+ .replace(/[^\p{L}\p{N}\s]+/gu, " ")
51
+ .split(/\s+/)
52
+ .filter(Boolean);
53
+ const trigrams = [];
54
+ for (let i = 0; i <= tokens.length - 3; i++) {
55
+ trigrams.push(`${tokens[i]} ${tokens[i + 1]} ${tokens[i + 2]}`);
56
+ }
57
+ return trigrams;
58
+ }
59
+ /**
60
+ * Compute the fraction of the text's trigrams that match the Wikipedia
61
+ * bloom filter. Returns a 0-1 ratio. Returns 0 for text with fewer than
62
+ * 3 words (no trigrams extractable).
63
+ */
64
+ export function wikipediaParaphraseRate(text) {
65
+ if (!text || text.trim().length === 0)
66
+ return 0;
67
+ const trigrams = extractTrigrams(text);
68
+ if (trigrams.length === 0)
69
+ return 0;
70
+ const bits = loadWikipediaBloomFilter();
71
+ let matched = 0;
72
+ for (const t of trigrams) {
73
+ if (bloomQuery(bits, t))
74
+ matched++;
75
+ }
76
+ return matched / trigrams.length;
77
+ }
78
+ /** Reset the in-memory cache. For testing purposes only. */
79
+ export function _resetBloomCache() {
80
+ _cache = null;
81
+ }
82
+ //# sourceMappingURL=wikipedia-paraphrase.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"wikipedia-paraphrase.js","sourceRoot":"","sources":["../../src/algorithms/wikipedia-paraphrase.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAEH,OAAO,EAAE,YAAY,EAAE,MAAM,SAAS,CAAC;AACvC,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAC1C,OAAO,EAAE,aAAa,EAAE,MAAM,UAAU,CAAC;AAEzC,MAAM,SAAS,GAAG,OAAO,CAAC,aAAa,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;AAC1D,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,EAAE,IAAI,EAAE,IAAI,EAAE,MAAM,EAAE,wBAAwB,CAAC,CAAC;AAEhF,MAAM,UAAU,GAAG,KAAK,CAAC;AACzB,MAAM,OAAO,GAAG,CAAC,CAAC;AAClB,MAAM,SAAS,GAAG,UAAU,CAAC;AAC7B,MAAM,SAAS,GAAG,CAAC,UAAU,EAAE,UAAU,EAAE,UAAU,CAAC,CAAC;AAEvD,SAAS,OAAO,CAAC,GAAW,EAAE,IAAY;IACxC,IAAI,IAAI,GAAG,IAAI,KAAK,CAAC,CAAC;IACtB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACpC,IAAI,IAAI,GAAG,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;QAC1B,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,SAAS,CAAC,KAAK,CAAC,CAAC;IAC1C,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED,IAAI,MAAM,GAAsB,IAAI,CAAC;AAErC,8CAA8C;AAC9C,MAAM,UAAU,wBAAwB;IACtC,IAAI,MAAM,KAAK,IAAI;QAAE,OAAO,MAAM,CAAC;IACnC,MAAM,GAAG,GAAG,YAAY,CAAC,SAAS,CAAC,CAAC;IACpC,MAAM,GAAG,IAAI,UAAU,CAAC,GAAG,CAAC,MAAM,EAAE,GAAG,CAAC,UAAU,EAAE,GAAG,CAAC,UAAU,CAAC,CAAC;IACpE,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,SAAS,UAAU,CAAC,IAAgB,EAAE,OAAe;IACnD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,EAAE,CAAC,EAAE,EAAE,CAAC;QACjC,MAAM,GAAG,GAAG,OAAO,CAAC,OAAO,EAAE,SAAS,CAAC,CAAC,CAAC,CAAC,GAAG,UAAU,CAAC;QACxD,IAAI,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC;YAAE,OAAO,KAAK,CAAC;IAC9D,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED,SAAS,eAAe,CAAC,IAAY;IACnC,MAAM,MAAM,GAAG,IAAI;SAChB,WAAW,EAAE;SACb,OAAO,CAAC,oBAAoB,EAAE,GAAG,CAAC;SAClC,KAAK,CAAC,KAAK,CAAC;SACZ,MAAM,CAAC,OAAO,CAAC,CAAC;IACnB,MAAM,QAAQ,GAAa,EAAE,CAAC;IAC9B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QAC5C,QAAQ,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC,CAAC,CAAC,IAAI,MAAM,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,MAAM,CAAC,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;IAClE,CAAC;IACD,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,uBAAuB,CAAC,IAAY;IAClD,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,CAAC,CAAC;IAChD,MAAM,QAAQ,GAAG,eAAe,CAAC,IAAI,CAAC,CAAC;IACvC,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,CAAC,CAAC;IACpC,MAAM,IAAI,GAAG,wBAAwB,EAAE,CAAC;IACxC,IAAI,OAAO,GAAG,CAAC,CAAC;IAChB,KAAK,MAAM,CAAC,IAAI,QAAQ,EAAE,CAAC;QACzB,IAAI,UAAU,CAAC,IAAI,EAAE,CAAC,CAAC;YAAE,OAAO,EAAE,CAAC;IACrC,CAAC;IACD,OAAO,OAAO,GAAG,QAAQ,CAAC,MAAM,CAAC;AACnC,CAAC;AAED,4DAA4D;AAC5D,MAAM,UAAU,gBAAgB;IAC9B,MAAM,GAAG,IAAI,CAAC;AAChB,CAAC"}
@@ -1 +1 @@
1
- {"version":3,"file":"auditor.d.ts","sourceRoot":"","sources":["../src/auditor.ts"],"names":[],"mappings":"AAiEA,OAAO,KAAK,EACV,YAAY,EACZ,YAAY,EAaZ,UAAU,EAIX,MAAM,YAAY,CAAC;AAQpB,OAAO,EAA8D,KAAK,kBAAkB,EAAiB,MAAM,sBAAsB,CAAC;AAyzB1I;;;;;;;;GAQG;AACH,wBAAgB,4BAA4B,CAC1C,QAAQ,EAAE,UAAU,EAAE,EACtB,cAAc,EAAE,kBAAkB,GAAG,SAAS,GAC7C,UAAU,EAAE,CAed;AA+WD,wBAAgB,2BAA2B,CAAC,GAAG,EAAE,MAAM,GAAG,KAAK,CAAC;IAAE,GAAG,EAAE,MAAM,CAAC;IAAC,OAAO,CAAC,EAAE,MAAM,CAAA;CAAE,CAAC,CAgBjG;AA2fD,wBAAsB,WAAW,CAAC,MAAM,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,YAAY,GAAG,OAAO,CAAC,YAAY,CAAC,CA6/B/F"}
1
+ {"version":3,"file":"auditor.d.ts","sourceRoot":"","sources":["../src/auditor.ts"],"names":[],"mappings":"AAmEA,OAAO,KAAK,EACV,YAAY,EACZ,YAAY,EAaZ,UAAU,EAIX,MAAM,YAAY,CAAC;AAQpB,OAAO,EAA8D,KAAK,kBAAkB,EAAiB,MAAM,sBAAsB,CAAC;AAm0B1I;;;;;;;;GAQG;AACH,wBAAgB,4BAA4B,CAC1C,QAAQ,EAAE,UAAU,EAAE,EACtB,cAAc,EAAE,kBAAkB,GAAG,SAAS,GAC7C,UAAU,EAAE,CAed;AA+WD,wBAAgB,2BAA2B,CAAC,GAAG,EAAE,MAAM,GAAG,KAAK,CAAC;IAAE,GAAG,EAAE,MAAM,CAAC;IAAC,OAAO,CAAC,EAAE,MAAM,CAAA;CAAE,CAAC,CAgBjG;AAqhBD,wBAAsB,WAAW,CAAC,MAAM,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,YAAY,GAAG,OAAO,CAAC,YAAY,CAAC,CA4lC/F"}
package/dist/auditor.js CHANGED
@@ -26,6 +26,8 @@ import { headingStructureRule } from "./rules/content/heading-structure.js";
26
26
  import { imageAltTextRule } from "./rules/content/image-alt-text.js";
27
27
  import { translationNoOpRule } from "./rules/content/translation-no-op.js";
28
28
  import { regurgitatedContentRule } from "./rules/content/regurgitated-content.js";
29
+ import { commonPhraseReuseRule } from "./rules/content/common-phrase-reuse.js";
30
+ import { wikipediaParaphraseRule } from "./rules/content/wikipedia-paraphrase.js";
29
31
  import { valueAddRule } from "./rules/content/value-add.js";
30
32
  import { canonicalConsistencyRule } from "./rules/tech/canonical-consistency.js";
31
33
  import { canonicalNoindexConflictRule } from "./rules/tech/canonical-noindex-conflict.js";
@@ -69,7 +71,7 @@ import { readState, writeState, computeContentHash, STATE_SCHEMA_VERSION, } from
69
71
  import { CORE_RULESET_VERSION } from "./ruleset-version.js";
70
72
  import { planScrapeStrategy, DEFAULT_AGE_FLOOR_DAYS } from "./scrape-strategy.js";
71
73
  import { detectTemplates, buildUrlToTemplateMap, shouldActivateTemplateScoring } from "./template-detection.js";
72
- import { scoreTemplates } from "./per-template-scoring.js";
74
+ import { scoreTemplates, siteVerdictFromTemplates } from "./per-template-scoring.js";
73
75
  const DEFAULTS = {
74
76
  nearDuplicateThreshold: 0.85,
75
77
  entitySwapThreshold: 0.95,
@@ -397,6 +399,10 @@ const RULE_IMPACTS = {
397
399
  "content/translation-no-op": { baseImpact: 30, perInstance: 10, maxImpact: 60 },
398
400
  // v1 warning-severity heuristic; lower than translation-no-op since it's speculative
399
401
  "content/regurgitated-content": { baseImpact: 15, perInstance: 5, maxImpact: 35 },
402
+ // v0.5.11 warning/low-confidence cliché density detector; lower than regurgitated-content
403
+ "content/common-phrase-reuse": { baseImpact: 12, perInstance: 4, maxImpact: 30 },
404
+ // v0.5.14 speculative/warning Wikipedia trigram overlap; lower than common-phrase-reuse
405
+ "content/wikipedia-paraphrase": { baseImpact: 10, perInstance: 3, maxImpact: 25 },
400
406
  // v0.5.8 composite per-page quality synthesis
401
407
  "content/value-add": { baseImpact: 25, perInstance: 8, maxImpact: 50 },
402
408
  // Tech — softened in v0.4.3-rc2 after dogfood showed nextjs.org regressing
@@ -639,6 +645,12 @@ sampled = false) {
639
645
  if (isEnabled("content/regurgitated-content") && modeOk("content/regurgitated-content")) {
640
646
  findings.push(...tag(regurgitatedContentRule(pages)));
641
647
  }
648
+ if (isEnabled("content/common-phrase-reuse") && modeOk("content/common-phrase-reuse")) {
649
+ findings.push(...tag(commonPhraseReuseRule(pages)));
650
+ }
651
+ if (isEnabled("content/wikipedia-paraphrase") && modeOk("content/wikipedia-paraphrase")) {
652
+ findings.push(...tag(wikipediaParaphraseRule(pages)));
653
+ }
642
654
  // Link rules — use the global link graph
643
655
  if (isEnabled("links/orphan-pages") && modeOk("links/orphan-pages")) {
644
656
  findings.push(...tag(orphanPagesRule(pages, inbound, rootUrl)));
@@ -1529,6 +1541,31 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
1529
1541
  return { pages: [{ url: resolved, html: await readFile(resolved, "utf-8") }] };
1530
1542
  }
1531
1543
  if (sourceStat.isDirectory()) {
1544
+ // v0.5.15: if the directory contains _manifest.json, restore original URLs
1545
+ // from the fixture manifest. Without a manifest (or unparseable JSON), the
1546
+ // engine falls back to file-path URLs (existing behaviour for arbitrary
1547
+ // HTML directories). Missing files listed in a valid manifest are
1548
+ // propagated as errors — a stale manifest is a programmer error, not
1549
+ // a soft condition.
1550
+ const manifestPath = join(resolved, "_manifest.json");
1551
+ let hasManifest = false;
1552
+ let manifest = null;
1553
+ try {
1554
+ const raw = await readFile(manifestPath, "utf-8");
1555
+ manifest = JSON.parse(raw);
1556
+ hasManifest = true;
1557
+ }
1558
+ catch {
1559
+ // No manifest file or invalid JSON — fall through to path-based loading
1560
+ }
1561
+ if (hasManifest && manifest !== null) {
1562
+ // Propagate missing-file errors (fail-loud: stale manifests must be noticed)
1563
+ const pages = await Promise.all(Object.entries(manifest).map(async ([originalUrl, relPath]) => ({
1564
+ url: originalUrl,
1565
+ html: await readFile(join(resolved, relPath), "utf-8"),
1566
+ })));
1567
+ return { pages };
1568
+ }
1532
1569
  const htmlFiles = await collectHtmlFiles(resolved);
1533
1570
  const pages = await Promise.all(htmlFiles.map(async (filePath) => ({
1534
1571
  url: filePath,
@@ -1556,7 +1593,11 @@ export async function auditSource(source, options) {
1556
1593
  const skipBoilerplate = options?.skipBoilerplate ?? false;
1557
1594
  const skipSearchPages = options?.skipSearchPages ?? false;
1558
1595
  const skipEmptyBody = options?.skipEmptyBody ?? false;
1559
- const sampleSize = options?.sampleSize ?? preset.sampleSize ?? 0;
1596
+ // v0.5.12: when pinnedUrls is non-empty, sampleSize is irrelevant — the
1597
+ // pinned list IS the sample. Force to 0 so the post-fetch sampling step
1598
+ // is a no-op and all pinned pages pass through untruncated.
1599
+ const hasPinnedUrlsEarly = Array.isArray(options?.pinnedUrls) && options.pinnedUrls.length > 0;
1600
+ const sampleSize = hasPinnedUrlsEarly ? 0 : (options?.sampleSize ?? preset.sampleSize ?? 0);
1560
1601
  const externalSignal = options?.signal;
1561
1602
  const guardSsrf = options?.guardSsrf ?? preset.guardSsrf ?? false;
1562
1603
  const respectRobotsTxt = options?.respectRobotsTxt ?? preset.respectRobotsTxt ?? true;
@@ -1714,7 +1755,84 @@ export async function auditSource(source, options) {
1714
1755
  if (!priorState && options?.state?.since) {
1715
1756
  console.error("no prior state found — performing full baseline audit");
1716
1757
  }
1717
- const { pages: loadedPagesRaw, sitemapUrls: sitemapUrlSet, sitemapLastmodByUrl, discoveredUrlCount, scrapePlan } = await loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cacheConfig, cacheStats, fillBudgetViaLinkDiscovery, fetchByteBudget, signal, guardSsrf, respectRobotsTxt, skippedByRobots, followRedirects, maxCrawlDiscovered, monitoringContext);
1758
+ // v0.5.12 pinnedUrls fast path: bypass sitemap discovery + random sampling
1759
+ // entirely. Only fetch the caller-specified URLs. Validated same-origin for
1760
+ // HTTP sources. Filesystem sources treat pinnedUrls as absolute paths.
1761
+ let loadedPagesRaw;
1762
+ let sitemapUrlSet;
1763
+ let sitemapLastmodByUrl;
1764
+ let discoveredUrlCount;
1765
+ let scrapePlan;
1766
+ if (hasPinnedUrlsEarly) {
1767
+ const pinned = options.pinnedUrls;
1768
+ // Validate same-origin for HTTP sources
1769
+ if (/^https?:\/\//i.test(source)) {
1770
+ let sourceOriginStr;
1771
+ try {
1772
+ sourceOriginStr = new URL(source).origin;
1773
+ }
1774
+ catch {
1775
+ throw new Error(`pinnedUrls: source URL is not a valid URL: ${source}`);
1776
+ }
1777
+ for (const u of pinned) {
1778
+ let pinnedOrigin;
1779
+ try {
1780
+ pinnedOrigin = new URL(u).origin;
1781
+ }
1782
+ catch {
1783
+ throw new Error(`pinnedUrls: "${u}" is not a valid absolute URL`);
1784
+ }
1785
+ if (pinnedOrigin !== sourceOriginStr) {
1786
+ throw new Error(`pinnedUrls: cross-origin URL rejected. Source origin is "${sourceOriginStr}" but pinned URL "${u}" has origin "${pinnedOrigin}". All pinnedUrls must be same-origin as the source.`);
1787
+ }
1788
+ }
1789
+ }
1790
+ // Fetch pinned URLs directly — no sitemap fetch, no sampling
1791
+ const ssrfCache = new Map();
1792
+ const validateHopPinned = guardSsrf
1793
+ ? async (u) => {
1794
+ let host;
1795
+ try {
1796
+ host = new URL(u).hostname;
1797
+ }
1798
+ catch {
1799
+ throw new Error(`Refusing to fetch invalid URL: ${u}`);
1800
+ }
1801
+ let pending = ssrfCache.get(host);
1802
+ if (!pending) {
1803
+ pending = validateTargetHost(host).catch((err) => {
1804
+ if (err instanceof SSRFError)
1805
+ throw new Error(`Refusing to fetch ${u}: ${err.reason}`);
1806
+ throw err;
1807
+ });
1808
+ ssrfCache.set(host, pending);
1809
+ }
1810
+ await pending;
1811
+ }
1812
+ : undefined;
1813
+ const pinnedPages = [];
1814
+ await runWithConcurrency(Array.from(pinned), concurrency, async (url) => {
1815
+ const result = await fetchPageWithMeta(url, timeoutMs, cacheConfig, cacheStats, signal, validateHopPinned, followRedirects);
1816
+ if (result) {
1817
+ fetchByteBudget.used += result.html.length;
1818
+ pinnedPages.push(result);
1819
+ }
1820
+ });
1821
+ loadedPagesRaw = pinnedPages;
1822
+ // No sitemap context in pinned mode
1823
+ sitemapUrlSet = undefined;
1824
+ sitemapLastmodByUrl = undefined;
1825
+ discoveredUrlCount = undefined;
1826
+ scrapePlan = undefined;
1827
+ }
1828
+ else {
1829
+ const loaded = await loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cacheConfig, cacheStats, fillBudgetViaLinkDiscovery, fetchByteBudget, signal, guardSsrf, respectRobotsTxt, skippedByRobots, followRedirects, maxCrawlDiscovered, monitoringContext);
1830
+ loadedPagesRaw = loaded.pages;
1831
+ sitemapUrlSet = loaded.sitemapUrls;
1832
+ sitemapLastmodByUrl = loaded.sitemapLastmodByUrl;
1833
+ discoveredUrlCount = loaded.discoveredUrlCount;
1834
+ scrapePlan = loaded.scrapePlan;
1835
+ }
1718
1836
  // The scrapePlan tells us which URLs were skipped pre-fetch under monitoring
1719
1837
  // mode. Surface them in skippedUrls so they show up under summary.skippedUrls
1720
1838
  // (kept for back-compat with --since consumers); T7 will carry their prior
@@ -1963,7 +2081,15 @@ export async function auditSource(source, options) {
1963
2081
  continue;
1964
2082
  const groupRules = resolveGroupRules(resolvedRules, groupConfig?.overrides);
1965
2083
  const enabledCheck = (ruleId) => !suppressedRuleSet.has(ruleId) && isRuleEnabled(ruleId, groupConfig?.rules);
1966
- const findings = runRulesOnPages(groupPages, parsedPagesAll, groupRules, enabledCheck, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, DEFAULT_ENTITY_PATTERNS, groupConfig?.overrides, options?.mode ?? "full", isSampledAudit);
2084
+ const findings = runRulesOnPages(groupPages, parsedPagesAll, groupRules, enabledCheck, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, DEFAULT_ENTITY_PATTERNS, groupConfig?.overrides, options?.mode ?? "full",
2085
+ // 2026-05-06 calibration fix: pinnedUrls mode fetches a hand-picked subset
2086
+ // of the full site — the link graph across those pages is structurally
2087
+ // incomplete, just like a random-sampled crawl. Pass `true` so
2088
+ // links/unreachable-from-root skips its check rather than emitting
2089
+ // sampling-artifact false positives (22/25 Wise pages flagged "unreachable"
2090
+ // because the nav paths between locale-specific currency-converter URLs
2091
+ // were not in the pinned set).
2092
+ isSampledAudit || hasPinnedUrlsEarly);
1967
2093
  allFindings.push(...findings);
1968
2094
  groupPageCounts[groupName] = groupPages.length;
1969
2095
  // v0.4.3: per-group scoring uses the same site-classification profile so
@@ -2061,7 +2187,14 @@ export async function auditSource(source, options) {
2061
2187
  const { risk, categories, bucketCounts } = scoreFromFindings(enriched.findings, siteClassification, parsedPages.length);
2062
2188
  const auditedPageCount = Object.values(groupPageCounts).reduce((a, b) => a + b, 0);
2063
2189
  const issues = bucketIssues(enriched.findings);
2064
- const verdict = shiftVerdictForAuthority(verdictForRisk(risk), options?.authorityScore);
2190
+ // v0.6.0 — spec §15.1: site verdict comes from siteVerdictFromTemplates when
2191
+ // ≥1 template has ≥5% coverage. Falls back to the legacy risk-ladder verdict
2192
+ // when no template meets the threshold (single-template sites, `unclear`/
2193
+ // `small-marketing` classifications, or the long-tail-only case).
2194
+ // The `risk` score is intentionally unchanged — §15.1 governs verdict only.
2195
+ const legacyVerdict = shiftVerdictForAuthority(verdictForRisk(risk), options?.authorityScore);
2196
+ const templateVerdict = siteVerdictFromTemplates(siteTemplates);
2197
+ const verdict = templateVerdict !== null ? templateVerdict : legacyVerdict;
2065
2198
  const headline = buildHeadline(bucketCounts);
2066
2199
  // audit/* findings are diagnostic-only and never appear in summary.issues.
2067
2200
  // Surface them under diagnostics so consumers (telemetry, debug UIs) can
@@ -2094,6 +2227,10 @@ export async function auditSource(source, options) {
2094
2227
  pageCount: auditedPageCount || parsedPages.length,
2095
2228
  templateDetected: enriched.templateDetected,
2096
2229
  rawFindingCount: enriched.rawFindingCount,
2230
+ // v0.5.12 — sorted list of audited page URLs for --repin capture
2231
+ auditedUrls: parsedPages.length > 0
2232
+ ? [...parsedPages.map((p) => p.url)].sort()
2233
+ : undefined,
2097
2234
  };
2098
2235
  if (cacheConfig) {
2099
2236
  summary.cacheStats = cacheStats;