@pseolint/core 0.5.15 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/algorithms/wikipedia-paraphrase.d.ts +23 -0
- package/dist/algorithms/wikipedia-paraphrase.d.ts.map +1 -0
- package/dist/algorithms/wikipedia-paraphrase.js +82 -0
- package/dist/algorithms/wikipedia-paraphrase.js.map +1 -0
- package/dist/auditor.d.ts.map +1 -1
- package/dist/auditor.js +147 -5
- package/dist/auditor.js.map +1 -1
- package/dist/formatters/console.d.ts +16 -0
- package/dist/formatters/console.d.ts.map +1 -1
- package/dist/formatters/console.js +34 -7
- package/dist/formatters/console.js.map +1 -1
- package/dist/formatters/html.d.ts +7 -1
- package/dist/formatters/html.d.ts.map +1 -1
- package/dist/formatters/html.js +28 -2
- package/dist/formatters/html.js.map +1 -1
- package/dist/formatters/markdown.d.ts +10 -1
- package/dist/formatters/markdown.d.ts.map +1 -1
- package/dist/formatters/markdown.js +33 -6
- package/dist/formatters/markdown.js.map +1 -1
- package/dist/formatters/template-cards.d.ts +44 -0
- package/dist/formatters/template-cards.d.ts.map +1 -0
- package/dist/formatters/template-cards.js +191 -0
- package/dist/formatters/template-cards.js.map +1 -0
- package/dist/rules/content/common-phrase-reuse.d.ts +10 -0
- package/dist/rules/content/common-phrase-reuse.d.ts.map +1 -0
- package/dist/rules/content/common-phrase-reuse.js +132 -0
- package/dist/rules/content/common-phrase-reuse.js.map +1 -0
- package/dist/rules/content/value-add.d.ts +5 -2
- package/dist/rules/content/value-add.d.ts.map +1 -1
- package/dist/rules/content/value-add.js +29 -5
- package/dist/rules/content/value-add.js.map +1 -1
- package/dist/rules/content/wikipedia-paraphrase.d.ts +15 -0
- package/dist/rules/content/wikipedia-paraphrase.d.ts.map +1 -0
- package/dist/rules/content/wikipedia-paraphrase.js +39 -0
- package/dist/rules/content/wikipedia-paraphrase.js.map +1 -0
- package/dist/types.d.ts +39 -0
- package/dist/types.d.ts.map +1 -1
- package/package.json +92 -92
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* wikipedia-paraphrase.ts
|
|
3
|
+
*
|
|
4
|
+
* Loads the prebuilt Wikipedia trigram bloom filter and provides a
|
|
5
|
+
* paraphrase-rate estimator. Used by content/wikipedia-paraphrase rule.
|
|
6
|
+
*
|
|
7
|
+
* Bloom filter layout (data/wikipedia-trigrams.bin):
|
|
8
|
+
* m = 65536 bits (8192 bytes)
|
|
9
|
+
* k = 3 FNV-1a-32 hash functions with distinct seeds
|
|
10
|
+
*
|
|
11
|
+
* FP rate ~5% for the curated corpus of ~10 k unique trigrams.
|
|
12
|
+
*/
|
|
13
|
+
/** Load and cache the bloom filter binary. */
|
|
14
|
+
export declare function loadWikipediaBloomFilter(): Uint8Array;
|
|
15
|
+
/**
|
|
16
|
+
* Compute the fraction of the text's trigrams that match the Wikipedia
|
|
17
|
+
* bloom filter. Returns a 0-1 ratio. Returns 0 for text with fewer than
|
|
18
|
+
* 3 words (no trigrams extractable).
|
|
19
|
+
*/
|
|
20
|
+
export declare function wikipediaParaphraseRate(text: string): number;
|
|
21
|
+
/** Reset the in-memory cache. For testing purposes only. */
|
|
22
|
+
export declare function _resetBloomCache(): void;
|
|
23
|
+
//# sourceMappingURL=wikipedia-paraphrase.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"wikipedia-paraphrase.d.ts","sourceRoot":"","sources":["../../src/algorithms/wikipedia-paraphrase.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAyBH,8CAA8C;AAC9C,wBAAgB,wBAAwB,IAAI,UAAU,CAKrD;AAuBD;;;;GAIG;AACH,wBAAgB,uBAAuB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAU5D;AAED,4DAA4D;AAC5D,wBAAgB,gBAAgB,IAAI,IAAI,CAEvC"}
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* wikipedia-paraphrase.ts
|
|
3
|
+
*
|
|
4
|
+
* Loads the prebuilt Wikipedia trigram bloom filter and provides a
|
|
5
|
+
* paraphrase-rate estimator. Used by content/wikipedia-paraphrase rule.
|
|
6
|
+
*
|
|
7
|
+
* Bloom filter layout (data/wikipedia-trigrams.bin):
|
|
8
|
+
* m = 65536 bits (8192 bytes)
|
|
9
|
+
* k = 3 FNV-1a-32 hash functions with distinct seeds
|
|
10
|
+
*
|
|
11
|
+
* FP rate ~5% for the curated corpus of ~10 k unique trigrams.
|
|
12
|
+
*/
|
|
13
|
+
import { readFileSync } from "node:fs";
|
|
14
|
+
import { join, dirname } from "node:path";
|
|
15
|
+
import { fileURLToPath } from "node:url";
|
|
16
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
17
|
+
const DATA_PATH = join(__dirname, "..", "..", "data", "wikipedia-trigrams.bin");
|
|
18
|
+
const BLOOM_BITS = 65536;
|
|
19
|
+
const BLOOM_K = 3;
|
|
20
|
+
const FNV_PRIME = 0x01000193;
|
|
21
|
+
const FNV_SEEDS = [0x811c9dc5, 0x6b43a9b5, 0x29f7b4e3];
|
|
22
|
+
function fnv1a32(str, seed) {
|
|
23
|
+
let hash = seed >>> 0;
|
|
24
|
+
for (let i = 0; i < str.length; i++) {
|
|
25
|
+
hash ^= str.charCodeAt(i);
|
|
26
|
+
hash = Math.imul(hash, FNV_PRIME) >>> 0;
|
|
27
|
+
}
|
|
28
|
+
return hash;
|
|
29
|
+
}
|
|
30
|
+
let _cache = null;
|
|
31
|
+
/** Load and cache the bloom filter binary. */
|
|
32
|
+
export function loadWikipediaBloomFilter() {
|
|
33
|
+
if (_cache !== null)
|
|
34
|
+
return _cache;
|
|
35
|
+
const buf = readFileSync(DATA_PATH);
|
|
36
|
+
_cache = new Uint8Array(buf.buffer, buf.byteOffset, buf.byteLength);
|
|
37
|
+
return _cache;
|
|
38
|
+
}
|
|
39
|
+
function bloomQuery(bits, trigram) {
|
|
40
|
+
for (let k = 0; k < BLOOM_K; k++) {
|
|
41
|
+
const pos = fnv1a32(trigram, FNV_SEEDS[k]) % BLOOM_BITS;
|
|
42
|
+
if ((bits[pos >> 3] & (1 << (pos & 7))) === 0)
|
|
43
|
+
return false;
|
|
44
|
+
}
|
|
45
|
+
return true;
|
|
46
|
+
}
|
|
47
|
+
function extractTrigrams(text) {
|
|
48
|
+
const tokens = text
|
|
49
|
+
.toLowerCase()
|
|
50
|
+
.replace(/[^\p{L}\p{N}\s]+/gu, " ")
|
|
51
|
+
.split(/\s+/)
|
|
52
|
+
.filter(Boolean);
|
|
53
|
+
const trigrams = [];
|
|
54
|
+
for (let i = 0; i <= tokens.length - 3; i++) {
|
|
55
|
+
trigrams.push(`${tokens[i]} ${tokens[i + 1]} ${tokens[i + 2]}`);
|
|
56
|
+
}
|
|
57
|
+
return trigrams;
|
|
58
|
+
}
|
|
59
|
+
/**
|
|
60
|
+
* Compute the fraction of the text's trigrams that match the Wikipedia
|
|
61
|
+
* bloom filter. Returns a 0-1 ratio. Returns 0 for text with fewer than
|
|
62
|
+
* 3 words (no trigrams extractable).
|
|
63
|
+
*/
|
|
64
|
+
export function wikipediaParaphraseRate(text) {
|
|
65
|
+
if (!text || text.trim().length === 0)
|
|
66
|
+
return 0;
|
|
67
|
+
const trigrams = extractTrigrams(text);
|
|
68
|
+
if (trigrams.length === 0)
|
|
69
|
+
return 0;
|
|
70
|
+
const bits = loadWikipediaBloomFilter();
|
|
71
|
+
let matched = 0;
|
|
72
|
+
for (const t of trigrams) {
|
|
73
|
+
if (bloomQuery(bits, t))
|
|
74
|
+
matched++;
|
|
75
|
+
}
|
|
76
|
+
return matched / trigrams.length;
|
|
77
|
+
}
|
|
78
|
+
/** Reset the in-memory cache. For testing purposes only. */
|
|
79
|
+
export function _resetBloomCache() {
|
|
80
|
+
_cache = null;
|
|
81
|
+
}
|
|
82
|
+
//# sourceMappingURL=wikipedia-paraphrase.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"wikipedia-paraphrase.js","sourceRoot":"","sources":["../../src/algorithms/wikipedia-paraphrase.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAEH,OAAO,EAAE,YAAY,EAAE,MAAM,SAAS,CAAC;AACvC,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAC1C,OAAO,EAAE,aAAa,EAAE,MAAM,UAAU,CAAC;AAEzC,MAAM,SAAS,GAAG,OAAO,CAAC,aAAa,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;AAC1D,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,EAAE,IAAI,EAAE,IAAI,EAAE,MAAM,EAAE,wBAAwB,CAAC,CAAC;AAEhF,MAAM,UAAU,GAAG,KAAK,CAAC;AACzB,MAAM,OAAO,GAAG,CAAC,CAAC;AAClB,MAAM,SAAS,GAAG,UAAU,CAAC;AAC7B,MAAM,SAAS,GAAG,CAAC,UAAU,EAAE,UAAU,EAAE,UAAU,CAAC,CAAC;AAEvD,SAAS,OAAO,CAAC,GAAW,EAAE,IAAY;IACxC,IAAI,IAAI,GAAG,IAAI,KAAK,CAAC,CAAC;IACtB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACpC,IAAI,IAAI,GAAG,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;QAC1B,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,SAAS,CAAC,KAAK,CAAC,CAAC;IAC1C,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED,IAAI,MAAM,GAAsB,IAAI,CAAC;AAErC,8CAA8C;AAC9C,MAAM,UAAU,wBAAwB;IACtC,IAAI,MAAM,KAAK,IAAI;QAAE,OAAO,MAAM,CAAC;IACnC,MAAM,GAAG,GAAG,YAAY,CAAC,SAAS,CAAC,CAAC;IACpC,MAAM,GAAG,IAAI,UAAU,CAAC,GAAG,CAAC,MAAM,EAAE,GAAG,CAAC,UAAU,EAAE,GAAG,CAAC,UAAU,CAAC,CAAC;IACpE,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,SAAS,UAAU,CAAC,IAAgB,EAAE,OAAe;IACnD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,EAAE,CAAC,EAAE,EAAE,CAAC;QACjC,MAAM,GAAG,GAAG,OAAO,CAAC,OAAO,EAAE,SAAS,CAAC,CAAC,CAAC,CAAC,GAAG,UAAU,CAAC;QACxD,IAAI,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC;YAAE,OAAO,KAAK,CAAC;IAC9D,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED,SAAS,eAAe,CAAC,IAAY;IACnC,MAAM,MAAM,GAAG,IAAI;SAChB,WAAW,EAAE;SACb,OAAO,CAAC,oBAAoB,EAAE,GAAG,CAAC;SAClC,KAAK,CAAC,KAAK,CAAC;SACZ,MAAM,CAAC,OAAO,CAAC,CAAC;IACnB,MAAM,QAAQ,GAAa,EAAE,CAAC;IAC9B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QAC5C,QAAQ,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC,CAAC,CAAC,IAAI,MAAM,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,MAAM,CAAC,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;IAClE,CAAC;IACD,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,uBAAuB,CAAC,IAAY;IAClD,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,CAAC,CAAC;IAChD,MAAM,QAAQ,GAAG,eAAe,CAAC,IAAI,CAAC,CAAC;IACvC,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,CAAC,CAAC;IACpC,MAAM,IAAI,GAAG,wBAAwB,EAAE,CAAC;IACxC,IAAI,OAAO,GAAG,CAAC,CAAC;IAChB,KAAK,MAAM,CAAC,IAAI,QAAQ,EAAE,CAAC;QACzB,IAAI,UAAU,CAAC,IAAI,EAAE,CAAC,CAAC;YAAE,OAAO,EAAE,CAAC;IACrC,CAAC;IACD,OAAO,OAAO,GAAG,QAAQ,CAAC,MAAM,CAAC;AACnC,CAAC;AAED,4DAA4D;AAC5D,MAAM,UAAU,gBAAgB;IAC9B,MAAM,GAAG,IAAI,CAAC;AAChB,CAAC"}
|
package/dist/auditor.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"auditor.d.ts","sourceRoot":"","sources":["../src/auditor.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"auditor.d.ts","sourceRoot":"","sources":["../src/auditor.ts"],"names":[],"mappings":"AAmEA,OAAO,KAAK,EACV,YAAY,EACZ,YAAY,EAaZ,UAAU,EAIX,MAAM,YAAY,CAAC;AAQpB,OAAO,EAA8D,KAAK,kBAAkB,EAAiB,MAAM,sBAAsB,CAAC;AAm0B1I;;;;;;;;GAQG;AACH,wBAAgB,4BAA4B,CAC1C,QAAQ,EAAE,UAAU,EAAE,EACtB,cAAc,EAAE,kBAAkB,GAAG,SAAS,GAC7C,UAAU,EAAE,CAed;AA+WD,wBAAgB,2BAA2B,CAAC,GAAG,EAAE,MAAM,GAAG,KAAK,CAAC;IAAE,GAAG,EAAE,MAAM,CAAC;IAAC,OAAO,CAAC,EAAE,MAAM,CAAA;CAAE,CAAC,CAgBjG;AAqhBD,wBAAsB,WAAW,CAAC,MAAM,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,YAAY,GAAG,OAAO,CAAC,YAAY,CAAC,CAimC/F"}
|
package/dist/auditor.js
CHANGED
|
@@ -26,6 +26,8 @@ import { headingStructureRule } from "./rules/content/heading-structure.js";
|
|
|
26
26
|
import { imageAltTextRule } from "./rules/content/image-alt-text.js";
|
|
27
27
|
import { translationNoOpRule } from "./rules/content/translation-no-op.js";
|
|
28
28
|
import { regurgitatedContentRule } from "./rules/content/regurgitated-content.js";
|
|
29
|
+
import { commonPhraseReuseRule } from "./rules/content/common-phrase-reuse.js";
|
|
30
|
+
import { wikipediaParaphraseRule } from "./rules/content/wikipedia-paraphrase.js";
|
|
29
31
|
import { valueAddRule } from "./rules/content/value-add.js";
|
|
30
32
|
import { canonicalConsistencyRule } from "./rules/tech/canonical-consistency.js";
|
|
31
33
|
import { canonicalNoindexConflictRule } from "./rules/tech/canonical-noindex-conflict.js";
|
|
@@ -69,7 +71,7 @@ import { readState, writeState, computeContentHash, STATE_SCHEMA_VERSION, } from
|
|
|
69
71
|
import { CORE_RULESET_VERSION } from "./ruleset-version.js";
|
|
70
72
|
import { planScrapeStrategy, DEFAULT_AGE_FLOOR_DAYS } from "./scrape-strategy.js";
|
|
71
73
|
import { detectTemplates, buildUrlToTemplateMap, shouldActivateTemplateScoring } from "./template-detection.js";
|
|
72
|
-
import { scoreTemplates } from "./per-template-scoring.js";
|
|
74
|
+
import { scoreTemplates, siteVerdictFromTemplates } from "./per-template-scoring.js";
|
|
73
75
|
const DEFAULTS = {
|
|
74
76
|
nearDuplicateThreshold: 0.85,
|
|
75
77
|
entitySwapThreshold: 0.95,
|
|
@@ -397,6 +399,10 @@ const RULE_IMPACTS = {
|
|
|
397
399
|
"content/translation-no-op": { baseImpact: 30, perInstance: 10, maxImpact: 60 },
|
|
398
400
|
// v1 warning-severity heuristic; lower than translation-no-op since it's speculative
|
|
399
401
|
"content/regurgitated-content": { baseImpact: 15, perInstance: 5, maxImpact: 35 },
|
|
402
|
+
// v0.5.11 warning/low-confidence cliché density detector; lower than regurgitated-content
|
|
403
|
+
"content/common-phrase-reuse": { baseImpact: 12, perInstance: 4, maxImpact: 30 },
|
|
404
|
+
// v0.5.14 speculative/warning Wikipedia trigram overlap; lower than common-phrase-reuse
|
|
405
|
+
"content/wikipedia-paraphrase": { baseImpact: 10, perInstance: 3, maxImpact: 25 },
|
|
400
406
|
// v0.5.8 composite per-page quality synthesis
|
|
401
407
|
"content/value-add": { baseImpact: 25, perInstance: 8, maxImpact: 50 },
|
|
402
408
|
// Tech — softened in v0.4.3-rc2 after dogfood showed nextjs.org regressing
|
|
@@ -639,6 +645,12 @@ sampled = false) {
|
|
|
639
645
|
if (isEnabled("content/regurgitated-content") && modeOk("content/regurgitated-content")) {
|
|
640
646
|
findings.push(...tag(regurgitatedContentRule(pages)));
|
|
641
647
|
}
|
|
648
|
+
if (isEnabled("content/common-phrase-reuse") && modeOk("content/common-phrase-reuse")) {
|
|
649
|
+
findings.push(...tag(commonPhraseReuseRule(pages)));
|
|
650
|
+
}
|
|
651
|
+
if (isEnabled("content/wikipedia-paraphrase") && modeOk("content/wikipedia-paraphrase")) {
|
|
652
|
+
findings.push(...tag(wikipediaParaphraseRule(pages)));
|
|
653
|
+
}
|
|
642
654
|
// Link rules — use the global link graph
|
|
643
655
|
if (isEnabled("links/orphan-pages") && modeOk("links/orphan-pages")) {
|
|
644
656
|
findings.push(...tag(orphanPagesRule(pages, inbound, rootUrl)));
|
|
@@ -1529,6 +1541,31 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
1529
1541
|
return { pages: [{ url: resolved, html: await readFile(resolved, "utf-8") }] };
|
|
1530
1542
|
}
|
|
1531
1543
|
if (sourceStat.isDirectory()) {
|
|
1544
|
+
// v0.5.15: if the directory contains _manifest.json, restore original URLs
|
|
1545
|
+
// from the fixture manifest. Without a manifest (or unparseable JSON), the
|
|
1546
|
+
// engine falls back to file-path URLs (existing behaviour for arbitrary
|
|
1547
|
+
// HTML directories). Missing files listed in a valid manifest are
|
|
1548
|
+
// propagated as errors — a stale manifest is a programmer error, not
|
|
1549
|
+
// a soft condition.
|
|
1550
|
+
const manifestPath = join(resolved, "_manifest.json");
|
|
1551
|
+
let hasManifest = false;
|
|
1552
|
+
let manifest = null;
|
|
1553
|
+
try {
|
|
1554
|
+
const raw = await readFile(manifestPath, "utf-8");
|
|
1555
|
+
manifest = JSON.parse(raw);
|
|
1556
|
+
hasManifest = true;
|
|
1557
|
+
}
|
|
1558
|
+
catch {
|
|
1559
|
+
// No manifest file or invalid JSON — fall through to path-based loading
|
|
1560
|
+
}
|
|
1561
|
+
if (hasManifest && manifest !== null) {
|
|
1562
|
+
// Propagate missing-file errors (fail-loud: stale manifests must be noticed)
|
|
1563
|
+
const pages = await Promise.all(Object.entries(manifest).map(async ([originalUrl, relPath]) => ({
|
|
1564
|
+
url: originalUrl,
|
|
1565
|
+
html: await readFile(join(resolved, relPath), "utf-8"),
|
|
1566
|
+
})));
|
|
1567
|
+
return { pages };
|
|
1568
|
+
}
|
|
1532
1569
|
const htmlFiles = await collectHtmlFiles(resolved);
|
|
1533
1570
|
const pages = await Promise.all(htmlFiles.map(async (filePath) => ({
|
|
1534
1571
|
url: filePath,
|
|
@@ -1556,7 +1593,11 @@ export async function auditSource(source, options) {
|
|
|
1556
1593
|
const skipBoilerplate = options?.skipBoilerplate ?? false;
|
|
1557
1594
|
const skipSearchPages = options?.skipSearchPages ?? false;
|
|
1558
1595
|
const skipEmptyBody = options?.skipEmptyBody ?? false;
|
|
1559
|
-
|
|
1596
|
+
// v0.5.12: when pinnedUrls is non-empty, sampleSize is irrelevant — the
|
|
1597
|
+
// pinned list IS the sample. Force to 0 so the post-fetch sampling step
|
|
1598
|
+
// is a no-op and all pinned pages pass through untruncated.
|
|
1599
|
+
const hasPinnedUrlsEarly = Array.isArray(options?.pinnedUrls) && options.pinnedUrls.length > 0;
|
|
1600
|
+
const sampleSize = hasPinnedUrlsEarly ? 0 : (options?.sampleSize ?? preset.sampleSize ?? 0);
|
|
1560
1601
|
const externalSignal = options?.signal;
|
|
1561
1602
|
const guardSsrf = options?.guardSsrf ?? preset.guardSsrf ?? false;
|
|
1562
1603
|
const respectRobotsTxt = options?.respectRobotsTxt ?? preset.respectRobotsTxt ?? true;
|
|
@@ -1714,7 +1755,84 @@ export async function auditSource(source, options) {
|
|
|
1714
1755
|
if (!priorState && options?.state?.since) {
|
|
1715
1756
|
console.error("no prior state found — performing full baseline audit");
|
|
1716
1757
|
}
|
|
1717
|
-
|
|
1758
|
+
// v0.5.12 — pinnedUrls fast path: bypass sitemap discovery + random sampling
|
|
1759
|
+
// entirely. Only fetch the caller-specified URLs. Validated same-origin for
|
|
1760
|
+
// HTTP sources. Filesystem sources treat pinnedUrls as absolute paths.
|
|
1761
|
+
let loadedPagesRaw;
|
|
1762
|
+
let sitemapUrlSet;
|
|
1763
|
+
let sitemapLastmodByUrl;
|
|
1764
|
+
let discoveredUrlCount;
|
|
1765
|
+
let scrapePlan;
|
|
1766
|
+
if (hasPinnedUrlsEarly) {
|
|
1767
|
+
const pinned = options.pinnedUrls;
|
|
1768
|
+
// Validate same-origin for HTTP sources
|
|
1769
|
+
if (/^https?:\/\//i.test(source)) {
|
|
1770
|
+
let sourceOriginStr;
|
|
1771
|
+
try {
|
|
1772
|
+
sourceOriginStr = new URL(source).origin;
|
|
1773
|
+
}
|
|
1774
|
+
catch {
|
|
1775
|
+
throw new Error(`pinnedUrls: source URL is not a valid URL: ${source}`);
|
|
1776
|
+
}
|
|
1777
|
+
for (const u of pinned) {
|
|
1778
|
+
let pinnedOrigin;
|
|
1779
|
+
try {
|
|
1780
|
+
pinnedOrigin = new URL(u).origin;
|
|
1781
|
+
}
|
|
1782
|
+
catch {
|
|
1783
|
+
throw new Error(`pinnedUrls: "${u}" is not a valid absolute URL`);
|
|
1784
|
+
}
|
|
1785
|
+
if (pinnedOrigin !== sourceOriginStr) {
|
|
1786
|
+
throw new Error(`pinnedUrls: cross-origin URL rejected. Source origin is "${sourceOriginStr}" but pinned URL "${u}" has origin "${pinnedOrigin}". All pinnedUrls must be same-origin as the source.`);
|
|
1787
|
+
}
|
|
1788
|
+
}
|
|
1789
|
+
}
|
|
1790
|
+
// Fetch pinned URLs directly — no sitemap fetch, no sampling
|
|
1791
|
+
const ssrfCache = new Map();
|
|
1792
|
+
const validateHopPinned = guardSsrf
|
|
1793
|
+
? async (u) => {
|
|
1794
|
+
let host;
|
|
1795
|
+
try {
|
|
1796
|
+
host = new URL(u).hostname;
|
|
1797
|
+
}
|
|
1798
|
+
catch {
|
|
1799
|
+
throw new Error(`Refusing to fetch invalid URL: ${u}`);
|
|
1800
|
+
}
|
|
1801
|
+
let pending = ssrfCache.get(host);
|
|
1802
|
+
if (!pending) {
|
|
1803
|
+
pending = validateTargetHost(host).catch((err) => {
|
|
1804
|
+
if (err instanceof SSRFError)
|
|
1805
|
+
throw new Error(`Refusing to fetch ${u}: ${err.reason}`);
|
|
1806
|
+
throw err;
|
|
1807
|
+
});
|
|
1808
|
+
ssrfCache.set(host, pending);
|
|
1809
|
+
}
|
|
1810
|
+
await pending;
|
|
1811
|
+
}
|
|
1812
|
+
: undefined;
|
|
1813
|
+
const pinnedPages = [];
|
|
1814
|
+
await runWithConcurrency(Array.from(pinned), concurrency, async (url) => {
|
|
1815
|
+
const result = await fetchPageWithMeta(url, timeoutMs, cacheConfig, cacheStats, signal, validateHopPinned, followRedirects);
|
|
1816
|
+
if (result) {
|
|
1817
|
+
fetchByteBudget.used += result.html.length;
|
|
1818
|
+
pinnedPages.push(result);
|
|
1819
|
+
}
|
|
1820
|
+
});
|
|
1821
|
+
loadedPagesRaw = pinnedPages;
|
|
1822
|
+
// No sitemap context in pinned mode
|
|
1823
|
+
sitemapUrlSet = undefined;
|
|
1824
|
+
sitemapLastmodByUrl = undefined;
|
|
1825
|
+
discoveredUrlCount = undefined;
|
|
1826
|
+
scrapePlan = undefined;
|
|
1827
|
+
}
|
|
1828
|
+
else {
|
|
1829
|
+
const loaded = await loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cacheConfig, cacheStats, fillBudgetViaLinkDiscovery, fetchByteBudget, signal, guardSsrf, respectRobotsTxt, skippedByRobots, followRedirects, maxCrawlDiscovered, monitoringContext);
|
|
1830
|
+
loadedPagesRaw = loaded.pages;
|
|
1831
|
+
sitemapUrlSet = loaded.sitemapUrls;
|
|
1832
|
+
sitemapLastmodByUrl = loaded.sitemapLastmodByUrl;
|
|
1833
|
+
discoveredUrlCount = loaded.discoveredUrlCount;
|
|
1834
|
+
scrapePlan = loaded.scrapePlan;
|
|
1835
|
+
}
|
|
1718
1836
|
// The scrapePlan tells us which URLs were skipped pre-fetch under monitoring
|
|
1719
1837
|
// mode. Surface them in skippedUrls so they show up under summary.skippedUrls
|
|
1720
1838
|
// (kept for back-compat with --since consumers); T7 will carry their prior
|
|
@@ -1884,6 +2002,11 @@ export async function auditSource(source, options) {
|
|
|
1884
2002
|
// matters: a sampled crawl of a 5000-page directory must still classify
|
|
1885
2003
|
// as `programmatic-directory`, not `unclear`.
|
|
1886
2004
|
const classifierUrls = (() => {
|
|
2005
|
+
// v0.6.1 — explicit caller override takes priority. Lets calibration
|
|
2006
|
+
// fixtures audit a small sample but classify against full sitemap.
|
|
2007
|
+
if (options?.classifierUrls && options.classifierUrls.length > 0) {
|
|
2008
|
+
return [...options.classifierUrls];
|
|
2009
|
+
}
|
|
1887
2010
|
if (sitemapUrlSet && sitemapUrlSet.size > 0) {
|
|
1888
2011
|
return Array.from(sitemapUrlSet);
|
|
1889
2012
|
}
|
|
@@ -1963,7 +2086,15 @@ export async function auditSource(source, options) {
|
|
|
1963
2086
|
continue;
|
|
1964
2087
|
const groupRules = resolveGroupRules(resolvedRules, groupConfig?.overrides);
|
|
1965
2088
|
const enabledCheck = (ruleId) => !suppressedRuleSet.has(ruleId) && isRuleEnabled(ruleId, groupConfig?.rules);
|
|
1966
|
-
const findings = runRulesOnPages(groupPages, parsedPagesAll, groupRules, enabledCheck, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, DEFAULT_ENTITY_PATTERNS, groupConfig?.overrides, options?.mode ?? "full",
|
|
2089
|
+
const findings = runRulesOnPages(groupPages, parsedPagesAll, groupRules, enabledCheck, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, DEFAULT_ENTITY_PATTERNS, groupConfig?.overrides, options?.mode ?? "full",
|
|
2090
|
+
// 2026-05-06 calibration fix: pinnedUrls mode fetches a hand-picked subset
|
|
2091
|
+
// of the full site — the link graph across those pages is structurally
|
|
2092
|
+
// incomplete, just like a random-sampled crawl. Pass `true` so
|
|
2093
|
+
// links/unreachable-from-root skips its check rather than emitting
|
|
2094
|
+
// sampling-artifact false positives (22/25 Wise pages flagged "unreachable"
|
|
2095
|
+
// because the nav paths between locale-specific currency-converter URLs
|
|
2096
|
+
// were not in the pinned set).
|
|
2097
|
+
isSampledAudit || hasPinnedUrlsEarly);
|
|
1967
2098
|
allFindings.push(...findings);
|
|
1968
2099
|
groupPageCounts[groupName] = groupPages.length;
|
|
1969
2100
|
// v0.4.3: per-group scoring uses the same site-classification profile so
|
|
@@ -2061,7 +2192,14 @@ export async function auditSource(source, options) {
|
|
|
2061
2192
|
const { risk, categories, bucketCounts } = scoreFromFindings(enriched.findings, siteClassification, parsedPages.length);
|
|
2062
2193
|
const auditedPageCount = Object.values(groupPageCounts).reduce((a, b) => a + b, 0);
|
|
2063
2194
|
const issues = bucketIssues(enriched.findings);
|
|
2064
|
-
|
|
2195
|
+
// v0.6.0 — spec §15.1: site verdict comes from siteVerdictFromTemplates when
|
|
2196
|
+
// ≥1 template has ≥5% coverage. Falls back to the legacy risk-ladder verdict
|
|
2197
|
+
// when no template meets the threshold (single-template sites, `unclear`/
|
|
2198
|
+
// `small-marketing` classifications, or the long-tail-only case).
|
|
2199
|
+
// The `risk` score is intentionally unchanged — §15.1 governs verdict only.
|
|
2200
|
+
const legacyVerdict = shiftVerdictForAuthority(verdictForRisk(risk), options?.authorityScore);
|
|
2201
|
+
const templateVerdict = siteVerdictFromTemplates(siteTemplates);
|
|
2202
|
+
const verdict = templateVerdict !== null ? templateVerdict : legacyVerdict;
|
|
2065
2203
|
const headline = buildHeadline(bucketCounts);
|
|
2066
2204
|
// audit/* findings are diagnostic-only and never appear in summary.issues.
|
|
2067
2205
|
// Surface them under diagnostics so consumers (telemetry, debug UIs) can
|
|
@@ -2094,6 +2232,10 @@ export async function auditSource(source, options) {
|
|
|
2094
2232
|
pageCount: auditedPageCount || parsedPages.length,
|
|
2095
2233
|
templateDetected: enriched.templateDetected,
|
|
2096
2234
|
rawFindingCount: enriched.rawFindingCount,
|
|
2235
|
+
// v0.5.12 — sorted list of audited page URLs for --repin capture
|
|
2236
|
+
auditedUrls: parsedPages.length > 0
|
|
2237
|
+
? [...parsedPages.map((p) => p.url)].sort()
|
|
2238
|
+
: undefined,
|
|
2097
2239
|
};
|
|
2098
2240
|
if (cacheConfig) {
|
|
2099
2241
|
summary.cacheStats = cacheStats;
|