@pseolint/core 0.3.2 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +49 -1
- package/dist/ai/triage.d.ts.map +1 -1
- package/dist/ai/triage.js +8 -1
- package/dist/ai/triage.js.map +1 -1
- package/dist/auditor.d.ts.map +1 -1
- package/dist/auditor.js +495 -130
- package/dist/auditor.js.map +1 -1
- package/dist/backpressure.d.ts +68 -0
- package/dist/backpressure.d.ts.map +1 -0
- package/dist/backpressure.js +81 -0
- package/dist/backpressure.js.map +1 -0
- package/dist/cache.d.ts +73 -0
- package/dist/cache.d.ts.map +1 -1
- package/dist/cache.js +258 -19
- package/dist/cache.js.map +1 -1
- package/dist/enrich-findings.d.ts.map +1 -1
- package/dist/enrich-findings.js +1 -14
- package/dist/enrich-findings.js.map +1 -1
- package/dist/fetch-observer.d.ts +97 -0
- package/dist/fetch-observer.d.ts.map +1 -0
- package/dist/fetch-observer.js +124 -0
- package/dist/fetch-observer.js.map +1 -0
- package/dist/formatters/console.d.ts +7 -9
- package/dist/formatters/console.d.ts.map +1 -1
- package/dist/formatters/console.js +218 -254
- package/dist/formatters/console.js.map +1 -1
- package/dist/formatters/html.d.ts +5 -1
- package/dist/formatters/html.d.ts.map +1 -1
- package/dist/formatters/html.js +352 -570
- package/dist/formatters/html.js.map +1 -1
- package/dist/formatters/index.d.ts +4 -1
- package/dist/formatters/index.d.ts.map +1 -1
- package/dist/formatters/index.js +1 -1
- package/dist/formatters/index.js.map +1 -1
- package/dist/formatters/json.d.ts +11 -1
- package/dist/formatters/json.d.ts.map +1 -1
- package/dist/formatters/json.js +5 -1
- package/dist/formatters/json.js.map +1 -1
- package/dist/formatters/markdown.d.ts +7 -1
- package/dist/formatters/markdown.d.ts.map +1 -1
- package/dist/formatters/markdown.js +77 -70
- package/dist/formatters/markdown.js.map +1 -1
- package/dist/index.d.ts +13 -8
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +6 -7
- package/dist/index.js.map +1 -1
- package/dist/rule-references.d.ts.map +1 -1
- package/dist/rule-references.js +0 -6
- package/dist/rule-references.js.map +1 -1
- package/dist/rules/content/unique-value.d.ts.map +1 -1
- package/dist/rules/content/unique-value.js +1 -0
- package/dist/rules/content/unique-value.js.map +1 -1
- package/dist/rules/scope.d.ts.map +1 -1
- package/dist/rules/scope.js +6 -14
- package/dist/rules/scope.js.map +1 -1
- package/dist/rules/tech/robots-sitemap-presence.d.ts +9 -1
- package/dist/rules/tech/robots-sitemap-presence.d.ts.map +1 -1
- package/dist/rules/tech/robots-sitemap-presence.js +14 -5
- package/dist/rules/tech/robots-sitemap-presence.js.map +1 -1
- package/dist/safe-mode-preset.d.ts +27 -0
- package/dist/safe-mode-preset.d.ts.map +1 -0
- package/dist/safe-mode-preset.js +54 -0
- package/dist/safe-mode-preset.js.map +1 -0
- package/dist/site-classifier.d.ts +83 -0
- package/dist/site-classifier.d.ts.map +1 -0
- package/dist/site-classifier.js +205 -0
- package/dist/site-classifier.js.map +1 -0
- package/dist/ssrf-guard.d.ts +96 -0
- package/dist/ssrf-guard.d.ts.map +1 -0
- package/dist/ssrf-guard.js +268 -0
- package/dist/ssrf-guard.js.map +1 -0
- package/dist/types.d.ts +171 -19
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js +2 -1
- package/dist/types.js.map +1 -1
- package/package.json +2 -2
package/dist/auditor.js
CHANGED
|
@@ -4,7 +4,6 @@ import { extname, join, resolve } from "node:path";
|
|
|
4
4
|
import { parseHtmlPage } from "./parser.js";
|
|
5
5
|
import { mergeNormalizeUrlOptions, normalizeAuditUrl } from "./url-normalize.js";
|
|
6
6
|
import { eeatSignalsRule } from "./rules/content/eeat-signals.js";
|
|
7
|
-
import { headingUniquenessRule } from "./rules/content/heading-uniqueness.js";
|
|
8
7
|
import { metaUniquenessRule } from "./rules/content/meta-uniqueness.js";
|
|
9
8
|
import { missingAuthorRule } from "./rules/content/missing-author.js";
|
|
10
9
|
import { uniqueValueRule } from "./rules/content/unique-value.js";
|
|
@@ -18,12 +17,10 @@ import { thinContentRule } from "./rules/spam/thin-content.js";
|
|
|
18
17
|
import { deadEndsRule } from "./rules/links/dead-ends.js";
|
|
19
18
|
import { linkDepthRule } from "./rules/links/link-depth.js";
|
|
20
19
|
import { clusterConnectivityRule } from "./rules/links/cluster-connectivity.js";
|
|
21
|
-
import { hubPagesRule } from "./rules/links/hub-pages.js";
|
|
22
20
|
import { orphanPagesRule } from "./rules/links/orphan-pages.js";
|
|
23
21
|
import { canonicalConsistencyRule } from "./rules/tech/canonical-consistency.js";
|
|
24
22
|
import { canonicalNoindexConflictRule } from "./rules/tech/canonical-noindex-conflict.js";
|
|
25
23
|
import { hreflangConsistencyRule } from "./rules/tech/hreflang-consistency.js";
|
|
26
|
-
import { ogCompletenessRule } from "./rules/tech/og-completeness.js";
|
|
27
24
|
import { robotsNoindexConflictRule } from "./rules/tech/robots-noindex-conflict.js";
|
|
28
25
|
import { sitemapCompletenessRule } from "./rules/tech/sitemap-completeness.js";
|
|
29
26
|
import { robotsComplianceRule, parseDisallowPatterns, isBlockedByPattern, parseCrawlDelaySeconds } from "./rules/tech/robots-sitemap-presence.js";
|
|
@@ -33,7 +30,6 @@ import { freshnessSignalsRule } from "./rules/aeo/freshness-signals.js";
|
|
|
33
30
|
import { faqCoverageRule } from "./rules/aeo/faq-coverage.js";
|
|
34
31
|
import { answerFirstRule } from "./rules/aeo/answer-first.js";
|
|
35
32
|
import { citableFactsRule } from "./rules/aeo/citable-facts.js";
|
|
36
|
-
import { nonReplicableValueRule } from "./rules/aeo/non-replicable-value.js";
|
|
37
33
|
import { contentModularityRule } from "./rules/aeo/content-modularity.js";
|
|
38
34
|
import { summaryBaitRule } from "./rules/aeo/summary-bait.js";
|
|
39
35
|
import { redirectChainRule } from "./rules/tech/redirect-chain.js";
|
|
@@ -41,8 +37,6 @@ import { soft404Rule } from "./rules/tech/soft-404.js";
|
|
|
41
37
|
import { jsonLdValidRule } from "./rules/schema/json-ld-valid.js";
|
|
42
38
|
import { requiredFieldsRule } from "./rules/schema/required-fields.js";
|
|
43
39
|
import { schemaConsistencyRule } from "./rules/schema/consistency.js";
|
|
44
|
-
import { titleOverlapRule } from "./rules/cannibal/title-overlap.js";
|
|
45
|
-
import { keywordCollisionRule } from "./rules/cannibal/keyword-collision.js";
|
|
46
40
|
import { urlPatternRule } from "./rules/cannibal/url-pattern.js";
|
|
47
41
|
import { templateCoverageRule } from "./rules/spam/template-coverage.js";
|
|
48
42
|
import { dataBindingRule, dataIdenticalRule } from "./rules/data/data-binding.js";
|
|
@@ -54,8 +48,14 @@ import { triageFindings } from "./ai/triage.js";
|
|
|
54
48
|
import { createLanguageModel } from "./ai/adapters/index.js";
|
|
55
49
|
import { promptTriageFeedback } from "./ai/feedback-prompt.js";
|
|
56
50
|
import { generateRunId, appendTelemetryRecord, todayTriageSpendUsd, } from "./telemetry/index.js";
|
|
57
|
-
import {
|
|
51
|
+
import { SCHEMA_VERSION } from "./types.js";
|
|
52
|
+
import { cachedFetch, pruneCache } from "./cache.js";
|
|
53
|
+
import { SSRFError, validateTargetHost } from "./ssrf-guard.js";
|
|
54
|
+
import { SAFE_MODE_PRESETS, resolveSafeModeKey } from "./safe-mode-preset.js";
|
|
55
|
+
import { FetchObserver, computeReadiness, detectDevServer } from "./fetch-observer.js";
|
|
56
|
+
import { BackpressureMonitor, OriginDegradedError } from "./backpressure.js";
|
|
58
57
|
import { stratifiedSample } from "./stratified-sample.js";
|
|
58
|
+
import { classifySite } from "./site-classifier.js";
|
|
59
59
|
import { readState, writeState, computeContentHash, STATE_SCHEMA_VERSION, } from "./state.js";
|
|
60
60
|
const DEFAULTS = {
|
|
61
61
|
nearDuplicateThreshold: 0.85,
|
|
@@ -67,10 +67,6 @@ const DEFAULTS = {
|
|
|
67
67
|
uniqueValueMinWords: 100,
|
|
68
68
|
metaUniquenessMinJaccard: 0.9,
|
|
69
69
|
linkDepthMaxClicks: 3,
|
|
70
|
-
hubPagesMinSiblings: 4,
|
|
71
|
-
hubPagesMaxSiblings: 50,
|
|
72
|
-
titleOverlapThreshold: 0.8,
|
|
73
|
-
keywordCollisionMinShared: 6,
|
|
74
70
|
templateCoverageMinPages: 5,
|
|
75
71
|
answerFirstMaxWords: 100,
|
|
76
72
|
citableFactsMin: 3,
|
|
@@ -80,18 +76,82 @@ const DEFAULTS = {
|
|
|
80
76
|
modularityMinSelfContainedRatio: 0.7,
|
|
81
77
|
faqMinQuestionHeadings: 2
|
|
82
78
|
};
|
|
79
|
+
/**
|
|
80
|
+
* v0.4 four-category weights. Audit is diagnostic-only (weight 0).
|
|
81
|
+
* See 2026-04-29 v0.4 redesign spec §4.2.
|
|
82
|
+
*/
|
|
83
83
|
const CATEGORY_WEIGHTS = {
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
data: 0.06,
|
|
90
|
-
schema: 0.05,
|
|
91
|
-
cannibal: 0.05,
|
|
92
|
-
/** Dedup / crawl hygiene; does not affect composite score. */
|
|
93
|
-
audit: 0
|
|
84
|
+
integrity: 0.50, // spam + content + cannibal
|
|
85
|
+
discoverability: 0.20, // links + tech
|
|
86
|
+
citation: 0.25, // aeo + schema
|
|
87
|
+
data: 0.05, // data
|
|
88
|
+
audit: 0, // diagnostics, never weighted
|
|
94
89
|
};
|
|
90
|
+
/**
|
|
91
|
+
* Maps the v0.3 ruleId namespace prefix to the v0.4 four-bucket category.
|
|
92
|
+
* Used by `scoreFromFindings` to bucket findings without changing rule IDs.
|
|
93
|
+
*/
|
|
94
|
+
const CATEGORY_MAP = {
|
|
95
|
+
spam: "integrity",
|
|
96
|
+
content: "integrity",
|
|
97
|
+
cannibal: "integrity",
|
|
98
|
+
links: "discoverability",
|
|
99
|
+
tech: "discoverability",
|
|
100
|
+
aeo: "citation",
|
|
101
|
+
schema: "citation",
|
|
102
|
+
data: "data",
|
|
103
|
+
audit: "audit",
|
|
104
|
+
};
|
|
105
|
+
/** Slug map for `RuleResult.docsUrl`. Defaults to the rule-id segment after the `/`. */
|
|
106
|
+
const RULE_DOCS_SLUG = {
|
|
107
|
+
// intentionally empty for v0.4 — slug = ruleId.split("/").pop() works for every shipped rule
|
|
108
|
+
};
|
|
109
|
+
function docsUrlFor(ruleId) {
|
|
110
|
+
const slug = RULE_DOCS_SLUG[ruleId] ?? ruleId.split("/").pop() ?? ruleId;
|
|
111
|
+
return `https://pseolint.dev/rules/${slug}`;
|
|
112
|
+
}
|
|
113
|
+
/** Verdict ladder thresholds — see spec §4.4. */
|
|
114
|
+
function verdictForRisk(risk) {
|
|
115
|
+
if (risk <= 20)
|
|
116
|
+
return "ready";
|
|
117
|
+
if (risk <= 40)
|
|
118
|
+
return "caution";
|
|
119
|
+
if (risk <= 60)
|
|
120
|
+
return "concerning";
|
|
121
|
+
return "critical";
|
|
122
|
+
}
|
|
123
|
+
function gradeForPenalty(penalty) {
|
|
124
|
+
if (penalty <= 20)
|
|
125
|
+
return "A";
|
|
126
|
+
if (penalty <= 40)
|
|
127
|
+
return "B";
|
|
128
|
+
if (penalty <= 60)
|
|
129
|
+
return "C";
|
|
130
|
+
if (penalty <= 80)
|
|
131
|
+
return "D";
|
|
132
|
+
return "F";
|
|
133
|
+
}
|
|
134
|
+
/** True for `text/html` and `application/xhtml+xml` only (treat as audit-eligible content). */
|
|
135
|
+
function isHtmlContentType(contentType) {
|
|
136
|
+
if (!contentType)
|
|
137
|
+
return true; // Local files / unknown — assume HTML.
|
|
138
|
+
const lower = contentType.toLowerCase();
|
|
139
|
+
return lower.includes("text/html") || lower.includes("application/xhtml+xml");
|
|
140
|
+
}
|
|
141
|
+
/** Glob match against a URL pathname only (not the full URL). v0.4 spec §4.5. */
|
|
142
|
+
function globMatchPathname(pattern, urlOrPath) {
|
|
143
|
+
let pathname;
|
|
144
|
+
try {
|
|
145
|
+
pathname = new URL(urlOrPath).pathname;
|
|
146
|
+
}
|
|
147
|
+
catch {
|
|
148
|
+
// Not a URL — treat as already-a-path. Force a leading slash for consistency.
|
|
149
|
+
pathname = urlOrPath.startsWith("/") ? urlOrPath : `/${urlOrPath}`;
|
|
150
|
+
}
|
|
151
|
+
// Allow patterns that don't begin with "/" by normalising both sides.
|
|
152
|
+
const normPattern = pattern.startsWith("/") || pattern.startsWith("*") ? pattern : `/${pattern}`;
|
|
153
|
+
return matchGlob(normPattern, pathname) || matchGlob(pattern, pathname);
|
|
154
|
+
}
|
|
95
155
|
const DEFAULT_ENTITY_PATTERNS = [
|
|
96
156
|
{
|
|
97
157
|
placeholder: "[STATE]",
|
|
@@ -156,9 +216,6 @@ function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls,
|
|
|
156
216
|
if (isEnabled("content/unique-value") && modeOk("content/unique-value")) {
|
|
157
217
|
findings.push(...tag(uniqueValueRule(pages, resolvedRules.uniqueValueMinWords)));
|
|
158
218
|
}
|
|
159
|
-
if (isEnabled("content/heading-uniqueness") && modeOk("content/heading-uniqueness")) {
|
|
160
|
-
findings.push(...tag(headingUniquenessRule(pages, entityPatterns)));
|
|
161
|
-
}
|
|
162
219
|
if (isEnabled("content/meta-uniqueness") && modeOk("content/meta-uniqueness")) {
|
|
163
220
|
findings.push(...tag(metaUniquenessRule(pages, entityPatterns, resolvedRules.metaUniquenessMinJaccard)));
|
|
164
221
|
}
|
|
@@ -183,9 +240,6 @@ function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls,
|
|
|
183
240
|
if (isEnabled("links/cluster-connectivity") && modeOk("links/cluster-connectivity")) {
|
|
184
241
|
findings.push(...tag(clusterConnectivityRule(pages, knownUrls)));
|
|
185
242
|
}
|
|
186
|
-
if (isEnabled("links/hub-pages") && modeOk("links/hub-pages")) {
|
|
187
|
-
findings.push(...tag(hubPagesRule(pages, knownUrls, resolvedRules.hubPagesMinSiblings, resolvedRules.hubPagesMaxSiblings)));
|
|
188
|
-
}
|
|
189
243
|
// Tech rules
|
|
190
244
|
if (isEnabled("tech/canonical-consistency") && modeOk("tech/canonical-consistency")) {
|
|
191
245
|
findings.push(...tag(canonicalConsistencyRule(pages, knownUrls, normalizeUrlOptions)));
|
|
@@ -202,9 +256,6 @@ function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls,
|
|
|
202
256
|
if (isEnabled("tech/soft-404") && modeOk("tech/soft-404")) {
|
|
203
257
|
findings.push(...tag(soft404Rule(pages)));
|
|
204
258
|
}
|
|
205
|
-
if (isEnabled("tech/og-completeness") && modeOk("tech/og-completeness")) {
|
|
206
|
-
findings.push(...tag(ogCompletenessRule(pages)));
|
|
207
|
-
}
|
|
208
259
|
if (isEnabled("tech/hreflang-consistency") && modeOk("tech/hreflang-consistency")) {
|
|
209
260
|
findings.push(...tag(hreflangConsistencyRule(pages, normalizeUrlOptions)));
|
|
210
261
|
}
|
|
@@ -240,9 +291,6 @@ function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls,
|
|
|
240
291
|
targetFactsPerPage: resolvedRules.citableFactsTarget,
|
|
241
292
|
})));
|
|
242
293
|
}
|
|
243
|
-
if (isEnabled("aeo/non-replicable-value")) {
|
|
244
|
-
findings.push(...tag(nonReplicableValueRule(pages)));
|
|
245
|
-
}
|
|
246
294
|
if (isEnabled("aeo/content-modularity")) {
|
|
247
295
|
findings.push(...tag(contentModularityRule(pages, {
|
|
248
296
|
maxParagraphWords: resolvedRules.modularityMaxParagraphWords,
|
|
@@ -252,13 +300,9 @@ function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls,
|
|
|
252
300
|
if (isEnabled("aeo/summary-bait")) {
|
|
253
301
|
findings.push(...tag(summaryBaitRule(pages, entityPatterns)));
|
|
254
302
|
}
|
|
255
|
-
// Cannibal rules
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
}
|
|
259
|
-
if (isEnabled("cannibal/keyword-collision") && modeOk("cannibal/keyword-collision")) {
|
|
260
|
-
findings.push(...tag(keywordCollisionRule(pages, resolvedRules.keywordCollisionMinShared)));
|
|
261
|
-
}
|
|
303
|
+
// Cannibal rules — only url-pattern survives in v0.4 (title-overlap and
|
|
304
|
+
// keyword-collision dropped due to high false-positive rates; see
|
|
305
|
+
// 2026-04-29 v0.4 redesign spec §4.3).
|
|
262
306
|
if (isEnabled("cannibal/url-pattern") && modeOk("cannibal/url-pattern")) {
|
|
263
307
|
findings.push(...tag(urlPatternRule(pages)));
|
|
264
308
|
}
|
|
@@ -267,54 +311,110 @@ function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls,
|
|
|
267
311
|
function hashHtml(html) {
|
|
268
312
|
return createHash("sha256").update(html, "utf8").digest("hex");
|
|
269
313
|
}
|
|
314
|
+
const SEVERITY_WEIGHTS = {
|
|
315
|
+
critical: 40,
|
|
316
|
+
error: 25,
|
|
317
|
+
warning: 12,
|
|
318
|
+
info: 5,
|
|
319
|
+
};
|
|
270
320
|
function scoreFromFindings(findings) {
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
321
|
+
// v0.4 four-bucket raw penalties.
|
|
322
|
+
const bucketRaw = {
|
|
323
|
+
integrity: 0,
|
|
324
|
+
discoverability: 0,
|
|
325
|
+
citation: 0,
|
|
326
|
+
data: 0,
|
|
327
|
+
audit: 0,
|
|
276
328
|
};
|
|
277
|
-
const
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
links: 0,
|
|
282
|
-
tech: 0,
|
|
329
|
+
const bucketIssues = {
|
|
330
|
+
integrity: 0,
|
|
331
|
+
discoverability: 0,
|
|
332
|
+
citation: 0,
|
|
283
333
|
data: 0,
|
|
284
|
-
|
|
285
|
-
cannibal: 0,
|
|
286
|
-
audit: 0
|
|
334
|
+
audit: 0,
|
|
287
335
|
};
|
|
336
|
+
let blockers = 0;
|
|
337
|
+
let shouldFix = 0;
|
|
338
|
+
let informational = 0;
|
|
288
339
|
for (const finding of findings) {
|
|
289
|
-
const
|
|
290
|
-
|
|
340
|
+
const namespace = finding.ruleId.split("/")[0];
|
|
341
|
+
const bucket = CATEGORY_MAP[namespace];
|
|
342
|
+
if (!bucket)
|
|
291
343
|
continue;
|
|
344
|
+
const weight = SEVERITY_WEIGHTS[finding.severity];
|
|
345
|
+
// v0.4 buckets.
|
|
346
|
+
bucketRaw[bucket] = Math.min(100, bucketRaw[bucket] + weight);
|
|
347
|
+
if (bucket !== "audit") {
|
|
348
|
+
bucketIssues[bucket] += 1;
|
|
292
349
|
}
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
350
|
+
// Issue-bucket counts (audit/* findings are diagnostic-only and excluded).
|
|
351
|
+
if (bucket === "audit")
|
|
352
|
+
continue;
|
|
353
|
+
if (finding.severity === "critical" || finding.severity === "error")
|
|
354
|
+
blockers += 1;
|
|
355
|
+
else if (finding.severity === "warning")
|
|
356
|
+
shouldFix += 1;
|
|
357
|
+
else
|
|
358
|
+
informational += 1;
|
|
359
|
+
}
|
|
360
|
+
const weighted = bucketRaw.integrity * CATEGORY_WEIGHTS.integrity +
|
|
361
|
+
bucketRaw.discoverability * CATEGORY_WEIGHTS.discoverability +
|
|
362
|
+
bucketRaw.citation * CATEGORY_WEIGHTS.citation +
|
|
363
|
+
bucketRaw.data * CATEGORY_WEIGHTS.data;
|
|
364
|
+
const risk = Math.round(Math.min(100, weighted));
|
|
365
|
+
const categories = {
|
|
366
|
+
integrity: { grade: gradeForPenalty(bucketRaw.integrity), issues: bucketIssues.integrity },
|
|
367
|
+
discoverability: { grade: gradeForPenalty(bucketRaw.discoverability), issues: bucketIssues.discoverability },
|
|
368
|
+
citation: { grade: gradeForPenalty(bucketRaw.citation), issues: bucketIssues.citation },
|
|
369
|
+
data: { grade: gradeForPenalty(bucketRaw.data), issues: bucketIssues.data },
|
|
370
|
+
audit: { grade: "A", issues: 0 },
|
|
371
|
+
};
|
|
304
372
|
return {
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
content: raw.content,
|
|
309
|
-
aeo: raw.aeo,
|
|
310
|
-
links: raw.links,
|
|
311
|
-
tech: raw.tech,
|
|
312
|
-
data: raw.data,
|
|
313
|
-
schema: raw.schema,
|
|
314
|
-
cannibal: raw.cannibal
|
|
315
|
-
}
|
|
373
|
+
risk,
|
|
374
|
+
categories,
|
|
375
|
+
bucketCounts: { blockers, shouldFix, informational },
|
|
316
376
|
};
|
|
317
377
|
}
|
|
378
|
+
function bucketIssues(findings) {
|
|
379
|
+
const blockers = [];
|
|
380
|
+
const shouldFix = [];
|
|
381
|
+
const informational = [];
|
|
382
|
+
for (const f of findings) {
|
|
383
|
+
// audit/* findings are diagnostics and never appear in issue buckets.
|
|
384
|
+
if (f.ruleId.startsWith("audit/"))
|
|
385
|
+
continue;
|
|
386
|
+
if (f.severity === "critical" || f.severity === "error")
|
|
387
|
+
blockers.push(f);
|
|
388
|
+
else if (f.severity === "warning")
|
|
389
|
+
shouldFix.push(f);
|
|
390
|
+
else
|
|
391
|
+
informational.push(f);
|
|
392
|
+
}
|
|
393
|
+
return { blockers, shouldFix, informational };
|
|
394
|
+
}
|
|
395
|
+
function buildHeadline(counts) {
|
|
396
|
+
const parts = [];
|
|
397
|
+
if (counts.blockers > 0) {
|
|
398
|
+
parts.push(`${counts.blockers} ship-blocker${counts.blockers === 1 ? "" : "s"}`);
|
|
399
|
+
}
|
|
400
|
+
if (counts.shouldFix > 0) {
|
|
401
|
+
parts.push(`${counts.shouldFix} should-fix`);
|
|
402
|
+
}
|
|
403
|
+
if (counts.informational > 0 && parts.length < 2) {
|
|
404
|
+
parts.push(`${counts.informational} informational`);
|
|
405
|
+
}
|
|
406
|
+
if (parts.length === 0)
|
|
407
|
+
return "No issues detected.";
|
|
408
|
+
return parts.join(", ");
|
|
409
|
+
}
|
|
410
|
+
/** Populate `docsUrl` on every finding that doesn't already have one. */
|
|
411
|
+
function withDocsUrls(findings) {
|
|
412
|
+
for (const f of findings) {
|
|
413
|
+
if (!f.docsUrl)
|
|
414
|
+
f.docsUrl = docsUrlFor(f.ruleId);
|
|
415
|
+
}
|
|
416
|
+
return findings;
|
|
417
|
+
}
|
|
318
418
|
async function collectHtmlFiles(directory) {
|
|
319
419
|
const entries = await readdir(directory, { withFileTypes: true });
|
|
320
420
|
const files = await Promise.all(entries.map(async (entry) => {
|
|
@@ -330,10 +430,30 @@ async function collectHtmlFiles(directory) {
|
|
|
330
430
|
}));
|
|
331
431
|
return files.flat();
|
|
332
432
|
}
|
|
333
|
-
|
|
433
|
+
/**
|
|
434
|
+
* Combine up to N AbortSignals into one. The returned signal aborts as soon
|
|
435
|
+
* as any input aborts. Avoids the node-only `AbortSignal.any` for wider
|
|
436
|
+
* compatibility and keeps listeners weak-ish (one per input, no unbounded
|
|
437
|
+
* listener growth).
|
|
438
|
+
*/
|
|
439
|
+
function composeSignals(...signals) {
|
|
440
|
+
const actual = signals.filter((s) => Boolean(s));
|
|
441
|
+
if (actual.length === 0)
|
|
442
|
+
return new AbortController().signal;
|
|
443
|
+
const ac = new AbortController();
|
|
444
|
+
for (const s of actual) {
|
|
445
|
+
if (s.aborted) {
|
|
446
|
+
ac.abort(s.reason);
|
|
447
|
+
return ac.signal;
|
|
448
|
+
}
|
|
449
|
+
s.addEventListener("abort", () => ac.abort(s.reason), { once: true });
|
|
450
|
+
}
|
|
451
|
+
return ac.signal;
|
|
452
|
+
}
|
|
453
|
+
async function fetchWithRetry(url, timeoutMs, cache, stats, signal, validateHop) {
|
|
334
454
|
try {
|
|
335
455
|
stats.total += 1;
|
|
336
|
-
const r = await cachedFetch(url, { timeoutMs, cache });
|
|
456
|
+
const r = await cachedFetch(url, { timeoutMs, cache, signal, validateHop, onObservation: stats.onObservation });
|
|
337
457
|
if (r.fromCache) {
|
|
338
458
|
stats.hits += 1;
|
|
339
459
|
stats.bytesSavedEstimate += r.body.length;
|
|
@@ -342,14 +462,16 @@ async function fetchWithRetry(url, timeoutMs, cache, stats) {
|
|
|
342
462
|
return null;
|
|
343
463
|
return { text: r.body, contentType: (r.headers["content-type"] ?? "").toLowerCase() };
|
|
344
464
|
}
|
|
345
|
-
catch {
|
|
465
|
+
catch (err) {
|
|
466
|
+
if (signal?.aborted)
|
|
467
|
+
throw err; // propagate cancellation
|
|
346
468
|
return null;
|
|
347
469
|
}
|
|
348
470
|
}
|
|
349
|
-
async function fetchPageWithMeta(url, timeoutMs, cache, stats) {
|
|
471
|
+
async function fetchPageWithMeta(url, timeoutMs, cache, stats, signal, validateHop, followRedirects = true) {
|
|
350
472
|
try {
|
|
351
473
|
stats.total += 1;
|
|
352
|
-
const r = await cachedFetch(url, { timeoutMs, cache });
|
|
474
|
+
const r = await cachedFetch(url, { timeoutMs, cache, signal, validateHop, followRedirects, onObservation: stats.onObservation });
|
|
353
475
|
if (r.fromCache) {
|
|
354
476
|
stats.hits += 1;
|
|
355
477
|
stats.bytesSavedEstimate += r.body.length;
|
|
@@ -366,13 +488,15 @@ async function fetchPageWithMeta(url, timeoutMs, cache, stats) {
|
|
|
366
488
|
},
|
|
367
489
|
};
|
|
368
490
|
}
|
|
369
|
-
catch {
|
|
491
|
+
catch (err) {
|
|
492
|
+
if (signal?.aborted)
|
|
493
|
+
throw err;
|
|
370
494
|
return null;
|
|
371
495
|
}
|
|
372
496
|
}
|
|
373
|
-
async function fetchTextStrict(url, timeoutMs, cache, stats) {
|
|
497
|
+
async function fetchTextStrict(url, timeoutMs, cache, stats, signal, validateHop) {
|
|
374
498
|
stats.total += 1;
|
|
375
|
-
const r = await cachedFetch(url, { timeoutMs, cache });
|
|
499
|
+
const r = await cachedFetch(url, { timeoutMs, cache, signal, validateHop, onObservation: stats.onObservation });
|
|
376
500
|
if (r.fromCache) {
|
|
377
501
|
stats.hits += 1;
|
|
378
502
|
stats.bytesSavedEstimate += r.body.length;
|
|
@@ -455,8 +579,13 @@ function matchGlob(pattern, value) {
|
|
|
455
579
|
function shouldIgnore(url, patterns) {
|
|
456
580
|
if (patterns.length === 0)
|
|
457
581
|
return false;
|
|
582
|
+
// v0.4 §4.5: globs match against the URL pathname only, NOT the full URL.
|
|
583
|
+
// Operator intuition: `ignore: ["dashboard/**"]` should match
|
|
584
|
+
// `https://example.com/dashboard/...` even though the full URL contains the
|
|
585
|
+
// host. Previously globs matched the full URL and silently failed for users
|
|
586
|
+
// who didn't think to write `**/dashboard/**`.
|
|
458
587
|
for (const pattern of patterns) {
|
|
459
|
-
if (
|
|
588
|
+
if (globMatchPathname(pattern, url))
|
|
460
589
|
return true;
|
|
461
590
|
}
|
|
462
591
|
return false;
|
|
@@ -469,7 +598,7 @@ function fisherYatesSample(items, n) {
|
|
|
469
598
|
}
|
|
470
599
|
return arr.slice(arr.length - n);
|
|
471
600
|
}
|
|
472
|
-
async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutMs, cache, stats) {
|
|
601
|
+
async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutMs, cache, stats, signal, validateHop) {
|
|
473
602
|
visited.add(sitemapUrl);
|
|
474
603
|
const locs = parseSitemapUrls(sitemapText);
|
|
475
604
|
if (!isSitemapIndex(sitemapText)) {
|
|
@@ -477,27 +606,32 @@ async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutM
|
|
|
477
606
|
}
|
|
478
607
|
const allUrls = [];
|
|
479
608
|
for (const childUrl of locs) {
|
|
609
|
+
if (signal?.aborted)
|
|
610
|
+
throw signal.reason ?? new Error("aborted");
|
|
480
611
|
if (visited.has(childUrl))
|
|
481
612
|
continue;
|
|
482
|
-
const child = await fetchWithRetry(childUrl, timeoutMs, cache, stats);
|
|
613
|
+
const child = await fetchWithRetry(childUrl, timeoutMs, cache, stats, signal, validateHop);
|
|
483
614
|
if (!child)
|
|
484
615
|
continue;
|
|
485
616
|
const childLike = child.contentType.includes("xml") || looksLikeSitemap(child.text);
|
|
486
617
|
if (!childLike)
|
|
487
618
|
continue;
|
|
488
|
-
const childUrls = await collectUrlsFromSitemap(child.text, childUrl, visited, timeoutMs, cache, stats);
|
|
619
|
+
const childUrls = await collectUrlsFromSitemap(child.text, childUrl, visited, timeoutMs, cache, stats, signal, validateHop);
|
|
489
620
|
allUrls.push(...childUrls);
|
|
490
621
|
}
|
|
491
622
|
return allUrls;
|
|
492
623
|
}
|
|
493
|
-
async function fetchRobotsMeta(origin, timeoutMs, cache, stats) {
|
|
624
|
+
async function fetchRobotsMeta(origin, timeoutMs, cache, stats, signal, validateHop) {
|
|
494
625
|
if (!origin)
|
|
495
626
|
return { disallow: [], crawlDelaySec: 0 };
|
|
496
627
|
try {
|
|
497
628
|
const robotsUrl = `${origin}/robots.txt`;
|
|
498
|
-
const fetched = await fetchTextStrict(robotsUrl, timeoutMs, cache, stats);
|
|
629
|
+
const fetched = await fetchTextStrict(robotsUrl, timeoutMs, cache, stats, signal, validateHop);
|
|
630
|
+
// Honor both the wildcard block AND any block specifically targeting us.
|
|
631
|
+
// A malicious target can't bypass our crawler by adding a targeted
|
|
632
|
+
// `User-agent: pseolint / Disallow: /` without a wildcard.
|
|
499
633
|
return {
|
|
500
|
-
disallow: parseDisallowPatterns(fetched.text),
|
|
634
|
+
disallow: parseDisallowPatterns(fetched.text, ["*", "pseolint"]),
|
|
501
635
|
crawlDelaySec: parseCrawlDelaySeconds(fetched.text),
|
|
502
636
|
};
|
|
503
637
|
}
|
|
@@ -518,13 +652,42 @@ function isDisallowedByRobots(urlPath, patterns) {
|
|
|
518
652
|
function budgetExceeded(b) {
|
|
519
653
|
return b.cap > 0 && b.used >= b.cap;
|
|
520
654
|
}
|
|
521
|
-
async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cache, stats, fillBudgetViaLinkDiscovery = false, byteBudget = { used: 0, cap: 0 }) {
|
|
655
|
+
async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cache, stats, fillBudgetViaLinkDiscovery = false, byteBudget = { used: 0, cap: 0 }, signal, guardSsrf = false, respectRobotsTxt = true, skippedByRobots = [], followRedirects = true, maxCrawlDiscovered = 5000) {
|
|
656
|
+
// Memoized SSRF validator. When guardSsrf is on, every URL fetched by the
|
|
657
|
+
// audit (source, sitemap entries, redirects, discovered links) goes through
|
|
658
|
+
// this. DNS is hit once per unique hostname per audit — a 4k-page audit on
|
|
659
|
+
// one origin does 1 DNS lookup, not 4k.
|
|
660
|
+
const ssrfCache = new Map();
|
|
661
|
+
const validateHop = guardSsrf
|
|
662
|
+
? async (u) => {
|
|
663
|
+
let host;
|
|
664
|
+
try {
|
|
665
|
+
host = new URL(u).hostname;
|
|
666
|
+
}
|
|
667
|
+
catch {
|
|
668
|
+
throw new Error(`Refusing to fetch invalid URL: ${u}`);
|
|
669
|
+
}
|
|
670
|
+
let pending = ssrfCache.get(host);
|
|
671
|
+
if (!pending) {
|
|
672
|
+
pending = validateTargetHost(host).catch((err) => {
|
|
673
|
+
if (err instanceof SSRFError) {
|
|
674
|
+
throw new Error(`Refusing to fetch ${u}: ${err.reason}`);
|
|
675
|
+
}
|
|
676
|
+
throw err;
|
|
677
|
+
});
|
|
678
|
+
ssrfCache.set(host, pending);
|
|
679
|
+
}
|
|
680
|
+
await pending;
|
|
681
|
+
}
|
|
682
|
+
: undefined;
|
|
522
683
|
if (/^https?:\/\//i.test(source)) {
|
|
684
|
+
if (validateHop)
|
|
685
|
+
await validateHop(source);
|
|
523
686
|
let text;
|
|
524
687
|
let contentType;
|
|
525
688
|
let sourceStatus = 200;
|
|
526
689
|
try {
|
|
527
|
-
const fetched = await fetchTextStrict(source, timeoutMs, cache, stats);
|
|
690
|
+
const fetched = await fetchTextStrict(source, timeoutMs, cache, stats, signal, validateHop);
|
|
528
691
|
text = fetched.text;
|
|
529
692
|
contentType = fetched.contentType;
|
|
530
693
|
}
|
|
@@ -533,7 +696,7 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
533
696
|
if (source.includes("sitemap")) {
|
|
534
697
|
try {
|
|
535
698
|
const origin = new URL(source).origin;
|
|
536
|
-
const fallback = await fetchTextStrict(origin, timeoutMs, cache, stats);
|
|
699
|
+
const fallback = await fetchTextStrict(origin, timeoutMs, cache, stats, signal, validateHop);
|
|
537
700
|
text = fallback.text;
|
|
538
701
|
contentType = fallback.contentType;
|
|
539
702
|
sourceStatus = -1; // flag that we fell back
|
|
@@ -549,7 +712,7 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
549
712
|
const isXml = (contentType.includes("xml") || looksLikeSitemap(text)) && sourceStatus !== -1;
|
|
550
713
|
if (isXml) {
|
|
551
714
|
const visited = new Set();
|
|
552
|
-
const allSitemapUrls = await collectUrlsFromSitemap(text, source, visited, timeoutMs, cache, stats);
|
|
715
|
+
const allSitemapUrls = await collectUrlsFromSitemap(text, source, visited, timeoutMs, cache, stats, signal, validateHop);
|
|
553
716
|
// If we have a budget, sample from sitemap URLs before fetching
|
|
554
717
|
const urlsToFetch = discoveryBudget > 0 && allSitemapUrls.length > discoveryBudget
|
|
555
718
|
? fisherYatesSample(allSitemapUrls, discoveryBudget)
|
|
@@ -562,13 +725,29 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
562
725
|
catch {
|
|
563
726
|
return "";
|
|
564
727
|
} })();
|
|
565
|
-
const robots = await fetchRobotsMeta(sourceOrigin, timeoutMs, cache, stats);
|
|
728
|
+
const robots = await fetchRobotsMeta(sourceOrigin, timeoutMs, cache, stats, signal, validateHop);
|
|
566
729
|
const effectiveConcurrency = robots.crawlDelaySec > 0 ? 1 : concurrency;
|
|
567
730
|
const delayMs = robots.crawlDelaySec * 1000;
|
|
568
731
|
await runWithConcurrency(urlsToFetch, effectiveConcurrency, async (url) => {
|
|
569
732
|
if (budgetExceeded(byteBudget))
|
|
570
733
|
return;
|
|
571
|
-
|
|
734
|
+
// Honor robots.txt for our own crawl when respectRobotsTxt is on (default).
|
|
735
|
+
// The existing robotsComplianceRule flags sitemap-vs-robots conflicts as
|
|
736
|
+
// findings; this actually refuses to fetch the disallowed URL. Keeps us
|
|
737
|
+
// legally defensible (we are a bot, our UA `pseolint` is public, and we
|
|
738
|
+
// respect Disallow directives) and removes the "crawler-for-hire" abuse
|
|
739
|
+
// vector when the library is invoked from a hosted service.
|
|
740
|
+
if (respectRobotsTxt) {
|
|
741
|
+
try {
|
|
742
|
+
const p = new URL(url).pathname;
|
|
743
|
+
if (isDisallowedByRobots(p, robots.disallow)) {
|
|
744
|
+
skippedByRobots.push(url);
|
|
745
|
+
return;
|
|
746
|
+
}
|
|
747
|
+
}
|
|
748
|
+
catch { /* URL parse failed — fall through, fetch will fail naturally */ }
|
|
749
|
+
}
|
|
750
|
+
const result = await fetchPageWithMeta(url, timeoutMs, cache, stats, signal, validateHop, followRedirects);
|
|
572
751
|
if (result) {
|
|
573
752
|
byteBudget.used += result.html.length;
|
|
574
753
|
pages.push(result);
|
|
@@ -587,9 +766,16 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
587
766
|
const discoveredUrls = new Set();
|
|
588
767
|
// robots already fetched above; reuse its Disallow patterns here.
|
|
589
768
|
const disallowPatterns = robots.disallow;
|
|
590
|
-
|
|
769
|
+
let discoveryCeilingReached = false;
|
|
770
|
+
outer: for (const page of pages) {
|
|
591
771
|
const linkMatches = Array.from(page.html.matchAll(/href=["']([^"']+)["']/gi));
|
|
592
772
|
for (const match of linkMatches) {
|
|
773
|
+
if (discoveredUrls.size >= maxCrawlDiscovered) {
|
|
774
|
+
// Hard ceiling — don't let a malicious site with many self-links
|
|
775
|
+
// extend crawl discovery up to the byte budget.
|
|
776
|
+
discoveryCeilingReached = true;
|
|
777
|
+
break outer;
|
|
778
|
+
}
|
|
593
779
|
const href = match[1];
|
|
594
780
|
if (!href || href.startsWith("#") || /^mailto:|^tel:|^javascript:|^data:/i.test(href))
|
|
595
781
|
continue;
|
|
@@ -614,6 +800,10 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
614
800
|
}
|
|
615
801
|
}
|
|
616
802
|
}
|
|
803
|
+
if (discoveryCeilingReached) {
|
|
804
|
+
// eslint-disable-next-line no-console
|
|
805
|
+
console.error(`pseolint: crawl discovery hit maxCrawlDiscovered=${maxCrawlDiscovered} ceiling; sampling from the first ${discoveredUrls.size} URLs.`);
|
|
806
|
+
}
|
|
617
807
|
if (discoveredUrls.size > 0) {
|
|
618
808
|
const candidates = Array.from(discoveredUrls);
|
|
619
809
|
// Fisher-Yates shuffle so we don't bias toward the first-discovered links (nav/footer).
|
|
@@ -623,7 +813,7 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
623
813
|
await runWithConcurrency(toFetch, effectiveConcurrency, async (url) => {
|
|
624
814
|
if (budgetExceeded(byteBudget))
|
|
625
815
|
return;
|
|
626
|
-
const result = await fetchPageWithMeta(url, timeoutMs, cache, stats);
|
|
816
|
+
const result = await fetchPageWithMeta(url, timeoutMs, cache, stats, signal, validateHop, followRedirects);
|
|
627
817
|
if (result && result.httpMeta && result.httpMeta.statusCode >= 200 && result.httpMeta.statusCode < 300) {
|
|
628
818
|
byteBudget.used += result.html.length;
|
|
629
819
|
pages.push(result);
|
|
@@ -700,7 +890,7 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
700
890
|
}
|
|
701
891
|
const newPages = [];
|
|
702
892
|
await runWithConcurrency(urlsToFetch, concurrency, async (url) => {
|
|
703
|
-
const result = await fetchPageWithMeta(url, timeoutMs, cache, stats);
|
|
893
|
+
const result = await fetchPageWithMeta(url, timeoutMs, cache, stats, signal, validateHop, followRedirects);
|
|
704
894
|
if (result && result.httpMeta && result.httpMeta.statusCode >= 200 && result.httpMeta.statusCode < 300) {
|
|
705
895
|
newPages.push(result);
|
|
706
896
|
knownCrawled.add(url);
|
|
@@ -744,10 +934,63 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
744
934
|
export async function auditSource(source, options) {
|
|
745
935
|
const runId = generateRunId();
|
|
746
936
|
const runStartedAt = Date.now();
|
|
747
|
-
|
|
937
|
+
// Apply safeMode preset first, then let explicit options override it. Using
|
|
938
|
+
// `??` preserves the "not set" vs "explicitly false" distinction — a user
|
|
939
|
+
// who picks safeMode="saas" but passes `guardSsrf: false` gets the explicit
|
|
940
|
+
// override. Localhost sources auto-promote to the `dev` preset unless the
|
|
941
|
+
// caller explicitly set `safeMode` or passed `autoDevPreset: false`.
|
|
942
|
+
const presetKey = resolveSafeModeKey(source, options);
|
|
943
|
+
const preset = SAFE_MODE_PRESETS[presetKey];
|
|
944
|
+
const concurrency = options?.concurrency ?? preset.concurrency ?? 5;
|
|
748
945
|
const timeoutMs = options?.timeout ?? 30000;
|
|
749
946
|
const ignorePatterns = options?.ignore ?? [];
|
|
750
|
-
const sampleSize = options?.sampleSize ?? 0;
|
|
947
|
+
const sampleSize = options?.sampleSize ?? preset.sampleSize ?? 0;
|
|
948
|
+
const externalSignal = options?.signal;
|
|
949
|
+
const guardSsrf = options?.guardSsrf ?? preset.guardSsrf ?? false;
|
|
950
|
+
const respectRobotsTxt = options?.respectRobotsTxt ?? preset.respectRobotsTxt ?? true;
|
|
951
|
+
const followRedirects = options?.followRedirects ?? preset.followRedirects ?? true;
|
|
952
|
+
const maxCrawlDiscovered = options?.maxCrawlDiscovered ?? preset.maxCrawlDiscovered ?? 5000;
|
|
953
|
+
const skippedByRobots = [];
|
|
954
|
+
// Backpressure: watch TTFB + 5xx rate during the crawl and abort if the
|
|
955
|
+
// origin looks degraded. The audit signal is a composite of the caller's
|
|
956
|
+
// signal (ctrl-C, parent timeout) and the monitor's abort controller.
|
|
957
|
+
const backpressureEnabled = options?.backpressure !== false;
|
|
958
|
+
const backpressureAbort = new AbortController();
|
|
959
|
+
let backpressureError = null;
|
|
960
|
+
const signal = composeSignals(externalSignal, backpressureAbort.signal);
|
|
961
|
+
const observer = new FetchObserver();
|
|
962
|
+
const monitor = backpressureEnabled
|
|
963
|
+
? new BackpressureMonitor({
|
|
964
|
+
warmupSize: 10,
|
|
965
|
+
absoluteP95Ms: 3000,
|
|
966
|
+
baselineMultiplier: 2,
|
|
967
|
+
errorRatioThreshold: 0.1,
|
|
968
|
+
})
|
|
969
|
+
: null;
|
|
970
|
+
// v0.4: framework gets set on the first observation that carries headers
|
|
971
|
+
// (the source URL fetch). Backpressure thresholds and computeReadiness use
|
|
972
|
+
// it to soften limits when auditing a dev server.
|
|
973
|
+
let detectedFramework = null;
|
|
974
|
+
const onObservation = (obs) => {
|
|
975
|
+
if (detectedFramework === null && obs.headers) {
|
|
976
|
+
detectedFramework = detectDevServer(obs.headers);
|
|
977
|
+
}
|
|
978
|
+
observer.record(obs);
|
|
979
|
+
if (!monitor)
|
|
980
|
+
return;
|
|
981
|
+
const decision = monitor.record(obs);
|
|
982
|
+
if (decision.shouldAbort && !backpressureError && decision.snapshot) {
|
|
983
|
+
backpressureError = new OriginDegradedError(decision.reason ?? "", decision.snapshot);
|
|
984
|
+
backpressureAbort.abort(backpressureError);
|
|
985
|
+
}
|
|
986
|
+
};
|
|
987
|
+
function throwIfAborted() {
|
|
988
|
+
if (backpressureError)
|
|
989
|
+
throw backpressureError;
|
|
990
|
+
if (externalSignal?.aborted) {
|
|
991
|
+
throw externalSignal.reason ?? new DOMException("Audit aborted", "AbortError");
|
|
992
|
+
}
|
|
993
|
+
}
|
|
751
994
|
const resolvedRules = {
|
|
752
995
|
nearDuplicateThreshold: options?.rules?.nearDuplicateThreshold ?? DEFAULTS.nearDuplicateThreshold,
|
|
753
996
|
entitySwapThreshold: options?.rules?.entitySwapThreshold ?? DEFAULTS.entitySwapThreshold,
|
|
@@ -758,10 +1001,6 @@ export async function auditSource(source, options) {
|
|
|
758
1001
|
uniqueValueMinWords: options?.rules?.uniqueValueMinWords ?? DEFAULTS.uniqueValueMinWords,
|
|
759
1002
|
metaUniquenessMinJaccard: options?.rules?.metaUniquenessMinJaccard ?? DEFAULTS.metaUniquenessMinJaccard,
|
|
760
1003
|
linkDepthMaxClicks: options?.rules?.linkDepthMaxClicks ?? DEFAULTS.linkDepthMaxClicks,
|
|
761
|
-
hubPagesMinSiblings: options?.rules?.hubPagesMinSiblings ?? DEFAULTS.hubPagesMinSiblings,
|
|
762
|
-
hubPagesMaxSiblings: options?.rules?.hubPagesMaxSiblings ?? DEFAULTS.hubPagesMaxSiblings,
|
|
763
|
-
titleOverlapThreshold: options?.rules?.titleOverlapThreshold ?? DEFAULTS.titleOverlapThreshold,
|
|
764
|
-
keywordCollisionMinShared: options?.rules?.keywordCollisionMinShared ?? DEFAULTS.keywordCollisionMinShared,
|
|
765
1004
|
templateCoverageMinPages: options?.rules?.templateCoverageMinPages ?? DEFAULTS.templateCoverageMinPages,
|
|
766
1005
|
answerFirstMaxWords: options?.rules?.answerFirstMaxWords ?? DEFAULTS.answerFirstMaxWords,
|
|
767
1006
|
citableFactsMin: options?.rules?.citableFactsMin ?? DEFAULTS.citableFactsMin,
|
|
@@ -783,18 +1022,47 @@ export async function auditSource(source, options) {
|
|
|
783
1022
|
const discoveryBudget = options?.sampleSize && options.sampleSize > 0
|
|
784
1023
|
? Math.max(50, options.sampleSize * 2)
|
|
785
1024
|
: 0;
|
|
786
|
-
const cacheStats = { hits: 0, total: 0, bytesSavedEstimate: 0 };
|
|
1025
|
+
const cacheStats = { hits: 0, total: 0, bytesSavedEstimate: 0, onObservation };
|
|
787
1026
|
const cacheConfig = options?.cache
|
|
788
1027
|
? {
|
|
789
1028
|
dir: options.cache.dir ?? ".pseolint/cache",
|
|
790
1029
|
ttlMs: options.cache.ttlMs ?? 7 * 24 * 60 * 60 * 1000,
|
|
791
1030
|
}
|
|
792
1031
|
: null;
|
|
1032
|
+
// Size cap (post-audit eviction). Default 200 MB keeps pSEO-scale sites in check;
|
|
1033
|
+
// a single full crawl of a 5k-page site averages ~250 KB per body = ~1.25 GB uncapped.
|
|
1034
|
+
const cacheMaxBytes = options?.cache?.maxBytes ?? 209_715_200;
|
|
793
1035
|
const fillBudgetViaLinkDiscovery = options?.fillBudgetViaLinkDiscovery ?? false;
|
|
794
|
-
const maxFetchBytes = options?.maxFetchBytes ?? 52_428_800;
|
|
1036
|
+
const maxFetchBytes = options?.maxFetchBytes ?? preset.maxFetchBytes ?? 52_428_800;
|
|
795
1037
|
const fetchByteBudget = { used: 0, cap: maxFetchBytes };
|
|
796
|
-
|
|
1038
|
+
// v0.4 §4.7: detectedFramework is set in onObservation above, side-effect
|
|
1039
|
+
// of the normal source URL fetch. No separate probe needed.
|
|
1040
|
+
const { pages: loadedPagesRaw, sitemapUrls: sitemapUrlSet, discoveredUrlCount } = await loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cacheConfig, cacheStats, fillBudgetViaLinkDiscovery, fetchByteBudget, signal, guardSsrf, respectRobotsTxt, skippedByRobots, followRedirects, maxCrawlDiscovered);
|
|
1041
|
+
throwIfAborted();
|
|
797
1042
|
const loadedPages = [...loadedPagesRaw];
|
|
1043
|
+
// v0.4 §4.7: content-type-aware crawling. Filter out fetched URLs whose
|
|
1044
|
+
// response Content-Type is not HTML (text/html or application/xhtml+xml).
|
|
1045
|
+
// Binary routes like /apple-icon, /opengraph-image, /icon get pushed to
|
|
1046
|
+
// crawlStats.skipped instead of being parsed as thin-content pages.
|
|
1047
|
+
const skippedByContentType = [];
|
|
1048
|
+
const htmlOnlyPages = [];
|
|
1049
|
+
for (const p of loadedPages) {
|
|
1050
|
+
// httpMeta is set on URL fetches; locally-loaded files have no httpMeta
|
|
1051
|
+
// and are always HTML by definition (collectHtmlFiles only picks .html).
|
|
1052
|
+
// We don't have content-type on the LoadedPage object. Heuristic: if html
|
|
1053
|
+
// body doesn't contain any HTML markers, treat as non-HTML.
|
|
1054
|
+
if (!p.httpMeta) {
|
|
1055
|
+
htmlOnlyPages.push(p);
|
|
1056
|
+
continue;
|
|
1057
|
+
}
|
|
1058
|
+
if (looksLikeHtml(p.html)) {
|
|
1059
|
+
htmlOnlyPages.push(p);
|
|
1060
|
+
}
|
|
1061
|
+
else {
|
|
1062
|
+
skippedByContentType.push(p.url);
|
|
1063
|
+
}
|
|
1064
|
+
}
|
|
1065
|
+
loadedPages.splice(0, loadedPages.length, ...htmlOnlyPages);
|
|
798
1066
|
if (discoveredUrlCount && discoveredUrlCount > loadedPages.length) {
|
|
799
1067
|
console.error(`Discovered ${discoveredUrlCount} pages, fetched ${loadedPages.length} for audit. Use --sample-size 0 for full crawl.`);
|
|
800
1068
|
}
|
|
@@ -830,7 +1098,7 @@ export async function auditSource(source, options) {
|
|
|
830
1098
|
if (/^https?:\/\//i.test(source)) {
|
|
831
1099
|
try {
|
|
832
1100
|
const origin = new URL(source).origin;
|
|
833
|
-
const result = await fetchWithRetry(`${origin}/robots.txt`, timeoutMs, cacheConfig, cacheStats);
|
|
1101
|
+
const result = await fetchWithRetry(`${origin}/robots.txt`, timeoutMs, cacheConfig, cacheStats, signal);
|
|
834
1102
|
if (result)
|
|
835
1103
|
robotsTxtContent = result.text;
|
|
836
1104
|
}
|
|
@@ -918,11 +1186,50 @@ export async function auditSource(source, options) {
|
|
|
918
1186
|
}),
|
|
919
1187
|
]
|
|
920
1188
|
: DEFAULT_ENTITY_PATTERNS;
|
|
1189
|
+
// v0.4 §4.11 — pre-flight site classification. We compute this BEFORE the
|
|
1190
|
+
// rule pipeline so the dispatcher can skip pSEO-only rules on small
|
|
1191
|
+
// marketing sites / blogs. Classification is computed off the FULL
|
|
1192
|
+
// discovered URL set (sitemap when available, else loaded URLs). This
|
|
1193
|
+
// matters: a sampled crawl of a 5000-page directory must still classify
|
|
1194
|
+
// as `programmatic-directory`, not `unclear`.
|
|
1195
|
+
const classifierUrls = (() => {
|
|
1196
|
+
if (sitemapUrlSet && sitemapUrlSet.size > 0) {
|
|
1197
|
+
return Array.from(sitemapUrlSet);
|
|
1198
|
+
}
|
|
1199
|
+
return loadedPagesRaw.map((p) => p.url);
|
|
1200
|
+
})();
|
|
1201
|
+
const classifierFramework = detectedFramework ?? "unknown";
|
|
1202
|
+
const computedClassification = classifySite({
|
|
1203
|
+
urls: classifierUrls,
|
|
1204
|
+
framework: classifierFramework,
|
|
1205
|
+
});
|
|
1206
|
+
// `--strict` (or AuditOptions.strict) keeps the classification but forces
|
|
1207
|
+
// every rule to run regardless of detected site type.
|
|
1208
|
+
const siteClassification = options?.strict
|
|
1209
|
+
? { ...computedClassification, suppressedRules: [] }
|
|
1210
|
+
: computedClassification;
|
|
1211
|
+
const suppressedRuleSet = new Set(siteClassification.suppressedRules);
|
|
921
1212
|
// Classify pages into groups and run only enabled rules per group
|
|
922
1213
|
const classified = classifyPages(parsedPages, options?.pageGroups);
|
|
923
1214
|
const allFindings = [...duplicateUrlFindings];
|
|
924
1215
|
const groupScores = {};
|
|
925
1216
|
const groupPageCounts = {};
|
|
1217
|
+
// Surface robots-skipped URLs so users don't silently get a smaller audit
|
|
1218
|
+
// than expected. One rollup finding (not per-URL) to avoid flooding the
|
|
1219
|
+
// output on large sites. Also included on summary.skippedUrls below.
|
|
1220
|
+
if (skippedByRobots.length > 0) {
|
|
1221
|
+
allFindings.push({
|
|
1222
|
+
ruleId: "audit/skipped-by-robots",
|
|
1223
|
+
severity: "info",
|
|
1224
|
+
message: `Skipped ${skippedByRobots.length} sitemap URL${skippedByRobots.length === 1 ? "" : "s"} because the target's robots.txt Disallow'd them: ${skippedByRobots.slice(0, 5).join(", ")}${skippedByRobots.length > 5 ? ", …" : ""}.`,
|
|
1225
|
+
fix: "If you own this site and want to audit these URLs anyway, pass `respectRobotsTxt: false` (or remove the Disallow directive).",
|
|
1226
|
+
relatedUrls: skippedByRobots,
|
|
1227
|
+
});
|
|
1228
|
+
}
|
|
1229
|
+
// v0.4 §4.4: origin readiness is now diagnostic-only. The previous
|
|
1230
|
+
// `audit/origin-readiness` finding emission was retired — the structured
|
|
1231
|
+
// ReadinessReport in `summary.diagnostics.originReadiness` is the canonical
|
|
1232
|
+
// signal now (no double-counting in the issue buckets).
|
|
926
1233
|
const auditMode = options?.mode ?? "full";
|
|
927
1234
|
// Site-wide rules (run once, outside group loop)
|
|
928
1235
|
if (sitemapUrlSet && sitemapUrlSet.size > 0 && auditMode !== "diff") {
|
|
@@ -959,39 +1266,83 @@ export async function auditSource(source, options) {
|
|
|
959
1266
|
if (groupConfig?.rules !== undefined && groupConfig.rules.length === 0)
|
|
960
1267
|
continue;
|
|
961
1268
|
const groupRules = resolveGroupRules(resolvedRules, groupConfig?.overrides);
|
|
962
|
-
const enabledCheck = (ruleId) => isRuleEnabled(ruleId, groupConfig?.rules);
|
|
1269
|
+
const enabledCheck = (ruleId) => !suppressedRuleSet.has(ruleId) && isRuleEnabled(ruleId, groupConfig?.rules);
|
|
963
1270
|
const findings = runRulesOnPages(groupPages, groupRules, enabledCheck, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, DEFAULT_ENTITY_PATTERNS, groupConfig?.overrides, options?.mode ?? "full");
|
|
964
1271
|
allFindings.push(...findings);
|
|
965
1272
|
groupPageCounts[groupName] = groupPages.length;
|
|
966
|
-
const {
|
|
967
|
-
groupScores[groupName] =
|
|
1273
|
+
const { risk: groupRisk } = scoreFromFindings(findings);
|
|
1274
|
+
groupScores[groupName] = groupRisk;
|
|
968
1275
|
}
|
|
1276
|
+
throwIfAborted();
|
|
969
1277
|
// Enrich findings: cluster pairwise, detect templates, assign effort
|
|
970
1278
|
const enriched = enrichFindings(allFindings, parsedPages, {
|
|
971
1279
|
templateGenerated: options?.templateGenerated,
|
|
972
1280
|
});
|
|
973
|
-
|
|
1281
|
+
// Populate docsUrl on every finding before they leave the engine.
|
|
1282
|
+
withDocsUrls(enriched.findings);
|
|
1283
|
+
const { risk, categories, bucketCounts } = scoreFromFindings(enriched.findings);
|
|
974
1284
|
const auditedPageCount = Object.values(groupPageCounts).reduce((a, b) => a + b, 0);
|
|
1285
|
+
const issues = bucketIssues(enriched.findings);
|
|
1286
|
+
const verdict = verdictForRisk(risk);
|
|
1287
|
+
const headline = buildHeadline(bucketCounts);
|
|
1288
|
+
// audit/* findings are diagnostic-only and never appear in summary.issues.
|
|
1289
|
+
// Surface them under diagnostics so consumers (telemetry, debug UIs) can
|
|
1290
|
+
// still see what was deduped or skipped.
|
|
1291
|
+
const auditFindings = enriched.findings.filter((f) => f.ruleId.startsWith("audit/"));
|
|
1292
|
+
const readinessReport = computeReadiness(observer.getAll(), { detectedFramework });
|
|
1293
|
+
const crawlStats = {
|
|
1294
|
+
discovered: discoveredUrlCount ?? loadedPagesRaw.length,
|
|
1295
|
+
fetched: parsedPages.length,
|
|
1296
|
+
skipped: skippedByContentType.length + skippedByRobots.length + skippedUrls.length,
|
|
1297
|
+
};
|
|
975
1298
|
const summary = {
|
|
976
|
-
|
|
977
|
-
|
|
1299
|
+
schemaVersion: SCHEMA_VERSION,
|
|
1300
|
+
verdict,
|
|
1301
|
+
risk,
|
|
1302
|
+
headline,
|
|
1303
|
+
categories,
|
|
1304
|
+
issues,
|
|
1305
|
+
siteClassification,
|
|
1306
|
+
diagnostics: {
|
|
1307
|
+
originReadiness: readinessReport,
|
|
1308
|
+
crawlStats,
|
|
1309
|
+
auditFindings,
|
|
1310
|
+
},
|
|
978
1311
|
groupScores: options?.pageGroups ? groupScores : undefined,
|
|
979
1312
|
groupPageCounts: options?.pageGroups ? groupPageCounts : undefined,
|
|
980
1313
|
pageCount: auditedPageCount || parsedPages.length,
|
|
981
|
-
findings: enriched.findings,
|
|
982
1314
|
templateDetected: enriched.templateDetected,
|
|
983
1315
|
rawFindingCount: enriched.rawFindingCount,
|
|
984
1316
|
};
|
|
985
1317
|
if (cacheConfig) {
|
|
986
1318
|
summary.cacheStats = cacheStats;
|
|
987
1319
|
}
|
|
988
|
-
|
|
989
|
-
|
|
1320
|
+
// v0.4 §4.5: warn when an `ignore` pattern matched zero discovered URLs.
|
|
1321
|
+
if (ignorePatterns.length > 0) {
|
|
1322
|
+
for (const pattern of ignorePatterns) {
|
|
1323
|
+
const matched = deduped.some((p) => globMatchPathname(pattern, p.url));
|
|
1324
|
+
if (!matched) {
|
|
1325
|
+
// eslint-disable-next-line no-console
|
|
1326
|
+
console.warn(`[pseolint] ignore pattern '${pattern}' matched 0 URLs — likely typo`);
|
|
1327
|
+
}
|
|
1328
|
+
}
|
|
990
1329
|
}
|
|
1330
|
+
// Merge state-skipped (unchanged since last run) and robots-skipped (target
|
|
1331
|
+
// robots.txt Disallow'd) URLs so callers have a single audit-skipped surface.
|
|
1332
|
+
const allSkipped = [...skippedUrls, ...skippedByRobots];
|
|
1333
|
+
if (allSkipped.length > 0) {
|
|
1334
|
+
summary.skippedUrls = allSkipped;
|
|
1335
|
+
}
|
|
1336
|
+
// Local flat view of every finding the engine produced, used internally for
|
|
1337
|
+
// state persistence, regression detection, AI triage input, and telemetry
|
|
1338
|
+
// counts. NOT exposed on the AuditSummary — consumers must use
|
|
1339
|
+
// `summary.issues.{blockers,shouldFix,informational}` and
|
|
1340
|
+
// `summary.diagnostics.auditFindings`.
|
|
1341
|
+
const enrichedFindings = enriched.findings;
|
|
991
1342
|
if (priorState && options?.state?.exitOnRegression) {
|
|
992
1343
|
let hasRegression = false;
|
|
993
1344
|
const currentFindings = new Map();
|
|
994
|
-
for (const f of
|
|
1345
|
+
for (const f of enrichedFindings) {
|
|
995
1346
|
if (!f.pageUrl)
|
|
996
1347
|
continue;
|
|
997
1348
|
const set = currentFindings.get(f.pageUrl) ?? new Set();
|
|
@@ -1019,7 +1370,7 @@ export async function auditSource(source, options) {
|
|
|
1019
1370
|
const renderMode = options.render ? "rendered" : "static";
|
|
1020
1371
|
const urls = {};
|
|
1021
1372
|
const findingsByUrl = new Map();
|
|
1022
|
-
for (const f of
|
|
1373
|
+
for (const f of enrichedFindings) {
|
|
1023
1374
|
if (!f.pageUrl)
|
|
1024
1375
|
continue;
|
|
1025
1376
|
const list = findingsByUrl.get(f.pageUrl) ?? [];
|
|
@@ -1051,9 +1402,10 @@ export async function auditSource(source, options) {
|
|
|
1051
1402
|
renderMode,
|
|
1052
1403
|
urls,
|
|
1053
1404
|
summary: {
|
|
1054
|
-
score: summary.
|
|
1055
|
-
totalFindings:
|
|
1056
|
-
byCategory: Object.fromEntries(Object.entries(summary.
|
|
1405
|
+
score: summary.risk,
|
|
1406
|
+
totalFindings: enrichedFindings.length,
|
|
1407
|
+
byCategory: Object.fromEntries(Object.entries(summary.categories)
|
|
1408
|
+
.map(([k, v]) => [k, v.issues])),
|
|
1057
1409
|
},
|
|
1058
1410
|
};
|
|
1059
1411
|
await writeState(statePath, newState);
|
|
@@ -1089,7 +1441,8 @@ export async function auditSource(source, options) {
|
|
|
1089
1441
|
spentTodayUsd = 0;
|
|
1090
1442
|
}
|
|
1091
1443
|
}
|
|
1092
|
-
|
|
1444
|
+
throwIfAborted();
|
|
1445
|
+
const outcome = await triageFindings(enrichedFindings, summary.pageCount, {
|
|
1093
1446
|
enabled: true,
|
|
1094
1447
|
model: resolved.model,
|
|
1095
1448
|
providerId: resolved.providerId,
|
|
@@ -1124,9 +1477,9 @@ export async function auditSource(source, options) {
|
|
|
1124
1477
|
runId,
|
|
1125
1478
|
timestamp: new Date().toISOString(),
|
|
1126
1479
|
durationMs: Date.now() - runStartedAt,
|
|
1127
|
-
score: summary.
|
|
1480
|
+
score: summary.risk,
|
|
1128
1481
|
pageCount: summary.pageCount,
|
|
1129
|
-
findingCount:
|
|
1482
|
+
findingCount: enrichedFindings.length,
|
|
1130
1483
|
...(summary.rawFindingCount !== undefined && { rawFindingCount: summary.rawFindingCount }),
|
|
1131
1484
|
...(summary.templateDetected !== undefined && { templateDetected: summary.templateDetected }),
|
|
1132
1485
|
...(summary.cacheStats && { cacheStats: summary.cacheStats }),
|
|
@@ -1181,7 +1534,19 @@ export async function auditSource(source, options) {
|
|
|
1181
1534
|
}
|
|
1182
1535
|
const aiHintEnabled = options?.ai?.suggest !== false;
|
|
1183
1536
|
if (aiHintEnabled && !options?.ai?.enabled && process.env.ANTHROPIC_API_KEY) {
|
|
1184
|
-
console.error(`💡 AI triage available — re-run with --ai to prioritize ${
|
|
1537
|
+
console.error(`💡 AI triage available — re-run with --ai to prioritize ${enrichedFindings.length} findings into a fix list.`);
|
|
1538
|
+
}
|
|
1539
|
+
if (cacheConfig && cacheMaxBytes > 0) {
|
|
1540
|
+
try {
|
|
1541
|
+
const pruneResult = await pruneCache(cacheConfig.dir, cacheMaxBytes);
|
|
1542
|
+
if (pruneResult.removedEntries > 0 || pruneResult.removedTmpFiles > 0) {
|
|
1543
|
+
const freedMb = ((pruneResult.before.bytes - pruneResult.after.bytes) / 1024 / 1024).toFixed(1);
|
|
1544
|
+
console.error(`pseolint: cache prune freed ${freedMb} MB (${pruneResult.removedEntries} entries, ${pruneResult.removedTmpFiles} .tmp files); size=${(pruneResult.after.bytes / 1024 / 1024).toFixed(1)}MB / cap=${(cacheMaxBytes / 1024 / 1024).toFixed(0)}MB`);
|
|
1545
|
+
}
|
|
1546
|
+
}
|
|
1547
|
+
catch {
|
|
1548
|
+
// Non-fatal: eviction failure must not break the audit.
|
|
1549
|
+
}
|
|
1185
1550
|
}
|
|
1186
1551
|
return summary;
|
|
1187
1552
|
}
|