@pseolint/core 0.3.2 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +49 -1
- package/dist/ai/triage.d.ts.map +1 -1
- package/dist/ai/triage.js +8 -1
- package/dist/ai/triage.js.map +1 -1
- package/dist/auditor.d.ts.map +1 -1
- package/dist/auditor.js +566 -136
- package/dist/auditor.js.map +1 -1
- package/dist/backpressure.d.ts +68 -0
- package/dist/backpressure.d.ts.map +1 -0
- package/dist/backpressure.js +81 -0
- package/dist/backpressure.js.map +1 -0
- package/dist/cache.d.ts +73 -0
- package/dist/cache.d.ts.map +1 -1
- package/dist/cache.js +258 -19
- package/dist/cache.js.map +1 -1
- package/dist/enrich-findings.d.ts.map +1 -1
- package/dist/enrich-findings.js +1 -14
- package/dist/enrich-findings.js.map +1 -1
- package/dist/fetch-observer.d.ts +97 -0
- package/dist/fetch-observer.d.ts.map +1 -0
- package/dist/fetch-observer.js +124 -0
- package/dist/fetch-observer.js.map +1 -0
- package/dist/formatters/console.d.ts +7 -9
- package/dist/formatters/console.d.ts.map +1 -1
- package/dist/formatters/console.js +218 -254
- package/dist/formatters/console.js.map +1 -1
- package/dist/formatters/html.d.ts +5 -1
- package/dist/formatters/html.d.ts.map +1 -1
- package/dist/formatters/html.js +352 -570
- package/dist/formatters/html.js.map +1 -1
- package/dist/formatters/index.d.ts +4 -1
- package/dist/formatters/index.d.ts.map +1 -1
- package/dist/formatters/index.js +1 -1
- package/dist/formatters/index.js.map +1 -1
- package/dist/formatters/json.d.ts +11 -1
- package/dist/formatters/json.d.ts.map +1 -1
- package/dist/formatters/json.js +5 -1
- package/dist/formatters/json.js.map +1 -1
- package/dist/formatters/markdown.d.ts +7 -1
- package/dist/formatters/markdown.d.ts.map +1 -1
- package/dist/formatters/markdown.js +77 -70
- package/dist/formatters/markdown.js.map +1 -1
- package/dist/index.d.ts +13 -8
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +6 -7
- package/dist/index.js.map +1 -1
- package/dist/page-filter.d.ts +50 -0
- package/dist/page-filter.d.ts.map +1 -0
- package/dist/page-filter.js +86 -0
- package/dist/page-filter.js.map +1 -0
- package/dist/rule-references.d.ts.map +1 -1
- package/dist/rule-references.js +0 -6
- package/dist/rule-references.js.map +1 -1
- package/dist/rules/content/unique-value.d.ts.map +1 -1
- package/dist/rules/content/unique-value.js +1 -0
- package/dist/rules/content/unique-value.js.map +1 -1
- package/dist/rules/scope.d.ts.map +1 -1
- package/dist/rules/scope.js +6 -14
- package/dist/rules/scope.js.map +1 -1
- package/dist/rules/tech/robots-sitemap-presence.d.ts +9 -1
- package/dist/rules/tech/robots-sitemap-presence.d.ts.map +1 -1
- package/dist/rules/tech/robots-sitemap-presence.js +14 -5
- package/dist/rules/tech/robots-sitemap-presence.js.map +1 -1
- package/dist/safe-mode-preset.d.ts +27 -0
- package/dist/safe-mode-preset.d.ts.map +1 -0
- package/dist/safe-mode-preset.js +54 -0
- package/dist/safe-mode-preset.js.map +1 -0
- package/dist/site-classifier.d.ts +83 -0
- package/dist/site-classifier.d.ts.map +1 -0
- package/dist/site-classifier.js +205 -0
- package/dist/site-classifier.js.map +1 -0
- package/dist/ssrf-guard.d.ts +96 -0
- package/dist/ssrf-guard.d.ts.map +1 -0
- package/dist/ssrf-guard.js +268 -0
- package/dist/ssrf-guard.js.map +1 -0
- package/dist/types.d.ts +202 -19
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js +2 -1
- package/dist/types.js.map +1 -1
- package/package.json +2 -2
package/dist/auditor.js
CHANGED
|
@@ -2,9 +2,9 @@ import { createHash } from "node:crypto";
|
|
|
2
2
|
import { readdir, readFile, stat } from "node:fs/promises";
|
|
3
3
|
import { extname, join, resolve } from "node:path";
|
|
4
4
|
import { parseHtmlPage } from "./parser.js";
|
|
5
|
+
import { pageSkipReason } from "./page-filter.js";
|
|
5
6
|
import { mergeNormalizeUrlOptions, normalizeAuditUrl } from "./url-normalize.js";
|
|
6
7
|
import { eeatSignalsRule } from "./rules/content/eeat-signals.js";
|
|
7
|
-
import { headingUniquenessRule } from "./rules/content/heading-uniqueness.js";
|
|
8
8
|
import { metaUniquenessRule } from "./rules/content/meta-uniqueness.js";
|
|
9
9
|
import { missingAuthorRule } from "./rules/content/missing-author.js";
|
|
10
10
|
import { uniqueValueRule } from "./rules/content/unique-value.js";
|
|
@@ -18,12 +18,10 @@ import { thinContentRule } from "./rules/spam/thin-content.js";
|
|
|
18
18
|
import { deadEndsRule } from "./rules/links/dead-ends.js";
|
|
19
19
|
import { linkDepthRule } from "./rules/links/link-depth.js";
|
|
20
20
|
import { clusterConnectivityRule } from "./rules/links/cluster-connectivity.js";
|
|
21
|
-
import { hubPagesRule } from "./rules/links/hub-pages.js";
|
|
22
21
|
import { orphanPagesRule } from "./rules/links/orphan-pages.js";
|
|
23
22
|
import { canonicalConsistencyRule } from "./rules/tech/canonical-consistency.js";
|
|
24
23
|
import { canonicalNoindexConflictRule } from "./rules/tech/canonical-noindex-conflict.js";
|
|
25
24
|
import { hreflangConsistencyRule } from "./rules/tech/hreflang-consistency.js";
|
|
26
|
-
import { ogCompletenessRule } from "./rules/tech/og-completeness.js";
|
|
27
25
|
import { robotsNoindexConflictRule } from "./rules/tech/robots-noindex-conflict.js";
|
|
28
26
|
import { sitemapCompletenessRule } from "./rules/tech/sitemap-completeness.js";
|
|
29
27
|
import { robotsComplianceRule, parseDisallowPatterns, isBlockedByPattern, parseCrawlDelaySeconds } from "./rules/tech/robots-sitemap-presence.js";
|
|
@@ -33,7 +31,6 @@ import { freshnessSignalsRule } from "./rules/aeo/freshness-signals.js";
|
|
|
33
31
|
import { faqCoverageRule } from "./rules/aeo/faq-coverage.js";
|
|
34
32
|
import { answerFirstRule } from "./rules/aeo/answer-first.js";
|
|
35
33
|
import { citableFactsRule } from "./rules/aeo/citable-facts.js";
|
|
36
|
-
import { nonReplicableValueRule } from "./rules/aeo/non-replicable-value.js";
|
|
37
34
|
import { contentModularityRule } from "./rules/aeo/content-modularity.js";
|
|
38
35
|
import { summaryBaitRule } from "./rules/aeo/summary-bait.js";
|
|
39
36
|
import { redirectChainRule } from "./rules/tech/redirect-chain.js";
|
|
@@ -41,8 +38,6 @@ import { soft404Rule } from "./rules/tech/soft-404.js";
|
|
|
41
38
|
import { jsonLdValidRule } from "./rules/schema/json-ld-valid.js";
|
|
42
39
|
import { requiredFieldsRule } from "./rules/schema/required-fields.js";
|
|
43
40
|
import { schemaConsistencyRule } from "./rules/schema/consistency.js";
|
|
44
|
-
import { titleOverlapRule } from "./rules/cannibal/title-overlap.js";
|
|
45
|
-
import { keywordCollisionRule } from "./rules/cannibal/keyword-collision.js";
|
|
46
41
|
import { urlPatternRule } from "./rules/cannibal/url-pattern.js";
|
|
47
42
|
import { templateCoverageRule } from "./rules/spam/template-coverage.js";
|
|
48
43
|
import { dataBindingRule, dataIdenticalRule } from "./rules/data/data-binding.js";
|
|
@@ -54,8 +49,14 @@ import { triageFindings } from "./ai/triage.js";
|
|
|
54
49
|
import { createLanguageModel } from "./ai/adapters/index.js";
|
|
55
50
|
import { promptTriageFeedback } from "./ai/feedback-prompt.js";
|
|
56
51
|
import { generateRunId, appendTelemetryRecord, todayTriageSpendUsd, } from "./telemetry/index.js";
|
|
57
|
-
import {
|
|
52
|
+
import { SCHEMA_VERSION } from "./types.js";
|
|
53
|
+
import { cachedFetch, pruneCache } from "./cache.js";
|
|
54
|
+
import { SSRFError, validateTargetHost } from "./ssrf-guard.js";
|
|
55
|
+
import { SAFE_MODE_PRESETS, resolveSafeModeKey } from "./safe-mode-preset.js";
|
|
56
|
+
import { FetchObserver, computeReadiness, detectDevServer } from "./fetch-observer.js";
|
|
57
|
+
import { BackpressureMonitor, OriginDegradedError } from "./backpressure.js";
|
|
58
58
|
import { stratifiedSample } from "./stratified-sample.js";
|
|
59
|
+
import { classifySite } from "./site-classifier.js";
|
|
59
60
|
import { readState, writeState, computeContentHash, STATE_SCHEMA_VERSION, } from "./state.js";
|
|
60
61
|
const DEFAULTS = {
|
|
61
62
|
nearDuplicateThreshold: 0.85,
|
|
@@ -67,10 +68,6 @@ const DEFAULTS = {
|
|
|
67
68
|
uniqueValueMinWords: 100,
|
|
68
69
|
metaUniquenessMinJaccard: 0.9,
|
|
69
70
|
linkDepthMaxClicks: 3,
|
|
70
|
-
hubPagesMinSiblings: 4,
|
|
71
|
-
hubPagesMaxSiblings: 50,
|
|
72
|
-
titleOverlapThreshold: 0.8,
|
|
73
|
-
keywordCollisionMinShared: 6,
|
|
74
71
|
templateCoverageMinPages: 5,
|
|
75
72
|
answerFirstMaxWords: 100,
|
|
76
73
|
citableFactsMin: 3,
|
|
@@ -80,18 +77,82 @@ const DEFAULTS = {
|
|
|
80
77
|
modularityMinSelfContainedRatio: 0.7,
|
|
81
78
|
faqMinQuestionHeadings: 2
|
|
82
79
|
};
|
|
80
|
+
/**
|
|
81
|
+
* v0.4 four-category weights. Audit is diagnostic-only (weight 0).
|
|
82
|
+
* See 2026-04-29 v0.4 redesign spec §4.2.
|
|
83
|
+
*/
|
|
83
84
|
const CATEGORY_WEIGHTS = {
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
data: 0.06,
|
|
90
|
-
schema: 0.05,
|
|
91
|
-
cannibal: 0.05,
|
|
92
|
-
/** Dedup / crawl hygiene; does not affect composite score. */
|
|
93
|
-
audit: 0
|
|
85
|
+
integrity: 0.50, // spam + content + cannibal
|
|
86
|
+
discoverability: 0.20, // links + tech
|
|
87
|
+
citation: 0.25, // aeo + schema
|
|
88
|
+
data: 0.05, // data
|
|
89
|
+
audit: 0, // diagnostics, never weighted
|
|
94
90
|
};
|
|
91
|
+
/**
|
|
92
|
+
* Maps the v0.3 ruleId namespace prefix to the v0.4 four-bucket category.
|
|
93
|
+
* Used by `scoreFromFindings` to bucket findings without changing rule IDs.
|
|
94
|
+
*/
|
|
95
|
+
const CATEGORY_MAP = {
|
|
96
|
+
spam: "integrity",
|
|
97
|
+
content: "integrity",
|
|
98
|
+
cannibal: "integrity",
|
|
99
|
+
links: "discoverability",
|
|
100
|
+
tech: "discoverability",
|
|
101
|
+
aeo: "citation",
|
|
102
|
+
schema: "citation",
|
|
103
|
+
data: "data",
|
|
104
|
+
audit: "audit",
|
|
105
|
+
};
|
|
106
|
+
/** Slug map for `RuleResult.docsUrl`. Defaults to the rule-id segment after the `/`. */
|
|
107
|
+
const RULE_DOCS_SLUG = {
|
|
108
|
+
// intentionally empty for v0.4 — slug = ruleId.split("/").pop() works for every shipped rule
|
|
109
|
+
};
|
|
110
|
+
function docsUrlFor(ruleId) {
|
|
111
|
+
const slug = RULE_DOCS_SLUG[ruleId] ?? ruleId.split("/").pop() ?? ruleId;
|
|
112
|
+
return `https://pseolint.dev/rules/${slug}`;
|
|
113
|
+
}
|
|
114
|
+
/** Verdict ladder thresholds — see spec §4.4. */
|
|
115
|
+
function verdictForRisk(risk) {
|
|
116
|
+
if (risk <= 20)
|
|
117
|
+
return "ready";
|
|
118
|
+
if (risk <= 40)
|
|
119
|
+
return "caution";
|
|
120
|
+
if (risk <= 60)
|
|
121
|
+
return "concerning";
|
|
122
|
+
return "critical";
|
|
123
|
+
}
|
|
124
|
+
function gradeForPenalty(penalty) {
|
|
125
|
+
if (penalty <= 20)
|
|
126
|
+
return "A";
|
|
127
|
+
if (penalty <= 40)
|
|
128
|
+
return "B";
|
|
129
|
+
if (penalty <= 60)
|
|
130
|
+
return "C";
|
|
131
|
+
if (penalty <= 80)
|
|
132
|
+
return "D";
|
|
133
|
+
return "F";
|
|
134
|
+
}
|
|
135
|
+
/** True for `text/html` and `application/xhtml+xml` only (treat as audit-eligible content). */
|
|
136
|
+
function isHtmlContentType(contentType) {
|
|
137
|
+
if (!contentType)
|
|
138
|
+
return true; // Local files / unknown — assume HTML.
|
|
139
|
+
const lower = contentType.toLowerCase();
|
|
140
|
+
return lower.includes("text/html") || lower.includes("application/xhtml+xml");
|
|
141
|
+
}
|
|
142
|
+
/** Glob match against a URL pathname only (not the full URL). v0.4 spec §4.5. */
|
|
143
|
+
function globMatchPathname(pattern, urlOrPath) {
|
|
144
|
+
let pathname;
|
|
145
|
+
try {
|
|
146
|
+
pathname = new URL(urlOrPath).pathname;
|
|
147
|
+
}
|
|
148
|
+
catch {
|
|
149
|
+
// Not a URL — treat as already-a-path. Force a leading slash for consistency.
|
|
150
|
+
pathname = urlOrPath.startsWith("/") ? urlOrPath : `/${urlOrPath}`;
|
|
151
|
+
}
|
|
152
|
+
// Allow patterns that don't begin with "/" by normalising both sides.
|
|
153
|
+
const normPattern = pattern.startsWith("/") || pattern.startsWith("*") ? pattern : `/${pattern}`;
|
|
154
|
+
return matchGlob(normPattern, pathname) || matchGlob(pattern, pathname);
|
|
155
|
+
}
|
|
95
156
|
const DEFAULT_ENTITY_PATTERNS = [
|
|
96
157
|
{
|
|
97
158
|
placeholder: "[STATE]",
|
|
@@ -112,7 +173,16 @@ function resolveGroupRules(baseRules, overrides) {
|
|
|
112
173
|
}
|
|
113
174
|
return result;
|
|
114
175
|
}
|
|
115
|
-
function runRulesOnPages(pages,
|
|
176
|
+
function runRulesOnPages(pages,
|
|
177
|
+
/**
|
|
178
|
+
* Full set of parsed pages including those filtered out by `respectNoindex`
|
|
179
|
+
* / `skipDetectedAuth`. Defaults to `pages` for backwards compat. The two
|
|
180
|
+
* noindex-conflict rules (`tech/canonical-noindex-conflict`,
|
|
181
|
+
* `tech/robots-noindex-conflict`) read this list specifically — without it,
|
|
182
|
+
* `respectNoindex: true` would hide noindex'd pages from the very rules
|
|
183
|
+
* designed to flag accidental noindex'ing.
|
|
184
|
+
*/
|
|
185
|
+
noindexAwarePages, resolvedRules, isEnabled, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, entityPatterns, overrides, mode = "full") {
|
|
116
186
|
const findings = [];
|
|
117
187
|
const modeOk = (ruleId) => mode !== "diff" || isRuleAllowedInDiff(ruleId);
|
|
118
188
|
const tag = (results) => results.map((r) => {
|
|
@@ -156,9 +226,6 @@ function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls,
|
|
|
156
226
|
if (isEnabled("content/unique-value") && modeOk("content/unique-value")) {
|
|
157
227
|
findings.push(...tag(uniqueValueRule(pages, resolvedRules.uniqueValueMinWords)));
|
|
158
228
|
}
|
|
159
|
-
if (isEnabled("content/heading-uniqueness") && modeOk("content/heading-uniqueness")) {
|
|
160
|
-
findings.push(...tag(headingUniquenessRule(pages, entityPatterns)));
|
|
161
|
-
}
|
|
162
229
|
if (isEnabled("content/meta-uniqueness") && modeOk("content/meta-uniqueness")) {
|
|
163
230
|
findings.push(...tag(metaUniquenessRule(pages, entityPatterns, resolvedRules.metaUniquenessMinJaccard)));
|
|
164
231
|
}
|
|
@@ -183,18 +250,15 @@ function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls,
|
|
|
183
250
|
if (isEnabled("links/cluster-connectivity") && modeOk("links/cluster-connectivity")) {
|
|
184
251
|
findings.push(...tag(clusterConnectivityRule(pages, knownUrls)));
|
|
185
252
|
}
|
|
186
|
-
if (isEnabled("links/hub-pages") && modeOk("links/hub-pages")) {
|
|
187
|
-
findings.push(...tag(hubPagesRule(pages, knownUrls, resolvedRules.hubPagesMinSiblings, resolvedRules.hubPagesMaxSiblings)));
|
|
188
|
-
}
|
|
189
253
|
// Tech rules
|
|
190
254
|
if (isEnabled("tech/canonical-consistency") && modeOk("tech/canonical-consistency")) {
|
|
191
255
|
findings.push(...tag(canonicalConsistencyRule(pages, knownUrls, normalizeUrlOptions)));
|
|
192
256
|
}
|
|
193
257
|
if (isEnabled("tech/canonical-noindex-conflict") && modeOk("tech/canonical-noindex-conflict")) {
|
|
194
|
-
findings.push(...tag(canonicalNoindexConflictRule(
|
|
258
|
+
findings.push(...tag(canonicalNoindexConflictRule(noindexAwarePages, normalizeUrlOptions)));
|
|
195
259
|
}
|
|
196
260
|
if (isEnabled("tech/robots-noindex-conflict") && modeOk("tech/robots-noindex-conflict")) {
|
|
197
|
-
findings.push(...tag(robotsNoindexConflictRule(
|
|
261
|
+
findings.push(...tag(robotsNoindexConflictRule(noindexAwarePages, inbound)));
|
|
198
262
|
}
|
|
199
263
|
if (isEnabled("tech/redirect-chain") && modeOk("tech/redirect-chain")) {
|
|
200
264
|
findings.push(...tag(redirectChainRule(pages)));
|
|
@@ -202,11 +266,10 @@ function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls,
|
|
|
202
266
|
if (isEnabled("tech/soft-404") && modeOk("tech/soft-404")) {
|
|
203
267
|
findings.push(...tag(soft404Rule(pages)));
|
|
204
268
|
}
|
|
205
|
-
if (isEnabled("tech/og-completeness") && modeOk("tech/og-completeness")) {
|
|
206
|
-
findings.push(...tag(ogCompletenessRule(pages)));
|
|
207
|
-
}
|
|
208
269
|
if (isEnabled("tech/hreflang-consistency") && modeOk("tech/hreflang-consistency")) {
|
|
209
|
-
|
|
270
|
+
// hreflang declarations on noindex'd pages are still bugs when they're
|
|
271
|
+
// inconsistent — see auditor.test.ts "emits technical SEO findings".
|
|
272
|
+
findings.push(...tag(hreflangConsistencyRule(noindexAwarePages, normalizeUrlOptions)));
|
|
210
273
|
}
|
|
211
274
|
// Schema rules
|
|
212
275
|
if (isEnabled("schema/json-ld-valid") && modeOk("schema/json-ld-valid")) {
|
|
@@ -240,9 +303,6 @@ function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls,
|
|
|
240
303
|
targetFactsPerPage: resolvedRules.citableFactsTarget,
|
|
241
304
|
})));
|
|
242
305
|
}
|
|
243
|
-
if (isEnabled("aeo/non-replicable-value")) {
|
|
244
|
-
findings.push(...tag(nonReplicableValueRule(pages)));
|
|
245
|
-
}
|
|
246
306
|
if (isEnabled("aeo/content-modularity")) {
|
|
247
307
|
findings.push(...tag(contentModularityRule(pages, {
|
|
248
308
|
maxParagraphWords: resolvedRules.modularityMaxParagraphWords,
|
|
@@ -252,13 +312,9 @@ function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls,
|
|
|
252
312
|
if (isEnabled("aeo/summary-bait")) {
|
|
253
313
|
findings.push(...tag(summaryBaitRule(pages, entityPatterns)));
|
|
254
314
|
}
|
|
255
|
-
// Cannibal rules
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
}
|
|
259
|
-
if (isEnabled("cannibal/keyword-collision") && modeOk("cannibal/keyword-collision")) {
|
|
260
|
-
findings.push(...tag(keywordCollisionRule(pages, resolvedRules.keywordCollisionMinShared)));
|
|
261
|
-
}
|
|
315
|
+
// Cannibal rules — only url-pattern survives in v0.4 (title-overlap and
|
|
316
|
+
// keyword-collision dropped due to high false-positive rates; see
|
|
317
|
+
// 2026-04-29 v0.4 redesign spec §4.3).
|
|
262
318
|
if (isEnabled("cannibal/url-pattern") && modeOk("cannibal/url-pattern")) {
|
|
263
319
|
findings.push(...tag(urlPatternRule(pages)));
|
|
264
320
|
}
|
|
@@ -267,54 +323,110 @@ function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls,
|
|
|
267
323
|
function hashHtml(html) {
|
|
268
324
|
return createHash("sha256").update(html, "utf8").digest("hex");
|
|
269
325
|
}
|
|
326
|
+
const SEVERITY_WEIGHTS = {
|
|
327
|
+
critical: 40,
|
|
328
|
+
error: 25,
|
|
329
|
+
warning: 12,
|
|
330
|
+
info: 5,
|
|
331
|
+
};
|
|
270
332
|
function scoreFromFindings(findings) {
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
333
|
+
// v0.4 four-bucket raw penalties.
|
|
334
|
+
const bucketRaw = {
|
|
335
|
+
integrity: 0,
|
|
336
|
+
discoverability: 0,
|
|
337
|
+
citation: 0,
|
|
338
|
+
data: 0,
|
|
339
|
+
audit: 0,
|
|
276
340
|
};
|
|
277
|
-
const
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
links: 0,
|
|
282
|
-
tech: 0,
|
|
341
|
+
const bucketIssues = {
|
|
342
|
+
integrity: 0,
|
|
343
|
+
discoverability: 0,
|
|
344
|
+
citation: 0,
|
|
283
345
|
data: 0,
|
|
284
|
-
|
|
285
|
-
cannibal: 0,
|
|
286
|
-
audit: 0
|
|
346
|
+
audit: 0,
|
|
287
347
|
};
|
|
348
|
+
let blockers = 0;
|
|
349
|
+
let shouldFix = 0;
|
|
350
|
+
let informational = 0;
|
|
288
351
|
for (const finding of findings) {
|
|
289
|
-
const
|
|
290
|
-
|
|
352
|
+
const namespace = finding.ruleId.split("/")[0];
|
|
353
|
+
const bucket = CATEGORY_MAP[namespace];
|
|
354
|
+
if (!bucket)
|
|
291
355
|
continue;
|
|
356
|
+
const weight = SEVERITY_WEIGHTS[finding.severity];
|
|
357
|
+
// v0.4 buckets.
|
|
358
|
+
bucketRaw[bucket] = Math.min(100, bucketRaw[bucket] + weight);
|
|
359
|
+
if (bucket !== "audit") {
|
|
360
|
+
bucketIssues[bucket] += 1;
|
|
292
361
|
}
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
362
|
+
// Issue-bucket counts (audit/* findings are diagnostic-only and excluded).
|
|
363
|
+
if (bucket === "audit")
|
|
364
|
+
continue;
|
|
365
|
+
if (finding.severity === "critical" || finding.severity === "error")
|
|
366
|
+
blockers += 1;
|
|
367
|
+
else if (finding.severity === "warning")
|
|
368
|
+
shouldFix += 1;
|
|
369
|
+
else
|
|
370
|
+
informational += 1;
|
|
371
|
+
}
|
|
372
|
+
const weighted = bucketRaw.integrity * CATEGORY_WEIGHTS.integrity +
|
|
373
|
+
bucketRaw.discoverability * CATEGORY_WEIGHTS.discoverability +
|
|
374
|
+
bucketRaw.citation * CATEGORY_WEIGHTS.citation +
|
|
375
|
+
bucketRaw.data * CATEGORY_WEIGHTS.data;
|
|
376
|
+
const risk = Math.round(Math.min(100, weighted));
|
|
377
|
+
const categories = {
|
|
378
|
+
integrity: { grade: gradeForPenalty(bucketRaw.integrity), issues: bucketIssues.integrity },
|
|
379
|
+
discoverability: { grade: gradeForPenalty(bucketRaw.discoverability), issues: bucketIssues.discoverability },
|
|
380
|
+
citation: { grade: gradeForPenalty(bucketRaw.citation), issues: bucketIssues.citation },
|
|
381
|
+
data: { grade: gradeForPenalty(bucketRaw.data), issues: bucketIssues.data },
|
|
382
|
+
audit: { grade: "A", issues: 0 },
|
|
383
|
+
};
|
|
304
384
|
return {
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
content: raw.content,
|
|
309
|
-
aeo: raw.aeo,
|
|
310
|
-
links: raw.links,
|
|
311
|
-
tech: raw.tech,
|
|
312
|
-
data: raw.data,
|
|
313
|
-
schema: raw.schema,
|
|
314
|
-
cannibal: raw.cannibal
|
|
315
|
-
}
|
|
385
|
+
risk,
|
|
386
|
+
categories,
|
|
387
|
+
bucketCounts: { blockers, shouldFix, informational },
|
|
316
388
|
};
|
|
317
389
|
}
|
|
390
|
+
function bucketIssues(findings) {
|
|
391
|
+
const blockers = [];
|
|
392
|
+
const shouldFix = [];
|
|
393
|
+
const informational = [];
|
|
394
|
+
for (const f of findings) {
|
|
395
|
+
// audit/* findings are diagnostics and never appear in issue buckets.
|
|
396
|
+
if (f.ruleId.startsWith("audit/"))
|
|
397
|
+
continue;
|
|
398
|
+
if (f.severity === "critical" || f.severity === "error")
|
|
399
|
+
blockers.push(f);
|
|
400
|
+
else if (f.severity === "warning")
|
|
401
|
+
shouldFix.push(f);
|
|
402
|
+
else
|
|
403
|
+
informational.push(f);
|
|
404
|
+
}
|
|
405
|
+
return { blockers, shouldFix, informational };
|
|
406
|
+
}
|
|
407
|
+
function buildHeadline(counts) {
|
|
408
|
+
const parts = [];
|
|
409
|
+
if (counts.blockers > 0) {
|
|
410
|
+
parts.push(`${counts.blockers} ship-blocker${counts.blockers === 1 ? "" : "s"}`);
|
|
411
|
+
}
|
|
412
|
+
if (counts.shouldFix > 0) {
|
|
413
|
+
parts.push(`${counts.shouldFix} should-fix`);
|
|
414
|
+
}
|
|
415
|
+
if (counts.informational > 0 && parts.length < 2) {
|
|
416
|
+
parts.push(`${counts.informational} informational`);
|
|
417
|
+
}
|
|
418
|
+
if (parts.length === 0)
|
|
419
|
+
return "No issues detected.";
|
|
420
|
+
return parts.join(", ");
|
|
421
|
+
}
|
|
422
|
+
/** Populate `docsUrl` on every finding that doesn't already have one. */
|
|
423
|
+
function withDocsUrls(findings) {
|
|
424
|
+
for (const f of findings) {
|
|
425
|
+
if (!f.docsUrl)
|
|
426
|
+
f.docsUrl = docsUrlFor(f.ruleId);
|
|
427
|
+
}
|
|
428
|
+
return findings;
|
|
429
|
+
}
|
|
318
430
|
async function collectHtmlFiles(directory) {
|
|
319
431
|
const entries = await readdir(directory, { withFileTypes: true });
|
|
320
432
|
const files = await Promise.all(entries.map(async (entry) => {
|
|
@@ -330,10 +442,30 @@ async function collectHtmlFiles(directory) {
|
|
|
330
442
|
}));
|
|
331
443
|
return files.flat();
|
|
332
444
|
}
|
|
333
|
-
|
|
445
|
+
/**
|
|
446
|
+
* Combine up to N AbortSignals into one. The returned signal aborts as soon
|
|
447
|
+
* as any input aborts. Avoids the node-only `AbortSignal.any` for wider
|
|
448
|
+
* compatibility and keeps listeners weak-ish (one per input, no unbounded
|
|
449
|
+
* listener growth).
|
|
450
|
+
*/
|
|
451
|
+
function composeSignals(...signals) {
|
|
452
|
+
const actual = signals.filter((s) => Boolean(s));
|
|
453
|
+
if (actual.length === 0)
|
|
454
|
+
return new AbortController().signal;
|
|
455
|
+
const ac = new AbortController();
|
|
456
|
+
for (const s of actual) {
|
|
457
|
+
if (s.aborted) {
|
|
458
|
+
ac.abort(s.reason);
|
|
459
|
+
return ac.signal;
|
|
460
|
+
}
|
|
461
|
+
s.addEventListener("abort", () => ac.abort(s.reason), { once: true });
|
|
462
|
+
}
|
|
463
|
+
return ac.signal;
|
|
464
|
+
}
|
|
465
|
+
async function fetchWithRetry(url, timeoutMs, cache, stats, signal, validateHop) {
|
|
334
466
|
try {
|
|
335
467
|
stats.total += 1;
|
|
336
|
-
const r = await cachedFetch(url, { timeoutMs, cache });
|
|
468
|
+
const r = await cachedFetch(url, { timeoutMs, cache, signal, validateHop, onObservation: stats.onObservation });
|
|
337
469
|
if (r.fromCache) {
|
|
338
470
|
stats.hits += 1;
|
|
339
471
|
stats.bytesSavedEstimate += r.body.length;
|
|
@@ -342,14 +474,16 @@ async function fetchWithRetry(url, timeoutMs, cache, stats) {
|
|
|
342
474
|
return null;
|
|
343
475
|
return { text: r.body, contentType: (r.headers["content-type"] ?? "").toLowerCase() };
|
|
344
476
|
}
|
|
345
|
-
catch {
|
|
477
|
+
catch (err) {
|
|
478
|
+
if (signal?.aborted)
|
|
479
|
+
throw err; // propagate cancellation
|
|
346
480
|
return null;
|
|
347
481
|
}
|
|
348
482
|
}
|
|
349
|
-
async function fetchPageWithMeta(url, timeoutMs, cache, stats) {
|
|
483
|
+
async function fetchPageWithMeta(url, timeoutMs, cache, stats, signal, validateHop, followRedirects = true) {
|
|
350
484
|
try {
|
|
351
485
|
stats.total += 1;
|
|
352
|
-
const r = await cachedFetch(url, { timeoutMs, cache });
|
|
486
|
+
const r = await cachedFetch(url, { timeoutMs, cache, signal, validateHop, followRedirects, onObservation: stats.onObservation });
|
|
353
487
|
if (r.fromCache) {
|
|
354
488
|
stats.hits += 1;
|
|
355
489
|
stats.bytesSavedEstimate += r.body.length;
|
|
@@ -366,13 +500,15 @@ async function fetchPageWithMeta(url, timeoutMs, cache, stats) {
|
|
|
366
500
|
},
|
|
367
501
|
};
|
|
368
502
|
}
|
|
369
|
-
catch {
|
|
503
|
+
catch (err) {
|
|
504
|
+
if (signal?.aborted)
|
|
505
|
+
throw err;
|
|
370
506
|
return null;
|
|
371
507
|
}
|
|
372
508
|
}
|
|
373
|
-
async function fetchTextStrict(url, timeoutMs, cache, stats) {
|
|
509
|
+
async function fetchTextStrict(url, timeoutMs, cache, stats, signal, validateHop) {
|
|
374
510
|
stats.total += 1;
|
|
375
|
-
const r = await cachedFetch(url, { timeoutMs, cache });
|
|
511
|
+
const r = await cachedFetch(url, { timeoutMs, cache, signal, validateHop, onObservation: stats.onObservation });
|
|
376
512
|
if (r.fromCache) {
|
|
377
513
|
stats.hits += 1;
|
|
378
514
|
stats.bytesSavedEstimate += r.body.length;
|
|
@@ -455,8 +591,13 @@ function matchGlob(pattern, value) {
|
|
|
455
591
|
function shouldIgnore(url, patterns) {
|
|
456
592
|
if (patterns.length === 0)
|
|
457
593
|
return false;
|
|
594
|
+
// v0.4 §4.5: globs match against the URL pathname only, NOT the full URL.
|
|
595
|
+
// Operator intuition: `ignore: ["dashboard/**"]` should match
|
|
596
|
+
// `https://example.com/dashboard/...` even though the full URL contains the
|
|
597
|
+
// host. Previously globs matched the full URL and silently failed for users
|
|
598
|
+
// who didn't think to write `**/dashboard/**`.
|
|
458
599
|
for (const pattern of patterns) {
|
|
459
|
-
if (
|
|
600
|
+
if (globMatchPathname(pattern, url))
|
|
460
601
|
return true;
|
|
461
602
|
}
|
|
462
603
|
return false;
|
|
@@ -469,7 +610,7 @@ function fisherYatesSample(items, n) {
|
|
|
469
610
|
}
|
|
470
611
|
return arr.slice(arr.length - n);
|
|
471
612
|
}
|
|
472
|
-
async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutMs, cache, stats) {
|
|
613
|
+
async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutMs, cache, stats, signal, validateHop) {
|
|
473
614
|
visited.add(sitemapUrl);
|
|
474
615
|
const locs = parseSitemapUrls(sitemapText);
|
|
475
616
|
if (!isSitemapIndex(sitemapText)) {
|
|
@@ -477,27 +618,32 @@ async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutM
|
|
|
477
618
|
}
|
|
478
619
|
const allUrls = [];
|
|
479
620
|
for (const childUrl of locs) {
|
|
621
|
+
if (signal?.aborted)
|
|
622
|
+
throw signal.reason ?? new Error("aborted");
|
|
480
623
|
if (visited.has(childUrl))
|
|
481
624
|
continue;
|
|
482
|
-
const child = await fetchWithRetry(childUrl, timeoutMs, cache, stats);
|
|
625
|
+
const child = await fetchWithRetry(childUrl, timeoutMs, cache, stats, signal, validateHop);
|
|
483
626
|
if (!child)
|
|
484
627
|
continue;
|
|
485
628
|
const childLike = child.contentType.includes("xml") || looksLikeSitemap(child.text);
|
|
486
629
|
if (!childLike)
|
|
487
630
|
continue;
|
|
488
|
-
const childUrls = await collectUrlsFromSitemap(child.text, childUrl, visited, timeoutMs, cache, stats);
|
|
631
|
+
const childUrls = await collectUrlsFromSitemap(child.text, childUrl, visited, timeoutMs, cache, stats, signal, validateHop);
|
|
489
632
|
allUrls.push(...childUrls);
|
|
490
633
|
}
|
|
491
634
|
return allUrls;
|
|
492
635
|
}
|
|
493
|
-
async function fetchRobotsMeta(origin, timeoutMs, cache, stats) {
|
|
636
|
+
async function fetchRobotsMeta(origin, timeoutMs, cache, stats, signal, validateHop) {
|
|
494
637
|
if (!origin)
|
|
495
638
|
return { disallow: [], crawlDelaySec: 0 };
|
|
496
639
|
try {
|
|
497
640
|
const robotsUrl = `${origin}/robots.txt`;
|
|
498
|
-
const fetched = await fetchTextStrict(robotsUrl, timeoutMs, cache, stats);
|
|
641
|
+
const fetched = await fetchTextStrict(robotsUrl, timeoutMs, cache, stats, signal, validateHop);
|
|
642
|
+
// Honor both the wildcard block AND any block specifically targeting us.
|
|
643
|
+
// A malicious target can't bypass our crawler by adding a targeted
|
|
644
|
+
// `User-agent: pseolint / Disallow: /` without a wildcard.
|
|
499
645
|
return {
|
|
500
|
-
disallow: parseDisallowPatterns(fetched.text),
|
|
646
|
+
disallow: parseDisallowPatterns(fetched.text, ["*", "pseolint"]),
|
|
501
647
|
crawlDelaySec: parseCrawlDelaySeconds(fetched.text),
|
|
502
648
|
};
|
|
503
649
|
}
|
|
@@ -518,13 +664,42 @@ function isDisallowedByRobots(urlPath, patterns) {
|
|
|
518
664
|
function budgetExceeded(b) {
|
|
519
665
|
return b.cap > 0 && b.used >= b.cap;
|
|
520
666
|
}
|
|
521
|
-
async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cache, stats, fillBudgetViaLinkDiscovery = false, byteBudget = { used: 0, cap: 0 }) {
|
|
667
|
+
async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cache, stats, fillBudgetViaLinkDiscovery = false, byteBudget = { used: 0, cap: 0 }, signal, guardSsrf = false, respectRobotsTxt = true, skippedByRobots = [], followRedirects = true, maxCrawlDiscovered = 5000) {
|
|
668
|
+
// Memoized SSRF validator. When guardSsrf is on, every URL fetched by the
|
|
669
|
+
// audit (source, sitemap entries, redirects, discovered links) goes through
|
|
670
|
+
// this. DNS is hit once per unique hostname per audit — a 4k-page audit on
|
|
671
|
+
// one origin does 1 DNS lookup, not 4k.
|
|
672
|
+
const ssrfCache = new Map();
|
|
673
|
+
const validateHop = guardSsrf
|
|
674
|
+
? async (u) => {
|
|
675
|
+
let host;
|
|
676
|
+
try {
|
|
677
|
+
host = new URL(u).hostname;
|
|
678
|
+
}
|
|
679
|
+
catch {
|
|
680
|
+
throw new Error(`Refusing to fetch invalid URL: ${u}`);
|
|
681
|
+
}
|
|
682
|
+
let pending = ssrfCache.get(host);
|
|
683
|
+
if (!pending) {
|
|
684
|
+
pending = validateTargetHost(host).catch((err) => {
|
|
685
|
+
if (err instanceof SSRFError) {
|
|
686
|
+
throw new Error(`Refusing to fetch ${u}: ${err.reason}`);
|
|
687
|
+
}
|
|
688
|
+
throw err;
|
|
689
|
+
});
|
|
690
|
+
ssrfCache.set(host, pending);
|
|
691
|
+
}
|
|
692
|
+
await pending;
|
|
693
|
+
}
|
|
694
|
+
: undefined;
|
|
522
695
|
if (/^https?:\/\//i.test(source)) {
|
|
696
|
+
if (validateHop)
|
|
697
|
+
await validateHop(source);
|
|
523
698
|
let text;
|
|
524
699
|
let contentType;
|
|
525
700
|
let sourceStatus = 200;
|
|
526
701
|
try {
|
|
527
|
-
const fetched = await fetchTextStrict(source, timeoutMs, cache, stats);
|
|
702
|
+
const fetched = await fetchTextStrict(source, timeoutMs, cache, stats, signal, validateHop);
|
|
528
703
|
text = fetched.text;
|
|
529
704
|
contentType = fetched.contentType;
|
|
530
705
|
}
|
|
@@ -533,7 +708,7 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
533
708
|
if (source.includes("sitemap")) {
|
|
534
709
|
try {
|
|
535
710
|
const origin = new URL(source).origin;
|
|
536
|
-
const fallback = await fetchTextStrict(origin, timeoutMs, cache, stats);
|
|
711
|
+
const fallback = await fetchTextStrict(origin, timeoutMs, cache, stats, signal, validateHop);
|
|
537
712
|
text = fallback.text;
|
|
538
713
|
contentType = fallback.contentType;
|
|
539
714
|
sourceStatus = -1; // flag that we fell back
|
|
@@ -549,7 +724,7 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
549
724
|
const isXml = (contentType.includes("xml") || looksLikeSitemap(text)) && sourceStatus !== -1;
|
|
550
725
|
if (isXml) {
|
|
551
726
|
const visited = new Set();
|
|
552
|
-
const allSitemapUrls = await collectUrlsFromSitemap(text, source, visited, timeoutMs, cache, stats);
|
|
727
|
+
const allSitemapUrls = await collectUrlsFromSitemap(text, source, visited, timeoutMs, cache, stats, signal, validateHop);
|
|
553
728
|
// If we have a budget, sample from sitemap URLs before fetching
|
|
554
729
|
const urlsToFetch = discoveryBudget > 0 && allSitemapUrls.length > discoveryBudget
|
|
555
730
|
? fisherYatesSample(allSitemapUrls, discoveryBudget)
|
|
@@ -562,13 +737,29 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
562
737
|
catch {
|
|
563
738
|
return "";
|
|
564
739
|
} })();
|
|
565
|
-
const robots = await fetchRobotsMeta(sourceOrigin, timeoutMs, cache, stats);
|
|
740
|
+
const robots = await fetchRobotsMeta(sourceOrigin, timeoutMs, cache, stats, signal, validateHop);
|
|
566
741
|
const effectiveConcurrency = robots.crawlDelaySec > 0 ? 1 : concurrency;
|
|
567
742
|
const delayMs = robots.crawlDelaySec * 1000;
|
|
568
743
|
await runWithConcurrency(urlsToFetch, effectiveConcurrency, async (url) => {
|
|
569
744
|
if (budgetExceeded(byteBudget))
|
|
570
745
|
return;
|
|
571
|
-
|
|
746
|
+
// Honor robots.txt for our own crawl when respectRobotsTxt is on (default).
|
|
747
|
+
// The existing robotsComplianceRule flags sitemap-vs-robots conflicts as
|
|
748
|
+
// findings; this actually refuses to fetch the disallowed URL. Keeps us
|
|
749
|
+
// legally defensible (we are a bot, our UA `pseolint` is public, and we
|
|
750
|
+
// respect Disallow directives) and removes the "crawler-for-hire" abuse
|
|
751
|
+
// vector when the library is invoked from a hosted service.
|
|
752
|
+
if (respectRobotsTxt) {
|
|
753
|
+
try {
|
|
754
|
+
const p = new URL(url).pathname;
|
|
755
|
+
if (isDisallowedByRobots(p, robots.disallow)) {
|
|
756
|
+
skippedByRobots.push(url);
|
|
757
|
+
return;
|
|
758
|
+
}
|
|
759
|
+
}
|
|
760
|
+
catch { /* URL parse failed — fall through, fetch will fail naturally */ }
|
|
761
|
+
}
|
|
762
|
+
const result = await fetchPageWithMeta(url, timeoutMs, cache, stats, signal, validateHop, followRedirects);
|
|
572
763
|
if (result) {
|
|
573
764
|
byteBudget.used += result.html.length;
|
|
574
765
|
pages.push(result);
|
|
@@ -587,9 +778,16 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
587
778
|
const discoveredUrls = new Set();
|
|
588
779
|
// robots already fetched above; reuse its Disallow patterns here.
|
|
589
780
|
const disallowPatterns = robots.disallow;
|
|
590
|
-
|
|
781
|
+
let discoveryCeilingReached = false;
|
|
782
|
+
outer: for (const page of pages) {
|
|
591
783
|
const linkMatches = Array.from(page.html.matchAll(/href=["']([^"']+)["']/gi));
|
|
592
784
|
for (const match of linkMatches) {
|
|
785
|
+
if (discoveredUrls.size >= maxCrawlDiscovered) {
|
|
786
|
+
// Hard ceiling — don't let a malicious site with many self-links
|
|
787
|
+
// extend crawl discovery up to the byte budget.
|
|
788
|
+
discoveryCeilingReached = true;
|
|
789
|
+
break outer;
|
|
790
|
+
}
|
|
593
791
|
const href = match[1];
|
|
594
792
|
if (!href || href.startsWith("#") || /^mailto:|^tel:|^javascript:|^data:/i.test(href))
|
|
595
793
|
continue;
|
|
@@ -614,6 +812,10 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
614
812
|
}
|
|
615
813
|
}
|
|
616
814
|
}
|
|
815
|
+
if (discoveryCeilingReached) {
|
|
816
|
+
// eslint-disable-next-line no-console
|
|
817
|
+
console.error(`pseolint: crawl discovery hit maxCrawlDiscovered=${maxCrawlDiscovered} ceiling; sampling from the first ${discoveredUrls.size} URLs.`);
|
|
818
|
+
}
|
|
617
819
|
if (discoveredUrls.size > 0) {
|
|
618
820
|
const candidates = Array.from(discoveredUrls);
|
|
619
821
|
// Fisher-Yates shuffle so we don't bias toward the first-discovered links (nav/footer).
|
|
@@ -623,7 +825,7 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
623
825
|
await runWithConcurrency(toFetch, effectiveConcurrency, async (url) => {
|
|
624
826
|
if (budgetExceeded(byteBudget))
|
|
625
827
|
return;
|
|
626
|
-
const result = await fetchPageWithMeta(url, timeoutMs, cache, stats);
|
|
828
|
+
const result = await fetchPageWithMeta(url, timeoutMs, cache, stats, signal, validateHop, followRedirects);
|
|
627
829
|
if (result && result.httpMeta && result.httpMeta.statusCode >= 200 && result.httpMeta.statusCode < 300) {
|
|
628
830
|
byteBudget.used += result.html.length;
|
|
629
831
|
pages.push(result);
|
|
@@ -700,7 +902,7 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
700
902
|
}
|
|
701
903
|
const newPages = [];
|
|
702
904
|
await runWithConcurrency(urlsToFetch, concurrency, async (url) => {
|
|
703
|
-
const result = await fetchPageWithMeta(url, timeoutMs, cache, stats);
|
|
905
|
+
const result = await fetchPageWithMeta(url, timeoutMs, cache, stats, signal, validateHop, followRedirects);
|
|
704
906
|
if (result && result.httpMeta && result.httpMeta.statusCode >= 200 && result.httpMeta.statusCode < 300) {
|
|
705
907
|
newPages.push(result);
|
|
706
908
|
knownCrawled.add(url);
|
|
@@ -744,10 +946,65 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
744
946
|
export async function auditSource(source, options) {
|
|
745
947
|
const runId = generateRunId();
|
|
746
948
|
const runStartedAt = Date.now();
|
|
747
|
-
|
|
949
|
+
// Apply safeMode preset first, then let explicit options override it. Using
|
|
950
|
+
// `??` preserves the "not set" vs "explicitly false" distinction — a user
|
|
951
|
+
// who picks safeMode="saas" but passes `guardSsrf: false` gets the explicit
|
|
952
|
+
// override. Localhost sources auto-promote to the `dev` preset unless the
|
|
953
|
+
// caller explicitly set `safeMode` or passed `autoDevPreset: false`.
|
|
954
|
+
const presetKey = resolveSafeModeKey(source, options);
|
|
955
|
+
const preset = SAFE_MODE_PRESETS[presetKey];
|
|
956
|
+
const concurrency = options?.concurrency ?? preset.concurrency ?? 5;
|
|
748
957
|
const timeoutMs = options?.timeout ?? 30000;
|
|
749
958
|
const ignorePatterns = options?.ignore ?? [];
|
|
750
|
-
const
|
|
959
|
+
const respectNoindex = options?.respectNoindex ?? true;
|
|
960
|
+
const skipDetectedAuth = options?.skipDetectedAuth ?? false;
|
|
961
|
+
const sampleSize = options?.sampleSize ?? preset.sampleSize ?? 0;
|
|
962
|
+
const externalSignal = options?.signal;
|
|
963
|
+
const guardSsrf = options?.guardSsrf ?? preset.guardSsrf ?? false;
|
|
964
|
+
const respectRobotsTxt = options?.respectRobotsTxt ?? preset.respectRobotsTxt ?? true;
|
|
965
|
+
const followRedirects = options?.followRedirects ?? preset.followRedirects ?? true;
|
|
966
|
+
const maxCrawlDiscovered = options?.maxCrawlDiscovered ?? preset.maxCrawlDiscovered ?? 5000;
|
|
967
|
+
const skippedByRobots = [];
|
|
968
|
+
// Backpressure: watch TTFB + 5xx rate during the crawl and abort if the
|
|
969
|
+
// origin looks degraded. The audit signal is a composite of the caller's
|
|
970
|
+
// signal (ctrl-C, parent timeout) and the monitor's abort controller.
|
|
971
|
+
const backpressureEnabled = options?.backpressure !== false;
|
|
972
|
+
const backpressureAbort = new AbortController();
|
|
973
|
+
let backpressureError = null;
|
|
974
|
+
const signal = composeSignals(externalSignal, backpressureAbort.signal);
|
|
975
|
+
const observer = new FetchObserver();
|
|
976
|
+
const monitor = backpressureEnabled
|
|
977
|
+
? new BackpressureMonitor({
|
|
978
|
+
warmupSize: 10,
|
|
979
|
+
absoluteP95Ms: 3000,
|
|
980
|
+
baselineMultiplier: 2,
|
|
981
|
+
errorRatioThreshold: 0.1,
|
|
982
|
+
})
|
|
983
|
+
: null;
|
|
984
|
+
// v0.4: framework gets set on the first observation that carries headers
|
|
985
|
+
// (the source URL fetch). Backpressure thresholds and computeReadiness use
|
|
986
|
+
// it to soften limits when auditing a dev server.
|
|
987
|
+
let detectedFramework = null;
|
|
988
|
+
const onObservation = (obs) => {
|
|
989
|
+
if (detectedFramework === null && obs.headers) {
|
|
990
|
+
detectedFramework = detectDevServer(obs.headers);
|
|
991
|
+
}
|
|
992
|
+
observer.record(obs);
|
|
993
|
+
if (!monitor)
|
|
994
|
+
return;
|
|
995
|
+
const decision = monitor.record(obs);
|
|
996
|
+
if (decision.shouldAbort && !backpressureError && decision.snapshot) {
|
|
997
|
+
backpressureError = new OriginDegradedError(decision.reason ?? "", decision.snapshot);
|
|
998
|
+
backpressureAbort.abort(backpressureError);
|
|
999
|
+
}
|
|
1000
|
+
};
|
|
1001
|
+
function throwIfAborted() {
|
|
1002
|
+
if (backpressureError)
|
|
1003
|
+
throw backpressureError;
|
|
1004
|
+
if (externalSignal?.aborted) {
|
|
1005
|
+
throw externalSignal.reason ?? new DOMException("Audit aborted", "AbortError");
|
|
1006
|
+
}
|
|
1007
|
+
}
|
|
751
1008
|
const resolvedRules = {
|
|
752
1009
|
nearDuplicateThreshold: options?.rules?.nearDuplicateThreshold ?? DEFAULTS.nearDuplicateThreshold,
|
|
753
1010
|
entitySwapThreshold: options?.rules?.entitySwapThreshold ?? DEFAULTS.entitySwapThreshold,
|
|
@@ -758,10 +1015,6 @@ export async function auditSource(source, options) {
|
|
|
758
1015
|
uniqueValueMinWords: options?.rules?.uniqueValueMinWords ?? DEFAULTS.uniqueValueMinWords,
|
|
759
1016
|
metaUniquenessMinJaccard: options?.rules?.metaUniquenessMinJaccard ?? DEFAULTS.metaUniquenessMinJaccard,
|
|
760
1017
|
linkDepthMaxClicks: options?.rules?.linkDepthMaxClicks ?? DEFAULTS.linkDepthMaxClicks,
|
|
761
|
-
hubPagesMinSiblings: options?.rules?.hubPagesMinSiblings ?? DEFAULTS.hubPagesMinSiblings,
|
|
762
|
-
hubPagesMaxSiblings: options?.rules?.hubPagesMaxSiblings ?? DEFAULTS.hubPagesMaxSiblings,
|
|
763
|
-
titleOverlapThreshold: options?.rules?.titleOverlapThreshold ?? DEFAULTS.titleOverlapThreshold,
|
|
764
|
-
keywordCollisionMinShared: options?.rules?.keywordCollisionMinShared ?? DEFAULTS.keywordCollisionMinShared,
|
|
765
1018
|
templateCoverageMinPages: options?.rules?.templateCoverageMinPages ?? DEFAULTS.templateCoverageMinPages,
|
|
766
1019
|
answerFirstMaxWords: options?.rules?.answerFirstMaxWords ?? DEFAULTS.answerFirstMaxWords,
|
|
767
1020
|
citableFactsMin: options?.rules?.citableFactsMin ?? DEFAULTS.citableFactsMin,
|
|
@@ -783,18 +1036,47 @@ export async function auditSource(source, options) {
|
|
|
783
1036
|
const discoveryBudget = options?.sampleSize && options.sampleSize > 0
|
|
784
1037
|
? Math.max(50, options.sampleSize * 2)
|
|
785
1038
|
: 0;
|
|
786
|
-
const cacheStats = { hits: 0, total: 0, bytesSavedEstimate: 0 };
|
|
1039
|
+
const cacheStats = { hits: 0, total: 0, bytesSavedEstimate: 0, onObservation };
|
|
787
1040
|
const cacheConfig = options?.cache
|
|
788
1041
|
? {
|
|
789
1042
|
dir: options.cache.dir ?? ".pseolint/cache",
|
|
790
1043
|
ttlMs: options.cache.ttlMs ?? 7 * 24 * 60 * 60 * 1000,
|
|
791
1044
|
}
|
|
792
1045
|
: null;
|
|
1046
|
+
// Size cap (post-audit eviction). Default 200 MB keeps pSEO-scale sites in check;
|
|
1047
|
+
// a single full crawl of a 5k-page site averages ~250 KB per body = ~1.25 GB uncapped.
|
|
1048
|
+
const cacheMaxBytes = options?.cache?.maxBytes ?? 209_715_200;
|
|
793
1049
|
const fillBudgetViaLinkDiscovery = options?.fillBudgetViaLinkDiscovery ?? false;
|
|
794
|
-
const maxFetchBytes = options?.maxFetchBytes ?? 52_428_800;
|
|
1050
|
+
const maxFetchBytes = options?.maxFetchBytes ?? preset.maxFetchBytes ?? 52_428_800;
|
|
795
1051
|
const fetchByteBudget = { used: 0, cap: maxFetchBytes };
|
|
796
|
-
|
|
1052
|
+
// v0.4 §4.7: detectedFramework is set in onObservation above, side-effect
|
|
1053
|
+
// of the normal source URL fetch. No separate probe needed.
|
|
1054
|
+
const { pages: loadedPagesRaw, sitemapUrls: sitemapUrlSet, discoveredUrlCount } = await loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cacheConfig, cacheStats, fillBudgetViaLinkDiscovery, fetchByteBudget, signal, guardSsrf, respectRobotsTxt, skippedByRobots, followRedirects, maxCrawlDiscovered);
|
|
1055
|
+
throwIfAborted();
|
|
797
1056
|
const loadedPages = [...loadedPagesRaw];
|
|
1057
|
+
// v0.4 §4.7: content-type-aware crawling. Filter out fetched URLs whose
|
|
1058
|
+
// response Content-Type is not HTML (text/html or application/xhtml+xml).
|
|
1059
|
+
// Binary routes like /apple-icon, /opengraph-image, /icon get pushed to
|
|
1060
|
+
// crawlStats.skipped instead of being parsed as thin-content pages.
|
|
1061
|
+
const skippedByContentType = [];
|
|
1062
|
+
const htmlOnlyPages = [];
|
|
1063
|
+
for (const p of loadedPages) {
|
|
1064
|
+
// httpMeta is set on URL fetches; locally-loaded files have no httpMeta
|
|
1065
|
+
// and are always HTML by definition (collectHtmlFiles only picks .html).
|
|
1066
|
+
// We don't have content-type on the LoadedPage object. Heuristic: if html
|
|
1067
|
+
// body doesn't contain any HTML markers, treat as non-HTML.
|
|
1068
|
+
if (!p.httpMeta) {
|
|
1069
|
+
htmlOnlyPages.push(p);
|
|
1070
|
+
continue;
|
|
1071
|
+
}
|
|
1072
|
+
if (looksLikeHtml(p.html)) {
|
|
1073
|
+
htmlOnlyPages.push(p);
|
|
1074
|
+
}
|
|
1075
|
+
else {
|
|
1076
|
+
skippedByContentType.push(p.url);
|
|
1077
|
+
}
|
|
1078
|
+
}
|
|
1079
|
+
loadedPages.splice(0, loadedPages.length, ...htmlOnlyPages);
|
|
798
1080
|
if (discoveredUrlCount && discoveredUrlCount > loadedPages.length) {
|
|
799
1081
|
console.error(`Discovered ${discoveredUrlCount} pages, fetched ${loadedPages.length} for audit. Use --sample-size 0 for full crawl.`);
|
|
800
1082
|
}
|
|
@@ -830,7 +1112,7 @@ export async function auditSource(source, options) {
|
|
|
830
1112
|
if (/^https?:\/\//i.test(source)) {
|
|
831
1113
|
try {
|
|
832
1114
|
const origin = new URL(source).origin;
|
|
833
|
-
const result = await fetchWithRetry(`${origin}/robots.txt`, timeoutMs, cacheConfig, cacheStats);
|
|
1115
|
+
const result = await fetchWithRetry(`${origin}/robots.txt`, timeoutMs, cacheConfig, cacheStats, signal);
|
|
834
1116
|
if (result)
|
|
835
1117
|
robotsTxtContent = result.text;
|
|
836
1118
|
}
|
|
@@ -872,13 +1154,27 @@ export async function auditSource(source, options) {
|
|
|
872
1154
|
})()
|
|
873
1155
|
: fisherYatesSample(filtered, sampleSize))
|
|
874
1156
|
: filtered;
|
|
875
|
-
const
|
|
1157
|
+
const parsedPagesAll = sampled.map((page) => {
|
|
876
1158
|
const parsed = parseHtmlPage(page.html, page.url, { normalizeUrl: normalizeUrlOptions });
|
|
877
1159
|
if (page.httpMeta) {
|
|
878
1160
|
parsed.httpMeta = page.httpMeta;
|
|
879
1161
|
}
|
|
880
1162
|
return parsed;
|
|
881
1163
|
});
|
|
1164
|
+
// v0.4.1 §page-filter: drop noindex'd pages and (when enabled) heuristically
|
|
1165
|
+
// detected auth pages BEFORE rule evaluation. The site owner's noindex is a
|
|
1166
|
+
// hard signal — they already opted out of SEO indexing, so auditing those
|
|
1167
|
+
// URLs produces only noise. Auth detection is opt-in via skipDetectedAuth
|
|
1168
|
+
// (off for the CLI by default; on for the hosted web form).
|
|
1169
|
+
const skippedByPolicy = [];
|
|
1170
|
+
const parsedPages = parsedPagesAll.filter((p) => {
|
|
1171
|
+
const reason = pageSkipReason(p, { respectNoindex, skipDetectedAuth });
|
|
1172
|
+
if (reason) {
|
|
1173
|
+
skippedByPolicy.push({ url: p.url, reason });
|
|
1174
|
+
return false;
|
|
1175
|
+
}
|
|
1176
|
+
return true;
|
|
1177
|
+
});
|
|
882
1178
|
const knownUrls = new Set(parsedPages.map((p) => p.url));
|
|
883
1179
|
const rootUrl = parsedPages.find((p) => /(^|[\\/])index\.html?$/i.test(p.url))?.url ?? parsedPages[0]?.url ?? "";
|
|
884
1180
|
const adjacency = new Map();
|
|
@@ -918,11 +1214,50 @@ export async function auditSource(source, options) {
|
|
|
918
1214
|
}),
|
|
919
1215
|
]
|
|
920
1216
|
: DEFAULT_ENTITY_PATTERNS;
|
|
1217
|
+
// v0.4 §4.11 — pre-flight site classification. We compute this BEFORE the
|
|
1218
|
+
// rule pipeline so the dispatcher can skip pSEO-only rules on small
|
|
1219
|
+
// marketing sites / blogs. Classification is computed off the FULL
|
|
1220
|
+
// discovered URL set (sitemap when available, else loaded URLs). This
|
|
1221
|
+
// matters: a sampled crawl of a 5000-page directory must still classify
|
|
1222
|
+
// as `programmatic-directory`, not `unclear`.
|
|
1223
|
+
const classifierUrls = (() => {
|
|
1224
|
+
if (sitemapUrlSet && sitemapUrlSet.size > 0) {
|
|
1225
|
+
return Array.from(sitemapUrlSet);
|
|
1226
|
+
}
|
|
1227
|
+
return loadedPagesRaw.map((p) => p.url);
|
|
1228
|
+
})();
|
|
1229
|
+
const classifierFramework = detectedFramework ?? "unknown";
|
|
1230
|
+
const computedClassification = classifySite({
|
|
1231
|
+
urls: classifierUrls,
|
|
1232
|
+
framework: classifierFramework,
|
|
1233
|
+
});
|
|
1234
|
+
// `--strict` (or AuditOptions.strict) keeps the classification but forces
|
|
1235
|
+
// every rule to run regardless of detected site type.
|
|
1236
|
+
const siteClassification = options?.strict
|
|
1237
|
+
? { ...computedClassification, suppressedRules: [] }
|
|
1238
|
+
: computedClassification;
|
|
1239
|
+
const suppressedRuleSet = new Set(siteClassification.suppressedRules);
|
|
921
1240
|
// Classify pages into groups and run only enabled rules per group
|
|
922
1241
|
const classified = classifyPages(parsedPages, options?.pageGroups);
|
|
923
1242
|
const allFindings = [...duplicateUrlFindings];
|
|
924
1243
|
const groupScores = {};
|
|
925
1244
|
const groupPageCounts = {};
|
|
1245
|
+
// Surface robots-skipped URLs so users don't silently get a smaller audit
|
|
1246
|
+
// than expected. One rollup finding (not per-URL) to avoid flooding the
|
|
1247
|
+
// output on large sites. Also included on summary.skippedUrls below.
|
|
1248
|
+
if (skippedByRobots.length > 0) {
|
|
1249
|
+
allFindings.push({
|
|
1250
|
+
ruleId: "audit/skipped-by-robots",
|
|
1251
|
+
severity: "info",
|
|
1252
|
+
message: `Skipped ${skippedByRobots.length} sitemap URL${skippedByRobots.length === 1 ? "" : "s"} because the target's robots.txt Disallow'd them: ${skippedByRobots.slice(0, 5).join(", ")}${skippedByRobots.length > 5 ? ", …" : ""}.`,
|
|
1253
|
+
fix: "If you own this site and want to audit these URLs anyway, pass `respectRobotsTxt: false` (or remove the Disallow directive).",
|
|
1254
|
+
relatedUrls: skippedByRobots,
|
|
1255
|
+
});
|
|
1256
|
+
}
|
|
1257
|
+
// v0.4 §4.4: origin readiness is now diagnostic-only. The previous
|
|
1258
|
+
// `audit/origin-readiness` finding emission was retired — the structured
|
|
1259
|
+
// ReadinessReport in `summary.diagnostics.originReadiness` is the canonical
|
|
1260
|
+
// signal now (no double-counting in the issue buckets).
|
|
926
1261
|
const auditMode = options?.mode ?? "full";
|
|
927
1262
|
// Site-wide rules (run once, outside group loop)
|
|
928
1263
|
if (sitemapUrlSet && sitemapUrlSet.size > 0 && auditMode !== "diff") {
|
|
@@ -959,39 +1294,120 @@ export async function auditSource(source, options) {
|
|
|
959
1294
|
if (groupConfig?.rules !== undefined && groupConfig.rules.length === 0)
|
|
960
1295
|
continue;
|
|
961
1296
|
const groupRules = resolveGroupRules(resolvedRules, groupConfig?.overrides);
|
|
962
|
-
const enabledCheck = (ruleId) => isRuleEnabled(ruleId, groupConfig?.rules);
|
|
963
|
-
const findings = runRulesOnPages(groupPages, groupRules, enabledCheck, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, DEFAULT_ENTITY_PATTERNS, groupConfig?.overrides, options?.mode ?? "full");
|
|
1297
|
+
const enabledCheck = (ruleId) => !suppressedRuleSet.has(ruleId) && isRuleEnabled(ruleId, groupConfig?.rules);
|
|
1298
|
+
const findings = runRulesOnPages(groupPages, parsedPagesAll, groupRules, enabledCheck, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, DEFAULT_ENTITY_PATTERNS, groupConfig?.overrides, options?.mode ?? "full");
|
|
964
1299
|
allFindings.push(...findings);
|
|
965
1300
|
groupPageCounts[groupName] = groupPages.length;
|
|
966
|
-
const {
|
|
967
|
-
groupScores[groupName] =
|
|
1301
|
+
const { risk: groupRisk } = scoreFromFindings(findings);
|
|
1302
|
+
groupScores[groupName] = groupRisk;
|
|
968
1303
|
}
|
|
1304
|
+
throwIfAborted();
|
|
969
1305
|
// Enrich findings: cluster pairwise, detect templates, assign effort
|
|
970
1306
|
const enriched = enrichFindings(allFindings, parsedPages, {
|
|
971
1307
|
templateGenerated: options?.templateGenerated,
|
|
972
1308
|
});
|
|
973
|
-
|
|
1309
|
+
// Populate docsUrl on every finding before they leave the engine.
|
|
1310
|
+
withDocsUrls(enriched.findings);
|
|
1311
|
+
const { risk, categories, bucketCounts } = scoreFromFindings(enriched.findings);
|
|
974
1312
|
const auditedPageCount = Object.values(groupPageCounts).reduce((a, b) => a + b, 0);
|
|
1313
|
+
const issues = bucketIssues(enriched.findings);
|
|
1314
|
+
const verdict = verdictForRisk(risk);
|
|
1315
|
+
const headline = buildHeadline(bucketCounts);
|
|
1316
|
+
// audit/* findings are diagnostic-only and never appear in summary.issues.
|
|
1317
|
+
// Surface them under diagnostics so consumers (telemetry, debug UIs) can
|
|
1318
|
+
// still see what was deduped or skipped.
|
|
1319
|
+
const auditFindings = enriched.findings.filter((f) => f.ruleId.startsWith("audit/"));
|
|
1320
|
+
const readinessReport = computeReadiness(observer.getAll(), { detectedFramework });
|
|
1321
|
+
const crawlStats = {
|
|
1322
|
+
discovered: discoveredUrlCount ?? loadedPagesRaw.length,
|
|
1323
|
+
fetched: parsedPages.length,
|
|
1324
|
+
skipped: skippedByContentType.length + skippedByRobots.length + skippedUrls.length,
|
|
1325
|
+
};
|
|
975
1326
|
const summary = {
|
|
976
|
-
|
|
977
|
-
|
|
1327
|
+
schemaVersion: SCHEMA_VERSION,
|
|
1328
|
+
verdict,
|
|
1329
|
+
risk,
|
|
1330
|
+
headline,
|
|
1331
|
+
categories,
|
|
1332
|
+
issues,
|
|
1333
|
+
siteClassification,
|
|
1334
|
+
diagnostics: {
|
|
1335
|
+
originReadiness: readinessReport,
|
|
1336
|
+
crawlStats,
|
|
1337
|
+
auditFindings,
|
|
1338
|
+
},
|
|
978
1339
|
groupScores: options?.pageGroups ? groupScores : undefined,
|
|
979
1340
|
groupPageCounts: options?.pageGroups ? groupPageCounts : undefined,
|
|
980
1341
|
pageCount: auditedPageCount || parsedPages.length,
|
|
981
|
-
findings: enriched.findings,
|
|
982
1342
|
templateDetected: enriched.templateDetected,
|
|
983
1343
|
rawFindingCount: enriched.rawFindingCount,
|
|
984
1344
|
};
|
|
985
1345
|
if (cacheConfig) {
|
|
986
1346
|
summary.cacheStats = cacheStats;
|
|
987
1347
|
}
|
|
988
|
-
|
|
989
|
-
|
|
1348
|
+
// v0.4 §4.5 / v0.4.1: warn when ignore patterns matched zero discovered URLs.
|
|
1349
|
+
// - Per-pattern warning fires only when `warnUnmatchedIgnore` is true
|
|
1350
|
+
// (set by the CLI when `--ignore` was passed explicitly). Quiet by
|
|
1351
|
+
// default for config-loaded patterns where broad safety lists like
|
|
1352
|
+
// `**/dashboard/**` legitimately don't match small marketing sites.
|
|
1353
|
+
// - When ALL patterns matched zero (strongest typo signal, e.g. user
|
|
1354
|
+
// wrote `*.json` instead of `**/*.json`), emit a single consolidated
|
|
1355
|
+
// warning regardless of source.
|
|
1356
|
+
if (ignorePatterns.length > 0) {
|
|
1357
|
+
const unmatched = ignorePatterns.filter((pattern) => !deduped.some((p) => globMatchPathname(pattern, p.url)));
|
|
1358
|
+
if (unmatched.length === ignorePatterns.length) {
|
|
1359
|
+
// eslint-disable-next-line no-console
|
|
1360
|
+
console.warn(`[pseolint] none of the ${ignorePatterns.length} ignore pattern${ignorePatterns.length === 1 ? "" : "s"} matched any URLs — check config or --ignore for typos`);
|
|
1361
|
+
}
|
|
1362
|
+
else if (options?.warnUnmatchedIgnore === true) {
|
|
1363
|
+
for (const pattern of unmatched) {
|
|
1364
|
+
// eslint-disable-next-line no-console
|
|
1365
|
+
console.warn(`[pseolint] ignore pattern '${pattern}' matched 0 URLs — likely typo`);
|
|
1366
|
+
}
|
|
1367
|
+
}
|
|
990
1368
|
}
|
|
1369
|
+
// Merge state-skipped (unchanged since last run), robots-skipped (target
|
|
1370
|
+
// robots.txt Disallow'd), and policy-skipped (noindex / detected-auth) URLs
|
|
1371
|
+
// so callers have a single audit-skipped surface.
|
|
1372
|
+
const allSkipped = [
|
|
1373
|
+
...skippedUrls,
|
|
1374
|
+
...skippedByRobots,
|
|
1375
|
+
...skippedByPolicy.map((s) => s.url),
|
|
1376
|
+
];
|
|
1377
|
+
if (allSkipped.length > 0) {
|
|
1378
|
+
summary.skippedUrls = allSkipped;
|
|
1379
|
+
}
|
|
1380
|
+
// v0.4.1: surface noindex / auth skips as a discoverable diagnostic so the
|
|
1381
|
+
// user sees what the engine excluded. Catches the accidental-noindex bug:
|
|
1382
|
+
// pages silently dropped from indexing show up as a visible skip line
|
|
1383
|
+
// instead of being absent without explanation.
|
|
1384
|
+
if (skippedByPolicy.length > 0) {
|
|
1385
|
+
const noindexCount = skippedByPolicy.filter((s) => s.reason === "noindex").length;
|
|
1386
|
+
const authCount = skippedByPolicy.filter((s) => s.reason === "auth-detected").length;
|
|
1387
|
+
const sample = skippedByPolicy.slice(0, 5).map((s) => `${s.url} (${s.reason})`).join(", ");
|
|
1388
|
+
const more = skippedByPolicy.length > 5 ? `, +${skippedByPolicy.length - 5} more` : "";
|
|
1389
|
+
const parts = [];
|
|
1390
|
+
if (noindexCount > 0)
|
|
1391
|
+
parts.push(`${noindexCount} marked noindex`);
|
|
1392
|
+
if (authCount > 0)
|
|
1393
|
+
parts.push(`${authCount} detected as auth (login/register/etc)`);
|
|
1394
|
+
auditFindings.push({
|
|
1395
|
+
ruleId: "audit/skipped-by-policy",
|
|
1396
|
+
severity: "info",
|
|
1397
|
+
message: `Skipped ${skippedByPolicy.length} page${skippedByPolicy.length === 1 ? "" : "s"} from rule evaluation — ${parts.join(", ")}. First few: ${sample}${more}.`,
|
|
1398
|
+
relatedUrls: skippedByPolicy.map((s) => s.url),
|
|
1399
|
+
});
|
|
1400
|
+
}
|
|
1401
|
+
// Local flat view of every finding the engine produced, used internally for
|
|
1402
|
+
// state persistence, regression detection, AI triage input, and telemetry
|
|
1403
|
+
// counts. NOT exposed on the AuditSummary — consumers must use
|
|
1404
|
+
// `summary.issues.{blockers,shouldFix,informational}` and
|
|
1405
|
+
// `summary.diagnostics.auditFindings`.
|
|
1406
|
+
const enrichedFindings = enriched.findings;
|
|
991
1407
|
if (priorState && options?.state?.exitOnRegression) {
|
|
992
1408
|
let hasRegression = false;
|
|
993
1409
|
const currentFindings = new Map();
|
|
994
|
-
for (const f of
|
|
1410
|
+
for (const f of enrichedFindings) {
|
|
995
1411
|
if (!f.pageUrl)
|
|
996
1412
|
continue;
|
|
997
1413
|
const set = currentFindings.get(f.pageUrl) ?? new Set();
|
|
@@ -1019,7 +1435,7 @@ export async function auditSource(source, options) {
|
|
|
1019
1435
|
const renderMode = options.render ? "rendered" : "static";
|
|
1020
1436
|
const urls = {};
|
|
1021
1437
|
const findingsByUrl = new Map();
|
|
1022
|
-
for (const f of
|
|
1438
|
+
for (const f of enrichedFindings) {
|
|
1023
1439
|
if (!f.pageUrl)
|
|
1024
1440
|
continue;
|
|
1025
1441
|
const list = findingsByUrl.get(f.pageUrl) ?? [];
|
|
@@ -1051,9 +1467,10 @@ export async function auditSource(source, options) {
|
|
|
1051
1467
|
renderMode,
|
|
1052
1468
|
urls,
|
|
1053
1469
|
summary: {
|
|
1054
|
-
score: summary.
|
|
1055
|
-
totalFindings:
|
|
1056
|
-
byCategory: Object.fromEntries(Object.entries(summary.
|
|
1470
|
+
score: summary.risk,
|
|
1471
|
+
totalFindings: enrichedFindings.length,
|
|
1472
|
+
byCategory: Object.fromEntries(Object.entries(summary.categories)
|
|
1473
|
+
.map(([k, v]) => [k, v.issues])),
|
|
1057
1474
|
},
|
|
1058
1475
|
};
|
|
1059
1476
|
await writeState(statePath, newState);
|
|
@@ -1089,7 +1506,8 @@ export async function auditSource(source, options) {
|
|
|
1089
1506
|
spentTodayUsd = 0;
|
|
1090
1507
|
}
|
|
1091
1508
|
}
|
|
1092
|
-
|
|
1509
|
+
throwIfAborted();
|
|
1510
|
+
const outcome = await triageFindings(enrichedFindings, summary.pageCount, {
|
|
1093
1511
|
enabled: true,
|
|
1094
1512
|
model: resolved.model,
|
|
1095
1513
|
providerId: resolved.providerId,
|
|
@@ -1124,9 +1542,9 @@ export async function auditSource(source, options) {
|
|
|
1124
1542
|
runId,
|
|
1125
1543
|
timestamp: new Date().toISOString(),
|
|
1126
1544
|
durationMs: Date.now() - runStartedAt,
|
|
1127
|
-
score: summary.
|
|
1545
|
+
score: summary.risk,
|
|
1128
1546
|
pageCount: summary.pageCount,
|
|
1129
|
-
findingCount:
|
|
1547
|
+
findingCount: enrichedFindings.length,
|
|
1130
1548
|
...(summary.rawFindingCount !== undefined && { rawFindingCount: summary.rawFindingCount }),
|
|
1131
1549
|
...(summary.templateDetected !== undefined && { templateDetected: summary.templateDetected }),
|
|
1132
1550
|
...(summary.cacheStats && { cacheStats: summary.cacheStats }),
|
|
@@ -1181,7 +1599,19 @@ export async function auditSource(source, options) {
|
|
|
1181
1599
|
}
|
|
1182
1600
|
const aiHintEnabled = options?.ai?.suggest !== false;
|
|
1183
1601
|
if (aiHintEnabled && !options?.ai?.enabled && process.env.ANTHROPIC_API_KEY) {
|
|
1184
|
-
console.error(`💡 AI triage available — re-run with --ai to prioritize ${
|
|
1602
|
+
console.error(`💡 AI triage available — re-run with --ai to prioritize ${enrichedFindings.length} findings into a fix list.`);
|
|
1603
|
+
}
|
|
1604
|
+
if (cacheConfig && cacheMaxBytes > 0) {
|
|
1605
|
+
try {
|
|
1606
|
+
const pruneResult = await pruneCache(cacheConfig.dir, cacheMaxBytes);
|
|
1607
|
+
if (pruneResult.removedEntries > 0 || pruneResult.removedTmpFiles > 0) {
|
|
1608
|
+
const freedMb = ((pruneResult.before.bytes - pruneResult.after.bytes) / 1024 / 1024).toFixed(1);
|
|
1609
|
+
console.error(`pseolint: cache prune freed ${freedMb} MB (${pruneResult.removedEntries} entries, ${pruneResult.removedTmpFiles} .tmp files); size=${(pruneResult.after.bytes / 1024 / 1024).toFixed(1)}MB / cap=${(cacheMaxBytes / 1024 / 1024).toFixed(0)}MB`);
|
|
1610
|
+
}
|
|
1611
|
+
}
|
|
1612
|
+
catch {
|
|
1613
|
+
// Non-fatal: eviction failure must not break the audit.
|
|
1614
|
+
}
|
|
1185
1615
|
}
|
|
1186
1616
|
return summary;
|
|
1187
1617
|
}
|