@pseolint/core 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +84 -15
- package/dist/ai/prompt.d.ts +1 -1
- package/dist/ai/prompt.d.ts.map +1 -1
- package/dist/ai/prompt.js +13 -1
- package/dist/ai/prompt.js.map +1 -1
- package/dist/auditor.d.ts.map +1 -1
- package/dist/auditor.js +197 -63
- package/dist/auditor.js.map +1 -1
- package/dist/cache.d.ts.map +1 -1
- package/dist/cache.js +38 -2
- package/dist/cache.js.map +1 -1
- package/dist/formatters/console.d.ts +9 -0
- package/dist/formatters/console.d.ts.map +1 -1
- package/dist/formatters/console.js +53 -0
- package/dist/formatters/console.js.map +1 -1
- package/dist/formatters/html.d.ts.map +1 -1
- package/dist/formatters/html.js +363 -135
- package/dist/formatters/html.js.map +1 -1
- package/dist/index.d.ts +10 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +9 -0
- package/dist/index.js.map +1 -1
- package/dist/rule-references.d.ts.map +1 -1
- package/dist/rule-references.js +8 -0
- package/dist/rule-references.js.map +1 -1
- package/dist/rules/aeo/answer-first.d.ts +18 -0
- package/dist/rules/aeo/answer-first.d.ts.map +1 -0
- package/dist/rules/aeo/answer-first.js +191 -0
- package/dist/rules/aeo/answer-first.js.map +1 -0
- package/dist/rules/aeo/citable-facts.d.ts +9 -0
- package/dist/rules/aeo/citable-facts.d.ts.map +1 -0
- package/dist/rules/aeo/citable-facts.js +90 -0
- package/dist/rules/aeo/citable-facts.js.map +1 -0
- package/dist/rules/aeo/content-modularity.d.ts +11 -0
- package/dist/rules/aeo/content-modularity.d.ts.map +1 -0
- package/dist/rules/aeo/content-modularity.js +107 -0
- package/dist/rules/aeo/content-modularity.js.map +1 -0
- package/dist/rules/aeo/crawler-access.d.ts +25 -0
- package/dist/rules/aeo/crawler-access.d.ts.map +1 -0
- package/dist/rules/aeo/crawler-access.js +116 -0
- package/dist/rules/aeo/crawler-access.js.map +1 -0
- package/dist/rules/aeo/faq-coverage.d.ts +9 -0
- package/dist/rules/aeo/faq-coverage.d.ts.map +1 -0
- package/dist/rules/aeo/faq-coverage.js +71 -0
- package/dist/rules/aeo/faq-coverage.js.map +1 -0
- package/dist/rules/aeo/freshness-signals.d.ts +9 -0
- package/dist/rules/aeo/freshness-signals.d.ts.map +1 -0
- package/dist/rules/aeo/freshness-signals.js +109 -0
- package/dist/rules/aeo/freshness-signals.js.map +1 -0
- package/dist/rules/aeo/llms-txt.d.ts +24 -0
- package/dist/rules/aeo/llms-txt.d.ts.map +1 -0
- package/dist/rules/aeo/llms-txt.js +93 -0
- package/dist/rules/aeo/llms-txt.js.map +1 -0
- package/dist/rules/aeo/non-replicable-value.d.ts +9 -0
- package/dist/rules/aeo/non-replicable-value.d.ts.map +1 -0
- package/dist/rules/aeo/non-replicable-value.js +95 -0
- package/dist/rules/aeo/non-replicable-value.js.map +1 -0
- package/dist/rules/scope.d.ts +12 -0
- package/dist/rules/scope.d.ts.map +1 -0
- package/dist/rules/scope.js +66 -0
- package/dist/rules/scope.js.map +1 -0
- package/dist/rules/tech/robots-sitemap-presence.d.ts +16 -0
- package/dist/rules/tech/robots-sitemap-presence.d.ts.map +1 -1
- package/dist/rules/tech/robots-sitemap-presence.js +26 -2
- package/dist/rules/tech/robots-sitemap-presence.js.map +1 -1
- package/dist/types.d.ts +29 -0
- package/dist/types.d.ts.map +1 -1
- package/package.json +91 -66
package/dist/auditor.js
CHANGED
|
@@ -26,7 +26,15 @@ import { hreflangConsistencyRule } from "./rules/tech/hreflang-consistency.js";
|
|
|
26
26
|
import { ogCompletenessRule } from "./rules/tech/og-completeness.js";
|
|
27
27
|
import { robotsNoindexConflictRule } from "./rules/tech/robots-noindex-conflict.js";
|
|
28
28
|
import { sitemapCompletenessRule } from "./rules/tech/sitemap-completeness.js";
|
|
29
|
-
import { robotsComplianceRule } from "./rules/tech/robots-sitemap-presence.js";
|
|
29
|
+
import { robotsComplianceRule, parseDisallowPatterns, isBlockedByPattern, parseCrawlDelaySeconds } from "./rules/tech/robots-sitemap-presence.js";
|
|
30
|
+
import { llmsTxtRule } from "./rules/aeo/llms-txt.js";
|
|
31
|
+
import { crawlerAccessRule } from "./rules/aeo/crawler-access.js";
|
|
32
|
+
import { freshnessSignalsRule } from "./rules/aeo/freshness-signals.js";
|
|
33
|
+
import { faqCoverageRule } from "./rules/aeo/faq-coverage.js";
|
|
34
|
+
import { answerFirstRule } from "./rules/aeo/answer-first.js";
|
|
35
|
+
import { citableFactsRule } from "./rules/aeo/citable-facts.js";
|
|
36
|
+
import { nonReplicableValueRule } from "./rules/aeo/non-replicable-value.js";
|
|
37
|
+
import { contentModularityRule } from "./rules/aeo/content-modularity.js";
|
|
30
38
|
import { redirectChainRule } from "./rules/tech/redirect-chain.js";
|
|
31
39
|
import { soft404Rule } from "./rules/tech/soft-404.js";
|
|
32
40
|
import { jsonLdValidRule } from "./rules/schema/json-ld-valid.js";
|
|
@@ -38,6 +46,7 @@ import { urlPatternRule } from "./rules/cannibal/url-pattern.js";
|
|
|
38
46
|
import { templateCoverageRule } from "./rules/spam/template-coverage.js";
|
|
39
47
|
import { dataBindingRule, dataIdenticalRule } from "./rules/data/data-binding.js";
|
|
40
48
|
import { classifyPages, isRuleEnabled } from "./page-classifier.js";
|
|
49
|
+
import { isRuleAllowedInDiff } from "./rules/scope.js";
|
|
41
50
|
import { RULE_REFERENCES } from "./rule-references.js";
|
|
42
51
|
import { enrichFindings } from "./enrich-findings.js";
|
|
43
52
|
import { triageFindings } from "./ai/triage.js";
|
|
@@ -61,13 +70,21 @@ const DEFAULTS = {
|
|
|
61
70
|
hubPagesMaxSiblings: 50,
|
|
62
71
|
titleOverlapThreshold: 0.8,
|
|
63
72
|
keywordCollisionMinShared: 6,
|
|
64
|
-
templateCoverageMinPages: 5
|
|
73
|
+
templateCoverageMinPages: 5,
|
|
74
|
+
answerFirstMaxWords: 100,
|
|
75
|
+
citableFactsMin: 3,
|
|
76
|
+
citableFactsTarget: 8,
|
|
77
|
+
freshnessMaxStaleDays: 180,
|
|
78
|
+
modularityMaxParagraphWords: 200,
|
|
79
|
+
modularityMinSelfContainedRatio: 0.7,
|
|
80
|
+
faqMinQuestionHeadings: 2
|
|
65
81
|
};
|
|
66
82
|
const CATEGORY_WEIGHTS = {
|
|
67
|
-
spam: 0.
|
|
68
|
-
content: 0.
|
|
69
|
-
|
|
70
|
-
|
|
83
|
+
spam: 0.35,
|
|
84
|
+
content: 0.2,
|
|
85
|
+
aeo: 0.15,
|
|
86
|
+
links: 0.12,
|
|
87
|
+
tech: 0.08,
|
|
71
88
|
schema: 0.05,
|
|
72
89
|
cannibal: 0.05,
|
|
73
90
|
/** Dedup / crawl hygiene; does not affect composite score. */
|
|
@@ -93,8 +110,9 @@ function resolveGroupRules(baseRules, overrides) {
|
|
|
93
110
|
}
|
|
94
111
|
return result;
|
|
95
112
|
}
|
|
96
|
-
function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, entityPatterns, overrides) {
|
|
113
|
+
function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, entityPatterns, overrides, mode = "full") {
|
|
97
114
|
const findings = [];
|
|
115
|
+
const modeOk = (ruleId) => mode !== "diff" || isRuleAllowedInDiff(ruleId);
|
|
98
116
|
const tag = (results) => results.map((r) => {
|
|
99
117
|
const override = overrides?.[r.ruleId];
|
|
100
118
|
return {
|
|
@@ -106,106 +124,137 @@ function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls,
|
|
|
106
124
|
});
|
|
107
125
|
// Spam rules — always compute cross-page data, only push findings if enabled
|
|
108
126
|
const nearDuplicate = nearDuplicateRule(pages, resolvedRules.nearDuplicateThreshold);
|
|
109
|
-
if (isEnabled("spam/near-duplicate")) {
|
|
127
|
+
if (isEnabled("spam/near-duplicate") && modeOk("spam/near-duplicate")) {
|
|
110
128
|
findings.push(...tag(nearDuplicate.findings));
|
|
111
129
|
}
|
|
112
130
|
const entitySwap = entitySwapRule(pages, entityPatterns, resolvedRules.entitySwapThreshold);
|
|
113
|
-
if (isEnabled("spam/entity-swap")) {
|
|
131
|
+
if (isEnabled("spam/entity-swap") && modeOk("spam/entity-swap")) {
|
|
114
132
|
findings.push(...tag(entitySwap.findings));
|
|
115
133
|
}
|
|
116
134
|
const thinContent = thinContentRule(pages, resolvedRules.thinContentMinWords);
|
|
117
|
-
if (isEnabled("spam/thin-content")) {
|
|
135
|
+
if (isEnabled("spam/thin-content") && modeOk("spam/thin-content")) {
|
|
118
136
|
findings.push(...tag(thinContent.findings));
|
|
119
137
|
}
|
|
120
|
-
if (isEnabled("spam/doorway-pattern")) {
|
|
138
|
+
if (isEnabled("spam/doorway-pattern") && modeOk("spam/doorway-pattern")) {
|
|
121
139
|
findings.push(...tag(doorwayPatternRule(nearDuplicate.pairs, entitySwap.pairs, thinContent.thinContentUrls, pages)));
|
|
122
140
|
}
|
|
123
|
-
if (isEnabled("spam/publication-velocity")) {
|
|
141
|
+
if (isEnabled("spam/publication-velocity") && modeOk("spam/publication-velocity")) {
|
|
124
142
|
findings.push(...tag(publicationVelocityRule(pages, resolvedRules.publicationVelocityMaxPerDay)));
|
|
125
143
|
}
|
|
126
|
-
if (isEnabled("spam/boilerplate-ratio")) {
|
|
144
|
+
if (isEnabled("spam/boilerplate-ratio") && modeOk("spam/boilerplate-ratio")) {
|
|
127
145
|
findings.push(...tag(boilerplateRatioRule(pages, resolvedRules.boilerplateMaxRatio)));
|
|
128
146
|
}
|
|
129
|
-
if (isEnabled("spam/template-diversity")) {
|
|
147
|
+
if (isEnabled("spam/template-diversity") && modeOk("spam/template-diversity")) {
|
|
130
148
|
findings.push(...tag(templateDiversityRule(pages, resolvedRules.templateDiversityMinUniqueRatio)));
|
|
131
149
|
}
|
|
132
|
-
if (isEnabled("spam/template-coverage")) {
|
|
150
|
+
if (isEnabled("spam/template-coverage") && modeOk("spam/template-coverage")) {
|
|
133
151
|
findings.push(...tag(templateCoverageRule(pages, entityPatterns, resolvedRules.templateCoverageMinPages)));
|
|
134
152
|
}
|
|
135
153
|
// Content rules
|
|
136
|
-
if (isEnabled("content/unique-value")) {
|
|
154
|
+
if (isEnabled("content/unique-value") && modeOk("content/unique-value")) {
|
|
137
155
|
findings.push(...tag(uniqueValueRule(pages, resolvedRules.uniqueValueMinWords)));
|
|
138
156
|
}
|
|
139
|
-
if (isEnabled("content/heading-uniqueness")) {
|
|
157
|
+
if (isEnabled("content/heading-uniqueness") && modeOk("content/heading-uniqueness")) {
|
|
140
158
|
findings.push(...tag(headingUniquenessRule(pages, entityPatterns)));
|
|
141
159
|
}
|
|
142
|
-
if (isEnabled("content/meta-uniqueness")) {
|
|
160
|
+
if (isEnabled("content/meta-uniqueness") && modeOk("content/meta-uniqueness")) {
|
|
143
161
|
findings.push(...tag(metaUniquenessRule(pages, entityPatterns, resolvedRules.metaUniquenessMinJaccard)));
|
|
144
162
|
}
|
|
145
|
-
if (isEnabled("content/missing-author")) {
|
|
163
|
+
if (isEnabled("content/missing-author") && modeOk("content/missing-author")) {
|
|
146
164
|
findings.push(...tag(missingAuthorRule(pages)));
|
|
147
165
|
}
|
|
148
|
-
if (isEnabled("content/eeat-signals")) {
|
|
166
|
+
if (isEnabled("content/eeat-signals") && modeOk("content/eeat-signals")) {
|
|
149
167
|
findings.push(...tag(eeatSignalsRule(pages)));
|
|
150
168
|
}
|
|
151
169
|
// Link rules — use the global link graph
|
|
152
|
-
if (isEnabled("links/orphan-pages")) {
|
|
170
|
+
if (isEnabled("links/orphan-pages") && modeOk("links/orphan-pages")) {
|
|
153
171
|
findings.push(...tag(orphanPagesRule(pages, inbound, rootUrl)));
|
|
154
172
|
}
|
|
155
|
-
if (isEnabled("links/dead-ends")) {
|
|
173
|
+
if (isEnabled("links/dead-ends") && modeOk("links/dead-ends")) {
|
|
156
174
|
findings.push(...tag(deadEndsRule(pages, knownUrls, rootUrl)));
|
|
157
175
|
}
|
|
158
|
-
if (isEnabled("links/link-depth")) {
|
|
176
|
+
if (isEnabled("links/link-depth") && modeOk("links/link-depth")) {
|
|
159
177
|
if (rootUrl) {
|
|
160
178
|
findings.push(...tag(linkDepthRule(pages, adjacency, rootUrl, resolvedRules.linkDepthMaxClicks, inbound)));
|
|
161
179
|
}
|
|
162
180
|
}
|
|
163
|
-
if (isEnabled("links/cluster-connectivity")) {
|
|
181
|
+
if (isEnabled("links/cluster-connectivity") && modeOk("links/cluster-connectivity")) {
|
|
164
182
|
findings.push(...tag(clusterConnectivityRule(pages, knownUrls)));
|
|
165
183
|
}
|
|
166
|
-
if (isEnabled("links/hub-pages")) {
|
|
184
|
+
if (isEnabled("links/hub-pages") && modeOk("links/hub-pages")) {
|
|
167
185
|
findings.push(...tag(hubPagesRule(pages, knownUrls, resolvedRules.hubPagesMinSiblings, resolvedRules.hubPagesMaxSiblings)));
|
|
168
186
|
}
|
|
169
187
|
// Tech rules
|
|
170
|
-
if (isEnabled("tech/canonical-consistency")) {
|
|
188
|
+
if (isEnabled("tech/canonical-consistency") && modeOk("tech/canonical-consistency")) {
|
|
171
189
|
findings.push(...tag(canonicalConsistencyRule(pages, knownUrls, normalizeUrlOptions)));
|
|
172
190
|
}
|
|
173
|
-
if (isEnabled("tech/canonical-noindex-conflict")) {
|
|
191
|
+
if (isEnabled("tech/canonical-noindex-conflict") && modeOk("tech/canonical-noindex-conflict")) {
|
|
174
192
|
findings.push(...tag(canonicalNoindexConflictRule(pages, normalizeUrlOptions)));
|
|
175
193
|
}
|
|
176
|
-
if (isEnabled("tech/robots-noindex-conflict")) {
|
|
194
|
+
if (isEnabled("tech/robots-noindex-conflict") && modeOk("tech/robots-noindex-conflict")) {
|
|
177
195
|
findings.push(...tag(robotsNoindexConflictRule(pages, inbound)));
|
|
178
196
|
}
|
|
179
|
-
if (isEnabled("tech/redirect-chain")) {
|
|
197
|
+
if (isEnabled("tech/redirect-chain") && modeOk("tech/redirect-chain")) {
|
|
180
198
|
findings.push(...tag(redirectChainRule(pages)));
|
|
181
199
|
}
|
|
182
|
-
if (isEnabled("tech/soft-404")) {
|
|
200
|
+
if (isEnabled("tech/soft-404") && modeOk("tech/soft-404")) {
|
|
183
201
|
findings.push(...tag(soft404Rule(pages)));
|
|
184
202
|
}
|
|
185
|
-
if (isEnabled("tech/og-completeness")) {
|
|
203
|
+
if (isEnabled("tech/og-completeness") && modeOk("tech/og-completeness")) {
|
|
186
204
|
findings.push(...tag(ogCompletenessRule(pages)));
|
|
187
205
|
}
|
|
188
|
-
if (isEnabled("tech/hreflang-consistency")) {
|
|
206
|
+
if (isEnabled("tech/hreflang-consistency") && modeOk("tech/hreflang-consistency")) {
|
|
189
207
|
findings.push(...tag(hreflangConsistencyRule(pages, normalizeUrlOptions)));
|
|
190
208
|
}
|
|
191
209
|
// Schema rules
|
|
192
|
-
if (isEnabled("schema/json-ld-valid")) {
|
|
210
|
+
if (isEnabled("schema/json-ld-valid") && modeOk("schema/json-ld-valid")) {
|
|
193
211
|
findings.push(...tag(jsonLdValidRule(pages)));
|
|
194
212
|
}
|
|
195
|
-
if (isEnabled("schema/required-fields")) {
|
|
213
|
+
if (isEnabled("schema/required-fields") && modeOk("schema/required-fields")) {
|
|
196
214
|
findings.push(...tag(requiredFieldsRule(pages)));
|
|
197
215
|
}
|
|
198
|
-
if (isEnabled("schema/consistency")) {
|
|
216
|
+
if (isEnabled("schema/consistency") && modeOk("schema/consistency")) {
|
|
199
217
|
findings.push(...tag(schemaConsistencyRule(pages)));
|
|
200
218
|
}
|
|
219
|
+
// AEO rules
|
|
220
|
+
if (isEnabled("aeo/freshness-signals")) {
|
|
221
|
+
findings.push(...tag(freshnessSignalsRule(pages, {
|
|
222
|
+
maxStaleDays: resolvedRules.freshnessMaxStaleDays,
|
|
223
|
+
})));
|
|
224
|
+
}
|
|
225
|
+
if (isEnabled("aeo/faq-coverage")) {
|
|
226
|
+
findings.push(...tag(faqCoverageRule(pages, {
|
|
227
|
+
minQuestionHeadings: resolvedRules.faqMinQuestionHeadings,
|
|
228
|
+
})));
|
|
229
|
+
}
|
|
230
|
+
if (isEnabled("aeo/answer-first")) {
|
|
231
|
+
findings.push(...tag(answerFirstRule(pages, entityPatterns, {
|
|
232
|
+
maxFirstParagraphWords: resolvedRules.answerFirstMaxWords,
|
|
233
|
+
})));
|
|
234
|
+
}
|
|
235
|
+
if (isEnabled("aeo/citable-facts")) {
|
|
236
|
+
findings.push(...tag(citableFactsRule(pages, entityPatterns, {
|
|
237
|
+
minFactsPerPage: resolvedRules.citableFactsMin,
|
|
238
|
+
targetFactsPerPage: resolvedRules.citableFactsTarget,
|
|
239
|
+
})));
|
|
240
|
+
}
|
|
241
|
+
if (isEnabled("aeo/non-replicable-value")) {
|
|
242
|
+
findings.push(...tag(nonReplicableValueRule(pages)));
|
|
243
|
+
}
|
|
244
|
+
if (isEnabled("aeo/content-modularity")) {
|
|
245
|
+
findings.push(...tag(contentModularityRule(pages, {
|
|
246
|
+
maxParagraphWords: resolvedRules.modularityMaxParagraphWords,
|
|
247
|
+
minSelfContainedRatio: resolvedRules.modularityMinSelfContainedRatio,
|
|
248
|
+
})));
|
|
249
|
+
}
|
|
201
250
|
// Cannibal rules
|
|
202
|
-
if (isEnabled("cannibal/title-overlap")) {
|
|
251
|
+
if (isEnabled("cannibal/title-overlap") && modeOk("cannibal/title-overlap")) {
|
|
203
252
|
findings.push(...tag(titleOverlapRule(pages, entityPatterns, resolvedRules.titleOverlapThreshold)));
|
|
204
253
|
}
|
|
205
|
-
if (isEnabled("cannibal/keyword-collision")) {
|
|
254
|
+
if (isEnabled("cannibal/keyword-collision") && modeOk("cannibal/keyword-collision")) {
|
|
206
255
|
findings.push(...tag(keywordCollisionRule(pages, resolvedRules.keywordCollisionMinShared)));
|
|
207
256
|
}
|
|
208
|
-
if (isEnabled("cannibal/url-pattern")) {
|
|
257
|
+
if (isEnabled("cannibal/url-pattern") && modeOk("cannibal/url-pattern")) {
|
|
209
258
|
findings.push(...tag(urlPatternRule(pages)));
|
|
210
259
|
}
|
|
211
260
|
return findings;
|
|
@@ -223,6 +272,7 @@ function scoreFromFindings(findings) {
|
|
|
223
272
|
const raw = {
|
|
224
273
|
spam: 0,
|
|
225
274
|
content: 0,
|
|
275
|
+
aeo: 0,
|
|
226
276
|
links: 0,
|
|
227
277
|
tech: 0,
|
|
228
278
|
schema: 0,
|
|
@@ -238,6 +288,7 @@ function scoreFromFindings(findings) {
|
|
|
238
288
|
}
|
|
239
289
|
const weighted = raw.spam * CATEGORY_WEIGHTS.spam +
|
|
240
290
|
raw.content * CATEGORY_WEIGHTS.content +
|
|
291
|
+
raw.aeo * CATEGORY_WEIGHTS.aeo +
|
|
241
292
|
raw.links * CATEGORY_WEIGHTS.links +
|
|
242
293
|
raw.tech * CATEGORY_WEIGHTS.tech +
|
|
243
294
|
raw.schema * CATEGORY_WEIGHTS.schema +
|
|
@@ -248,6 +299,7 @@ function scoreFromFindings(findings) {
|
|
|
248
299
|
categoryScores: {
|
|
249
300
|
spam: raw.spam,
|
|
250
301
|
content: raw.content,
|
|
302
|
+
aeo: raw.aeo,
|
|
251
303
|
links: raw.links,
|
|
252
304
|
tech: raw.tech,
|
|
253
305
|
schema: raw.schema,
|
|
@@ -430,7 +482,35 @@ async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutM
|
|
|
430
482
|
}
|
|
431
483
|
return allUrls;
|
|
432
484
|
}
|
|
433
|
-
async function
|
|
485
|
+
async function fetchRobotsMeta(origin, timeoutMs, cache, stats) {
|
|
486
|
+
if (!origin)
|
|
487
|
+
return { disallow: [], crawlDelaySec: 0 };
|
|
488
|
+
try {
|
|
489
|
+
const robotsUrl = `${origin}/robots.txt`;
|
|
490
|
+
const fetched = await fetchTextStrict(robotsUrl, timeoutMs, cache, stats);
|
|
491
|
+
return {
|
|
492
|
+
disallow: parseDisallowPatterns(fetched.text),
|
|
493
|
+
crawlDelaySec: parseCrawlDelaySeconds(fetched.text),
|
|
494
|
+
};
|
|
495
|
+
}
|
|
496
|
+
catch {
|
|
497
|
+
return { disallow: [], crawlDelaySec: 0 };
|
|
498
|
+
}
|
|
499
|
+
}
|
|
500
|
+
function sleep(ms) {
|
|
501
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
502
|
+
}
|
|
503
|
+
function isDisallowedByRobots(urlPath, patterns) {
|
|
504
|
+
for (const pat of patterns) {
|
|
505
|
+
if (isBlockedByPattern(urlPath, pat))
|
|
506
|
+
return true;
|
|
507
|
+
}
|
|
508
|
+
return false;
|
|
509
|
+
}
|
|
510
|
+
function budgetExceeded(b) {
|
|
511
|
+
return b.cap > 0 && b.used >= b.cap;
|
|
512
|
+
}
|
|
513
|
+
async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cache, stats, fillBudgetViaLinkDiscovery = false, byteBudget = { used: 0, cap: 0 }) {
|
|
434
514
|
if (/^https?:\/\//i.test(source)) {
|
|
435
515
|
let text;
|
|
436
516
|
let contentType;
|
|
@@ -467,23 +547,38 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
467
547
|
? fisherYatesSample(allSitemapUrls, discoveryBudget)
|
|
468
548
|
: allSitemapUrls;
|
|
469
549
|
const pages = [];
|
|
470
|
-
|
|
550
|
+
// Fetch robots.txt once for the origin — reused for Crawl-Delay pacing and Disallow checks.
|
|
551
|
+
const sourceOrigin = (() => { try {
|
|
552
|
+
return new URL(source).origin;
|
|
553
|
+
}
|
|
554
|
+
catch {
|
|
555
|
+
return "";
|
|
556
|
+
} })();
|
|
557
|
+
const robots = await fetchRobotsMeta(sourceOrigin, timeoutMs, cache, stats);
|
|
558
|
+
const effectiveConcurrency = robots.crawlDelaySec > 0 ? 1 : concurrency;
|
|
559
|
+
const delayMs = robots.crawlDelaySec * 1000;
|
|
560
|
+
await runWithConcurrency(urlsToFetch, effectiveConcurrency, async (url) => {
|
|
561
|
+
if (budgetExceeded(byteBudget))
|
|
562
|
+
return;
|
|
471
563
|
const result = await fetchPageWithMeta(url, timeoutMs, cache, stats);
|
|
472
564
|
if (result) {
|
|
565
|
+
byteBudget.used += result.html.length;
|
|
473
566
|
pages.push(result);
|
|
474
567
|
}
|
|
568
|
+
if (delayMs > 0)
|
|
569
|
+
await sleep(delayMs);
|
|
475
570
|
});
|
|
476
|
-
//
|
|
477
|
-
|
|
571
|
+
// Link discovery fills the sample.
|
|
572
|
+
// Legacy behavior: no budget set + crawlDiscovery true → fill from links (unchanged).
|
|
573
|
+
// New behavior: budget set + crawlDiscovery true + opt-in flag → top up to budget.
|
|
574
|
+
const budgetUnderfilled = discoveryBudget > 0 && pages.length < discoveryBudget;
|
|
575
|
+
const legacyBudgetless = discoveryBudget === 0;
|
|
576
|
+
const shouldFill = crawlDiscovery && (legacyBudgetless || (budgetUnderfilled && fillBudgetViaLinkDiscovery));
|
|
577
|
+
if (shouldFill) {
|
|
478
578
|
const sitemapUrlSet = new Set(allSitemapUrls);
|
|
479
579
|
const discoveredUrls = new Set();
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
sourceOrigin = new URL(source).origin;
|
|
483
|
-
}
|
|
484
|
-
catch {
|
|
485
|
-
sourceOrigin = "";
|
|
486
|
-
}
|
|
580
|
+
// robots already fetched above; reuse its Disallow patterns here.
|
|
581
|
+
const disallowPatterns = robots.disallow;
|
|
487
582
|
for (const page of pages) {
|
|
488
583
|
const linkMatches = Array.from(page.html.matchAll(/href=["']([^"']+)["']/gi));
|
|
489
584
|
for (const match of linkMatches) {
|
|
@@ -500,9 +595,11 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
500
595
|
resolvedUrl.search = "";
|
|
501
596
|
resolvedUrl.hash = "";
|
|
502
597
|
const normalized = resolvedUrl.href;
|
|
503
|
-
if (
|
|
504
|
-
|
|
505
|
-
|
|
598
|
+
if (sitemapUrlSet.has(normalized) || discoveredUrls.has(normalized))
|
|
599
|
+
continue;
|
|
600
|
+
if (isDisallowedByRobots(resolvedUrl.pathname, disallowPatterns))
|
|
601
|
+
continue;
|
|
602
|
+
discoveredUrls.add(normalized);
|
|
506
603
|
}
|
|
507
604
|
catch {
|
|
508
605
|
continue;
|
|
@@ -510,11 +607,21 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
510
607
|
}
|
|
511
608
|
}
|
|
512
609
|
if (discoveredUrls.size > 0) {
|
|
513
|
-
|
|
610
|
+
const candidates = Array.from(discoveredUrls);
|
|
611
|
+
// Fisher-Yates shuffle so we don't bias toward the first-discovered links (nav/footer).
|
|
612
|
+
const shuffled = fisherYatesSample(candidates, candidates.length);
|
|
613
|
+
const remaining = discoveryBudget === 0 ? Infinity : discoveryBudget - pages.length;
|
|
614
|
+
const toFetch = remaining === Infinity ? shuffled : shuffled.slice(0, remaining);
|
|
615
|
+
await runWithConcurrency(toFetch, effectiveConcurrency, async (url) => {
|
|
616
|
+
if (budgetExceeded(byteBudget))
|
|
617
|
+
return;
|
|
514
618
|
const result = await fetchPageWithMeta(url, timeoutMs, cache, stats);
|
|
515
619
|
if (result && result.httpMeta && result.httpMeta.statusCode >= 200 && result.httpMeta.statusCode < 300) {
|
|
620
|
+
byteBudget.used += result.html.length;
|
|
516
621
|
pages.push(result);
|
|
517
622
|
}
|
|
623
|
+
if (delayMs > 0)
|
|
624
|
+
await sleep(delayMs);
|
|
518
625
|
});
|
|
519
626
|
}
|
|
520
627
|
}
|
|
@@ -647,7 +754,14 @@ export async function auditSource(source, options) {
|
|
|
647
754
|
hubPagesMaxSiblings: options?.rules?.hubPagesMaxSiblings ?? DEFAULTS.hubPagesMaxSiblings,
|
|
648
755
|
titleOverlapThreshold: options?.rules?.titleOverlapThreshold ?? DEFAULTS.titleOverlapThreshold,
|
|
649
756
|
keywordCollisionMinShared: options?.rules?.keywordCollisionMinShared ?? DEFAULTS.keywordCollisionMinShared,
|
|
650
|
-
templateCoverageMinPages: options?.rules?.templateCoverageMinPages ?? DEFAULTS.templateCoverageMinPages
|
|
757
|
+
templateCoverageMinPages: options?.rules?.templateCoverageMinPages ?? DEFAULTS.templateCoverageMinPages,
|
|
758
|
+
answerFirstMaxWords: options?.rules?.answerFirstMaxWords ?? DEFAULTS.answerFirstMaxWords,
|
|
759
|
+
citableFactsMin: options?.rules?.citableFactsMin ?? DEFAULTS.citableFactsMin,
|
|
760
|
+
citableFactsTarget: options?.rules?.citableFactsTarget ?? DEFAULTS.citableFactsTarget,
|
|
761
|
+
freshnessMaxStaleDays: options?.rules?.freshnessMaxStaleDays ?? DEFAULTS.freshnessMaxStaleDays,
|
|
762
|
+
modularityMaxParagraphWords: options?.rules?.modularityMaxParagraphWords ?? DEFAULTS.modularityMaxParagraphWords,
|
|
763
|
+
modularityMinSelfContainedRatio: options?.rules?.modularityMinSelfContainedRatio ?? DEFAULTS.modularityMinSelfContainedRatio,
|
|
764
|
+
faqMinQuestionHeadings: options?.rules?.faqMinQuestionHeadings ?? DEFAULTS.faqMinQuestionHeadings
|
|
651
765
|
};
|
|
652
766
|
const normalizeUrlOptions = mergeNormalizeUrlOptions({
|
|
653
767
|
stripQuery: options?.rules?.stripUrlQuery ?? true,
|
|
@@ -668,7 +782,10 @@ export async function auditSource(source, options) {
|
|
|
668
782
|
ttlMs: options.cache.ttlMs ?? 7 * 24 * 60 * 60 * 1000,
|
|
669
783
|
}
|
|
670
784
|
: null;
|
|
671
|
-
const
|
|
785
|
+
const fillBudgetViaLinkDiscovery = options?.fillBudgetViaLinkDiscovery ?? false;
|
|
786
|
+
const maxFetchBytes = options?.maxFetchBytes ?? 52_428_800;
|
|
787
|
+
const fetchByteBudget = { used: 0, cap: maxFetchBytes };
|
|
788
|
+
const { pages: loadedPagesRaw, sitemapUrls: sitemapUrlSet, discoveredUrlCount } = await loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cacheConfig, cacheStats, fillBudgetViaLinkDiscovery, fetchByteBudget);
|
|
672
789
|
const loadedPages = [...loadedPagesRaw];
|
|
673
790
|
if (discoveredUrlCount && discoveredUrlCount > loadedPages.length) {
|
|
674
791
|
console.error(`Discovered ${discoveredUrlCount} pages, fetched ${loadedPages.length} for audit. Use --sample-size 0 for full crawl.`);
|
|
@@ -778,9 +895,14 @@ export async function auditSource(source, options) {
|
|
|
778
895
|
throw new Error(`Invalid regex flags "${rawFlags}" in entityPatterns for placeholder "${p.placeholder}". ` +
|
|
779
896
|
`Only the flags g, i, m, s, u, y are permitted.`);
|
|
780
897
|
}
|
|
898
|
+
// Entity patterns are used with String.replace to mask every occurrence, which
|
|
899
|
+
// requires the `g` flag. Add it if the user forgot — a silently broken "only first
|
|
900
|
+
// match masked" regex would make template-detection rules (answer-first,
|
|
901
|
+
// citable-facts) miss shared openers.
|
|
902
|
+
const normalizedFlags = rawFlags.includes("g") ? rawFlags : `${rawFlags}g`;
|
|
781
903
|
try {
|
|
782
904
|
// Flags validated against SAFE_FLAGS_RE above; pattern is from trusted local config, not HTTP input.
|
|
783
|
-
return { placeholder: p.placeholder, pattern: new RegExp(p.pattern,
|
|
905
|
+
return { placeholder: p.placeholder, pattern: new RegExp(p.pattern, normalizedFlags) }; // nosemgrep
|
|
784
906
|
}
|
|
785
907
|
catch (err) {
|
|
786
908
|
throw new Error(`Invalid regex pattern for placeholder "${p.placeholder}": ${err.message}`);
|
|
@@ -793,8 +915,9 @@ export async function auditSource(source, options) {
|
|
|
793
915
|
const allFindings = [...duplicateUrlFindings];
|
|
794
916
|
const groupScores = {};
|
|
795
917
|
const groupPageCounts = {};
|
|
918
|
+
const auditMode = options?.mode ?? "full";
|
|
796
919
|
// Site-wide rules (run once, outside group loop)
|
|
797
|
-
if (sitemapUrlSet && sitemapUrlSet.size > 0) {
|
|
920
|
+
if (sitemapUrlSet && sitemapUrlSet.size > 0 && auditMode !== "diff") {
|
|
798
921
|
const sitemapFindings = sitemapCompletenessRule(parsedPages, sitemapUrlSet);
|
|
799
922
|
allFindings.push(...sitemapFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
|
|
800
923
|
if (robotsTxtContent) {
|
|
@@ -802,13 +925,24 @@ export async function auditSource(source, options) {
|
|
|
802
925
|
allFindings.push(...robotsFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
|
|
803
926
|
}
|
|
804
927
|
}
|
|
928
|
+
// AEO site-wide rules. These run unconditionally (consistent with sitemap-completeness
|
|
929
|
+
// and robots-compliance); page-group rule lists govern per-page AEO rules only.
|
|
930
|
+
const llmsFindings = await llmsTxtRule(source, { timeoutMs });
|
|
931
|
+
allFindings.push(...llmsFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
|
|
932
|
+
if (robotsTxtContent) {
|
|
933
|
+
const crawlerFindings = crawlerAccessRule(robotsTxtContent);
|
|
934
|
+
allFindings.push(...crawlerFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
|
|
935
|
+
}
|
|
805
936
|
// Data source comparison rules
|
|
806
937
|
if (options?.dataSource?.records && options.dataSource.records.length > 0) {
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
...
|
|
810
|
-
|
|
811
|
-
|
|
938
|
+
if (auditMode !== "diff" || isRuleAllowedInDiff("data/missing-binding")) {
|
|
939
|
+
const dataBindingFindings = dataBindingRule(parsedPages, options.dataSource.records);
|
|
940
|
+
allFindings.push(...dataBindingFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
|
|
941
|
+
}
|
|
942
|
+
if (auditMode !== "diff" || isRuleAllowedInDiff("data/identical-across-pages")) {
|
|
943
|
+
const dataIdenticalFindings = dataIdenticalRule(parsedPages, options.dataSource.records);
|
|
944
|
+
allFindings.push(...dataIdenticalFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
|
|
945
|
+
}
|
|
812
946
|
}
|
|
813
947
|
for (const [groupName, groupPages] of classified) {
|
|
814
948
|
if (groupPages.length === 0)
|
|
@@ -818,7 +952,7 @@ export async function auditSource(source, options) {
|
|
|
818
952
|
continue;
|
|
819
953
|
const groupRules = resolveGroupRules(resolvedRules, groupConfig?.overrides);
|
|
820
954
|
const enabledCheck = (ruleId) => isRuleEnabled(ruleId, groupConfig?.rules);
|
|
821
|
-
const findings = runRulesOnPages(groupPages, groupRules, enabledCheck, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, DEFAULT_ENTITY_PATTERNS, groupConfig?.overrides);
|
|
955
|
+
const findings = runRulesOnPages(groupPages, groupRules, enabledCheck, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, DEFAULT_ENTITY_PATTERNS, groupConfig?.overrides, options?.mode ?? "full");
|
|
822
956
|
allFindings.push(...findings);
|
|
823
957
|
groupPageCounts[groupName] = groupPages.length;
|
|
824
958
|
const { score } = scoreFromFindings(findings);
|