@pseolint/core 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +53 -0
- package/dist/algorithms/entity-mask.d.ts +3 -0
- package/dist/algorithms/entity-mask.d.ts.map +1 -0
- package/dist/algorithms/entity-mask.js +8 -0
- package/dist/algorithms/entity-mask.js.map +1 -0
- package/dist/algorithms/entity-mask.test.d.ts +2 -0
- package/dist/algorithms/entity-mask.test.d.ts.map +1 -0
- package/dist/algorithms/entity-mask.test.js +23 -0
- package/dist/algorithms/entity-mask.test.js.map +1 -0
- package/dist/algorithms/simhash.d.ts +4 -0
- package/dist/algorithms/simhash.d.ts.map +1 -0
- package/dist/algorithms/simhash.js +64 -0
- package/dist/algorithms/simhash.js.map +1 -0
- package/dist/algorithms/simhash.test.d.ts +2 -0
- package/dist/algorithms/simhash.test.d.ts.map +1 -0
- package/dist/algorithms/simhash.test.js +23 -0
- package/dist/algorithms/simhash.test.js.map +1 -0
- package/dist/algorithms/tf-idf.d.ts +8 -0
- package/dist/algorithms/tf-idf.d.ts.map +1 -0
- package/dist/algorithms/tf-idf.js +55 -0
- package/dist/algorithms/tf-idf.js.map +1 -0
- package/dist/auditor.d.ts +3 -0
- package/dist/auditor.d.ts.map +1 -0
- package/dist/auditor.js +730 -0
- package/dist/auditor.js.map +1 -0
- package/dist/auditor.test.d.ts +2 -0
- package/dist/auditor.test.d.ts.map +1 -0
- package/dist/auditor.test.js +134 -0
- package/dist/auditor.test.js.map +1 -0
- package/dist/enrich-findings.d.ts +9 -0
- package/dist/enrich-findings.d.ts.map +1 -0
- package/dist/enrich-findings.js +436 -0
- package/dist/enrich-findings.js.map +1 -0
- package/dist/formatters/console.d.ts +6 -0
- package/dist/formatters/console.d.ts.map +1 -0
- package/dist/formatters/console.js +237 -0
- package/dist/formatters/console.js.map +1 -0
- package/dist/formatters/html.d.ts +3 -0
- package/dist/formatters/html.d.ts.map +1 -0
- package/dist/formatters/html.js +170 -0
- package/dist/formatters/html.js.map +1 -0
- package/dist/formatters/index.d.ts +6 -0
- package/dist/formatters/index.d.ts.map +1 -0
- package/dist/formatters/index.js +5 -0
- package/dist/formatters/index.js.map +1 -0
- package/dist/formatters/json.d.ts +3 -0
- package/dist/formatters/json.d.ts.map +1 -0
- package/dist/formatters/json.js +4 -0
- package/dist/formatters/json.js.map +1 -0
- package/dist/formatters/markdown.d.ts +3 -0
- package/dist/formatters/markdown.d.ts.map +1 -0
- package/dist/formatters/markdown.js +93 -0
- package/dist/formatters/markdown.js.map +1 -0
- package/dist/index.d.ts +45 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +45 -0
- package/dist/index.js.map +1 -0
- package/dist/page-classifier.d.ts +4 -0
- package/dist/page-classifier.d.ts.map +1 -0
- package/dist/page-classifier.js +133 -0
- package/dist/page-classifier.js.map +1 -0
- package/dist/parser.d.ts +3 -0
- package/dist/parser.d.ts.map +1 -0
- package/dist/parser.js +131 -0
- package/dist/parser.js.map +1 -0
- package/dist/parser.test.d.ts +2 -0
- package/dist/parser.test.d.ts.map +1 -0
- package/dist/parser.test.js +37 -0
- package/dist/parser.test.js.map +1 -0
- package/dist/renderer.d.ts +15 -0
- package/dist/renderer.d.ts.map +1 -0
- package/dist/renderer.js +124 -0
- package/dist/renderer.js.map +1 -0
- package/dist/rule-references.d.ts +2 -0
- package/dist/rule-references.d.ts.map +1 -0
- package/dist/rule-references.js +35 -0
- package/dist/rule-references.js.map +1 -0
- package/dist/rules/cannibal/keyword-collision.d.ts +3 -0
- package/dist/rules/cannibal/keyword-collision.d.ts.map +1 -0
- package/dist/rules/cannibal/keyword-collision.js +25 -0
- package/dist/rules/cannibal/keyword-collision.js.map +1 -0
- package/dist/rules/cannibal/title-overlap.d.ts +3 -0
- package/dist/rules/cannibal/title-overlap.d.ts.map +1 -0
- package/dist/rules/cannibal/title-overlap.js +43 -0
- package/dist/rules/cannibal/title-overlap.js.map +1 -0
- package/dist/rules/cannibal/url-pattern.d.ts +3 -0
- package/dist/rules/cannibal/url-pattern.d.ts.map +1 -0
- package/dist/rules/cannibal/url-pattern.js +48 -0
- package/dist/rules/cannibal/url-pattern.js.map +1 -0
- package/dist/rules/content/eeat-signals.d.ts +3 -0
- package/dist/rules/content/eeat-signals.d.ts.map +1 -0
- package/dist/rules/content/eeat-signals.js +46 -0
- package/dist/rules/content/eeat-signals.js.map +1 -0
- package/dist/rules/content/heading-uniqueness.d.ts +3 -0
- package/dist/rules/content/heading-uniqueness.d.ts.map +1 -0
- package/dist/rules/content/heading-uniqueness.js +56 -0
- package/dist/rules/content/heading-uniqueness.js.map +1 -0
- package/dist/rules/content/meta-uniqueness.d.ts +3 -0
- package/dist/rules/content/meta-uniqueness.d.ts.map +1 -0
- package/dist/rules/content/meta-uniqueness.js +28 -0
- package/dist/rules/content/meta-uniqueness.js.map +1 -0
- package/dist/rules/content/missing-author.d.ts +3 -0
- package/dist/rules/content/missing-author.d.ts.map +1 -0
- package/dist/rules/content/missing-author.js +26 -0
- package/dist/rules/content/missing-author.js.map +1 -0
- package/dist/rules/content/unique-value.d.ts +3 -0
- package/dist/rules/content/unique-value.d.ts.map +1 -0
- package/dist/rules/content/unique-value.js +26 -0
- package/dist/rules/content/unique-value.js.map +1 -0
- package/dist/rules/links/cluster-connectivity.d.ts +7 -0
- package/dist/rules/links/cluster-connectivity.d.ts.map +1 -0
- package/dist/rules/links/cluster-connectivity.js +73 -0
- package/dist/rules/links/cluster-connectivity.js.map +1 -0
- package/dist/rules/links/cluster-key.d.ts +3 -0
- package/dist/rules/links/cluster-key.d.ts.map +1 -0
- package/dist/rules/links/cluster-key.js +22 -0
- package/dist/rules/links/cluster-key.js.map +1 -0
- package/dist/rules/links/dead-ends.d.ts +3 -0
- package/dist/rules/links/dead-ends.d.ts.map +1 -0
- package/dist/rules/links/dead-ends.js +13 -0
- package/dist/rules/links/dead-ends.js.map +1 -0
- package/dist/rules/links/hub-pages.d.ts +7 -0
- package/dist/rules/links/hub-pages.d.ts.map +1 -0
- package/dist/rules/links/hub-pages.js +73 -0
- package/dist/rules/links/hub-pages.js.map +1 -0
- package/dist/rules/links/link-depth.d.ts +3 -0
- package/dist/rules/links/link-depth.d.ts.map +1 -0
- package/dist/rules/links/link-depth.js +46 -0
- package/dist/rules/links/link-depth.js.map +1 -0
- package/dist/rules/links/orphan-pages.d.ts +3 -0
- package/dist/rules/links/orphan-pages.d.ts.map +1 -0
- package/dist/rules/links/orphan-pages.js +19 -0
- package/dist/rules/links/orphan-pages.js.map +1 -0
- package/dist/rules/schema/consistency.d.ts +3 -0
- package/dist/rules/schema/consistency.d.ts.map +1 -0
- package/dist/rules/schema/consistency.js +44 -0
- package/dist/rules/schema/consistency.js.map +1 -0
- package/dist/rules/schema/json-ld-valid.d.ts +3 -0
- package/dist/rules/schema/json-ld-valid.d.ts.map +1 -0
- package/dist/rules/schema/json-ld-valid.js +47 -0
- package/dist/rules/schema/json-ld-valid.js.map +1 -0
- package/dist/rules/schema/required-fields.d.ts +3 -0
- package/dist/rules/schema/required-fields.d.ts.map +1 -0
- package/dist/rules/schema/required-fields.js +60 -0
- package/dist/rules/schema/required-fields.js.map +1 -0
- package/dist/rules/spam/boilerplate-ratio.d.ts +3 -0
- package/dist/rules/spam/boilerplate-ratio.d.ts.map +1 -0
- package/dist/rules/spam/boilerplate-ratio.js +50 -0
- package/dist/rules/spam/boilerplate-ratio.js.map +1 -0
- package/dist/rules/spam/doorway-pattern.d.ts +4 -0
- package/dist/rules/spam/doorway-pattern.d.ts.map +1 -0
- package/dist/rules/spam/doorway-pattern.js +47 -0
- package/dist/rules/spam/doorway-pattern.js.map +1 -0
- package/dist/rules/spam/entity-swap.d.ts +7 -0
- package/dist/rules/spam/entity-swap.d.ts.map +1 -0
- package/dist/rules/spam/entity-swap.js +26 -0
- package/dist/rules/spam/entity-swap.js.map +1 -0
- package/dist/rules/spam/near-duplicate.d.ts +11 -0
- package/dist/rules/spam/near-duplicate.d.ts.map +1 -0
- package/dist/rules/spam/near-duplicate.js +25 -0
- package/dist/rules/spam/near-duplicate.js.map +1 -0
- package/dist/rules/spam/publication-velocity.d.ts +3 -0
- package/dist/rules/spam/publication-velocity.d.ts.map +1 -0
- package/dist/rules/spam/publication-velocity.js +25 -0
- package/dist/rules/spam/publication-velocity.js.map +1 -0
- package/dist/rules/spam/template-coverage.d.ts +3 -0
- package/dist/rules/spam/template-coverage.d.ts.map +1 -0
- package/dist/rules/spam/template-coverage.js +87 -0
- package/dist/rules/spam/template-coverage.js.map +1 -0
- package/dist/rules/spam/template-diversity.d.ts +3 -0
- package/dist/rules/spam/template-diversity.d.ts.map +1 -0
- package/dist/rules/spam/template-diversity.js +19 -0
- package/dist/rules/spam/template-diversity.js.map +1 -0
- package/dist/rules/spam/thin-content.d.ts +6 -0
- package/dist/rules/spam/thin-content.d.ts.map +1 -0
- package/dist/rules/spam/thin-content.js +22 -0
- package/dist/rules/spam/thin-content.js.map +1 -0
- package/dist/rules/tech/canonical-consistency.d.ts +4 -0
- package/dist/rules/tech/canonical-consistency.d.ts.map +1 -0
- package/dist/rules/tech/canonical-consistency.js +78 -0
- package/dist/rules/tech/canonical-consistency.js.map +1 -0
- package/dist/rules/tech/canonical-noindex-conflict.d.ts +3 -0
- package/dist/rules/tech/canonical-noindex-conflict.d.ts.map +1 -0
- package/dist/rules/tech/canonical-noindex-conflict.js +27 -0
- package/dist/rules/tech/canonical-noindex-conflict.js.map +1 -0
- package/dist/rules/tech/hreflang-consistency.d.ts +3 -0
- package/dist/rules/tech/hreflang-consistency.d.ts.map +1 -0
- package/dist/rules/tech/hreflang-consistency.js +99 -0
- package/dist/rules/tech/hreflang-consistency.js.map +1 -0
- package/dist/rules/tech/og-completeness.d.ts +3 -0
- package/dist/rules/tech/og-completeness.d.ts.map +1 -0
- package/dist/rules/tech/og-completeness.js +35 -0
- package/dist/rules/tech/og-completeness.js.map +1 -0
- package/dist/rules/tech/redirect-chain.d.ts +3 -0
- package/dist/rules/tech/redirect-chain.d.ts.map +1 -0
- package/dist/rules/tech/redirect-chain.js +20 -0
- package/dist/rules/tech/redirect-chain.js.map +1 -0
- package/dist/rules/tech/robots-noindex-conflict.d.ts +3 -0
- package/dist/rules/tech/robots-noindex-conflict.d.ts.map +1 -0
- package/dist/rules/tech/robots-noindex-conflict.js +30 -0
- package/dist/rules/tech/robots-noindex-conflict.js.map +1 -0
- package/dist/rules/tech/robots-sitemap-presence.d.ts +3 -0
- package/dist/rules/tech/robots-sitemap-presence.d.ts.map +1 -0
- package/dist/rules/tech/robots-sitemap-presence.js +61 -0
- package/dist/rules/tech/robots-sitemap-presence.js.map +1 -0
- package/dist/rules/tech/sitemap-completeness.d.ts +3 -0
- package/dist/rules/tech/sitemap-completeness.d.ts.map +1 -0
- package/dist/rules/tech/sitemap-completeness.js +40 -0
- package/dist/rules/tech/sitemap-completeness.js.map +1 -0
- package/dist/rules/tech/soft-404.d.ts +3 -0
- package/dist/rules/tech/soft-404.d.ts.map +1 -0
- package/dist/rules/tech/soft-404.js +24 -0
- package/dist/rules/tech/soft-404.js.map +1 -0
- package/dist/types.d.ts +170 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +2 -0
- package/dist/types.js.map +1 -0
- package/dist/url-normalize.d.ts +10 -0
- package/dist/url-normalize.d.ts.map +1 -0
- package/dist/url-normalize.js +52 -0
- package/dist/url-normalize.js.map +1 -0
- package/package.json +46 -0
package/dist/auditor.js
ADDED
|
@@ -0,0 +1,730 @@
|
|
|
1
|
+
import { createHash } from "node:crypto";
|
|
2
|
+
import { readdir, readFile, stat } from "node:fs/promises";
|
|
3
|
+
import { extname, join, resolve } from "node:path";
|
|
4
|
+
import { parseHtmlPage } from "./parser.js";
|
|
5
|
+
import { mergeNormalizeUrlOptions, normalizeAuditUrl } from "./url-normalize.js";
|
|
6
|
+
import { eeatSignalsRule } from "./rules/content/eeat-signals.js";
|
|
7
|
+
import { headingUniquenessRule } from "./rules/content/heading-uniqueness.js";
|
|
8
|
+
import { metaUniquenessRule } from "./rules/content/meta-uniqueness.js";
|
|
9
|
+
import { missingAuthorRule } from "./rules/content/missing-author.js";
|
|
10
|
+
import { uniqueValueRule } from "./rules/content/unique-value.js";
|
|
11
|
+
import { boilerplateRatioRule } from "./rules/spam/boilerplate-ratio.js";
|
|
12
|
+
import { doorwayPatternRule } from "./rules/spam/doorway-pattern.js";
|
|
13
|
+
import { entitySwapRule } from "./rules/spam/entity-swap.js";
|
|
14
|
+
import { nearDuplicateRule } from "./rules/spam/near-duplicate.js";
|
|
15
|
+
import { publicationVelocityRule } from "./rules/spam/publication-velocity.js";
|
|
16
|
+
import { templateDiversityRule } from "./rules/spam/template-diversity.js";
|
|
17
|
+
import { thinContentRule } from "./rules/spam/thin-content.js";
|
|
18
|
+
import { deadEndsRule } from "./rules/links/dead-ends.js";
|
|
19
|
+
import { linkDepthRule } from "./rules/links/link-depth.js";
|
|
20
|
+
import { clusterConnectivityRule } from "./rules/links/cluster-connectivity.js";
|
|
21
|
+
import { hubPagesRule } from "./rules/links/hub-pages.js";
|
|
22
|
+
import { orphanPagesRule } from "./rules/links/orphan-pages.js";
|
|
23
|
+
import { canonicalConsistencyRule } from "./rules/tech/canonical-consistency.js";
|
|
24
|
+
import { canonicalNoindexConflictRule } from "./rules/tech/canonical-noindex-conflict.js";
|
|
25
|
+
import { hreflangConsistencyRule } from "./rules/tech/hreflang-consistency.js";
|
|
26
|
+
import { ogCompletenessRule } from "./rules/tech/og-completeness.js";
|
|
27
|
+
import { robotsNoindexConflictRule } from "./rules/tech/robots-noindex-conflict.js";
|
|
28
|
+
import { sitemapCompletenessRule } from "./rules/tech/sitemap-completeness.js";
|
|
29
|
+
import { redirectChainRule } from "./rules/tech/redirect-chain.js";
|
|
30
|
+
import { soft404Rule } from "./rules/tech/soft-404.js";
|
|
31
|
+
import { jsonLdValidRule } from "./rules/schema/json-ld-valid.js";
|
|
32
|
+
import { requiredFieldsRule } from "./rules/schema/required-fields.js";
|
|
33
|
+
import { schemaConsistencyRule } from "./rules/schema/consistency.js";
|
|
34
|
+
import { titleOverlapRule } from "./rules/cannibal/title-overlap.js";
|
|
35
|
+
import { keywordCollisionRule } from "./rules/cannibal/keyword-collision.js";
|
|
36
|
+
import { urlPatternRule } from "./rules/cannibal/url-pattern.js";
|
|
37
|
+
import { templateCoverageRule } from "./rules/spam/template-coverage.js";
|
|
38
|
+
import { classifyPages, isRuleEnabled } from "./page-classifier.js";
|
|
39
|
+
import { RULE_REFERENCES } from "./rule-references.js";
|
|
40
|
+
import { enrichFindings } from "./enrich-findings.js";
|
|
41
|
+
const DEFAULTS = {
|
|
42
|
+
nearDuplicateThreshold: 0.85,
|
|
43
|
+
entitySwapThreshold: 0.95,
|
|
44
|
+
thinContentMinWords: 300,
|
|
45
|
+
publicationVelocityMaxPerDay: 100,
|
|
46
|
+
boilerplateMaxRatio: 0.7,
|
|
47
|
+
templateDiversityMinUniqueRatio: 0.35,
|
|
48
|
+
uniqueValueMinWords: 100,
|
|
49
|
+
metaUniquenessMinJaccard: 0.9,
|
|
50
|
+
linkDepthMaxClicks: 3,
|
|
51
|
+
hubPagesMinSiblings: 4,
|
|
52
|
+
hubPagesMaxSiblings: 50,
|
|
53
|
+
titleOverlapThreshold: 0.8,
|
|
54
|
+
keywordCollisionMinShared: 6,
|
|
55
|
+
templateCoverageMinPages: 5
|
|
56
|
+
};
|
|
57
|
+
const CATEGORY_WEIGHTS = {
|
|
58
|
+
spam: 0.4,
|
|
59
|
+
content: 0.25,
|
|
60
|
+
links: 0.15,
|
|
61
|
+
tech: 0.1,
|
|
62
|
+
schema: 0.05,
|
|
63
|
+
cannibal: 0.05,
|
|
64
|
+
/** Dedup / crawl hygiene; does not affect composite score. */
|
|
65
|
+
audit: 0
|
|
66
|
+
};
|
|
67
|
+
const DEFAULT_ENTITY_PATTERNS = [
|
|
68
|
+
{
|
|
69
|
+
placeholder: "[STATE]",
|
|
70
|
+
pattern: /\b(Alabama|Alaska|Arizona|Arkansas|California|Colorado|Connecticut|Delaware|Florida|Georgia|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Missouri|Montana|Nebraska|Nevada|New Hampshire|New Jersey|New Mexico|New York|North Carolina|North Dakota|Ohio|Oklahoma|Oregon|Pennsylvania|Rhode Island|South Carolina|South Dakota|Tennessee|Texas|Utah|Vermont|Virginia|Washington|West Virginia|Wisconsin|Wyoming)\b/gi
|
|
71
|
+
},
|
|
72
|
+
{ placeholder: "[ZIP]", pattern: /\b\d{5}\b/g }
|
|
73
|
+
];
|
|
74
|
+
function resolveGroupRules(baseRules, overrides) {
|
|
75
|
+
if (!overrides)
|
|
76
|
+
return baseRules;
|
|
77
|
+
const result = { ...baseRules };
|
|
78
|
+
for (const [, values] of Object.entries(overrides)) {
|
|
79
|
+
for (const [key, value] of Object.entries(values)) {
|
|
80
|
+
if (key in result) {
|
|
81
|
+
result[key] = value;
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
return result;
|
|
86
|
+
}
|
|
87
|
+
function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, entityPatterns) {
|
|
88
|
+
const findings = [];
|
|
89
|
+
const tag = (results) => results.map((r) => ({
|
|
90
|
+
...r,
|
|
91
|
+
group: groupName === "__default" ? undefined : groupName,
|
|
92
|
+
ref: r.ref ?? RULE_REFERENCES[r.ruleId],
|
|
93
|
+
}));
|
|
94
|
+
// Spam rules — always compute cross-page data, only push findings if enabled
|
|
95
|
+
const nearDuplicate = nearDuplicateRule(pages, resolvedRules.nearDuplicateThreshold);
|
|
96
|
+
if (isEnabled("spam/near-duplicate")) {
|
|
97
|
+
findings.push(...tag(nearDuplicate.findings));
|
|
98
|
+
}
|
|
99
|
+
const entitySwap = entitySwapRule(pages, entityPatterns, resolvedRules.entitySwapThreshold);
|
|
100
|
+
if (isEnabled("spam/entity-swap")) {
|
|
101
|
+
findings.push(...tag(entitySwap.findings));
|
|
102
|
+
}
|
|
103
|
+
const thinContent = thinContentRule(pages, resolvedRules.thinContentMinWords);
|
|
104
|
+
if (isEnabled("spam/thin-content")) {
|
|
105
|
+
findings.push(...tag(thinContent.findings));
|
|
106
|
+
}
|
|
107
|
+
if (isEnabled("spam/doorway-pattern")) {
|
|
108
|
+
findings.push(...tag(doorwayPatternRule(nearDuplicate.pairs, entitySwap.pairs, thinContent.thinContentUrls, pages)));
|
|
109
|
+
}
|
|
110
|
+
if (isEnabled("spam/publication-velocity")) {
|
|
111
|
+
findings.push(...tag(publicationVelocityRule(pages, resolvedRules.publicationVelocityMaxPerDay)));
|
|
112
|
+
}
|
|
113
|
+
if (isEnabled("spam/boilerplate-ratio")) {
|
|
114
|
+
findings.push(...tag(boilerplateRatioRule(pages, resolvedRules.boilerplateMaxRatio)));
|
|
115
|
+
}
|
|
116
|
+
if (isEnabled("spam/template-diversity")) {
|
|
117
|
+
findings.push(...tag(templateDiversityRule(pages, resolvedRules.templateDiversityMinUniqueRatio)));
|
|
118
|
+
}
|
|
119
|
+
if (isEnabled("spam/template-coverage")) {
|
|
120
|
+
findings.push(...tag(templateCoverageRule(pages, entityPatterns, resolvedRules.templateCoverageMinPages)));
|
|
121
|
+
}
|
|
122
|
+
// Content rules
|
|
123
|
+
if (isEnabled("content/unique-value")) {
|
|
124
|
+
findings.push(...tag(uniqueValueRule(pages, resolvedRules.uniqueValueMinWords)));
|
|
125
|
+
}
|
|
126
|
+
if (isEnabled("content/heading-uniqueness")) {
|
|
127
|
+
findings.push(...tag(headingUniquenessRule(pages, entityPatterns)));
|
|
128
|
+
}
|
|
129
|
+
if (isEnabled("content/meta-uniqueness")) {
|
|
130
|
+
findings.push(...tag(metaUniquenessRule(pages, entityPatterns, resolvedRules.metaUniquenessMinJaccard)));
|
|
131
|
+
}
|
|
132
|
+
if (isEnabled("content/missing-author")) {
|
|
133
|
+
findings.push(...tag(missingAuthorRule(pages)));
|
|
134
|
+
}
|
|
135
|
+
if (isEnabled("content/eeat-signals")) {
|
|
136
|
+
findings.push(...tag(eeatSignalsRule(pages)));
|
|
137
|
+
}
|
|
138
|
+
// Link rules — use the global link graph
|
|
139
|
+
if (isEnabled("links/orphan-pages")) {
|
|
140
|
+
findings.push(...tag(orphanPagesRule(pages, inbound, rootUrl)));
|
|
141
|
+
}
|
|
142
|
+
if (isEnabled("links/dead-ends")) {
|
|
143
|
+
findings.push(...tag(deadEndsRule(pages, knownUrls, rootUrl)));
|
|
144
|
+
}
|
|
145
|
+
if (isEnabled("links/link-depth")) {
|
|
146
|
+
if (rootUrl) {
|
|
147
|
+
findings.push(...tag(linkDepthRule(pages, adjacency, rootUrl, resolvedRules.linkDepthMaxClicks, inbound)));
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
if (isEnabled("links/cluster-connectivity")) {
|
|
151
|
+
findings.push(...tag(clusterConnectivityRule(pages, knownUrls)));
|
|
152
|
+
}
|
|
153
|
+
if (isEnabled("links/hub-pages")) {
|
|
154
|
+
findings.push(...tag(hubPagesRule(pages, knownUrls, resolvedRules.hubPagesMinSiblings, resolvedRules.hubPagesMaxSiblings)));
|
|
155
|
+
}
|
|
156
|
+
// Tech rules
|
|
157
|
+
if (isEnabled("tech/canonical-consistency")) {
|
|
158
|
+
findings.push(...tag(canonicalConsistencyRule(pages, knownUrls, normalizeUrlOptions)));
|
|
159
|
+
}
|
|
160
|
+
if (isEnabled("tech/canonical-noindex-conflict")) {
|
|
161
|
+
findings.push(...tag(canonicalNoindexConflictRule(pages, normalizeUrlOptions)));
|
|
162
|
+
}
|
|
163
|
+
if (isEnabled("tech/robots-noindex-conflict")) {
|
|
164
|
+
findings.push(...tag(robotsNoindexConflictRule(pages, inbound)));
|
|
165
|
+
}
|
|
166
|
+
if (isEnabled("tech/redirect-chain")) {
|
|
167
|
+
findings.push(...tag(redirectChainRule(pages)));
|
|
168
|
+
}
|
|
169
|
+
if (isEnabled("tech/soft-404")) {
|
|
170
|
+
findings.push(...tag(soft404Rule(pages)));
|
|
171
|
+
}
|
|
172
|
+
if (isEnabled("tech/og-completeness")) {
|
|
173
|
+
findings.push(...tag(ogCompletenessRule(pages)));
|
|
174
|
+
}
|
|
175
|
+
if (isEnabled("tech/hreflang-consistency")) {
|
|
176
|
+
findings.push(...tag(hreflangConsistencyRule(pages, normalizeUrlOptions)));
|
|
177
|
+
}
|
|
178
|
+
// Schema rules
|
|
179
|
+
if (isEnabled("schema/json-ld-valid")) {
|
|
180
|
+
findings.push(...tag(jsonLdValidRule(pages)));
|
|
181
|
+
}
|
|
182
|
+
if (isEnabled("schema/required-fields")) {
|
|
183
|
+
findings.push(...tag(requiredFieldsRule(pages)));
|
|
184
|
+
}
|
|
185
|
+
if (isEnabled("schema/consistency")) {
|
|
186
|
+
findings.push(...tag(schemaConsistencyRule(pages)));
|
|
187
|
+
}
|
|
188
|
+
// Cannibal rules
|
|
189
|
+
if (isEnabled("cannibal/title-overlap")) {
|
|
190
|
+
findings.push(...tag(titleOverlapRule(pages, entityPatterns, resolvedRules.titleOverlapThreshold)));
|
|
191
|
+
}
|
|
192
|
+
if (isEnabled("cannibal/keyword-collision")) {
|
|
193
|
+
findings.push(...tag(keywordCollisionRule(pages, resolvedRules.keywordCollisionMinShared)));
|
|
194
|
+
}
|
|
195
|
+
if (isEnabled("cannibal/url-pattern")) {
|
|
196
|
+
findings.push(...tag(urlPatternRule(pages)));
|
|
197
|
+
}
|
|
198
|
+
return findings;
|
|
199
|
+
}
|
|
200
|
+
function hashHtml(html) {
|
|
201
|
+
return createHash("sha256").update(html, "utf8").digest("hex");
|
|
202
|
+
}
|
|
203
|
+
function scoreFromFindings(findings) {
|
|
204
|
+
const severityWeights = {
|
|
205
|
+
critical: 40,
|
|
206
|
+
error: 25,
|
|
207
|
+
warning: 12,
|
|
208
|
+
info: 5
|
|
209
|
+
};
|
|
210
|
+
const raw = {
|
|
211
|
+
spam: 0,
|
|
212
|
+
content: 0,
|
|
213
|
+
links: 0,
|
|
214
|
+
tech: 0,
|
|
215
|
+
schema: 0,
|
|
216
|
+
cannibal: 0,
|
|
217
|
+
audit: 0
|
|
218
|
+
};
|
|
219
|
+
for (const finding of findings) {
|
|
220
|
+
const category = finding.ruleId.split("/")[0];
|
|
221
|
+
if (!(category in raw)) {
|
|
222
|
+
continue;
|
|
223
|
+
}
|
|
224
|
+
raw[category] = Math.min(100, raw[category] + severityWeights[finding.severity]);
|
|
225
|
+
}
|
|
226
|
+
const weighted = raw.spam * CATEGORY_WEIGHTS.spam +
|
|
227
|
+
raw.content * CATEGORY_WEIGHTS.content +
|
|
228
|
+
raw.links * CATEGORY_WEIGHTS.links +
|
|
229
|
+
raw.tech * CATEGORY_WEIGHTS.tech +
|
|
230
|
+
raw.schema * CATEGORY_WEIGHTS.schema +
|
|
231
|
+
raw.cannibal * CATEGORY_WEIGHTS.cannibal +
|
|
232
|
+
raw.audit * CATEGORY_WEIGHTS.audit;
|
|
233
|
+
return {
|
|
234
|
+
score: Math.round(Math.min(100, weighted)),
|
|
235
|
+
categoryScores: {
|
|
236
|
+
spam: raw.spam,
|
|
237
|
+
content: raw.content,
|
|
238
|
+
links: raw.links,
|
|
239
|
+
tech: raw.tech,
|
|
240
|
+
schema: raw.schema,
|
|
241
|
+
cannibal: raw.cannibal
|
|
242
|
+
}
|
|
243
|
+
};
|
|
244
|
+
}
|
|
245
|
+
async function collectHtmlFiles(directory) {
|
|
246
|
+
const entries = await readdir(directory, { withFileTypes: true });
|
|
247
|
+
const files = await Promise.all(entries.map(async (entry) => {
|
|
248
|
+
const fullPath = join(directory, entry.name);
|
|
249
|
+
if (entry.isDirectory()) {
|
|
250
|
+
return collectHtmlFiles(fullPath);
|
|
251
|
+
}
|
|
252
|
+
const extension = extname(entry.name).toLowerCase();
|
|
253
|
+
if (extension === ".html" || extension === ".htm") {
|
|
254
|
+
return [fullPath];
|
|
255
|
+
}
|
|
256
|
+
return [];
|
|
257
|
+
}));
|
|
258
|
+
return files.flat();
|
|
259
|
+
}
|
|
260
|
+
async function fetchWithRetry(url, timeoutMs) {
|
|
261
|
+
try {
|
|
262
|
+
const response = await fetch(url, { signal: AbortSignal.timeout(timeoutMs) });
|
|
263
|
+
if (!response.ok) {
|
|
264
|
+
return null;
|
|
265
|
+
}
|
|
266
|
+
return {
|
|
267
|
+
text: await response.text(),
|
|
268
|
+
contentType: response.headers.get("content-type")?.toLowerCase() ?? ""
|
|
269
|
+
};
|
|
270
|
+
}
|
|
271
|
+
catch {
|
|
272
|
+
return null;
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
async function fetchPageWithMeta(url, timeoutMs) {
|
|
276
|
+
const redirectChain = [];
|
|
277
|
+
let currentUrl = url;
|
|
278
|
+
for (let hop = 0; hop < 10; hop += 1) {
|
|
279
|
+
let response;
|
|
280
|
+
try {
|
|
281
|
+
response = await fetch(currentUrl, {
|
|
282
|
+
redirect: "manual",
|
|
283
|
+
signal: AbortSignal.timeout(timeoutMs),
|
|
284
|
+
});
|
|
285
|
+
}
|
|
286
|
+
catch {
|
|
287
|
+
return null;
|
|
288
|
+
}
|
|
289
|
+
const status = response.status;
|
|
290
|
+
if (status >= 300 && status < 400) {
|
|
291
|
+
const location = response.headers.get("location");
|
|
292
|
+
if (!location)
|
|
293
|
+
break;
|
|
294
|
+
redirectChain.push(currentUrl);
|
|
295
|
+
try {
|
|
296
|
+
currentUrl = new URL(location, currentUrl).href;
|
|
297
|
+
}
|
|
298
|
+
catch {
|
|
299
|
+
break;
|
|
300
|
+
}
|
|
301
|
+
continue;
|
|
302
|
+
}
|
|
303
|
+
let html;
|
|
304
|
+
try {
|
|
305
|
+
html = await response.text();
|
|
306
|
+
}
|
|
307
|
+
catch {
|
|
308
|
+
return null;
|
|
309
|
+
}
|
|
310
|
+
return {
|
|
311
|
+
url,
|
|
312
|
+
html,
|
|
313
|
+
httpMeta: {
|
|
314
|
+
statusCode: status,
|
|
315
|
+
finalUrl: currentUrl,
|
|
316
|
+
redirectChain,
|
|
317
|
+
xRobotsTag: response.headers.get("x-robots-tag") ?? "",
|
|
318
|
+
linkHeader: response.headers.get("link") ?? "",
|
|
319
|
+
},
|
|
320
|
+
};
|
|
321
|
+
}
|
|
322
|
+
return null;
|
|
323
|
+
}
|
|
324
|
+
async function fetchTextStrict(url, timeoutMs) {
|
|
325
|
+
const response = await fetch(url, { signal: AbortSignal.timeout(timeoutMs) });
|
|
326
|
+
if (!response.ok) {
|
|
327
|
+
throw new Error(`Failed to fetch source: ${response.status} ${response.statusText}`);
|
|
328
|
+
}
|
|
329
|
+
return {
|
|
330
|
+
text: await response.text(),
|
|
331
|
+
contentType: response.headers.get("content-type")?.toLowerCase() ?? ""
|
|
332
|
+
};
|
|
333
|
+
}
|
|
334
|
+
async function runWithConcurrency(items, limit, fn) {
|
|
335
|
+
let index = 0;
|
|
336
|
+
async function next() {
|
|
337
|
+
while (index < items.length) {
|
|
338
|
+
const current = index;
|
|
339
|
+
index += 1;
|
|
340
|
+
await fn(items[current]);
|
|
341
|
+
}
|
|
342
|
+
}
|
|
343
|
+
const workers = Array.from({ length: Math.min(limit, items.length) }, () => next());
|
|
344
|
+
await Promise.all(workers);
|
|
345
|
+
}
|
|
346
|
+
function parseSitemapUrls(xml) {
|
|
347
|
+
const matches = Array.from(xml.matchAll(/<loc>\s*([^<\s]+)\s*<\/loc>/gi));
|
|
348
|
+
return matches.map((match) => match[1]).filter(Boolean);
|
|
349
|
+
}
|
|
350
|
+
function looksLikeSitemap(text) {
|
|
351
|
+
const lowered = text.toLowerCase();
|
|
352
|
+
return lowered.includes("<urlset") || lowered.includes("<sitemapindex");
|
|
353
|
+
}
|
|
354
|
+
function looksLikeHtml(text) {
|
|
355
|
+
const lowered = text.toLowerCase();
|
|
356
|
+
return lowered.includes("<html") || lowered.includes("<body") || lowered.includes("<!doctype html");
|
|
357
|
+
}
|
|
358
|
+
function isSitemapIndex(text) {
|
|
359
|
+
return text.toLowerCase().includes("<sitemapindex");
|
|
360
|
+
}
|
|
361
|
+
function matchGlob(pattern, value) {
|
|
362
|
+
// Iterative glob matcher — avoids dynamic RegExp to prevent ReDoS.
|
|
363
|
+
// Supports ** (any path segments) and * (one path segment, no separator).
|
|
364
|
+
// Normalise both sides to forward slashes so Windows paths work with
|
|
365
|
+
// POSIX-style patterns like **/api/**.
|
|
366
|
+
const normPattern = pattern.replace(/\\/g, "/");
|
|
367
|
+
const normValue = value.replace(/\\/g, "/");
|
|
368
|
+
function match(pi, vi) {
|
|
369
|
+
while (pi < normPattern.length) {
|
|
370
|
+
if (normPattern[pi] === "*") {
|
|
371
|
+
const doubleStar = pi + 1 < normPattern.length && normPattern[pi + 1] === "*";
|
|
372
|
+
if (doubleStar) {
|
|
373
|
+
pi += 2;
|
|
374
|
+
// skip optional trailing separator after **
|
|
375
|
+
if (pi < normPattern.length && normPattern[pi] === "/") {
|
|
376
|
+
pi += 1;
|
|
377
|
+
}
|
|
378
|
+
if (pi === normPattern.length)
|
|
379
|
+
return true;
|
|
380
|
+
// try matching rest of pattern at every position in value
|
|
381
|
+
for (let vi2 = vi; vi2 <= normValue.length; vi2 += 1) {
|
|
382
|
+
if (match(pi, vi2))
|
|
383
|
+
return true;
|
|
384
|
+
}
|
|
385
|
+
return false;
|
|
386
|
+
}
|
|
387
|
+
// single *: match any chars except path separators
|
|
388
|
+
pi += 1;
|
|
389
|
+
while (vi < normValue.length && normValue[vi] !== "/") {
|
|
390
|
+
vi += 1;
|
|
391
|
+
}
|
|
392
|
+
}
|
|
393
|
+
else {
|
|
394
|
+
if (vi >= normValue.length || normPattern[pi] !== normValue[vi])
|
|
395
|
+
return false;
|
|
396
|
+
pi += 1;
|
|
397
|
+
vi += 1;
|
|
398
|
+
}
|
|
399
|
+
}
|
|
400
|
+
return vi === normValue.length;
|
|
401
|
+
}
|
|
402
|
+
return match(0, 0);
|
|
403
|
+
}
|
|
404
|
+
function shouldIgnore(url, patterns) {
|
|
405
|
+
if (patterns.length === 0)
|
|
406
|
+
return false;
|
|
407
|
+
for (const pattern of patterns) {
|
|
408
|
+
if (matchGlob(pattern, url))
|
|
409
|
+
return true;
|
|
410
|
+
}
|
|
411
|
+
return false;
|
|
412
|
+
}
|
|
413
|
+
function fisherYatesSample(items, n) {
|
|
414
|
+
const arr = [...items];
|
|
415
|
+
for (let i = arr.length - 1; i > 0 && arr.length - i <= n; i -= 1) {
|
|
416
|
+
const j = Math.floor(Math.random() * (i + 1));
|
|
417
|
+
[arr[i], arr[j]] = [arr[j], arr[i]];
|
|
418
|
+
}
|
|
419
|
+
return arr.slice(arr.length - n);
|
|
420
|
+
}
|
|
421
|
+
async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutMs) {
|
|
422
|
+
visited.add(sitemapUrl);
|
|
423
|
+
const locs = parseSitemapUrls(sitemapText);
|
|
424
|
+
if (!isSitemapIndex(sitemapText)) {
|
|
425
|
+
return locs;
|
|
426
|
+
}
|
|
427
|
+
const allUrls = [];
|
|
428
|
+
for (const childUrl of locs) {
|
|
429
|
+
if (visited.has(childUrl))
|
|
430
|
+
continue;
|
|
431
|
+
const child = await fetchWithRetry(childUrl, timeoutMs);
|
|
432
|
+
if (!child)
|
|
433
|
+
continue;
|
|
434
|
+
const childLike = child.contentType.includes("xml") || looksLikeSitemap(child.text);
|
|
435
|
+
if (!childLike)
|
|
436
|
+
continue;
|
|
437
|
+
const childUrls = await collectUrlsFromSitemap(child.text, childUrl, visited, timeoutMs);
|
|
438
|
+
allUrls.push(...childUrls);
|
|
439
|
+
}
|
|
440
|
+
return allUrls;
|
|
441
|
+
}
|
|
442
|
+
async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery) {
|
|
443
|
+
if (/^https?:\/\//i.test(source)) {
|
|
444
|
+
let text;
|
|
445
|
+
let contentType;
|
|
446
|
+
let sourceStatus = 200;
|
|
447
|
+
try {
|
|
448
|
+
const fetched = await fetchTextStrict(source, timeoutMs);
|
|
449
|
+
text = fetched.text;
|
|
450
|
+
contentType = fetched.contentType;
|
|
451
|
+
}
|
|
452
|
+
catch {
|
|
453
|
+
// Sitemap URL returned non-200 — fallback to crawl from origin homepage
|
|
454
|
+
if (source.includes("sitemap")) {
|
|
455
|
+
try {
|
|
456
|
+
const origin = new URL(source).origin;
|
|
457
|
+
const fallback = await fetchTextStrict(origin, timeoutMs);
|
|
458
|
+
text = fallback.text;
|
|
459
|
+
contentType = fallback.contentType;
|
|
460
|
+
sourceStatus = -1; // flag that we fell back
|
|
461
|
+
}
|
|
462
|
+
catch {
|
|
463
|
+
throw new Error(`Failed to fetch source URL: ${source} (and fallback to origin failed)`);
|
|
464
|
+
}
|
|
465
|
+
}
|
|
466
|
+
else {
|
|
467
|
+
throw new Error(`Failed to fetch source URL: ${source} — verify the URL is correct and returns a valid response.`);
|
|
468
|
+
}
|
|
469
|
+
}
|
|
470
|
+
const isXml = (contentType.includes("xml") || looksLikeSitemap(text)) && sourceStatus !== -1;
|
|
471
|
+
if (isXml) {
|
|
472
|
+
const visited = new Set();
|
|
473
|
+
const urls = await collectUrlsFromSitemap(text, source, visited, timeoutMs);
|
|
474
|
+
const pages = [];
|
|
475
|
+
await runWithConcurrency(urls, concurrency, async (url) => {
|
|
476
|
+
const result = await fetchPageWithMeta(url, timeoutMs);
|
|
477
|
+
if (result) {
|
|
478
|
+
pages.push(result);
|
|
479
|
+
}
|
|
480
|
+
});
|
|
481
|
+
// Crawl discovery: follow internal links to find pages not in sitemap
|
|
482
|
+
if (crawlDiscovery) {
|
|
483
|
+
const sitemapUrlSet = new Set(urls);
|
|
484
|
+
const discoveredUrls = new Set();
|
|
485
|
+
let sourceOrigin;
|
|
486
|
+
try {
|
|
487
|
+
sourceOrigin = new URL(source).origin;
|
|
488
|
+
}
|
|
489
|
+
catch {
|
|
490
|
+
sourceOrigin = "";
|
|
491
|
+
}
|
|
492
|
+
for (const page of pages) {
|
|
493
|
+
const linkMatches = Array.from(page.html.matchAll(/href=["']([^"']+)["']/gi));
|
|
494
|
+
for (const match of linkMatches) {
|
|
495
|
+
const href = match[1];
|
|
496
|
+
if (!href || href.startsWith("#") || /^mailto:|^tel:|^javascript:|^data:/i.test(href))
|
|
497
|
+
continue;
|
|
498
|
+
try {
|
|
499
|
+
const baseUrl = page.httpMeta?.finalUrl ?? page.url;
|
|
500
|
+
const resolved = new URL(href, baseUrl).href;
|
|
501
|
+
const resolvedUrl = new URL(resolved);
|
|
502
|
+
if (resolvedUrl.origin !== sourceOrigin)
|
|
503
|
+
continue;
|
|
504
|
+
// Strip query and hash for dedup
|
|
505
|
+
resolvedUrl.search = "";
|
|
506
|
+
resolvedUrl.hash = "";
|
|
507
|
+
const normalized = resolvedUrl.href;
|
|
508
|
+
if (!sitemapUrlSet.has(normalized) && !discoveredUrls.has(normalized)) {
|
|
509
|
+
discoveredUrls.add(normalized);
|
|
510
|
+
}
|
|
511
|
+
}
|
|
512
|
+
catch {
|
|
513
|
+
continue;
|
|
514
|
+
}
|
|
515
|
+
}
|
|
516
|
+
}
|
|
517
|
+
if (discoveredUrls.size > 0) {
|
|
518
|
+
await runWithConcurrency(Array.from(discoveredUrls), concurrency, async (url) => {
|
|
519
|
+
const result = await fetchPageWithMeta(url, timeoutMs);
|
|
520
|
+
if (result && result.httpMeta && result.httpMeta.statusCode >= 200 && result.httpMeta.statusCode < 300) {
|
|
521
|
+
pages.push(result);
|
|
522
|
+
}
|
|
523
|
+
});
|
|
524
|
+
}
|
|
525
|
+
}
|
|
526
|
+
return { pages, sitemapUrls: new Set(urls) };
|
|
527
|
+
}
|
|
528
|
+
if (contentType.includes("html") || looksLikeHtml(text)) {
|
|
529
|
+
const initialPage = { url: source, html: text };
|
|
530
|
+
const pages = [initialPage];
|
|
531
|
+
if (crawlDiscovery) {
|
|
532
|
+
let sourceOrigin;
|
|
533
|
+
try {
|
|
534
|
+
sourceOrigin = new URL(source).origin;
|
|
535
|
+
}
|
|
536
|
+
catch {
|
|
537
|
+
sourceOrigin = "";
|
|
538
|
+
}
|
|
539
|
+
const knownCrawled = new Set([source]);
|
|
540
|
+
const maxDepth = 3;
|
|
541
|
+
for (let depth = 0; depth < maxDepth; depth += 1) {
|
|
542
|
+
const frontier = new Set();
|
|
543
|
+
for (const page of pages) {
|
|
544
|
+
if (depth > 0 && !knownCrawled.has("__depth_" + depth + "_" + page.url))
|
|
545
|
+
continue;
|
|
546
|
+
const linkMatches = Array.from(page.html.matchAll(/href=["']([^"']+)["']/gi));
|
|
547
|
+
for (const match of linkMatches) {
|
|
548
|
+
const href = match[1];
|
|
549
|
+
if (!href || href.startsWith("#") || /^mailto:|^tel:|^javascript:|^data:/i.test(href))
|
|
550
|
+
continue;
|
|
551
|
+
try {
|
|
552
|
+
const baseUrl = page.httpMeta?.finalUrl ?? page.url;
|
|
553
|
+
const resolved = new URL(href, baseUrl).href;
|
|
554
|
+
const resolvedUrl = new URL(resolved);
|
|
555
|
+
if (resolvedUrl.origin !== sourceOrigin)
|
|
556
|
+
continue;
|
|
557
|
+
if (/^\/_next\/|^\/api\/|^\/icon/i.test(resolvedUrl.pathname))
|
|
558
|
+
continue;
|
|
559
|
+
resolvedUrl.search = "";
|
|
560
|
+
resolvedUrl.hash = "";
|
|
561
|
+
const normalized = resolvedUrl.href;
|
|
562
|
+
if (!knownCrawled.has(normalized)) {
|
|
563
|
+
frontier.add(normalized);
|
|
564
|
+
}
|
|
565
|
+
}
|
|
566
|
+
catch {
|
|
567
|
+
continue;
|
|
568
|
+
}
|
|
569
|
+
}
|
|
570
|
+
}
|
|
571
|
+
if (frontier.size === 0)
|
|
572
|
+
break;
|
|
573
|
+
const newPages = [];
|
|
574
|
+
await runWithConcurrency(Array.from(frontier), concurrency, async (url) => {
|
|
575
|
+
const result = await fetchPageWithMeta(url, timeoutMs);
|
|
576
|
+
if (result && result.httpMeta && result.httpMeta.statusCode >= 200 && result.httpMeta.statusCode < 300) {
|
|
577
|
+
newPages.push(result);
|
|
578
|
+
knownCrawled.add(url);
|
|
579
|
+
knownCrawled.add("__depth_" + (depth + 1) + "_" + url);
|
|
580
|
+
}
|
|
581
|
+
else {
|
|
582
|
+
knownCrawled.add(url);
|
|
583
|
+
}
|
|
584
|
+
});
|
|
585
|
+
pages.push(...newPages);
|
|
586
|
+
if (newPages.length === 0)
|
|
587
|
+
break;
|
|
588
|
+
}
|
|
589
|
+
}
|
|
590
|
+
return { pages };
|
|
591
|
+
}
|
|
592
|
+
throw new Error(`Source URL does not look like HTML or sitemap XML: ${source}`);
|
|
593
|
+
}
|
|
594
|
+
const resolved = resolve(source);
|
|
595
|
+
let sourceStat;
|
|
596
|
+
try {
|
|
597
|
+
sourceStat = await stat(resolved);
|
|
598
|
+
}
|
|
599
|
+
catch {
|
|
600
|
+
throw new Error(`Unable to access source: ${resolved}`);
|
|
601
|
+
}
|
|
602
|
+
if (sourceStat.isFile()) {
|
|
603
|
+
return { pages: [{ url: resolved, html: await readFile(resolved, "utf-8") }] };
|
|
604
|
+
}
|
|
605
|
+
if (sourceStat.isDirectory()) {
|
|
606
|
+
const htmlFiles = await collectHtmlFiles(resolved);
|
|
607
|
+
const pages = await Promise.all(htmlFiles.map(async (filePath) => ({
|
|
608
|
+
url: filePath,
|
|
609
|
+
html: await readFile(filePath, "utf-8")
|
|
610
|
+
})));
|
|
611
|
+
return { pages };
|
|
612
|
+
}
|
|
613
|
+
return { pages: [] };
|
|
614
|
+
}
|
|
615
|
+
export async function auditSource(source, options) {
|
|
616
|
+
const concurrency = options?.concurrency ?? 5;
|
|
617
|
+
const timeoutMs = options?.timeout ?? 30000;
|
|
618
|
+
const ignorePatterns = options?.ignore ?? [];
|
|
619
|
+
const sampleSize = options?.sampleSize ?? 0;
|
|
620
|
+
const resolvedRules = {
|
|
621
|
+
nearDuplicateThreshold: options?.rules?.nearDuplicateThreshold ?? DEFAULTS.nearDuplicateThreshold,
|
|
622
|
+
entitySwapThreshold: options?.rules?.entitySwapThreshold ?? DEFAULTS.entitySwapThreshold,
|
|
623
|
+
thinContentMinWords: options?.rules?.thinContentMinWords ?? DEFAULTS.thinContentMinWords,
|
|
624
|
+
publicationVelocityMaxPerDay: options?.rules?.publicationVelocityMaxPerDay ?? DEFAULTS.publicationVelocityMaxPerDay,
|
|
625
|
+
boilerplateMaxRatio: options?.rules?.boilerplateMaxRatio ?? DEFAULTS.boilerplateMaxRatio,
|
|
626
|
+
templateDiversityMinUniqueRatio: options?.rules?.templateDiversityMinUniqueRatio ?? DEFAULTS.templateDiversityMinUniqueRatio,
|
|
627
|
+
uniqueValueMinWords: options?.rules?.uniqueValueMinWords ?? DEFAULTS.uniqueValueMinWords,
|
|
628
|
+
metaUniquenessMinJaccard: options?.rules?.metaUniquenessMinJaccard ?? DEFAULTS.metaUniquenessMinJaccard,
|
|
629
|
+
linkDepthMaxClicks: options?.rules?.linkDepthMaxClicks ?? DEFAULTS.linkDepthMaxClicks,
|
|
630
|
+
hubPagesMinSiblings: options?.rules?.hubPagesMinSiblings ?? DEFAULTS.hubPagesMinSiblings,
|
|
631
|
+
hubPagesMaxSiblings: options?.rules?.hubPagesMaxSiblings ?? DEFAULTS.hubPagesMaxSiblings,
|
|
632
|
+
titleOverlapThreshold: options?.rules?.titleOverlapThreshold ?? DEFAULTS.titleOverlapThreshold,
|
|
633
|
+
keywordCollisionMinShared: options?.rules?.keywordCollisionMinShared ?? DEFAULTS.keywordCollisionMinShared,
|
|
634
|
+
templateCoverageMinPages: options?.rules?.templateCoverageMinPages ?? DEFAULTS.templateCoverageMinPages
|
|
635
|
+
};
|
|
636
|
+
const normalizeUrlOptions = mergeNormalizeUrlOptions({
|
|
637
|
+
stripQuery: options?.rules?.stripUrlQuery ?? true,
|
|
638
|
+
stripWwwHost: options?.rules?.stripWwwHost ?? false
|
|
639
|
+
});
|
|
640
|
+
const crawlDiscovery = /^https?:\/\//i.test(source) && (options?.crawlDiscovery ?? true);
|
|
641
|
+
const { pages: loadedPages, sitemapUrls: sitemapUrlSet } = await loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery);
|
|
642
|
+
const deduped = [];
|
|
643
|
+
const urlHashes = new Map();
|
|
644
|
+
const duplicateUrlFindings = [];
|
|
645
|
+
const duplicateConflictEmitted = new Set();
|
|
646
|
+
for (const page of loadedPages) {
|
|
647
|
+
const key = normalizeAuditUrl(page.url, normalizeUrlOptions);
|
|
648
|
+
const digest = hashHtml(page.html);
|
|
649
|
+
const prev = urlHashes.get(key);
|
|
650
|
+
if (prev !== undefined) {
|
|
651
|
+
if (prev !== digest && !duplicateConflictEmitted.has(key)) {
|
|
652
|
+
duplicateConflictEmitted.add(key);
|
|
653
|
+
duplicateUrlFindings.push({
|
|
654
|
+
ruleId: "audit/duplicate-url",
|
|
655
|
+
severity: "info",
|
|
656
|
+
message: `Duplicate crawl URL ${key} appeared with different HTML bodies; only the first occurrence was audited.`,
|
|
657
|
+
pageUrl: key
|
|
658
|
+
});
|
|
659
|
+
}
|
|
660
|
+
continue;
|
|
661
|
+
}
|
|
662
|
+
urlHashes.set(key, digest);
|
|
663
|
+
deduped.push({ url: key, html: page.html, httpMeta: page.httpMeta });
|
|
664
|
+
}
|
|
665
|
+
const filtered = ignorePatterns.length > 0
|
|
666
|
+
? deduped.filter((page) => !shouldIgnore(page.url, ignorePatterns))
|
|
667
|
+
: deduped;
|
|
668
|
+
const sampled = sampleSize > 0 && sampleSize < filtered.length
|
|
669
|
+
? fisherYatesSample(filtered, sampleSize)
|
|
670
|
+
: filtered;
|
|
671
|
+
const parsedPages = sampled.map((page) => {
|
|
672
|
+
const parsed = parseHtmlPage(page.html, page.url, { normalizeUrl: normalizeUrlOptions });
|
|
673
|
+
if (page.httpMeta) {
|
|
674
|
+
parsed.httpMeta = page.httpMeta;
|
|
675
|
+
}
|
|
676
|
+
return parsed;
|
|
677
|
+
});
|
|
678
|
+
const knownUrls = new Set(parsedPages.map((p) => p.url));
|
|
679
|
+
const rootUrl = parsedPages.find((p) => /(^|[\\/])index\.html?$/i.test(p.url))?.url ?? parsedPages[0]?.url ?? "";
|
|
680
|
+
const adjacency = new Map();
|
|
681
|
+
const inbound = new Map(Array.from(knownUrls).map((url) => [url, 0]));
|
|
682
|
+
for (const page of parsedPages) {
|
|
683
|
+
const links = new Set(page.resolvedHrefs.filter((link) => knownUrls.has(link)));
|
|
684
|
+
adjacency.set(page.url, links);
|
|
685
|
+
for (const link of links) {
|
|
686
|
+
inbound.set(link, (inbound.get(link) ?? 0) + 1);
|
|
687
|
+
}
|
|
688
|
+
}
|
|
689
|
+
// Classify pages into groups and run only enabled rules per group
|
|
690
|
+
const classified = classifyPages(parsedPages, options?.pageGroups);
|
|
691
|
+
const allFindings = [...duplicateUrlFindings];
|
|
692
|
+
const groupScores = {};
|
|
693
|
+
const groupPageCounts = {};
|
|
694
|
+
// Site-wide rules (run once, outside group loop)
|
|
695
|
+
if (sitemapUrlSet && sitemapUrlSet.size > 0) {
|
|
696
|
+
const sitemapFindings = sitemapCompletenessRule(parsedPages, sitemapUrlSet);
|
|
697
|
+
allFindings.push(...sitemapFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
|
|
698
|
+
}
|
|
699
|
+
for (const [groupName, groupPages] of classified) {
|
|
700
|
+
if (groupPages.length === 0)
|
|
701
|
+
continue;
|
|
702
|
+
const groupConfig = groupName === "__default" ? undefined : options?.pageGroups?.[groupName];
|
|
703
|
+
if (groupConfig?.rules !== undefined && groupConfig.rules.length === 0)
|
|
704
|
+
continue;
|
|
705
|
+
const groupRules = resolveGroupRules(resolvedRules, groupConfig?.overrides);
|
|
706
|
+
const enabledCheck = (ruleId) => isRuleEnabled(ruleId, groupConfig?.rules);
|
|
707
|
+
const findings = runRulesOnPages(groupPages, groupRules, enabledCheck, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, DEFAULT_ENTITY_PATTERNS);
|
|
708
|
+
allFindings.push(...findings);
|
|
709
|
+
groupPageCounts[groupName] = groupPages.length;
|
|
710
|
+
const { score } = scoreFromFindings(findings);
|
|
711
|
+
groupScores[groupName] = score;
|
|
712
|
+
}
|
|
713
|
+
// Enrich findings: cluster pairwise, detect templates, assign effort
|
|
714
|
+
const enriched = enrichFindings(allFindings, parsedPages, {
|
|
715
|
+
templateGenerated: options?.templateGenerated,
|
|
716
|
+
});
|
|
717
|
+
const { score, categoryScores } = scoreFromFindings(enriched.findings);
|
|
718
|
+
const auditedPageCount = Object.values(groupPageCounts).reduce((a, b) => a + b, 0);
|
|
719
|
+
return {
|
|
720
|
+
score,
|
|
721
|
+
categoryScores,
|
|
722
|
+
groupScores: options?.pageGroups ? groupScores : undefined,
|
|
723
|
+
groupPageCounts: options?.pageGroups ? groupPageCounts : undefined,
|
|
724
|
+
pageCount: auditedPageCount || parsedPages.length,
|
|
725
|
+
findings: enriched.findings,
|
|
726
|
+
templateDetected: enriched.templateDetected,
|
|
727
|
+
rawFindingCount: enriched.rawFindingCount,
|
|
728
|
+
};
|
|
729
|
+
}
|
|
730
|
+
//# sourceMappingURL=auditor.js.map
|