@crawlith/core 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +70 -0
- package/dist/analysis/analysis_list.html +35 -0
- package/dist/analysis/analysis_page.html +123 -0
- package/dist/analysis/analyze.d.ts +40 -5
- package/dist/analysis/analyze.js +395 -347
- package/dist/analysis/clustering.d.ts +23 -0
- package/dist/analysis/clustering.js +206 -0
- package/dist/analysis/content.d.ts +1 -1
- package/dist/analysis/content.js +11 -5
- package/dist/analysis/duplicate.d.ts +34 -0
- package/dist/analysis/duplicate.js +305 -0
- package/dist/analysis/heading.d.ts +116 -0
- package/dist/analysis/heading.js +356 -0
- package/dist/analysis/images.d.ts +1 -1
- package/dist/analysis/images.js +6 -5
- package/dist/analysis/links.d.ts +1 -1
- package/dist/analysis/links.js +8 -8
- package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
- package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
- package/dist/analysis/scoring.js +11 -2
- package/dist/analysis/seo.d.ts +8 -4
- package/dist/analysis/seo.js +41 -30
- package/dist/analysis/soft404.d.ts +17 -0
- package/dist/analysis/soft404.js +62 -0
- package/dist/analysis/structuredData.d.ts +1 -1
- package/dist/analysis/structuredData.js +5 -4
- package/dist/analysis/templates.d.ts +2 -0
- package/dist/analysis/templates.js +7 -0
- package/dist/application/index.d.ts +2 -0
- package/dist/application/index.js +2 -0
- package/dist/application/usecase.d.ts +3 -0
- package/dist/application/usecase.js +1 -0
- package/dist/application/usecases.d.ts +114 -0
- package/dist/application/usecases.js +201 -0
- package/dist/audit/index.js +1 -1
- package/dist/audit/transport.d.ts +1 -1
- package/dist/audit/transport.js +5 -4
- package/dist/audit/types.d.ts +1 -0
- package/dist/constants.d.ts +17 -0
- package/dist/constants.js +23 -0
- package/dist/core/scope/scopeManager.js +3 -0
- package/dist/core/security/ipGuard.d.ts +11 -0
- package/dist/core/security/ipGuard.js +71 -3
- package/dist/crawler/crawl.d.ts +4 -22
- package/dist/crawler/crawl.js +4 -335
- package/dist/crawler/crawler.d.ts +87 -0
- package/dist/crawler/crawler.js +683 -0
- package/dist/crawler/extract.d.ts +4 -1
- package/dist/crawler/extract.js +7 -2
- package/dist/crawler/fetcher.d.ts +2 -1
- package/dist/crawler/fetcher.js +26 -11
- package/dist/crawler/metricsRunner.d.ts +23 -1
- package/dist/crawler/metricsRunner.js +202 -72
- package/dist/crawler/normalize.d.ts +41 -0
- package/dist/crawler/normalize.js +119 -3
- package/dist/crawler/parser.d.ts +1 -3
- package/dist/crawler/parser.js +2 -49
- package/dist/crawler/resolver.d.ts +11 -0
- package/dist/crawler/resolver.js +67 -0
- package/dist/crawler/sitemap.d.ts +6 -0
- package/dist/crawler/sitemap.js +27 -17
- package/dist/crawler/trap.d.ts +5 -1
- package/dist/crawler/trap.js +23 -2
- package/dist/db/CrawlithDB.d.ts +110 -0
- package/dist/db/CrawlithDB.js +500 -0
- package/dist/db/graphLoader.js +42 -30
- package/dist/db/index.d.ts +11 -0
- package/dist/db/index.js +41 -29
- package/dist/db/migrations.d.ts +2 -0
- package/dist/db/{schema.js → migrations.js} +90 -43
- package/dist/db/pluginRegistry.d.ts +9 -0
- package/dist/db/pluginRegistry.js +19 -0
- package/dist/db/repositories/EdgeRepository.d.ts +13 -0
- package/dist/db/repositories/EdgeRepository.js +20 -0
- package/dist/db/repositories/MetricsRepository.d.ts +16 -8
- package/dist/db/repositories/MetricsRepository.js +28 -7
- package/dist/db/repositories/PageRepository.d.ts +15 -2
- package/dist/db/repositories/PageRepository.js +169 -25
- package/dist/db/repositories/SiteRepository.d.ts +9 -0
- package/dist/db/repositories/SiteRepository.js +13 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +14 -5
- package/dist/db/repositories/SnapshotRepository.js +64 -5
- package/dist/db/reset.d.ts +9 -0
- package/dist/db/reset.js +32 -0
- package/dist/db/statements.d.ts +12 -0
- package/dist/db/statements.js +40 -0
- package/dist/diff/compare.d.ts +0 -5
- package/dist/diff/compare.js +0 -12
- package/dist/diff/service.d.ts +16 -0
- package/dist/diff/service.js +41 -0
- package/dist/domain/index.d.ts +4 -0
- package/dist/domain/index.js +4 -0
- package/dist/events.d.ts +56 -0
- package/dist/events.js +1 -0
- package/dist/graph/graph.d.ts +36 -42
- package/dist/graph/graph.js +26 -17
- package/dist/graph/hits.d.ts +23 -0
- package/dist/graph/hits.js +111 -0
- package/dist/graph/metrics.d.ts +0 -4
- package/dist/graph/metrics.js +25 -9
- package/dist/graph/pagerank.d.ts +17 -4
- package/dist/graph/pagerank.js +126 -91
- package/dist/graph/simhash.d.ts +6 -0
- package/dist/graph/simhash.js +14 -0
- package/dist/index.d.ts +29 -8
- package/dist/index.js +29 -8
- package/dist/lock/hashKey.js +1 -1
- package/dist/lock/lockManager.d.ts +5 -1
- package/dist/lock/lockManager.js +38 -13
- package/dist/plugin-system/plugin-cli.d.ts +10 -0
- package/dist/plugin-system/plugin-cli.js +31 -0
- package/dist/plugin-system/plugin-config.d.ts +16 -0
- package/dist/plugin-system/plugin-config.js +36 -0
- package/dist/plugin-system/plugin-loader.d.ts +17 -0
- package/dist/plugin-system/plugin-loader.js +122 -0
- package/dist/plugin-system/plugin-registry.d.ts +25 -0
- package/dist/plugin-system/plugin-registry.js +167 -0
- package/dist/plugin-system/plugin-types.d.ts +205 -0
- package/dist/plugin-system/plugin-types.js +1 -0
- package/dist/ports/index.d.ts +9 -0
- package/dist/ports/index.js +1 -0
- package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
- package/dist/report/crawlExport.d.ts +3 -0
- package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
- package/dist/report/crawl_template.d.ts +1 -0
- package/dist/report/crawl_template.js +7 -0
- package/dist/report/export.d.ts +3 -0
- package/dist/report/export.js +81 -0
- package/dist/report/html.js +15 -216
- package/dist/report/insight.d.ts +27 -0
- package/dist/report/insight.js +103 -0
- package/dist/scoring/health.d.ts +56 -0
- package/dist/scoring/health.js +213 -0
- package/dist/utils/chalk.d.ts +6 -0
- package/dist/utils/chalk.js +41 -0
- package/dist/utils/secureConfig.d.ts +23 -0
- package/dist/utils/secureConfig.js +128 -0
- package/package.json +12 -6
- package/CHANGELOG.md +0 -7
- package/dist/db/schema.d.ts +0 -2
- package/dist/graph/cluster.d.ts +0 -6
- package/dist/graph/cluster.js +0 -173
- package/dist/graph/duplicate.d.ts +0 -10
- package/dist/graph/duplicate.js +0 -251
- package/dist/report/sitegraphExport.d.ts +0 -3
- package/dist/report/sitegraph_template.d.ts +0 -1
- package/dist/report/sitegraph_template.js +0 -630
- package/dist/scoring/hits.d.ts +0 -9
- package/dist/scoring/hits.js +0 -111
- package/src/analysis/analyze.ts +0 -548
- package/src/analysis/content.ts +0 -62
- package/src/analysis/images.ts +0 -28
- package/src/analysis/links.ts +0 -41
- package/src/analysis/scoring.ts +0 -59
- package/src/analysis/seo.ts +0 -82
- package/src/analysis/structuredData.ts +0 -62
- package/src/audit/dns.ts +0 -49
- package/src/audit/headers.ts +0 -98
- package/src/audit/index.ts +0 -66
- package/src/audit/scoring.ts +0 -232
- package/src/audit/transport.ts +0 -258
- package/src/audit/types.ts +0 -102
- package/src/core/network/proxyAdapter.ts +0 -21
- package/src/core/network/rateLimiter.ts +0 -39
- package/src/core/network/redirectController.ts +0 -47
- package/src/core/network/responseLimiter.ts +0 -34
- package/src/core/network/retryPolicy.ts +0 -57
- package/src/core/scope/domainFilter.ts +0 -45
- package/src/core/scope/scopeManager.ts +0 -52
- package/src/core/scope/subdomainPolicy.ts +0 -39
- package/src/core/security/ipGuard.ts +0 -92
- package/src/crawler/crawl.ts +0 -382
- package/src/crawler/extract.ts +0 -34
- package/src/crawler/fetcher.ts +0 -233
- package/src/crawler/metricsRunner.ts +0 -124
- package/src/crawler/normalize.ts +0 -108
- package/src/crawler/parser.ts +0 -190
- package/src/crawler/sitemap.ts +0 -73
- package/src/crawler/trap.ts +0 -96
- package/src/db/graphLoader.ts +0 -105
- package/src/db/index.ts +0 -70
- package/src/db/repositories/EdgeRepository.ts +0 -29
- package/src/db/repositories/MetricsRepository.ts +0 -49
- package/src/db/repositories/PageRepository.ts +0 -128
- package/src/db/repositories/SiteRepository.ts +0 -32
- package/src/db/repositories/SnapshotRepository.ts +0 -74
- package/src/db/schema.ts +0 -177
- package/src/diff/compare.ts +0 -84
- package/src/graph/cluster.ts +0 -192
- package/src/graph/duplicate.ts +0 -286
- package/src/graph/graph.ts +0 -172
- package/src/graph/metrics.ts +0 -110
- package/src/graph/pagerank.ts +0 -125
- package/src/graph/simhash.ts +0 -61
- package/src/index.ts +0 -30
- package/src/lock/hashKey.ts +0 -51
- package/src/lock/lockManager.ts +0 -124
- package/src/lock/pidCheck.ts +0 -13
- package/src/report/html.ts +0 -227
- package/src/report/sitegraphExport.ts +0 -58
- package/src/scoring/hits.ts +0 -131
- package/src/scoring/orphanSeverity.ts +0 -176
- package/src/utils/version.ts +0 -18
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
- package/tests/analysis.unit.test.ts +0 -98
- package/tests/analyze.integration.test.ts +0 -98
- package/tests/audit/dns.test.ts +0 -31
- package/tests/audit/headers.test.ts +0 -45
- package/tests/audit/scoring.test.ts +0 -133
- package/tests/audit/security.test.ts +0 -12
- package/tests/audit/transport.test.ts +0 -112
- package/tests/clustering.test.ts +0 -118
- package/tests/crawler.test.ts +0 -358
- package/tests/db.test.ts +0 -159
- package/tests/diff.test.ts +0 -67
- package/tests/duplicate.test.ts +0 -110
- package/tests/fetcher.test.ts +0 -106
- package/tests/fetcher_safety.test.ts +0 -85
- package/tests/fixtures/analyze-crawl.json +0 -26
- package/tests/hits.test.ts +0 -134
- package/tests/html_report.test.ts +0 -58
- package/tests/lock/lockManager.test.ts +0 -138
- package/tests/metrics.test.ts +0 -196
- package/tests/normalize.test.ts +0 -101
- package/tests/orphanSeverity.test.ts +0 -160
- package/tests/pagerank.test.ts +0 -98
- package/tests/parser.test.ts +0 -117
- package/tests/proxy_safety.test.ts +0 -57
- package/tests/redirect_safety.test.ts +0 -73
- package/tests/safety.test.ts +0 -114
- package/tests/scope.test.ts +0 -66
- package/tests/scoring.test.ts +0 -59
- package/tests/sitemap.test.ts +0 -88
- package/tests/soft404.test.ts +0 -41
- package/tests/trap.test.ts +0 -39
- package/tests/visualization_data.test.ts +0 -46
- package/tsconfig.json +0 -11
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Supported heading levels within HTML content.
|
|
3
|
+
*/
|
|
4
|
+
export type HeadingLevel = 1 | 2 | 3 | 4 | 5 | 6;
|
|
5
|
+
/**
|
|
6
|
+
* Represents a normalized heading node extracted from the DOM.
|
|
7
|
+
*/
|
|
8
|
+
export interface HeadingNode {
|
|
9
|
+
level: HeadingLevel;
|
|
10
|
+
text: string;
|
|
11
|
+
index: number;
|
|
12
|
+
parentIndex?: number;
|
|
13
|
+
}
|
|
14
|
+
/**
|
|
15
|
+
* Represents content statistics for a section under a heading.
|
|
16
|
+
*/
|
|
17
|
+
export interface SectionMetrics {
|
|
18
|
+
headingIndex: number;
|
|
19
|
+
headingText: string;
|
|
20
|
+
words: number;
|
|
21
|
+
keywordConcentration: number;
|
|
22
|
+
thin: boolean;
|
|
23
|
+
duplicateRisk: number;
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Raw heading analysis generated for a single URL.
|
|
27
|
+
*/
|
|
28
|
+
export interface LocalPageAnalysis {
|
|
29
|
+
url: string;
|
|
30
|
+
headingNodes: HeadingNode[];
|
|
31
|
+
sections: SectionMetrics[];
|
|
32
|
+
h1Norm: string;
|
|
33
|
+
h2SetHash: string;
|
|
34
|
+
patternHash: string;
|
|
35
|
+
issues: string[];
|
|
36
|
+
metrics: {
|
|
37
|
+
entropy: number;
|
|
38
|
+
maxDepth: number;
|
|
39
|
+
avgDepth: number;
|
|
40
|
+
headingDensity: number;
|
|
41
|
+
fragmentation: number;
|
|
42
|
+
levelVolatility: number;
|
|
43
|
+
hierarchySkips: number;
|
|
44
|
+
reverseJumps: number;
|
|
45
|
+
missingH1: number;
|
|
46
|
+
multipleH1: number;
|
|
47
|
+
};
|
|
48
|
+
}
|
|
49
|
+
/**
|
|
50
|
+
* Final heading-health payload attached to a page node.
|
|
51
|
+
*/
|
|
52
|
+
export interface HeadingHealthPayload {
|
|
53
|
+
score: number;
|
|
54
|
+
status: 'Healthy' | 'Moderate' | 'Poor';
|
|
55
|
+
issues: string[];
|
|
56
|
+
map: HeadingNode[];
|
|
57
|
+
missing_h1: number;
|
|
58
|
+
multiple_h1: number;
|
|
59
|
+
entropy: number;
|
|
60
|
+
max_depth: number;
|
|
61
|
+
avg_depth: number;
|
|
62
|
+
heading_density: number;
|
|
63
|
+
fragmentation: number;
|
|
64
|
+
volatility: number;
|
|
65
|
+
hierarchy_skips: number;
|
|
66
|
+
reverse_jumps: number;
|
|
67
|
+
thin_sections: number;
|
|
68
|
+
duplicate_h1_group: number;
|
|
69
|
+
similar_h1_group: number;
|
|
70
|
+
identical_h2_set_group: number;
|
|
71
|
+
duplicate_pattern_group: number;
|
|
72
|
+
template_risk: number;
|
|
73
|
+
}
|
|
74
|
+
/**
|
|
75
|
+
* Snapshot-level summary emitted by the plugin.
|
|
76
|
+
*/
|
|
77
|
+
export interface HeadingHealthSummary {
|
|
78
|
+
avgScore: number;
|
|
79
|
+
evaluatedPages: number;
|
|
80
|
+
totalMissing: number;
|
|
81
|
+
totalMultiple: number;
|
|
82
|
+
totalSkips: number;
|
|
83
|
+
totalReverseJumps: number;
|
|
84
|
+
totalThinSections: number;
|
|
85
|
+
avgEntropy: number;
|
|
86
|
+
poorPages: number;
|
|
87
|
+
}
|
|
88
|
+
/**
|
|
89
|
+
* Calculates token-level Jaccard similarity between two text values.
|
|
90
|
+
*/
|
|
91
|
+
export declare function jaccardSimilarity(a: string, b: string): number;
|
|
92
|
+
/**
|
|
93
|
+
* Performs per-page heading extraction and structural scoring signal generation.
|
|
94
|
+
*/
|
|
95
|
+
export declare function analyzeHeadingHealth(html: string, fallbackTitle?: string): LocalPageAnalysis;
|
|
96
|
+
/**
|
|
97
|
+
* Enriches section-level duplicate risk by comparing normalized section signatures across pages.
|
|
98
|
+
*/
|
|
99
|
+
export declare function enrichDuplicateRisk(pages: LocalPageAnalysis[]): void;
|
|
100
|
+
/**
|
|
101
|
+
* Coordinates heading analysis across graph nodes and builds report-safe payloads.
|
|
102
|
+
*/
|
|
103
|
+
export declare class HeadingHealthService {
|
|
104
|
+
/**
|
|
105
|
+
* Builds page-level payloads plus a snapshot summary for every eligible node.
|
|
106
|
+
*/
|
|
107
|
+
evaluateNodes(nodes: Array<Record<string, any>>): {
|
|
108
|
+
payloadsByUrl: Map<string, HeadingHealthPayload>;
|
|
109
|
+
summary: HeadingHealthSummary;
|
|
110
|
+
};
|
|
111
|
+
private collectAnalyses;
|
|
112
|
+
private buildBuckets;
|
|
113
|
+
private computeSimilarH1GroupSizes;
|
|
114
|
+
private scoreHealth;
|
|
115
|
+
private computeTemplateRisk;
|
|
116
|
+
}
|
|
@@ -0,0 +1,356 @@
|
|
|
1
|
+
import { createHash } from 'node:crypto';
|
|
2
|
+
const STOPWORDS = new Set(['the', 'and', 'for', 'with', 'from', 'that', 'this', 'your', 'about', 'into', 'over', 'under', 'are', 'was', 'were', 'can', 'has', 'have', 'had', 'you', 'our', 'out', 'all']);
|
|
3
|
+
const THIN_SECTION_WORDS = 80;
|
|
4
|
+
const HEADING_PATTERN = /<h([1-6])\b[^<>]*>([\s\S]*?)<\/h\1>/gi;
|
|
5
|
+
const TITLE_PATTERN = /<title\b[^<>]*>([\s\S]*?)<\/title>/i;
|
|
6
|
+
const normalizeText = (input) => input.replace(/<[^<>]*>/g, ' ').replace(/ /g, ' ').replace(/&/g, '&').replace(/\s+/g, ' ').trim();
|
|
7
|
+
const normalizeComparable = (input) => normalizeText(input).toLowerCase();
|
|
8
|
+
const tokenize = (input) => normalizeComparable(input).replace(/[^a-z0-9\s]/g, ' ').split(/\s+/).filter((token) => token.length > 2 && !STOPWORDS.has(token));
|
|
9
|
+
const stableHash = (input) => createHash('sha1').update(input).digest('hex').slice(0, 16);
|
|
10
|
+
/**
|
|
11
|
+
* Calculates token-level Jaccard similarity between two text values.
|
|
12
|
+
*/
|
|
13
|
+
export function jaccardSimilarity(a, b) {
|
|
14
|
+
const aSet = new Set(tokenize(a));
|
|
15
|
+
const bSet = new Set(tokenize(b));
|
|
16
|
+
if (!aSet.size || !bSet.size) {
|
|
17
|
+
return 0;
|
|
18
|
+
}
|
|
19
|
+
let intersection = 0;
|
|
20
|
+
for (const token of aSet) {
|
|
21
|
+
if (bSet.has(token)) {
|
|
22
|
+
intersection += 1;
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
return intersection / (aSet.size + bSet.size - intersection);
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Performs per-page heading extraction and structural scoring signal generation.
|
|
29
|
+
*/
|
|
30
|
+
export function analyzeHeadingHealth(html, fallbackTitle) {
|
|
31
|
+
const segments = [];
|
|
32
|
+
for (const match of html.matchAll(HEADING_PATTERN)) {
|
|
33
|
+
const level = Number(match[1]);
|
|
34
|
+
segments.push({
|
|
35
|
+
level,
|
|
36
|
+
text: normalizeText(match[2] || ''),
|
|
37
|
+
start: match.index || 0,
|
|
38
|
+
end: (match.index || 0) + match[0].length
|
|
39
|
+
});
|
|
40
|
+
}
|
|
41
|
+
const headingNodes = [];
|
|
42
|
+
const stack = [];
|
|
43
|
+
segments.forEach((segment, index) => {
|
|
44
|
+
const node = { level: segment.level, text: segment.text, index };
|
|
45
|
+
while (stack.length > 0 && stack[stack.length - 1].level >= node.level) {
|
|
46
|
+
stack.pop();
|
|
47
|
+
}
|
|
48
|
+
if (stack.length > 0) {
|
|
49
|
+
node.parentIndex = stack[stack.length - 1].index;
|
|
50
|
+
}
|
|
51
|
+
stack.push(node);
|
|
52
|
+
headingNodes.push(node);
|
|
53
|
+
});
|
|
54
|
+
const sections = [];
|
|
55
|
+
const pageWords = tokenize(html);
|
|
56
|
+
const frequency = new Map();
|
|
57
|
+
for (const word of pageWords) {
|
|
58
|
+
frequency.set(word, (frequency.get(word) || 0) + 1);
|
|
59
|
+
}
|
|
60
|
+
for (let i = 0; i < segments.length; i += 1) {
|
|
61
|
+
const current = segments[i];
|
|
62
|
+
const next = segments[i + 1];
|
|
63
|
+
const textChunk = html.slice(current.end, next ? next.start : html.length);
|
|
64
|
+
const words = tokenize(textChunk);
|
|
65
|
+
const headingTokens = tokenize(current.text);
|
|
66
|
+
const concentration = headingTokens.reduce((sum, token) => sum + (frequency.get(token) || 0), 0);
|
|
67
|
+
sections.push({
|
|
68
|
+
headingIndex: headingNodes[i]?.index ?? i,
|
|
69
|
+
headingText: current.text,
|
|
70
|
+
words: words.length,
|
|
71
|
+
keywordConcentration: words.length > 0 ? Number((concentration / words.length).toFixed(3)) : 0,
|
|
72
|
+
thin: words.length > 0 && words.length < THIN_SECTION_WORDS,
|
|
73
|
+
duplicateRisk: 0
|
|
74
|
+
});
|
|
75
|
+
}
|
|
76
|
+
const levelCounts = [0, 0, 0, 0, 0, 0];
|
|
77
|
+
headingNodes.forEach((node) => {
|
|
78
|
+
levelCounts[node.level - 1] += 1;
|
|
79
|
+
});
|
|
80
|
+
const entropyScore = Number(calculateEntropy(levelCounts).toFixed(3));
|
|
81
|
+
const missingH1 = levelCounts[0] === 0 ? 1 : 0;
|
|
82
|
+
const multipleH1 = levelCounts[0] > 1 ? 1 : 0;
|
|
83
|
+
let hierarchySkips = 0;
|
|
84
|
+
let reverseJumps = 0;
|
|
85
|
+
let volatilitySum = 0;
|
|
86
|
+
for (let i = 1; i < headingNodes.length; i += 1) {
|
|
87
|
+
const delta = headingNodes[i].level - headingNodes[i - 1].level;
|
|
88
|
+
volatilitySum += Math.abs(delta);
|
|
89
|
+
if (delta > 1) {
|
|
90
|
+
hierarchySkips += 1;
|
|
91
|
+
}
|
|
92
|
+
if (delta < -1) {
|
|
93
|
+
reverseJumps += 1;
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
const maxDepth = headingNodes.length ? Math.max(...headingNodes.map((node) => node.level)) : 0;
|
|
97
|
+
const avgDepth = headingNodes.length ? Number((headingNodes.reduce((sum, node) => sum + node.level, 0) / headingNodes.length).toFixed(2)) : 0;
|
|
98
|
+
const headingDensity = pageWords.length ? Number((headingNodes.length / pageWords.length).toFixed(4)) : 0;
|
|
99
|
+
const fragmentation = headingNodes.length ? Number((headingNodes.filter((node) => node.level <= 2).length / headingNodes.length).toFixed(3)) : 0;
|
|
100
|
+
const levelVolatility = headingNodes.length > 1 ? Number((volatilitySum / (headingNodes.length - 1)).toFixed(3)) : 0;
|
|
101
|
+
const h1Nodes = headingNodes.filter((node) => node.level === 1);
|
|
102
|
+
const issues = [];
|
|
103
|
+
if (missingH1)
|
|
104
|
+
issues.push('Missing H1');
|
|
105
|
+
if (multipleH1)
|
|
106
|
+
issues.push('Multiple H1 found');
|
|
107
|
+
if (h1Nodes.some((node) => node.text.length < 6))
|
|
108
|
+
issues.push('Empty or near-empty H1');
|
|
109
|
+
const title = fallbackTitle || getTitleFromHtml(html);
|
|
110
|
+
if (title && h1Nodes[0] && jaccardSimilarity(title, h1Nodes[0].text) < 0.3) {
|
|
111
|
+
issues.push('H1 diverges from <title>');
|
|
112
|
+
}
|
|
113
|
+
if (hierarchySkips > 0)
|
|
114
|
+
issues.push(`${hierarchySkips} hierarchy skips detected`);
|
|
115
|
+
if (reverseJumps > 0)
|
|
116
|
+
issues.push(`${reverseJumps} reverse hierarchy jumps detected`);
|
|
117
|
+
for (const thin of sections.filter((section) => section.thin).slice(0, 2)) {
|
|
118
|
+
issues.push(`Thin section under "${thin.headingText || 'Untitled heading'}"`);
|
|
119
|
+
}
|
|
120
|
+
if (entropyScore > 2.1)
|
|
121
|
+
issues.push('High structural entropy');
|
|
122
|
+
if (fragmentation > 0.65)
|
|
123
|
+
issues.push('Section fragmentation is high');
|
|
124
|
+
const h1Norm = normalizeComparable(h1Nodes[0]?.text || '');
|
|
125
|
+
const h2SetHash = stableHash(headingNodes
|
|
126
|
+
.filter((node) => node.level === 2)
|
|
127
|
+
.map((node) => normalizeComparable(node.text))
|
|
128
|
+
.filter(Boolean)
|
|
129
|
+
.sort()
|
|
130
|
+
.join('|'));
|
|
131
|
+
const patternHash = stableHash(headingNodes.map((node) => node.level).join('>'));
|
|
132
|
+
return {
|
|
133
|
+
url: '',
|
|
134
|
+
headingNodes,
|
|
135
|
+
sections,
|
|
136
|
+
h1Norm,
|
|
137
|
+
h2SetHash,
|
|
138
|
+
patternHash,
|
|
139
|
+
issues,
|
|
140
|
+
metrics: {
|
|
141
|
+
entropy: entropyScore,
|
|
142
|
+
maxDepth,
|
|
143
|
+
avgDepth,
|
|
144
|
+
headingDensity,
|
|
145
|
+
fragmentation,
|
|
146
|
+
levelVolatility,
|
|
147
|
+
hierarchySkips,
|
|
148
|
+
reverseJumps,
|
|
149
|
+
missingH1,
|
|
150
|
+
multipleH1
|
|
151
|
+
}
|
|
152
|
+
};
|
|
153
|
+
}
|
|
154
|
+
/**
|
|
155
|
+
* Enriches section-level duplicate risk by comparing normalized section signatures across pages.
|
|
156
|
+
*/
|
|
157
|
+
export function enrichDuplicateRisk(pages) {
|
|
158
|
+
const buckets = new Map();
|
|
159
|
+
for (const page of pages) {
|
|
160
|
+
for (const section of page.sections) {
|
|
161
|
+
const key = stableHash(`${normalizeComparable(section.headingText)}:${section.words}`);
|
|
162
|
+
const bucket = buckets.get(key) || [];
|
|
163
|
+
bucket.push(page.url);
|
|
164
|
+
buckets.set(key, bucket);
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
for (const page of pages) {
|
|
168
|
+
for (const section of page.sections) {
|
|
169
|
+
const key = stableHash(`${normalizeComparable(section.headingText)}:${section.words}`);
|
|
170
|
+
const size = (buckets.get(key) || []).length;
|
|
171
|
+
section.duplicateRisk = Number(Math.min(1, (size - 1) / 5).toFixed(3));
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
function calculateEntropy(values) {
|
|
176
|
+
const total = values.reduce((a, b) => a + b, 0);
|
|
177
|
+
if (!total) {
|
|
178
|
+
return 0;
|
|
179
|
+
}
|
|
180
|
+
return values.reduce((sum, value) => {
|
|
181
|
+
if (value === 0) {
|
|
182
|
+
return sum;
|
|
183
|
+
}
|
|
184
|
+
const probability = value / total;
|
|
185
|
+
return sum - probability * Math.log2(probability);
|
|
186
|
+
}, 0);
|
|
187
|
+
}
|
|
188
|
+
function getTitleFromHtml(html) {
|
|
189
|
+
const match = html.match(TITLE_PATTERN);
|
|
190
|
+
return match ? normalizeText(match[1]) : '';
|
|
191
|
+
}
|
|
192
|
+
/**
|
|
193
|
+
* Coordinates heading analysis across graph nodes and builds report-safe payloads.
|
|
194
|
+
*/
|
|
195
|
+
export class HeadingHealthService {
|
|
196
|
+
/**
|
|
197
|
+
* Builds page-level payloads plus a snapshot summary for every eligible node.
|
|
198
|
+
*/
|
|
199
|
+
evaluateNodes(nodes) {
|
|
200
|
+
const analyzedPages = this.collectAnalyses(nodes);
|
|
201
|
+
enrichDuplicateRisk(analyzedPages);
|
|
202
|
+
const exactH1Buckets = this.buildBuckets(analyzedPages, (page) => page.h1Norm);
|
|
203
|
+
const h2SetBuckets = this.buildBuckets(analyzedPages, (page) => page.h2SetHash);
|
|
204
|
+
const patternBuckets = this.buildBuckets(analyzedPages, (page) => page.patternHash);
|
|
205
|
+
const similarH1GroupSizes = this.computeSimilarH1GroupSizes(analyzedPages);
|
|
206
|
+
const payloadsByUrl = new Map();
|
|
207
|
+
let totalScore = 0;
|
|
208
|
+
let totalMissing = 0;
|
|
209
|
+
let totalMultiple = 0;
|
|
210
|
+
let totalSkips = 0;
|
|
211
|
+
let totalReverseJumps = 0;
|
|
212
|
+
let totalThinSections = 0;
|
|
213
|
+
let totalEntropy = 0;
|
|
214
|
+
let poorPages = 0;
|
|
215
|
+
for (const page of analyzedPages) {
|
|
216
|
+
const duplicateH1GroupSize = page.h1Norm ? exactH1Buckets.get(page.h1Norm)?.length || 1 : 0;
|
|
217
|
+
const similarH1GroupSize = similarH1GroupSizes.get(page.url) || 0;
|
|
218
|
+
const identicalH2SetGroupSize = h2SetBuckets.get(page.h2SetHash)?.length || 1;
|
|
219
|
+
const duplicatePatternGroupSize = patternBuckets.get(page.patternHash)?.length || 1;
|
|
220
|
+
const templateRisk = this.computeTemplateRisk(similarH1GroupSize, identicalH2SetGroupSize, duplicatePatternGroupSize);
|
|
221
|
+
const thinSectionCount = page.sections.filter((section) => section.thin).length;
|
|
222
|
+
const health = this.scoreHealth({
|
|
223
|
+
metrics: page.metrics,
|
|
224
|
+
thinSectionCount,
|
|
225
|
+
duplicateH1GroupSize,
|
|
226
|
+
similarH1GroupSize,
|
|
227
|
+
identicalH2SetGroupSize,
|
|
228
|
+
duplicatePatternGroupSize,
|
|
229
|
+
templateRisk,
|
|
230
|
+
issues: page.issues
|
|
231
|
+
});
|
|
232
|
+
if (health.status === 'Poor') {
|
|
233
|
+
poorPages += 1;
|
|
234
|
+
}
|
|
235
|
+
totalScore += health.score;
|
|
236
|
+
totalMissing += page.metrics.missingH1;
|
|
237
|
+
totalMultiple += page.metrics.multipleH1;
|
|
238
|
+
totalSkips += page.metrics.hierarchySkips;
|
|
239
|
+
totalReverseJumps += page.metrics.reverseJumps;
|
|
240
|
+
totalThinSections += thinSectionCount;
|
|
241
|
+
totalEntropy += page.metrics.entropy;
|
|
242
|
+
payloadsByUrl.set(page.url, {
|
|
243
|
+
score: health.score,
|
|
244
|
+
status: health.status,
|
|
245
|
+
issues: health.issues,
|
|
246
|
+
map: page.headingNodes,
|
|
247
|
+
missing_h1: page.metrics.missingH1,
|
|
248
|
+
multiple_h1: page.metrics.multipleH1,
|
|
249
|
+
entropy: page.metrics.entropy,
|
|
250
|
+
max_depth: page.metrics.maxDepth,
|
|
251
|
+
avg_depth: page.metrics.avgDepth,
|
|
252
|
+
heading_density: page.metrics.headingDensity,
|
|
253
|
+
fragmentation: page.metrics.fragmentation,
|
|
254
|
+
volatility: page.metrics.levelVolatility,
|
|
255
|
+
hierarchy_skips: page.metrics.hierarchySkips,
|
|
256
|
+
reverse_jumps: page.metrics.reverseJumps,
|
|
257
|
+
thin_sections: thinSectionCount,
|
|
258
|
+
duplicate_h1_group: duplicateH1GroupSize,
|
|
259
|
+
similar_h1_group: similarH1GroupSize,
|
|
260
|
+
identical_h2_set_group: identicalH2SetGroupSize,
|
|
261
|
+
duplicate_pattern_group: duplicatePatternGroupSize,
|
|
262
|
+
template_risk: templateRisk
|
|
263
|
+
});
|
|
264
|
+
}
|
|
265
|
+
const evaluatedPages = analyzedPages.length;
|
|
266
|
+
const summary = {
|
|
267
|
+
avgScore: evaluatedPages ? Math.round(totalScore / evaluatedPages) : 0,
|
|
268
|
+
evaluatedPages,
|
|
269
|
+
totalMissing,
|
|
270
|
+
totalMultiple,
|
|
271
|
+
totalSkips,
|
|
272
|
+
totalReverseJumps,
|
|
273
|
+
totalThinSections,
|
|
274
|
+
avgEntropy: evaluatedPages ? Number((totalEntropy / evaluatedPages).toFixed(3)) : 0,
|
|
275
|
+
poorPages
|
|
276
|
+
};
|
|
277
|
+
return { payloadsByUrl, summary };
|
|
278
|
+
}
|
|
279
|
+
collectAnalyses(nodes) {
|
|
280
|
+
const analyses = [];
|
|
281
|
+
for (const node of nodes) {
|
|
282
|
+
if (node.status < 200 || node.status >= 300 || !node.html || !node.url) {
|
|
283
|
+
continue;
|
|
284
|
+
}
|
|
285
|
+
const analysis = analyzeHeadingHealth(node.html, node.title || node.rawTitle);
|
|
286
|
+
analysis.url = node.url;
|
|
287
|
+
analyses.push(analysis);
|
|
288
|
+
}
|
|
289
|
+
return analyses;
|
|
290
|
+
}
|
|
291
|
+
buildBuckets(pages, selector) {
|
|
292
|
+
const buckets = new Map();
|
|
293
|
+
for (const page of pages) {
|
|
294
|
+
const key = selector(page);
|
|
295
|
+
if (!key) {
|
|
296
|
+
continue;
|
|
297
|
+
}
|
|
298
|
+
buckets.set(key, [...(buckets.get(key) || []), page.url]);
|
|
299
|
+
}
|
|
300
|
+
return buckets;
|
|
301
|
+
}
|
|
302
|
+
computeSimilarH1GroupSizes(pages) {
|
|
303
|
+
const uniqueH1 = Array.from(new Set(pages.map((page) => page.h1Norm).filter(Boolean)));
|
|
304
|
+
const similarBuckets = new Map();
|
|
305
|
+
for (const h1 of uniqueH1) {
|
|
306
|
+
similarBuckets.set(h1, new Set([h1]));
|
|
307
|
+
}
|
|
308
|
+
for (let i = 0; i < uniqueH1.length; i += 1) {
|
|
309
|
+
for (let j = i + 1; j < uniqueH1.length; j += 1) {
|
|
310
|
+
const a = uniqueH1[i];
|
|
311
|
+
const b = uniqueH1[j];
|
|
312
|
+
if (jaccardSimilarity(a, b) >= 0.7) {
|
|
313
|
+
similarBuckets.get(a)?.add(b);
|
|
314
|
+
similarBuckets.get(b)?.add(a);
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
}
|
|
318
|
+
const groupSizes = new Map();
|
|
319
|
+
for (const page of pages) {
|
|
320
|
+
groupSizes.set(page.url, similarBuckets.get(page.h1Norm)?.size || (page.h1Norm ? 1 : 0));
|
|
321
|
+
}
|
|
322
|
+
return groupSizes;
|
|
323
|
+
}
|
|
324
|
+
scoreHealth(input) {
|
|
325
|
+
let score = 100;
|
|
326
|
+
const metrics = input.metrics;
|
|
327
|
+
if (metrics.missingH1)
|
|
328
|
+
score -= 20;
|
|
329
|
+
if (metrics.multipleH1)
|
|
330
|
+
score -= 6;
|
|
331
|
+
score -= metrics.hierarchySkips * 8;
|
|
332
|
+
score -= metrics.reverseJumps * 6;
|
|
333
|
+
score -= Math.round(metrics.entropy * 7);
|
|
334
|
+
score -= Math.round(metrics.fragmentation * 20);
|
|
335
|
+
score -= Math.round(metrics.levelVolatility * 6);
|
|
336
|
+
score -= input.thinSectionCount * 4;
|
|
337
|
+
if (input.duplicateH1GroupSize > 1)
|
|
338
|
+
score -= Math.min(16, (input.duplicateH1GroupSize - 1) * 3);
|
|
339
|
+
if (input.similarH1GroupSize > 1)
|
|
340
|
+
score -= Math.min(8, (input.similarH1GroupSize - 1) * 2);
|
|
341
|
+
if (input.identicalH2SetGroupSize > 1)
|
|
342
|
+
score -= Math.min(10, (input.identicalH2SetGroupSize - 1) * 2);
|
|
343
|
+
if (input.duplicatePatternGroupSize > 1)
|
|
344
|
+
score -= Math.min(12, (input.duplicatePatternGroupSize - 1) * 2);
|
|
345
|
+
score -= Math.round(input.templateRisk * 12);
|
|
346
|
+
score = Math.max(0, Math.min(100, score));
|
|
347
|
+
return {
|
|
348
|
+
score,
|
|
349
|
+
status: score >= 80 ? 'Healthy' : score >= 55 ? 'Moderate' : 'Poor',
|
|
350
|
+
issues: input.issues
|
|
351
|
+
};
|
|
352
|
+
}
|
|
353
|
+
computeTemplateRisk(similar, h2set, pattern) {
|
|
354
|
+
return Number(Math.max(0, Math.min(1, ((similar - 1) * 0.15) + ((h2set - 1) * 0.2) + ((pattern - 1) * 0.2))).toFixed(3));
|
|
355
|
+
}
|
|
356
|
+
}
|
package/dist/analysis/images.js
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
import { load } from 'cheerio';
|
|
2
|
-
export function analyzeImageAlts(
|
|
3
|
-
const
|
|
2
|
+
export function analyzeImageAlts($) {
|
|
3
|
+
const isString = typeof $ === 'string';
|
|
4
|
+
const cheerioObj = isString ? load($ || '<html></html>') : $;
|
|
4
5
|
let missingAlt = 0;
|
|
5
6
|
let emptyAlt = 0;
|
|
6
|
-
|
|
7
|
-
const alt =
|
|
7
|
+
cheerioObj('img').each((_idx, el) => {
|
|
8
|
+
const alt = cheerioObj(el).attr('alt');
|
|
8
9
|
if (alt === undefined) {
|
|
9
10
|
missingAlt += 1;
|
|
10
11
|
return;
|
|
@@ -13,6 +14,6 @@ export function analyzeImageAlts(html) {
|
|
|
13
14
|
emptyAlt += 1;
|
|
14
15
|
}
|
|
15
16
|
});
|
|
16
|
-
const totalImages =
|
|
17
|
+
const totalImages = cheerioObj('img').length;
|
|
17
18
|
return { totalImages, missingAlt, emptyAlt };
|
|
18
19
|
}
|
package/dist/analysis/links.d.ts
CHANGED
|
@@ -4,4 +4,4 @@ export interface LinkRatioAnalysis {
|
|
|
4
4
|
nofollowCount: number;
|
|
5
5
|
externalRatio: number;
|
|
6
6
|
}
|
|
7
|
-
export declare function analyzeLinks(
|
|
7
|
+
export declare function analyzeLinks($: any, pageUrl: string, rootUrl: string): LinkRatioAnalysis;
|
package/dist/analysis/links.js
CHANGED
|
@@ -1,23 +1,23 @@
|
|
|
1
1
|
import { load } from 'cheerio';
|
|
2
|
-
import { normalizeUrl } from '../crawler/normalize.js';
|
|
3
|
-
export function analyzeLinks(
|
|
4
|
-
const
|
|
5
|
-
const
|
|
2
|
+
import { normalizeUrl, UrlUtil } from '../crawler/normalize.js';
|
|
3
|
+
export function analyzeLinks($, pageUrl, rootUrl) {
|
|
4
|
+
const isString = typeof $ === 'string';
|
|
5
|
+
const cheerioObj = isString ? load($ || '<html></html>') : $;
|
|
6
6
|
let internalLinks = 0;
|
|
7
7
|
let externalLinks = 0;
|
|
8
8
|
let nofollowCount = 0;
|
|
9
|
-
|
|
10
|
-
const href =
|
|
9
|
+
cheerioObj('a[href]').each((_idx, el) => {
|
|
10
|
+
const href = cheerioObj(el).attr('href');
|
|
11
11
|
if (!href)
|
|
12
12
|
return;
|
|
13
13
|
const normalized = normalizeUrl(href, pageUrl, { stripQuery: false });
|
|
14
14
|
if (!normalized)
|
|
15
15
|
return;
|
|
16
|
-
const rel = (
|
|
16
|
+
const rel = (cheerioObj(el).attr('rel') || '').toLowerCase();
|
|
17
17
|
if (rel.includes('nofollow')) {
|
|
18
18
|
nofollowCount += 1;
|
|
19
19
|
}
|
|
20
|
-
if (
|
|
20
|
+
if (UrlUtil.isInternal(normalized, rootUrl)) {
|
|
21
21
|
internalLinks += 1;
|
|
22
22
|
}
|
|
23
23
|
else {
|
|
@@ -1,26 +1,15 @@
|
|
|
1
|
-
|
|
2
|
-
export
|
|
3
|
-
export interface SitegraphNode {
|
|
4
|
-
url: string;
|
|
5
|
-
depth: number;
|
|
6
|
-
inLinks: number;
|
|
7
|
-
outLinks: number;
|
|
8
|
-
status: number;
|
|
9
|
-
discoveredViaSitemap?: boolean;
|
|
10
|
-
robotsExcluded?: boolean;
|
|
11
|
-
canonicalUrl?: string;
|
|
12
|
-
isHomepage?: boolean;
|
|
13
|
-
wordCount?: number;
|
|
14
|
-
hasStructuredData?: boolean;
|
|
1
|
+
import type { GraphNode, GraphEdge } from '../graph/graph.js';
|
|
2
|
+
export interface ExtendedGraphNode extends GraphNode {
|
|
15
3
|
pageType?: string;
|
|
16
|
-
|
|
17
|
-
duplicateContent?: boolean;
|
|
4
|
+
hasStructuredData?: boolean;
|
|
18
5
|
isProductOrCommercial?: boolean;
|
|
6
|
+
duplicateContent?: boolean;
|
|
7
|
+
isHomepage?: boolean;
|
|
8
|
+
robotsExcluded?: boolean;
|
|
9
|
+
discoveredViaSitemap?: boolean;
|
|
19
10
|
}
|
|
20
|
-
export
|
|
21
|
-
|
|
22
|
-
target: string;
|
|
23
|
-
}
|
|
11
|
+
export type OrphanType = 'hard' | 'near' | 'soft' | 'crawl-only';
|
|
12
|
+
export type ImpactLevel = 'low' | 'medium' | 'high' | 'critical';
|
|
24
13
|
export interface OrphanScoringOptions {
|
|
25
14
|
enabled: boolean;
|
|
26
15
|
severityEnabled: boolean;
|
|
@@ -28,12 +17,12 @@ export interface OrphanScoringOptions {
|
|
|
28
17
|
minInbound: number;
|
|
29
18
|
rootUrl?: string;
|
|
30
19
|
}
|
|
31
|
-
export type AnnotatedNode =
|
|
20
|
+
export type AnnotatedNode = GraphNode & {
|
|
32
21
|
orphan: boolean;
|
|
33
22
|
orphanType?: OrphanType;
|
|
34
23
|
orphanSeverity?: number;
|
|
35
24
|
impactLevel?: ImpactLevel;
|
|
36
25
|
};
|
|
37
26
|
export declare function mapImpactLevel(score: number): ImpactLevel;
|
|
38
|
-
export declare function calculateOrphanSeverity(orphanType: OrphanType, node:
|
|
39
|
-
export declare function annotateOrphans(nodes:
|
|
27
|
+
export declare function calculateOrphanSeverity(orphanType: OrphanType, node: ExtendedGraphNode): number;
|
|
28
|
+
export declare function annotateOrphans(nodes: ExtendedGraphNode[], edges: GraphEdge[], options: OrphanScoringOptions): AnnotatedNode[];
|
|
@@ -33,7 +33,8 @@ export function calculateOrphanSeverity(orphanType, node) {
|
|
|
33
33
|
score = 90;
|
|
34
34
|
break;
|
|
35
35
|
case 'crawl-only':
|
|
36
|
-
|
|
36
|
+
// Sitemap-only URLs are less severe if we haven't even tried to crawl them yet.
|
|
37
|
+
score = node.status === 0 ? 50 : 70;
|
|
37
38
|
break;
|
|
38
39
|
case 'near':
|
|
39
40
|
score = node.inLinks <= 1 ? 70 : 60;
|
|
@@ -64,12 +65,17 @@ export function calculateOrphanSeverity(orphanType, node) {
|
|
|
64
65
|
negativeModifier = Math.min(20, negativeModifier);
|
|
65
66
|
score += positiveModifier;
|
|
66
67
|
score -= negativeModifier;
|
|
68
|
+
// Safety: unvisited nodes should not be flagged as high-severity orphans
|
|
69
|
+
// because we haven't confirmed they are real pages or fully explored their context.
|
|
70
|
+
if (node.status === 0) {
|
|
71
|
+
score = Math.min(score, 60);
|
|
72
|
+
}
|
|
67
73
|
return clampScore(score);
|
|
68
74
|
}
|
|
69
75
|
function consolidateInboundByCanonical(nodes) {
|
|
70
76
|
const canonicalInbound = new Map();
|
|
71
77
|
for (const node of nodes) {
|
|
72
|
-
const canonical = node.
|
|
78
|
+
const canonical = node.canonical || node.url;
|
|
73
79
|
canonicalInbound.set(canonical, (canonicalInbound.get(canonical) || 0) + node.inLinks);
|
|
74
80
|
}
|
|
75
81
|
return canonicalInbound;
|
|
@@ -85,7 +91,7 @@ export function annotateOrphans(nodes, edges, options) {
|
|
|
85
91
|
if (isHomepage || node.robotsExcluded) {
|
|
86
92
|
return { ...node, orphan: false };
|
|
87
93
|
}
|
|
88
|
-
const canonical = node.
|
|
94
|
+
const canonical = node.canonical || node.url;
|
|
89
95
|
const inbound = canonicalInbound.get(canonical) || 0;
|
|
90
96
|
let orphanType;
|
|
91
97
|
if (inbound === 0) {
|
package/dist/analysis/scoring.js
CHANGED
|
@@ -1,6 +1,12 @@
|
|
|
1
1
|
export function scorePageSeo(page) {
|
|
2
|
+
if (page.meta.crawlStatus === 'blocked_by_robots') {
|
|
3
|
+
return 0;
|
|
4
|
+
}
|
|
2
5
|
const titleMeta = (scoreTextStatus(page.title.status) + scoreTextStatus(page.metaDescription.status)) / 2;
|
|
3
|
-
|
|
6
|
+
let h1 = page.h1.status === 'ok' ? 100 : page.h1.status === 'warning' ? 60 : 10;
|
|
7
|
+
if (page.headingScore !== undefined && page.headingScore !== null) {
|
|
8
|
+
h1 = page.headingScore;
|
|
9
|
+
}
|
|
4
10
|
const wordQuality = Math.min(100, (page.content.wordCount / 600) * 100) * 0.7 + Math.min(100, page.content.textHtmlRatio * 500) * 0.3;
|
|
5
11
|
const thin = 100 - page.thinScore;
|
|
6
12
|
const imageDen = Math.max(1, page.images.totalImages);
|
|
@@ -33,7 +39,10 @@ export function aggregateSiteScore(metrics, pages) {
|
|
|
33
39
|
const entropyScore = Math.max(0, 100 - Math.abs(metrics.structuralEntropy - 2) * 25);
|
|
34
40
|
const orphanPenalty = metrics.totalPages === 0 ? 0 : (metrics.orphanPages.length / metrics.totalPages) * 100;
|
|
35
41
|
const authorityEntropyOrphanScore = Math.max(0, Math.min(100, (avgAuthority * 100 * 0.4) + (entropyScore * 0.35) + ((100 - orphanPenalty) * 0.25)));
|
|
36
|
-
|
|
42
|
+
let overallScore = Number((seoHealthScore * 0.7 + authorityEntropyOrphanScore * 0.3).toFixed(2));
|
|
43
|
+
if (pages.some(p => p.meta.crawlStatus === 'blocked_by_robots')) {
|
|
44
|
+
overallScore = 0;
|
|
45
|
+
}
|
|
37
46
|
return {
|
|
38
47
|
seoHealthScore: Number(seoHealthScore.toFixed(2)),
|
|
39
48
|
authorityEntropyOrphanScore: Number(authorityEntropyOrphanScore.toFixed(2)),
|
package/dist/analysis/seo.d.ts
CHANGED
|
@@ -8,8 +8,12 @@ export interface H1Analysis {
|
|
|
8
8
|
count: number;
|
|
9
9
|
status: 'ok' | 'critical' | 'warning';
|
|
10
10
|
matchesTitle: boolean;
|
|
11
|
+
value: string | null;
|
|
11
12
|
}
|
|
12
|
-
export declare function analyzeTitle(
|
|
13
|
-
export declare function analyzeMetaDescription(
|
|
14
|
-
export declare function
|
|
15
|
-
export declare function
|
|
13
|
+
export declare function analyzeTitle($: any): TextFieldAnalysis;
|
|
14
|
+
export declare function analyzeMetaDescription($: any): TextFieldAnalysis;
|
|
15
|
+
export declare function analyzeH1($: any, titleValue: string | null): H1Analysis;
|
|
16
|
+
export declare function applyDuplicateStatuses<T extends {
|
|
17
|
+
value: string | null;
|
|
18
|
+
status: string;
|
|
19
|
+
}>(items: T[]): T[];
|