@crawlith/core 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +70 -0
- package/dist/analysis/analyze.d.ts +29 -8
- package/dist/analysis/analyze.js +325 -221
- package/dist/analysis/clustering.d.ts +23 -0
- package/dist/analysis/clustering.js +206 -0
- package/dist/analysis/content.d.ts +1 -1
- package/dist/analysis/content.js +11 -5
- package/dist/analysis/duplicate.d.ts +34 -0
- package/dist/analysis/duplicate.js +305 -0
- package/dist/analysis/heading.d.ts +116 -0
- package/dist/analysis/heading.js +356 -0
- package/dist/analysis/images.d.ts +1 -1
- package/dist/analysis/images.js +6 -5
- package/dist/analysis/links.d.ts +1 -1
- package/dist/analysis/links.js +8 -8
- package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
- package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
- package/dist/analysis/scoring.js +4 -1
- package/dist/analysis/seo.d.ts +8 -4
- package/dist/analysis/seo.js +41 -30
- package/dist/analysis/soft404.d.ts +17 -0
- package/dist/analysis/soft404.js +62 -0
- package/dist/analysis/structuredData.d.ts +1 -1
- package/dist/analysis/structuredData.js +5 -4
- package/dist/application/index.d.ts +2 -0
- package/dist/application/index.js +2 -0
- package/dist/application/usecase.d.ts +3 -0
- package/dist/application/usecase.js +1 -0
- package/dist/application/usecases.d.ts +114 -0
- package/dist/application/usecases.js +201 -0
- package/dist/audit/index.js +1 -1
- package/dist/audit/transport.d.ts +1 -1
- package/dist/audit/transport.js +5 -4
- package/dist/audit/types.d.ts +1 -0
- package/dist/constants.d.ts +17 -0
- package/dist/constants.js +23 -0
- package/dist/core/scope/scopeManager.js +3 -0
- package/dist/crawler/crawl.d.ts +2 -2
- package/dist/crawler/crawler.d.ts +17 -5
- package/dist/crawler/crawler.js +259 -94
- package/dist/crawler/fetcher.d.ts +1 -1
- package/dist/crawler/fetcher.js +6 -6
- package/dist/crawler/metricsRunner.d.ts +21 -1
- package/dist/crawler/metricsRunner.js +181 -60
- package/dist/crawler/normalize.d.ts +41 -0
- package/dist/crawler/normalize.js +119 -3
- package/dist/crawler/parser.d.ts +1 -3
- package/dist/crawler/parser.js +2 -49
- package/dist/crawler/resolver.d.ts +11 -0
- package/dist/crawler/resolver.js +67 -0
- package/dist/crawler/sitemap.d.ts +4 -1
- package/dist/crawler/sitemap.js +24 -18
- package/dist/crawler/trap.d.ts +5 -1
- package/dist/crawler/trap.js +23 -2
- package/dist/db/CrawlithDB.d.ts +110 -0
- package/dist/db/CrawlithDB.js +500 -0
- package/dist/db/graphLoader.js +15 -32
- package/dist/db/index.d.ts +9 -1
- package/dist/db/index.js +39 -31
- package/dist/db/migrations.d.ts +2 -0
- package/dist/db/{schema.js → migrations.js} +90 -43
- package/dist/db/pluginRegistry.d.ts +9 -0
- package/dist/db/pluginRegistry.js +19 -0
- package/dist/db/repositories/EdgeRepository.d.ts +5 -0
- package/dist/db/repositories/EdgeRepository.js +7 -0
- package/dist/db/repositories/MetricsRepository.d.ts +13 -8
- package/dist/db/repositories/MetricsRepository.js +14 -6
- package/dist/db/repositories/PageRepository.d.ts +5 -3
- package/dist/db/repositories/PageRepository.js +68 -17
- package/dist/db/repositories/SiteRepository.d.ts +6 -0
- package/dist/db/repositories/SiteRepository.js +4 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
- package/dist/db/repositories/SnapshotRepository.js +48 -10
- package/dist/db/reset.d.ts +9 -0
- package/dist/db/reset.js +32 -0
- package/dist/db/statements.d.ts +12 -0
- package/dist/db/statements.js +40 -0
- package/dist/diff/compare.d.ts +0 -5
- package/dist/diff/compare.js +0 -12
- package/dist/diff/service.d.ts +16 -0
- package/dist/diff/service.js +41 -0
- package/dist/domain/index.d.ts +4 -0
- package/dist/domain/index.js +4 -0
- package/dist/events.d.ts +8 -0
- package/dist/graph/graph.d.ts +20 -42
- package/dist/graph/graph.js +12 -16
- package/dist/graph/hits.d.ts +23 -0
- package/dist/graph/hits.js +111 -0
- package/dist/graph/metrics.d.ts +0 -4
- package/dist/graph/metrics.js +19 -15
- package/dist/graph/pagerank.d.ts +17 -4
- package/dist/graph/pagerank.js +126 -93
- package/dist/index.d.ts +27 -9
- package/dist/index.js +27 -9
- package/dist/lock/lockManager.d.ts +1 -0
- package/dist/lock/lockManager.js +15 -0
- package/dist/plugin-system/plugin-cli.d.ts +10 -0
- package/dist/plugin-system/plugin-cli.js +31 -0
- package/dist/plugin-system/plugin-config.d.ts +16 -0
- package/dist/plugin-system/plugin-config.js +36 -0
- package/dist/plugin-system/plugin-loader.d.ts +17 -0
- package/dist/plugin-system/plugin-loader.js +122 -0
- package/dist/plugin-system/plugin-registry.d.ts +25 -0
- package/dist/plugin-system/plugin-registry.js +167 -0
- package/dist/plugin-system/plugin-types.d.ts +205 -0
- package/dist/plugin-system/plugin-types.js +1 -0
- package/dist/ports/index.d.ts +9 -0
- package/dist/ports/index.js +1 -0
- package/dist/report/export.d.ts +3 -0
- package/dist/report/export.js +81 -0
- package/dist/report/insight.d.ts +27 -0
- package/dist/report/insight.js +103 -0
- package/dist/scoring/health.d.ts +17 -11
- package/dist/scoring/health.js +183 -140
- package/dist/utils/chalk.d.ts +6 -0
- package/dist/utils/chalk.js +41 -0
- package/dist/utils/secureConfig.d.ts +23 -0
- package/dist/utils/secureConfig.js +128 -0
- package/package.json +10 -4
- package/CHANGELOG.md +0 -13
- package/dist/db/schema.d.ts +0 -2
- package/dist/graph/cluster.d.ts +0 -6
- package/dist/graph/cluster.js +0 -221
- package/dist/graph/duplicate.d.ts +0 -10
- package/dist/graph/duplicate.js +0 -302
- package/dist/scoring/hits.d.ts +0 -10
- package/dist/scoring/hits.js +0 -131
- package/scripts/copy-assets.js +0 -37
- package/src/analysis/analysis_list.html +0 -35
- package/src/analysis/analysis_page.html +0 -123
- package/src/analysis/analyze.ts +0 -505
- package/src/analysis/content.ts +0 -62
- package/src/analysis/images.ts +0 -28
- package/src/analysis/links.ts +0 -41
- package/src/analysis/scoring.ts +0 -66
- package/src/analysis/seo.ts +0 -82
- package/src/analysis/structuredData.ts +0 -62
- package/src/analysis/templates.ts +0 -9
- package/src/audit/dns.ts +0 -49
- package/src/audit/headers.ts +0 -98
- package/src/audit/index.ts +0 -66
- package/src/audit/scoring.ts +0 -232
- package/src/audit/transport.ts +0 -258
- package/src/audit/types.ts +0 -102
- package/src/core/network/proxyAdapter.ts +0 -21
- package/src/core/network/rateLimiter.ts +0 -39
- package/src/core/network/redirectController.ts +0 -47
- package/src/core/network/responseLimiter.ts +0 -34
- package/src/core/network/retryPolicy.ts +0 -57
- package/src/core/scope/domainFilter.ts +0 -45
- package/src/core/scope/scopeManager.ts +0 -52
- package/src/core/scope/subdomainPolicy.ts +0 -39
- package/src/core/security/ipGuard.ts +0 -171
- package/src/crawler/crawl.ts +0 -9
- package/src/crawler/crawler.ts +0 -601
- package/src/crawler/extract.ts +0 -39
- package/src/crawler/fetcher.ts +0 -251
- package/src/crawler/metricsRunner.ts +0 -137
- package/src/crawler/normalize.ts +0 -108
- package/src/crawler/parser.ts +0 -190
- package/src/crawler/sitemap.ts +0 -76
- package/src/crawler/trap.ts +0 -96
- package/src/db/graphLoader.ts +0 -135
- package/src/db/index.ts +0 -75
- package/src/db/repositories/EdgeRepository.ts +0 -43
- package/src/db/repositories/MetricsRepository.ts +0 -63
- package/src/db/repositories/PageRepository.ts +0 -228
- package/src/db/repositories/SiteRepository.ts +0 -43
- package/src/db/repositories/SnapshotRepository.ts +0 -99
- package/src/db/schema.ts +0 -177
- package/src/diff/compare.ts +0 -84
- package/src/events.ts +0 -16
- package/src/graph/cluster.ts +0 -246
- package/src/graph/duplicate.ts +0 -350
- package/src/graph/graph.ts +0 -192
- package/src/graph/metrics.ts +0 -125
- package/src/graph/pagerank.ts +0 -126
- package/src/graph/simhash.ts +0 -76
- package/src/index.ts +0 -33
- package/src/lock/hashKey.ts +0 -51
- package/src/lock/lockManager.ts +0 -132
- package/src/lock/pidCheck.ts +0 -13
- package/src/report/crawl.html +0 -879
- package/src/report/crawlExport.ts +0 -58
- package/src/report/crawl_template.ts +0 -9
- package/src/report/html.ts +0 -27
- package/src/scoring/health.ts +0 -241
- package/src/scoring/hits.ts +0 -153
- package/src/scoring/orphanSeverity.ts +0 -176
- package/src/utils/version.ts +0 -18
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
- package/tests/analysis.unit.test.ts +0 -142
- package/tests/analyze.integration.test.ts +0 -133
- package/tests/analyze_markdown.test.ts +0 -98
- package/tests/audit/audit.test.ts +0 -101
- package/tests/audit/dns.test.ts +0 -31
- package/tests/audit/headers.test.ts +0 -45
- package/tests/audit/scoring.test.ts +0 -133
- package/tests/audit/security.test.ts +0 -12
- package/tests/audit/transport.test.ts +0 -111
- package/tests/clustering.test.ts +0 -118
- package/tests/clustering_risk.test.ts +0 -118
- package/tests/crawler.test.ts +0 -364
- package/tests/db/index.test.ts +0 -134
- package/tests/db/repositories.test.ts +0 -115
- package/tests/db.test.ts +0 -159
- package/tests/db_repos.test.ts +0 -72
- package/tests/diff.test.ts +0 -67
- package/tests/duplicate.test.ts +0 -110
- package/tests/extract.test.ts +0 -86
- package/tests/fetcher.test.ts +0 -110
- package/tests/fetcher_safety.test.ts +0 -91
- package/tests/fixtures/analyze-crawl.json +0 -26
- package/tests/graph/graph.test.ts +0 -100
- package/tests/graphLoader.test.ts +0 -124
- package/tests/hits.test.ts +0 -134
- package/tests/html_report.test.ts +0 -59
- package/tests/ipGuard.test.ts +0 -73
- package/tests/lock/lockManager.test.ts +0 -198
- package/tests/metrics.test.ts +0 -196
- package/tests/normalize.test.ts +0 -88
- package/tests/orphanSeverity.test.ts +0 -160
- package/tests/pagerank.test.ts +0 -98
- package/tests/parser.test.ts +0 -117
- package/tests/proxy_safety.test.ts +0 -57
- package/tests/redirect_safety.test.ts +0 -77
- package/tests/renderAnalysisCsv.test.ts +0 -183
- package/tests/safety.test.ts +0 -126
- package/tests/scope.test.ts +0 -84
- package/tests/scoring.test.ts +0 -60
- package/tests/sitemap.test.ts +0 -100
- package/tests/soft404.test.ts +0 -41
- package/tests/ssrf_fix.test.ts +0 -69
- package/tests/trap.test.ts +0 -39
- package/tests/visualization_data.test.ts +0 -46
- package/tsconfig.json +0 -11
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
import path from 'node:path';
|
|
2
|
+
import fs from 'node:fs/promises';
|
|
3
|
+
import chalk from '../utils/chalk.js';
|
|
4
|
+
import { generateHtml, } from './html.js';
|
|
5
|
+
import { renderCrawlMarkdown, renderCrawlCsvNodes, renderCrawlCsvEdges } from './crawlExport.js';
|
|
6
|
+
import { renderAnalysisHtml, renderAnalysisMarkdown, renderAnalysisCsv } from '../analysis/analyze.js';
|
|
7
|
+
export function parseExportFormats(exportOption) {
|
|
8
|
+
if (exportOption === undefined || exportOption === false)
|
|
9
|
+
return [];
|
|
10
|
+
if (exportOption === true)
|
|
11
|
+
return ['json'];
|
|
12
|
+
return exportOption.split(',').map(s => s.trim().toLowerCase()).filter(Boolean);
|
|
13
|
+
}
|
|
14
|
+
export async function runCrawlExports(formats, outputDir, url, graphData, metrics, graphObj, report) {
|
|
15
|
+
if (formats.length === 0)
|
|
16
|
+
return;
|
|
17
|
+
await fs.mkdir(outputDir, { recursive: true });
|
|
18
|
+
if (formats.includes('json')) {
|
|
19
|
+
await fs.writeFile(path.join(outputDir, 'graph.json'), JSON.stringify(graphData, null, 2));
|
|
20
|
+
await fs.writeFile(path.join(outputDir, 'metrics.json'), JSON.stringify(metrics, null, 2));
|
|
21
|
+
if (report) {
|
|
22
|
+
await fs.writeFile(path.join(outputDir, 'report.json'), JSON.stringify(report, null, 2));
|
|
23
|
+
}
|
|
24
|
+
console.log(chalk.green(`JSON exports saved to ${outputDir} (graph.json, metrics.json${report ? ', report.json' : ''})`));
|
|
25
|
+
}
|
|
26
|
+
if (formats.includes('html')) {
|
|
27
|
+
const html = generateHtml(graphData, metrics);
|
|
28
|
+
await fs.writeFile(path.join(outputDir, 'graph.html'), html);
|
|
29
|
+
console.log(chalk.green(`HTML report saved to ${path.join(outputDir, 'graph.html')}`));
|
|
30
|
+
}
|
|
31
|
+
if (formats.includes('visualize')) {
|
|
32
|
+
const CrawlHtml = generateHtml(graphData, metrics);
|
|
33
|
+
await fs.writeFile(path.join(outputDir, 'crawl.html'), CrawlHtml);
|
|
34
|
+
console.log(chalk.green(`Visualization saved to ${path.join(outputDir, 'crawl.html')}`));
|
|
35
|
+
}
|
|
36
|
+
if (formats.includes('csv')) {
|
|
37
|
+
await fs.writeFile(path.join(outputDir, 'nodes.csv'), renderCrawlCsvNodes(graphData));
|
|
38
|
+
await fs.writeFile(path.join(outputDir, 'edges.csv'), renderCrawlCsvEdges(graphData));
|
|
39
|
+
console.log(chalk.green(`CSV exports saved to ${outputDir} (nodes.csv, edges.csv)`));
|
|
40
|
+
}
|
|
41
|
+
if (formats.includes('markdown')) {
|
|
42
|
+
const md = renderCrawlMarkdown(url, graphData, metrics, graphObj);
|
|
43
|
+
await fs.writeFile(path.join(outputDir, 'summary.md'), md);
|
|
44
|
+
console.log(chalk.green(`Markdown summary saved to ${path.join(outputDir, 'summary.md')}`));
|
|
45
|
+
if (report && report.plugins) {
|
|
46
|
+
for (const [pluginName, pluginData] of Object.entries(report.plugins)) {
|
|
47
|
+
// Ensure Exporter remains generic without plugin-specific logical branches
|
|
48
|
+
const serialized = JSON.stringify(pluginData, null, 2);
|
|
49
|
+
const pluginMd = `\n## Plugin: ${pluginName}\n\n\`\`\`json\n${serialized}\n\`\`\`\n`;
|
|
50
|
+
await fs.appendFile(path.join(outputDir, 'summary.md'), pluginMd);
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
export async function runAnalysisExports(formats, outputDir, result, isLive) {
|
|
56
|
+
if (formats.length === 0)
|
|
57
|
+
return;
|
|
58
|
+
await fs.mkdir(outputDir, { recursive: true });
|
|
59
|
+
if (formats.includes('json')) {
|
|
60
|
+
await fs.writeFile(path.join(outputDir, 'analysis.json'), JSON.stringify(result, null, 2));
|
|
61
|
+
console.log(chalk.green(`JSON export saved to ${path.join(outputDir, 'analysis.json')}`));
|
|
62
|
+
}
|
|
63
|
+
if (formats.includes('html')) {
|
|
64
|
+
const html = renderAnalysisHtml(result);
|
|
65
|
+
const filename = isLive ? 'page.html' : 'analysis.html';
|
|
66
|
+
await fs.writeFile(path.join(outputDir, filename), html, 'utf-8');
|
|
67
|
+
console.log(chalk.green(`HTML report saved to ${path.join(outputDir, filename)}`));
|
|
68
|
+
}
|
|
69
|
+
if (formats.includes('markdown')) {
|
|
70
|
+
const markdown = renderAnalysisMarkdown(result);
|
|
71
|
+
const filename = isLive ? 'analysis.md' : 'analysis.md';
|
|
72
|
+
await fs.writeFile(path.join(outputDir, filename), markdown, 'utf-8');
|
|
73
|
+
console.log(chalk.green(`Markdown report saved to ${path.join(outputDir, filename)}`));
|
|
74
|
+
}
|
|
75
|
+
if (formats.includes('csv')) {
|
|
76
|
+
const csv = renderAnalysisCsv(result);
|
|
77
|
+
const filename = isLive ? 'analysis.csv' : 'analysis.csv';
|
|
78
|
+
await fs.writeFile(path.join(outputDir, filename), csv, 'utf-8');
|
|
79
|
+
console.log(chalk.green(`CSV export saved to ${path.join(outputDir, filename)}`));
|
|
80
|
+
}
|
|
81
|
+
}
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import { Graph, Metrics } from '@crawlith/core';
|
|
2
|
+
export interface CrawlInsightReport {
|
|
3
|
+
pages: number;
|
|
4
|
+
fetchedPages?: number;
|
|
5
|
+
summary: {
|
|
6
|
+
crawlDepth: number;
|
|
7
|
+
internalLinks: number;
|
|
8
|
+
externalLinks: number;
|
|
9
|
+
};
|
|
10
|
+
health?: {
|
|
11
|
+
score: number;
|
|
12
|
+
status: string;
|
|
13
|
+
weightedPenalties: any;
|
|
14
|
+
};
|
|
15
|
+
issues?: any;
|
|
16
|
+
topAuthorityPages: {
|
|
17
|
+
url: string;
|
|
18
|
+
score: number;
|
|
19
|
+
}[];
|
|
20
|
+
}
|
|
21
|
+
export declare function buildCrawlInsightReport(graph: Graph, metrics: Metrics, healthData?: {
|
|
22
|
+
health: any;
|
|
23
|
+
issues: any;
|
|
24
|
+
}): CrawlInsightReport;
|
|
25
|
+
export declare function renderInsightOutput(report: CrawlInsightReport, snapshotId: number): string;
|
|
26
|
+
export declare function renderScoreBreakdown(health: any): string;
|
|
27
|
+
export declare function hasCriticalIssues(report: CrawlInsightReport): boolean;
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
export function buildCrawlInsightReport(graph, metrics, healthData) {
|
|
2
|
+
return {
|
|
3
|
+
pages: metrics.totalPages,
|
|
4
|
+
fetchedPages: metrics.sessionStats?.pagesFetched,
|
|
5
|
+
health: healthData?.health,
|
|
6
|
+
issues: healthData?.issues,
|
|
7
|
+
summary: {
|
|
8
|
+
crawlDepth: metrics.maxDepthFound,
|
|
9
|
+
internalLinks: metrics.totalEdges,
|
|
10
|
+
externalLinks: healthData?.issues?.externalLinks || 0
|
|
11
|
+
},
|
|
12
|
+
topAuthorityPages: metrics.topAuthorityPages.map(p => ({ url: p.url, score: p.authority }))
|
|
13
|
+
};
|
|
14
|
+
}
|
|
15
|
+
export function renderInsightOutput(report, snapshotId) {
|
|
16
|
+
const lines = [];
|
|
17
|
+
// Header
|
|
18
|
+
lines.push(`CRAWLITH — Crawl`);
|
|
19
|
+
lines.push('');
|
|
20
|
+
lines.push(`# ${snapshotId}`);
|
|
21
|
+
lines.push('');
|
|
22
|
+
if (report.fetchedPages !== undefined) {
|
|
23
|
+
if (report.fetchedPages === report.pages) {
|
|
24
|
+
lines.push(`${report.pages} pages crawled`);
|
|
25
|
+
}
|
|
26
|
+
else {
|
|
27
|
+
lines.push(`${report.fetchedPages} pages fetched / ${report.pages} discovered`);
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
else {
|
|
31
|
+
lines.push(`${report.pages} pages crawled`);
|
|
32
|
+
}
|
|
33
|
+
lines.push('');
|
|
34
|
+
// Health Score if available
|
|
35
|
+
if (report.health) {
|
|
36
|
+
lines.push(`Score: ${report.health.score} (${report.health.status})`);
|
|
37
|
+
lines.push('');
|
|
38
|
+
}
|
|
39
|
+
// ===== Critical =====
|
|
40
|
+
if (report.issues) {
|
|
41
|
+
const critical = [];
|
|
42
|
+
const addLine = (arr, condition, text) => condition && arr.push(text);
|
|
43
|
+
addLine(critical, report.issues.orphanPages > 0, `${report.issues.orphanPages} orphan pages`);
|
|
44
|
+
addLine(critical, report.issues.redirectChains > 0, `${report.issues.redirectChains} redirect chains`);
|
|
45
|
+
addLine(critical, report.issues.brokenInternalLinks > 0, `${report.issues.brokenInternalLinks} broken internal links`);
|
|
46
|
+
addLine(critical, report.issues.duplicateClusters > 0, `${report.issues.duplicateClusters} near-duplicate clusters`);
|
|
47
|
+
addLine(critical, report.issues.canonicalConflicts > 0, `${report.issues.canonicalConflicts} canonical conflicts`);
|
|
48
|
+
addLine(critical, report.issues.accidentalNoindex > 0, `${report.issues.accidentalNoindex} pages accidentally noindexed`);
|
|
49
|
+
addLine(critical, report.issues.blockedByRobots > 0, `${report.issues.blockedByRobots} pages blocked by robots.txt`);
|
|
50
|
+
if (critical.length > 0) {
|
|
51
|
+
lines.push(`Critical`);
|
|
52
|
+
for (const c of critical)
|
|
53
|
+
lines.push(` • ${c}`);
|
|
54
|
+
lines.push('');
|
|
55
|
+
}
|
|
56
|
+
// ===== Warnings =====
|
|
57
|
+
const warnings = [];
|
|
58
|
+
addLine(warnings, report.issues.missingH1 > 0, `${report.issues.missingH1} pages missing H1`);
|
|
59
|
+
addLine(warnings, report.issues.thinContent > 0, `${report.issues.thinContent} thin content pages`);
|
|
60
|
+
addLine(warnings, report.issues.excessiveInternalLinkCount > 0, `${report.issues.excessiveInternalLinkCount} pages with excessive links`);
|
|
61
|
+
addLine(warnings, report.issues.imageAltMissing > 0, `${report.issues.imageAltMissing} pages missing image alt`);
|
|
62
|
+
if (warnings.length > 0) {
|
|
63
|
+
lines.push(`Warnings`);
|
|
64
|
+
for (const w of warnings)
|
|
65
|
+
lines.push(` • ${w}`);
|
|
66
|
+
lines.push('');
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
// ===== Structure =====
|
|
70
|
+
lines.push(`Structure`);
|
|
71
|
+
lines.push(` Depth Reached ${report.summary.crawlDepth}`);
|
|
72
|
+
lines.push(` Internal Links ${report.summary.internalLinks}`);
|
|
73
|
+
lines.push(` External Links ${report.summary.externalLinks}`);
|
|
74
|
+
lines.push('');
|
|
75
|
+
// ===== Authority =====
|
|
76
|
+
if (report.topAuthorityPages.length > 0) {
|
|
77
|
+
lines.push(`Top Authority`);
|
|
78
|
+
for (const page of report.topAuthorityPages.slice(0, 10)) {
|
|
79
|
+
lines.push(` ${page.url} ${page.score.toFixed(3)}`);
|
|
80
|
+
}
|
|
81
|
+
lines.push('');
|
|
82
|
+
}
|
|
83
|
+
return `${lines.join('\n')}\n`;
|
|
84
|
+
}
|
|
85
|
+
export function renderScoreBreakdown(health) {
|
|
86
|
+
return [
|
|
87
|
+
'Health Score Breakdown',
|
|
88
|
+
`weights: ${JSON.stringify(health.weights)}`,
|
|
89
|
+
`penalties: ${JSON.stringify(health.weightedPenalties)}`
|
|
90
|
+
].join('\n');
|
|
91
|
+
}
|
|
92
|
+
export function hasCriticalIssues(report) {
|
|
93
|
+
if (!report.issues)
|
|
94
|
+
return false;
|
|
95
|
+
const { issues } = report;
|
|
96
|
+
return (issues.orphanPages > 0 ||
|
|
97
|
+
issues.brokenInternalLinks > 0 ||
|
|
98
|
+
issues.redirectChains > 0 ||
|
|
99
|
+
issues.duplicateClusters > 0 ||
|
|
100
|
+
issues.canonicalConflicts > 0 ||
|
|
101
|
+
issues.accidentalNoindex > 0 ||
|
|
102
|
+
issues.blockedByRobots > 0);
|
|
103
|
+
}
|
package/dist/scoring/health.d.ts
CHANGED
|
@@ -1,10 +1,4 @@
|
|
|
1
1
|
import { Graph } from '../graph/graph.js';
|
|
2
|
-
import { Metrics } from '../graph/metrics.js';
|
|
3
|
-
export declare const THIN_CONTENT_THRESHOLD = 300;
|
|
4
|
-
export declare const LOW_INTERNAL_LINK_THRESHOLD = 2;
|
|
5
|
-
export declare const EXCESSIVE_INTERNAL_LINK_THRESHOLD = 150;
|
|
6
|
-
export declare const HIGH_EXTERNAL_LINK_RATIO_THRESHOLD = 0.6;
|
|
7
|
-
export declare const OPPORTUNITY_AUTHORITY_THRESHOLD = 0.8;
|
|
8
2
|
export interface HealthScoreWeights {
|
|
9
3
|
orphans: number;
|
|
10
4
|
brokenLinks: number;
|
|
@@ -17,8 +11,8 @@ export interface HealthScoreWeights {
|
|
|
17
11
|
lowInternalLinks: number;
|
|
18
12
|
excessiveLinks: number;
|
|
19
13
|
blockedByRobots: number;
|
|
14
|
+
crawlTraps: number;
|
|
20
15
|
}
|
|
21
|
-
export declare const DEFAULT_HEALTH_WEIGHTS: HealthScoreWeights;
|
|
22
16
|
export interface CrawlIssueCounts {
|
|
23
17
|
orphanPages: number;
|
|
24
18
|
brokenInternalLinks: number;
|
|
@@ -38,13 +32,25 @@ export interface CrawlIssueCounts {
|
|
|
38
32
|
underlinkedHighAuthorityPages: number;
|
|
39
33
|
externalLinks: number;
|
|
40
34
|
blockedByRobots: number;
|
|
35
|
+
crawlTraps: number;
|
|
41
36
|
}
|
|
42
37
|
export interface HealthScoreBreakdown {
|
|
43
38
|
score: number;
|
|
44
39
|
status: string;
|
|
45
|
-
weightedPenalties: Record<
|
|
40
|
+
weightedPenalties: Record<string, number>;
|
|
46
41
|
weights: HealthScoreWeights;
|
|
47
42
|
}
|
|
48
|
-
export declare
|
|
49
|
-
export declare
|
|
50
|
-
export declare
|
|
43
|
+
export declare const THIN_CONTENT_THRESHOLD = 200;
|
|
44
|
+
export declare const LOW_INTERNAL_LINK_THRESHOLD = 2;
|
|
45
|
+
export declare const EXCESSIVE_INTERNAL_LINK_THRESHOLD = 150;
|
|
46
|
+
export declare const HIGH_EXTERNAL_LINK_RATIO_THRESHOLD = 0.6;
|
|
47
|
+
export declare const OPPORTUNITY_AUTHORITY_THRESHOLD = 0.8;
|
|
48
|
+
export declare const DEFAULT_HEALTH_WEIGHTS: HealthScoreWeights;
|
|
49
|
+
export declare class HealthService {
|
|
50
|
+
calculateHealthScore(totalPages: number, issues: Pick<CrawlIssueCounts, 'orphanPages' | 'brokenInternalLinks' | 'redirectChains' | 'duplicateClusters' | 'thinContent' | 'missingH1' | 'accidentalNoindex' | 'canonicalConflicts' | 'lowInternalLinkCount' | 'excessiveInternalLinkCount' | 'blockedByRobots' | 'crawlTraps'>, weights?: HealthScoreWeights): HealthScoreBreakdown;
|
|
51
|
+
collectCrawlIssues(graph: Graph, metrics: any, rootOrigin?: string): CrawlIssueCounts;
|
|
52
|
+
private clamp;
|
|
53
|
+
private healthStatusLabel;
|
|
54
|
+
}
|
|
55
|
+
export declare const calculateHealthScore: (totalPages: number, issues: Pick<CrawlIssueCounts, "orphanPages" | "brokenInternalLinks" | "redirectChains" | "duplicateClusters" | "thinContent" | "missingH1" | "accidentalNoindex" | "canonicalConflicts" | "lowInternalLinkCount" | "excessiveInternalLinkCount" | "blockedByRobots" | "crawlTraps">, weights?: HealthScoreWeights) => HealthScoreBreakdown;
|
|
56
|
+
export declare const healthStatusLabel: (score: number, hasCritical?: boolean) => "Needs Attention" | "Excellent" | "Good" | "Critical";
|
package/dist/scoring/health.js
CHANGED
|
@@ -2,7 +2,8 @@ import { analyzeContent } from '../analysis/content.js';
|
|
|
2
2
|
import { analyzeH1 } from '../analysis/seo.js';
|
|
3
3
|
import { analyzeImageAlts } from '../analysis/images.js';
|
|
4
4
|
import { analyzeLinks } from '../analysis/links.js';
|
|
5
|
-
|
|
5
|
+
import { UrlUtil } from '../crawler/normalize.js';
|
|
6
|
+
export const THIN_CONTENT_THRESHOLD = 200;
|
|
6
7
|
export const LOW_INTERNAL_LINK_THRESHOLD = 2;
|
|
7
8
|
export const EXCESSIVE_INTERNAL_LINK_THRESHOLD = 150;
|
|
8
9
|
export const HIGH_EXTERNAL_LINK_RATIO_THRESHOLD = 0.6;
|
|
@@ -18,12 +19,188 @@ export const DEFAULT_HEALTH_WEIGHTS = {
|
|
|
18
19
|
canonicalConflicts: 10,
|
|
19
20
|
lowInternalLinks: 10,
|
|
20
21
|
excessiveLinks: 5,
|
|
21
|
-
blockedByRobots: 100
|
|
22
|
+
blockedByRobots: 100,
|
|
23
|
+
crawlTraps: 50
|
|
22
24
|
};
|
|
23
|
-
|
|
24
|
-
|
|
25
|
+
export class HealthService {
|
|
26
|
+
calculateHealthScore(totalPages, issues, weights = DEFAULT_HEALTH_WEIGHTS) {
|
|
27
|
+
const safePages = Math.max(totalPages, 1);
|
|
28
|
+
const weightedPenalties = {
|
|
29
|
+
orphans: this.clamp(((issues.orphanPages || 0) / safePages) * weights.orphans, 0, weights.orphans),
|
|
30
|
+
brokenLinks: this.clamp(((issues.brokenInternalLinks || 0) / safePages) * weights.brokenLinks, 0, weights.brokenLinks),
|
|
31
|
+
redirectChains: this.clamp(((issues.redirectChains || 0) / safePages) * weights.redirectChains, 0, weights.redirectChains),
|
|
32
|
+
duplicateClusters: this.clamp(((issues.duplicateClusters || 0) / safePages) * weights.duplicateClusters, 0, weights.duplicateClusters),
|
|
33
|
+
thinContent: this.clamp(((issues.thinContent || 0) / safePages) * weights.thinContent, 0, weights.thinContent),
|
|
34
|
+
missingH1: this.clamp(((issues.missingH1 || 0) / safePages) * weights.missingH1, 0, weights.missingH1),
|
|
35
|
+
noindexMisuse: this.clamp(((issues.accidentalNoindex || 0) / safePages) * weights.noindexMisuse, 0, weights.noindexMisuse),
|
|
36
|
+
canonicalConflicts: this.clamp(((issues.canonicalConflicts || 0) / safePages) * weights.canonicalConflicts, 0, weights.canonicalConflicts),
|
|
37
|
+
lowInternalLinks: this.clamp(((issues.lowInternalLinkCount || 0) / safePages) * weights.lowInternalLinks, 0, weights.lowInternalLinks),
|
|
38
|
+
excessiveLinks: this.clamp(((issues.excessiveInternalLinkCount || 0) / safePages) * weights.excessiveLinks, 0, weights.excessiveLinks),
|
|
39
|
+
blockedByRobots: this.clamp(((issues.blockedByRobots || 0) / safePages) * weights.blockedByRobots, 0, weights.blockedByRobots),
|
|
40
|
+
crawlTraps: this.clamp(((issues.crawlTraps || 0) / safePages) * weights.crawlTraps, 0, weights.crawlTraps)
|
|
41
|
+
};
|
|
42
|
+
const totalPenalty = Object.values(weightedPenalties).reduce((sum, value) => sum + value, 0);
|
|
43
|
+
const score = Number(this.clamp(100 - totalPenalty, 0, 100).toFixed(1));
|
|
44
|
+
const hasCritical = ((issues.orphanPages || 0) > 0 ||
|
|
45
|
+
(issues.brokenInternalLinks || 0) > 0 ||
|
|
46
|
+
(issues.redirectChains || 0) > 0 ||
|
|
47
|
+
(issues.duplicateClusters || 0) > 0 ||
|
|
48
|
+
(issues.canonicalConflicts || 0) > 0 ||
|
|
49
|
+
(issues.accidentalNoindex || 0) > 0 ||
|
|
50
|
+
(issues.blockedByRobots || 0) > 0);
|
|
51
|
+
return {
|
|
52
|
+
score,
|
|
53
|
+
status: this.healthStatusLabel(score, hasCritical),
|
|
54
|
+
weightedPenalties,
|
|
55
|
+
weights
|
|
56
|
+
};
|
|
57
|
+
}
|
|
58
|
+
collectCrawlIssues(graph, metrics, rootOrigin = '') {
|
|
59
|
+
const nodes = graph.getNodes();
|
|
60
|
+
let brokenInternalLinks = 0;
|
|
61
|
+
let redirectChains = 0;
|
|
62
|
+
let canonicalConflicts = 0;
|
|
63
|
+
let accidentalNoindex = 0;
|
|
64
|
+
let missingH1 = 0;
|
|
65
|
+
let thinContent = 0;
|
|
66
|
+
let highExternalLinkRatio = 0;
|
|
67
|
+
let imageAltMissing = 0;
|
|
68
|
+
let lowInternalLinkCount = 0;
|
|
69
|
+
let excessiveInternalLinkCount = 0;
|
|
70
|
+
let strongPagesUnderLinking = 0;
|
|
71
|
+
let nearAuthorityThreshold = 0;
|
|
72
|
+
let underlinkedHighAuthorityPages = 0;
|
|
73
|
+
let externalLinks = 0;
|
|
74
|
+
let blockedByRobots = 0;
|
|
75
|
+
let crawlTraps = 0;
|
|
76
|
+
for (const node of nodes) {
|
|
77
|
+
if (!node.isInternal) {
|
|
78
|
+
continue;
|
|
79
|
+
}
|
|
80
|
+
if (node.crawlStatus === 'blocked' || node.crawlStatus === 'blocked_by_robots') {
|
|
81
|
+
blockedByRobots += 1;
|
|
82
|
+
}
|
|
83
|
+
if (node.crawlTrapFlag) {
|
|
84
|
+
crawlTraps += 1;
|
|
85
|
+
}
|
|
86
|
+
const isConfirmedError = node.status >= 400 || (node.status === 0 && (node.crawlStatus === 'network_error' || node.crawlStatus === 'failed_after_retries' || node.securityError || node.crawlStatus === 'fetched_error'));
|
|
87
|
+
if (isConfirmedError) {
|
|
88
|
+
brokenInternalLinks += 1;
|
|
89
|
+
}
|
|
90
|
+
if (node.brokenLinks) {
|
|
91
|
+
const actualBreaks = node.brokenLinks.filter(url => {
|
|
92
|
+
const target = graph.nodes.get(url);
|
|
93
|
+
return target && (target.status >= 400 || (target.status === 0 && (target.crawlStatus === 'network_error' || target.crawlStatus === 'failed_after_retries' || target.securityError || target.crawlStatus === 'fetched_error')));
|
|
94
|
+
});
|
|
95
|
+
brokenInternalLinks += actualBreaks.length;
|
|
96
|
+
}
|
|
97
|
+
if ((node.redirectChain?.length || 0) > 1) {
|
|
98
|
+
redirectChains += 1;
|
|
99
|
+
}
|
|
100
|
+
const absoluteUrl = rootOrigin ? (node.url.startsWith('http') ? node.url : new URL(node.url, rootOrigin).toString()) : node.url;
|
|
101
|
+
if (node.canonical && node.canonical !== node.url && node.canonical !== absoluteUrl) {
|
|
102
|
+
// Final check: normalize both to ignore trailing slash differences or protocol mismatches if they are considered "same"
|
|
103
|
+
const normCanonical = node.canonical.replace(/\/$/, '');
|
|
104
|
+
const normAbsolute = absoluteUrl.replace(/\/$/, '');
|
|
105
|
+
if (normCanonical !== normAbsolute) {
|
|
106
|
+
canonicalConflicts += 1;
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
if (node.noindex && node.status >= 200 && node.status < 300) {
|
|
110
|
+
accidentalNoindex += 1;
|
|
111
|
+
}
|
|
112
|
+
if (node.inLinks === 1 && node.depth > 0) {
|
|
113
|
+
lowInternalLinkCount += 1;
|
|
114
|
+
}
|
|
115
|
+
if (node.outLinks > EXCESSIVE_INTERNAL_LINK_THRESHOLD) {
|
|
116
|
+
excessiveInternalLinkCount += 1;
|
|
117
|
+
}
|
|
118
|
+
if (!node.html) {
|
|
119
|
+
continue;
|
|
120
|
+
}
|
|
121
|
+
const h1Res = analyzeH1(node.html, '');
|
|
122
|
+
if (h1Res.count === 0) {
|
|
123
|
+
missingH1 += 1;
|
|
124
|
+
}
|
|
125
|
+
if (node.wordCount != null) {
|
|
126
|
+
if (node.wordCount < THIN_CONTENT_THRESHOLD) {
|
|
127
|
+
thinContent += 1;
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
else if (node.html) {
|
|
131
|
+
const content = analyzeContent(node.html);
|
|
132
|
+
if (content.wordCount < THIN_CONTENT_THRESHOLD) {
|
|
133
|
+
thinContent += 1;
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
const pageAbsUrl = rootOrigin ? UrlUtil.toAbsolute(node.url, rootOrigin) : node.url;
|
|
137
|
+
const links = analyzeLinks(node.html || '', pageAbsUrl, rootOrigin || node.url);
|
|
138
|
+
externalLinks += links.externalLinks;
|
|
139
|
+
if (links.externalRatio > HIGH_EXTERNAL_LINK_RATIO_THRESHOLD) {
|
|
140
|
+
highExternalLinkRatio += 1;
|
|
141
|
+
}
|
|
142
|
+
if (node.html) {
|
|
143
|
+
const imageAlt = analyzeImageAlts(node.html);
|
|
144
|
+
if (imageAlt.missingAlt > 0) {
|
|
145
|
+
imageAltMissing += 1;
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
const clusters = graph.contentClusters || metrics.clusters || [];
|
|
150
|
+
const duplicateClusters = clusters.length;
|
|
151
|
+
const cannibalizationClusters = clusters.filter((cluster) => cluster.risk === 'high' || cluster.type === 'near').length;
|
|
152
|
+
for (const node of nodes) {
|
|
153
|
+
const authority = node.inLinks > 5 ? 0.8 : 0.2;
|
|
154
|
+
if (authority >= OPPORTUNITY_AUTHORITY_THRESHOLD && node.outLinks < 3) {
|
|
155
|
+
strongPagesUnderLinking += 1;
|
|
156
|
+
}
|
|
157
|
+
if (authority >= 0.65 && authority < OPPORTUNITY_AUTHORITY_THRESHOLD) {
|
|
158
|
+
nearAuthorityThreshold += 1;
|
|
159
|
+
}
|
|
160
|
+
if (authority >= OPPORTUNITY_AUTHORITY_THRESHOLD && node.inLinks < LOW_INTERNAL_LINK_THRESHOLD) {
|
|
161
|
+
underlinkedHighAuthorityPages += 1;
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
return {
|
|
165
|
+
orphanPages: metrics.orphanPages?.length || 0,
|
|
166
|
+
brokenInternalLinks,
|
|
167
|
+
redirectChains,
|
|
168
|
+
duplicateClusters,
|
|
169
|
+
canonicalConflicts,
|
|
170
|
+
accidentalNoindex,
|
|
171
|
+
missingH1,
|
|
172
|
+
thinContent,
|
|
173
|
+
lowInternalLinkCount,
|
|
174
|
+
excessiveInternalLinkCount,
|
|
175
|
+
highExternalLinkRatio,
|
|
176
|
+
imageAltMissing,
|
|
177
|
+
strongPagesUnderLinking,
|
|
178
|
+
cannibalizationClusters,
|
|
179
|
+
nearAuthorityThreshold,
|
|
180
|
+
underlinkedHighAuthorityPages,
|
|
181
|
+
externalLinks,
|
|
182
|
+
blockedByRobots,
|
|
183
|
+
crawlTraps
|
|
184
|
+
};
|
|
185
|
+
}
|
|
186
|
+
clamp(value, min, max) {
|
|
187
|
+
return Math.min(max, Math.max(min, value));
|
|
188
|
+
}
|
|
189
|
+
healthStatusLabel(score, hasCritical = false) {
|
|
190
|
+
if (hasCritical && score >= 75)
|
|
191
|
+
return 'Needs Attention';
|
|
192
|
+
if (score >= 90)
|
|
193
|
+
return 'Excellent';
|
|
194
|
+
if (score >= 75)
|
|
195
|
+
return 'Good';
|
|
196
|
+
if (score >= 50)
|
|
197
|
+
return 'Needs Attention';
|
|
198
|
+
return 'Critical';
|
|
199
|
+
}
|
|
25
200
|
}
|
|
26
|
-
|
|
201
|
+
const service = new HealthService();
|
|
202
|
+
export const calculateHealthScore = (totalPages, issues, weights = DEFAULT_HEALTH_WEIGHTS) => service.calculateHealthScore(totalPages, issues, weights);
|
|
203
|
+
export const healthStatusLabel = (score, hasCritical = false) => {
|
|
27
204
|
if (hasCritical && score >= 75)
|
|
28
205
|
return 'Needs Attention';
|
|
29
206
|
if (score >= 90)
|
|
@@ -33,138 +210,4 @@ export function healthStatusLabel(score, hasCritical = false) {
|
|
|
33
210
|
if (score >= 50)
|
|
34
211
|
return 'Needs Attention';
|
|
35
212
|
return 'Critical';
|
|
36
|
-
}
|
|
37
|
-
export function calculateHealthScore(totalPages, issues, weights = DEFAULT_HEALTH_WEIGHTS) {
|
|
38
|
-
const safePages = Math.max(totalPages, 1);
|
|
39
|
-
const weightedPenalties = {
|
|
40
|
-
orphans: clamp((issues.orphanPages / safePages) * weights.orphans, 0, weights.orphans),
|
|
41
|
-
brokenLinks: clamp((issues.brokenInternalLinks / safePages) * weights.brokenLinks, 0, weights.brokenLinks),
|
|
42
|
-
redirectChains: clamp((issues.redirectChains / safePages) * weights.redirectChains, 0, weights.redirectChains),
|
|
43
|
-
duplicateClusters: clamp((issues.duplicateClusters / safePages) * weights.duplicateClusters, 0, weights.duplicateClusters),
|
|
44
|
-
thinContent: clamp((issues.thinContent / safePages) * weights.thinContent, 0, weights.thinContent),
|
|
45
|
-
missingH1: clamp((issues.missingH1 / safePages) * weights.missingH1, 0, weights.missingH1),
|
|
46
|
-
noindexMisuse: clamp((issues.accidentalNoindex / safePages) * weights.noindexMisuse, 0, weights.noindexMisuse),
|
|
47
|
-
canonicalConflicts: clamp((issues.canonicalConflicts / safePages) * weights.canonicalConflicts, 0, weights.canonicalConflicts),
|
|
48
|
-
lowInternalLinks: clamp((issues.lowInternalLinkCount / safePages) * weights.lowInternalLinks, 0, weights.lowInternalLinks),
|
|
49
|
-
excessiveLinks: clamp((issues.excessiveInternalLinkCount / safePages) * weights.excessiveLinks, 0, weights.excessiveLinks),
|
|
50
|
-
blockedByRobots: clamp((issues.blockedByRobots / safePages) * weights.blockedByRobots, 0, weights.blockedByRobots)
|
|
51
|
-
};
|
|
52
|
-
const totalPenalty = Object.values(weightedPenalties).reduce((sum, value) => sum + value, 0);
|
|
53
|
-
const score = Number(clamp(100 - totalPenalty, 0, 100).toFixed(1));
|
|
54
|
-
const hasCritical = (issues.orphanPages > 0 ||
|
|
55
|
-
issues.brokenInternalLinks > 0 ||
|
|
56
|
-
issues.redirectChains > 0 ||
|
|
57
|
-
issues.duplicateClusters > 0 ||
|
|
58
|
-
issues.canonicalConflicts > 0 ||
|
|
59
|
-
issues.accidentalNoindex > 0 ||
|
|
60
|
-
issues.blockedByRobots > 0);
|
|
61
|
-
return {
|
|
62
|
-
score,
|
|
63
|
-
status: healthStatusLabel(score, hasCritical),
|
|
64
|
-
weightedPenalties,
|
|
65
|
-
weights
|
|
66
|
-
};
|
|
67
|
-
}
|
|
68
|
-
export function collectCrawlIssues(graph, metrics) {
|
|
69
|
-
const nodes = graph.getNodes();
|
|
70
|
-
let brokenInternalLinks = 0;
|
|
71
|
-
let redirectChains = 0;
|
|
72
|
-
let canonicalConflicts = 0;
|
|
73
|
-
let accidentalNoindex = 0;
|
|
74
|
-
let missingH1 = 0;
|
|
75
|
-
let thinContent = 0;
|
|
76
|
-
let highExternalLinkRatio = 0;
|
|
77
|
-
let imageAltMissing = 0;
|
|
78
|
-
let lowInternalLinkCount = 0;
|
|
79
|
-
let excessiveInternalLinkCount = 0;
|
|
80
|
-
let strongPagesUnderLinking = 0;
|
|
81
|
-
let nearAuthorityThreshold = 0;
|
|
82
|
-
let underlinkedHighAuthorityPages = 0;
|
|
83
|
-
let externalLinks = 0;
|
|
84
|
-
let blockedByRobots = 0;
|
|
85
|
-
for (const node of nodes) {
|
|
86
|
-
if (node.crawlStatus === 'blocked' || node.crawlStatus === 'blocked_by_robots') {
|
|
87
|
-
blockedByRobots += 1;
|
|
88
|
-
}
|
|
89
|
-
const isConfirmedError = node.status >= 400 || (node.status === 0 && (node.crawlStatus === 'network_error' || node.crawlStatus === 'failed_after_retries' || node.securityError || node.crawlStatus === 'fetched_error'));
|
|
90
|
-
if (isConfirmedError) {
|
|
91
|
-
brokenInternalLinks += 1;
|
|
92
|
-
}
|
|
93
|
-
if (node.brokenLinks) {
|
|
94
|
-
const actualBreaks = node.brokenLinks.filter(url => {
|
|
95
|
-
const target = graph.nodes.get(url);
|
|
96
|
-
return target && (target.status >= 400 || (target.status === 0 && (target.crawlStatus === 'network_error' || target.crawlStatus === 'failed_after_retries' || target.securityError || target.crawlStatus === 'fetched_error')));
|
|
97
|
-
});
|
|
98
|
-
brokenInternalLinks += actualBreaks.length;
|
|
99
|
-
}
|
|
100
|
-
if ((node.redirectChain?.length || 0) > 1) {
|
|
101
|
-
redirectChains += 1;
|
|
102
|
-
}
|
|
103
|
-
if (node.canonical && node.canonical !== node.url) {
|
|
104
|
-
canonicalConflicts += 1;
|
|
105
|
-
}
|
|
106
|
-
if (node.noindex && node.status >= 200 && node.status < 300) {
|
|
107
|
-
accidentalNoindex += 1;
|
|
108
|
-
}
|
|
109
|
-
if (node.inLinks < LOW_INTERNAL_LINK_THRESHOLD && node.depth > 0) {
|
|
110
|
-
lowInternalLinkCount += 1;
|
|
111
|
-
}
|
|
112
|
-
if (node.outLinks > EXCESSIVE_INTERNAL_LINK_THRESHOLD) {
|
|
113
|
-
excessiveInternalLinkCount += 1;
|
|
114
|
-
}
|
|
115
|
-
if (!node.html) {
|
|
116
|
-
continue;
|
|
117
|
-
}
|
|
118
|
-
const h1 = analyzeH1(node.html, '');
|
|
119
|
-
if (h1.count === 0) {
|
|
120
|
-
missingH1 += 1;
|
|
121
|
-
}
|
|
122
|
-
const content = analyzeContent(node.html);
|
|
123
|
-
if (content.wordCount < THIN_CONTENT_THRESHOLD) {
|
|
124
|
-
thinContent += 1;
|
|
125
|
-
}
|
|
126
|
-
const links = analyzeLinks(node.html, node.url, node.url);
|
|
127
|
-
externalLinks += links.externalLinks;
|
|
128
|
-
if (links.externalRatio > HIGH_EXTERNAL_LINK_RATIO_THRESHOLD) {
|
|
129
|
-
highExternalLinkRatio += 1;
|
|
130
|
-
}
|
|
131
|
-
const imageAlt = analyzeImageAlts(node.html);
|
|
132
|
-
if (imageAlt.missingAlt > 0) {
|
|
133
|
-
imageAltMissing += 1;
|
|
134
|
-
}
|
|
135
|
-
}
|
|
136
|
-
const duplicateClusters = graph.duplicateClusters?.length || 0;
|
|
137
|
-
const cannibalizationClusters = graph.duplicateClusters?.filter((cluster) => cluster.type === 'near').length || 0;
|
|
138
|
-
for (const node of nodes) {
|
|
139
|
-
const authority = node.pageRank || 0;
|
|
140
|
-
if (authority >= OPPORTUNITY_AUTHORITY_THRESHOLD && node.outLinks < 3) {
|
|
141
|
-
strongPagesUnderLinking += 1;
|
|
142
|
-
}
|
|
143
|
-
if (authority >= 0.65 && authority < OPPORTUNITY_AUTHORITY_THRESHOLD) {
|
|
144
|
-
nearAuthorityThreshold += 1;
|
|
145
|
-
}
|
|
146
|
-
if (authority >= OPPORTUNITY_AUTHORITY_THRESHOLD && node.inLinks < LOW_INTERNAL_LINK_THRESHOLD) {
|
|
147
|
-
underlinkedHighAuthorityPages += 1;
|
|
148
|
-
}
|
|
149
|
-
}
|
|
150
|
-
return {
|
|
151
|
-
orphanPages: metrics.orphanPages.length,
|
|
152
|
-
brokenInternalLinks,
|
|
153
|
-
redirectChains,
|
|
154
|
-
duplicateClusters,
|
|
155
|
-
canonicalConflicts,
|
|
156
|
-
accidentalNoindex,
|
|
157
|
-
missingH1,
|
|
158
|
-
thinContent,
|
|
159
|
-
lowInternalLinkCount,
|
|
160
|
-
excessiveInternalLinkCount,
|
|
161
|
-
highExternalLinkRatio,
|
|
162
|
-
imageAltMissing,
|
|
163
|
-
strongPagesUnderLinking,
|
|
164
|
-
cannibalizationClusters,
|
|
165
|
-
nearAuthorityThreshold,
|
|
166
|
-
underlinkedHighAuthorityPages,
|
|
167
|
-
externalLinks,
|
|
168
|
-
blockedByRobots
|
|
169
|
-
};
|
|
170
|
-
}
|
|
213
|
+
};
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import { styleText } from 'node:util';
|
|
2
|
+
const alias = {
|
|
3
|
+
grey: 'gray'
|
|
4
|
+
};
|
|
5
|
+
const chalk = createChalk([]);
|
|
6
|
+
function createChalk(styles) {
|
|
7
|
+
const formatter = ((text) => applyStyles(styles, text));
|
|
8
|
+
return new Proxy(formatter, {
|
|
9
|
+
apply(_target, _thisArg, args) {
|
|
10
|
+
return applyStyles(styles, args[0]);
|
|
11
|
+
},
|
|
12
|
+
get(_target, prop) {
|
|
13
|
+
if (typeof prop !== 'string') {
|
|
14
|
+
return undefined;
|
|
15
|
+
}
|
|
16
|
+
const style = alias[prop] ?? prop;
|
|
17
|
+
return createChalk([...styles, style]);
|
|
18
|
+
}
|
|
19
|
+
});
|
|
20
|
+
}
|
|
21
|
+
function applyStyles(styles, text) {
|
|
22
|
+
const value = String(text ?? '');
|
|
23
|
+
if (styles.length === 0 || !isColorEnabled()) {
|
|
24
|
+
return value;
|
|
25
|
+
}
|
|
26
|
+
return styleText(styles, value);
|
|
27
|
+
}
|
|
28
|
+
function isColorEnabled() {
|
|
29
|
+
if (process.env.NO_COLOR !== undefined || process.env.NODE_DISABLE_COLORS !== undefined) {
|
|
30
|
+
return false;
|
|
31
|
+
}
|
|
32
|
+
const forceColor = process.env.FORCE_COLOR;
|
|
33
|
+
if (forceColor === '0') {
|
|
34
|
+
return false;
|
|
35
|
+
}
|
|
36
|
+
if (forceColor !== undefined) {
|
|
37
|
+
return true;
|
|
38
|
+
}
|
|
39
|
+
return Boolean(process.stdout?.isTTY);
|
|
40
|
+
}
|
|
41
|
+
export default chalk;
|