@crawlith/core 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analyze.d.ts +29 -8
  4. package/dist/analysis/analyze.js +325 -221
  5. package/dist/analysis/clustering.d.ts +23 -0
  6. package/dist/analysis/clustering.js +206 -0
  7. package/dist/analysis/content.d.ts +1 -1
  8. package/dist/analysis/content.js +11 -5
  9. package/dist/analysis/duplicate.d.ts +34 -0
  10. package/dist/analysis/duplicate.js +305 -0
  11. package/dist/analysis/heading.d.ts +116 -0
  12. package/dist/analysis/heading.js +356 -0
  13. package/dist/analysis/images.d.ts +1 -1
  14. package/dist/analysis/images.js +6 -5
  15. package/dist/analysis/links.d.ts +1 -1
  16. package/dist/analysis/links.js +8 -8
  17. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  18. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  19. package/dist/analysis/scoring.js +4 -1
  20. package/dist/analysis/seo.d.ts +8 -4
  21. package/dist/analysis/seo.js +41 -30
  22. package/dist/analysis/soft404.d.ts +17 -0
  23. package/dist/analysis/soft404.js +62 -0
  24. package/dist/analysis/structuredData.d.ts +1 -1
  25. package/dist/analysis/structuredData.js +5 -4
  26. package/dist/application/index.d.ts +2 -0
  27. package/dist/application/index.js +2 -0
  28. package/dist/application/usecase.d.ts +3 -0
  29. package/dist/application/usecase.js +1 -0
  30. package/dist/application/usecases.d.ts +114 -0
  31. package/dist/application/usecases.js +201 -0
  32. package/dist/audit/index.js +1 -1
  33. package/dist/audit/transport.d.ts +1 -1
  34. package/dist/audit/transport.js +5 -4
  35. package/dist/audit/types.d.ts +1 -0
  36. package/dist/constants.d.ts +17 -0
  37. package/dist/constants.js +23 -0
  38. package/dist/core/scope/scopeManager.js +3 -0
  39. package/dist/crawler/crawl.d.ts +2 -2
  40. package/dist/crawler/crawler.d.ts +17 -5
  41. package/dist/crawler/crawler.js +259 -94
  42. package/dist/crawler/fetcher.d.ts +1 -1
  43. package/dist/crawler/fetcher.js +6 -6
  44. package/dist/crawler/metricsRunner.d.ts +21 -1
  45. package/dist/crawler/metricsRunner.js +181 -60
  46. package/dist/crawler/normalize.d.ts +41 -0
  47. package/dist/crawler/normalize.js +119 -3
  48. package/dist/crawler/parser.d.ts +1 -3
  49. package/dist/crawler/parser.js +2 -49
  50. package/dist/crawler/resolver.d.ts +11 -0
  51. package/dist/crawler/resolver.js +67 -0
  52. package/dist/crawler/sitemap.d.ts +4 -1
  53. package/dist/crawler/sitemap.js +24 -18
  54. package/dist/crawler/trap.d.ts +5 -1
  55. package/dist/crawler/trap.js +23 -2
  56. package/dist/db/CrawlithDB.d.ts +110 -0
  57. package/dist/db/CrawlithDB.js +500 -0
  58. package/dist/db/graphLoader.js +15 -32
  59. package/dist/db/index.d.ts +9 -1
  60. package/dist/db/index.js +39 -31
  61. package/dist/db/migrations.d.ts +2 -0
  62. package/dist/db/{schema.js → migrations.js} +90 -43
  63. package/dist/db/pluginRegistry.d.ts +9 -0
  64. package/dist/db/pluginRegistry.js +19 -0
  65. package/dist/db/repositories/EdgeRepository.d.ts +5 -0
  66. package/dist/db/repositories/EdgeRepository.js +7 -0
  67. package/dist/db/repositories/MetricsRepository.d.ts +13 -8
  68. package/dist/db/repositories/MetricsRepository.js +14 -6
  69. package/dist/db/repositories/PageRepository.d.ts +5 -3
  70. package/dist/db/repositories/PageRepository.js +68 -17
  71. package/dist/db/repositories/SiteRepository.d.ts +6 -0
  72. package/dist/db/repositories/SiteRepository.js +4 -0
  73. package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
  74. package/dist/db/repositories/SnapshotRepository.js +48 -10
  75. package/dist/db/reset.d.ts +9 -0
  76. package/dist/db/reset.js +32 -0
  77. package/dist/db/statements.d.ts +12 -0
  78. package/dist/db/statements.js +40 -0
  79. package/dist/diff/compare.d.ts +0 -5
  80. package/dist/diff/compare.js +0 -12
  81. package/dist/diff/service.d.ts +16 -0
  82. package/dist/diff/service.js +41 -0
  83. package/dist/domain/index.d.ts +4 -0
  84. package/dist/domain/index.js +4 -0
  85. package/dist/events.d.ts +8 -0
  86. package/dist/graph/graph.d.ts +20 -42
  87. package/dist/graph/graph.js +12 -16
  88. package/dist/graph/hits.d.ts +23 -0
  89. package/dist/graph/hits.js +111 -0
  90. package/dist/graph/metrics.d.ts +0 -4
  91. package/dist/graph/metrics.js +19 -15
  92. package/dist/graph/pagerank.d.ts +17 -4
  93. package/dist/graph/pagerank.js +126 -93
  94. package/dist/index.d.ts +27 -9
  95. package/dist/index.js +27 -9
  96. package/dist/lock/lockManager.d.ts +1 -0
  97. package/dist/lock/lockManager.js +15 -0
  98. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  99. package/dist/plugin-system/plugin-cli.js +31 -0
  100. package/dist/plugin-system/plugin-config.d.ts +16 -0
  101. package/dist/plugin-system/plugin-config.js +36 -0
  102. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  103. package/dist/plugin-system/plugin-loader.js +122 -0
  104. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  105. package/dist/plugin-system/plugin-registry.js +167 -0
  106. package/dist/plugin-system/plugin-types.d.ts +205 -0
  107. package/dist/plugin-system/plugin-types.js +1 -0
  108. package/dist/ports/index.d.ts +9 -0
  109. package/dist/ports/index.js +1 -0
  110. package/dist/report/export.d.ts +3 -0
  111. package/dist/report/export.js +81 -0
  112. package/dist/report/insight.d.ts +27 -0
  113. package/dist/report/insight.js +103 -0
  114. package/dist/scoring/health.d.ts +17 -11
  115. package/dist/scoring/health.js +183 -140
  116. package/dist/utils/chalk.d.ts +6 -0
  117. package/dist/utils/chalk.js +41 -0
  118. package/dist/utils/secureConfig.d.ts +23 -0
  119. package/dist/utils/secureConfig.js +128 -0
  120. package/package.json +10 -4
  121. package/CHANGELOG.md +0 -13
  122. package/dist/db/schema.d.ts +0 -2
  123. package/dist/graph/cluster.d.ts +0 -6
  124. package/dist/graph/cluster.js +0 -221
  125. package/dist/graph/duplicate.d.ts +0 -10
  126. package/dist/graph/duplicate.js +0 -302
  127. package/dist/scoring/hits.d.ts +0 -10
  128. package/dist/scoring/hits.js +0 -131
  129. package/scripts/copy-assets.js +0 -37
  130. package/src/analysis/analysis_list.html +0 -35
  131. package/src/analysis/analysis_page.html +0 -123
  132. package/src/analysis/analyze.ts +0 -505
  133. package/src/analysis/content.ts +0 -62
  134. package/src/analysis/images.ts +0 -28
  135. package/src/analysis/links.ts +0 -41
  136. package/src/analysis/scoring.ts +0 -66
  137. package/src/analysis/seo.ts +0 -82
  138. package/src/analysis/structuredData.ts +0 -62
  139. package/src/analysis/templates.ts +0 -9
  140. package/src/audit/dns.ts +0 -49
  141. package/src/audit/headers.ts +0 -98
  142. package/src/audit/index.ts +0 -66
  143. package/src/audit/scoring.ts +0 -232
  144. package/src/audit/transport.ts +0 -258
  145. package/src/audit/types.ts +0 -102
  146. package/src/core/network/proxyAdapter.ts +0 -21
  147. package/src/core/network/rateLimiter.ts +0 -39
  148. package/src/core/network/redirectController.ts +0 -47
  149. package/src/core/network/responseLimiter.ts +0 -34
  150. package/src/core/network/retryPolicy.ts +0 -57
  151. package/src/core/scope/domainFilter.ts +0 -45
  152. package/src/core/scope/scopeManager.ts +0 -52
  153. package/src/core/scope/subdomainPolicy.ts +0 -39
  154. package/src/core/security/ipGuard.ts +0 -171
  155. package/src/crawler/crawl.ts +0 -9
  156. package/src/crawler/crawler.ts +0 -601
  157. package/src/crawler/extract.ts +0 -39
  158. package/src/crawler/fetcher.ts +0 -251
  159. package/src/crawler/metricsRunner.ts +0 -137
  160. package/src/crawler/normalize.ts +0 -108
  161. package/src/crawler/parser.ts +0 -190
  162. package/src/crawler/sitemap.ts +0 -76
  163. package/src/crawler/trap.ts +0 -96
  164. package/src/db/graphLoader.ts +0 -135
  165. package/src/db/index.ts +0 -75
  166. package/src/db/repositories/EdgeRepository.ts +0 -43
  167. package/src/db/repositories/MetricsRepository.ts +0 -63
  168. package/src/db/repositories/PageRepository.ts +0 -228
  169. package/src/db/repositories/SiteRepository.ts +0 -43
  170. package/src/db/repositories/SnapshotRepository.ts +0 -99
  171. package/src/db/schema.ts +0 -177
  172. package/src/diff/compare.ts +0 -84
  173. package/src/events.ts +0 -16
  174. package/src/graph/cluster.ts +0 -246
  175. package/src/graph/duplicate.ts +0 -350
  176. package/src/graph/graph.ts +0 -192
  177. package/src/graph/metrics.ts +0 -125
  178. package/src/graph/pagerank.ts +0 -126
  179. package/src/graph/simhash.ts +0 -76
  180. package/src/index.ts +0 -33
  181. package/src/lock/hashKey.ts +0 -51
  182. package/src/lock/lockManager.ts +0 -132
  183. package/src/lock/pidCheck.ts +0 -13
  184. package/src/report/crawl.html +0 -879
  185. package/src/report/crawlExport.ts +0 -58
  186. package/src/report/crawl_template.ts +0 -9
  187. package/src/report/html.ts +0 -27
  188. package/src/scoring/health.ts +0 -241
  189. package/src/scoring/hits.ts +0 -153
  190. package/src/scoring/orphanSeverity.ts +0 -176
  191. package/src/utils/version.ts +0 -18
  192. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  193. package/tests/analysis.unit.test.ts +0 -142
  194. package/tests/analyze.integration.test.ts +0 -133
  195. package/tests/analyze_markdown.test.ts +0 -98
  196. package/tests/audit/audit.test.ts +0 -101
  197. package/tests/audit/dns.test.ts +0 -31
  198. package/tests/audit/headers.test.ts +0 -45
  199. package/tests/audit/scoring.test.ts +0 -133
  200. package/tests/audit/security.test.ts +0 -12
  201. package/tests/audit/transport.test.ts +0 -111
  202. package/tests/clustering.test.ts +0 -118
  203. package/tests/clustering_risk.test.ts +0 -118
  204. package/tests/crawler.test.ts +0 -364
  205. package/tests/db/index.test.ts +0 -134
  206. package/tests/db/repositories.test.ts +0 -115
  207. package/tests/db.test.ts +0 -159
  208. package/tests/db_repos.test.ts +0 -72
  209. package/tests/diff.test.ts +0 -67
  210. package/tests/duplicate.test.ts +0 -110
  211. package/tests/extract.test.ts +0 -86
  212. package/tests/fetcher.test.ts +0 -110
  213. package/tests/fetcher_safety.test.ts +0 -91
  214. package/tests/fixtures/analyze-crawl.json +0 -26
  215. package/tests/graph/graph.test.ts +0 -100
  216. package/tests/graphLoader.test.ts +0 -124
  217. package/tests/hits.test.ts +0 -134
  218. package/tests/html_report.test.ts +0 -59
  219. package/tests/ipGuard.test.ts +0 -73
  220. package/tests/lock/lockManager.test.ts +0 -198
  221. package/tests/metrics.test.ts +0 -196
  222. package/tests/normalize.test.ts +0 -88
  223. package/tests/orphanSeverity.test.ts +0 -160
  224. package/tests/pagerank.test.ts +0 -98
  225. package/tests/parser.test.ts +0 -117
  226. package/tests/proxy_safety.test.ts +0 -57
  227. package/tests/redirect_safety.test.ts +0 -77
  228. package/tests/renderAnalysisCsv.test.ts +0 -183
  229. package/tests/safety.test.ts +0 -126
  230. package/tests/scope.test.ts +0 -84
  231. package/tests/scoring.test.ts +0 -60
  232. package/tests/sitemap.test.ts +0 -100
  233. package/tests/soft404.test.ts +0 -41
  234. package/tests/ssrf_fix.test.ts +0 -69
  235. package/tests/trap.test.ts +0 -39
  236. package/tests/visualization_data.test.ts +0 -46
  237. package/tsconfig.json +0 -11
@@ -0,0 +1,81 @@
1
+ import path from 'node:path';
2
+ import fs from 'node:fs/promises';
3
+ import chalk from '../utils/chalk.js';
4
+ import { generateHtml, } from './html.js';
5
+ import { renderCrawlMarkdown, renderCrawlCsvNodes, renderCrawlCsvEdges } from './crawlExport.js';
6
+ import { renderAnalysisHtml, renderAnalysisMarkdown, renderAnalysisCsv } from '../analysis/analyze.js';
7
+ export function parseExportFormats(exportOption) {
8
+ if (exportOption === undefined || exportOption === false)
9
+ return [];
10
+ if (exportOption === true)
11
+ return ['json'];
12
+ return exportOption.split(',').map(s => s.trim().toLowerCase()).filter(Boolean);
13
+ }
14
+ export async function runCrawlExports(formats, outputDir, url, graphData, metrics, graphObj, report) {
15
+ if (formats.length === 0)
16
+ return;
17
+ await fs.mkdir(outputDir, { recursive: true });
18
+ if (formats.includes('json')) {
19
+ await fs.writeFile(path.join(outputDir, 'graph.json'), JSON.stringify(graphData, null, 2));
20
+ await fs.writeFile(path.join(outputDir, 'metrics.json'), JSON.stringify(metrics, null, 2));
21
+ if (report) {
22
+ await fs.writeFile(path.join(outputDir, 'report.json'), JSON.stringify(report, null, 2));
23
+ }
24
+ console.log(chalk.green(`JSON exports saved to ${outputDir} (graph.json, metrics.json${report ? ', report.json' : ''})`));
25
+ }
26
+ if (formats.includes('html')) {
27
+ const html = generateHtml(graphData, metrics);
28
+ await fs.writeFile(path.join(outputDir, 'graph.html'), html);
29
+ console.log(chalk.green(`HTML report saved to ${path.join(outputDir, 'graph.html')}`));
30
+ }
31
+ if (formats.includes('visualize')) {
32
+ const CrawlHtml = generateHtml(graphData, metrics);
33
+ await fs.writeFile(path.join(outputDir, 'crawl.html'), CrawlHtml);
34
+ console.log(chalk.green(`Visualization saved to ${path.join(outputDir, 'crawl.html')}`));
35
+ }
36
+ if (formats.includes('csv')) {
37
+ await fs.writeFile(path.join(outputDir, 'nodes.csv'), renderCrawlCsvNodes(graphData));
38
+ await fs.writeFile(path.join(outputDir, 'edges.csv'), renderCrawlCsvEdges(graphData));
39
+ console.log(chalk.green(`CSV exports saved to ${outputDir} (nodes.csv, edges.csv)`));
40
+ }
41
+ if (formats.includes('markdown')) {
42
+ const md = renderCrawlMarkdown(url, graphData, metrics, graphObj);
43
+ await fs.writeFile(path.join(outputDir, 'summary.md'), md);
44
+ console.log(chalk.green(`Markdown summary saved to ${path.join(outputDir, 'summary.md')}`));
45
+ if (report && report.plugins) {
46
+ for (const [pluginName, pluginData] of Object.entries(report.plugins)) {
47
+ // Ensure Exporter remains generic without plugin-specific logical branches
48
+ const serialized = JSON.stringify(pluginData, null, 2);
49
+ const pluginMd = `\n## Plugin: ${pluginName}\n\n\`\`\`json\n${serialized}\n\`\`\`\n`;
50
+ await fs.appendFile(path.join(outputDir, 'summary.md'), pluginMd);
51
+ }
52
+ }
53
+ }
54
+ }
55
+ export async function runAnalysisExports(formats, outputDir, result, isLive) {
56
+ if (formats.length === 0)
57
+ return;
58
+ await fs.mkdir(outputDir, { recursive: true });
59
+ if (formats.includes('json')) {
60
+ await fs.writeFile(path.join(outputDir, 'analysis.json'), JSON.stringify(result, null, 2));
61
+ console.log(chalk.green(`JSON export saved to ${path.join(outputDir, 'analysis.json')}`));
62
+ }
63
+ if (formats.includes('html')) {
64
+ const html = renderAnalysisHtml(result);
65
+ const filename = isLive ? 'page.html' : 'analysis.html';
66
+ await fs.writeFile(path.join(outputDir, filename), html, 'utf-8');
67
+ console.log(chalk.green(`HTML report saved to ${path.join(outputDir, filename)}`));
68
+ }
69
+ if (formats.includes('markdown')) {
70
+ const markdown = renderAnalysisMarkdown(result);
71
+ const filename = isLive ? 'analysis.md' : 'analysis.md';
72
+ await fs.writeFile(path.join(outputDir, filename), markdown, 'utf-8');
73
+ console.log(chalk.green(`Markdown report saved to ${path.join(outputDir, filename)}`));
74
+ }
75
+ if (formats.includes('csv')) {
76
+ const csv = renderAnalysisCsv(result);
77
+ const filename = isLive ? 'analysis.csv' : 'analysis.csv';
78
+ await fs.writeFile(path.join(outputDir, filename), csv, 'utf-8');
79
+ console.log(chalk.green(`CSV export saved to ${path.join(outputDir, filename)}`));
80
+ }
81
+ }
@@ -0,0 +1,27 @@
1
+ import { Graph, Metrics } from '@crawlith/core';
2
+ export interface CrawlInsightReport {
3
+ pages: number;
4
+ fetchedPages?: number;
5
+ summary: {
6
+ crawlDepth: number;
7
+ internalLinks: number;
8
+ externalLinks: number;
9
+ };
10
+ health?: {
11
+ score: number;
12
+ status: string;
13
+ weightedPenalties: any;
14
+ };
15
+ issues?: any;
16
+ topAuthorityPages: {
17
+ url: string;
18
+ score: number;
19
+ }[];
20
+ }
21
+ export declare function buildCrawlInsightReport(graph: Graph, metrics: Metrics, healthData?: {
22
+ health: any;
23
+ issues: any;
24
+ }): CrawlInsightReport;
25
+ export declare function renderInsightOutput(report: CrawlInsightReport, snapshotId: number): string;
26
+ export declare function renderScoreBreakdown(health: any): string;
27
+ export declare function hasCriticalIssues(report: CrawlInsightReport): boolean;
@@ -0,0 +1,103 @@
1
+ export function buildCrawlInsightReport(graph, metrics, healthData) {
2
+ return {
3
+ pages: metrics.totalPages,
4
+ fetchedPages: metrics.sessionStats?.pagesFetched,
5
+ health: healthData?.health,
6
+ issues: healthData?.issues,
7
+ summary: {
8
+ crawlDepth: metrics.maxDepthFound,
9
+ internalLinks: metrics.totalEdges,
10
+ externalLinks: healthData?.issues?.externalLinks || 0
11
+ },
12
+ topAuthorityPages: metrics.topAuthorityPages.map(p => ({ url: p.url, score: p.authority }))
13
+ };
14
+ }
15
+ export function renderInsightOutput(report, snapshotId) {
16
+ const lines = [];
17
+ // Header
18
+ lines.push(`CRAWLITH — Crawl`);
19
+ lines.push('');
20
+ lines.push(`# ${snapshotId}`);
21
+ lines.push('');
22
+ if (report.fetchedPages !== undefined) {
23
+ if (report.fetchedPages === report.pages) {
24
+ lines.push(`${report.pages} pages crawled`);
25
+ }
26
+ else {
27
+ lines.push(`${report.fetchedPages} pages fetched / ${report.pages} discovered`);
28
+ }
29
+ }
30
+ else {
31
+ lines.push(`${report.pages} pages crawled`);
32
+ }
33
+ lines.push('');
34
+ // Health Score if available
35
+ if (report.health) {
36
+ lines.push(`Score: ${report.health.score} (${report.health.status})`);
37
+ lines.push('');
38
+ }
39
+ // ===== Critical =====
40
+ if (report.issues) {
41
+ const critical = [];
42
+ const addLine = (arr, condition, text) => condition && arr.push(text);
43
+ addLine(critical, report.issues.orphanPages > 0, `${report.issues.orphanPages} orphan pages`);
44
+ addLine(critical, report.issues.redirectChains > 0, `${report.issues.redirectChains} redirect chains`);
45
+ addLine(critical, report.issues.brokenInternalLinks > 0, `${report.issues.brokenInternalLinks} broken internal links`);
46
+ addLine(critical, report.issues.duplicateClusters > 0, `${report.issues.duplicateClusters} near-duplicate clusters`);
47
+ addLine(critical, report.issues.canonicalConflicts > 0, `${report.issues.canonicalConflicts} canonical conflicts`);
48
+ addLine(critical, report.issues.accidentalNoindex > 0, `${report.issues.accidentalNoindex} pages accidentally noindexed`);
49
+ addLine(critical, report.issues.blockedByRobots > 0, `${report.issues.blockedByRobots} pages blocked by robots.txt`);
50
+ if (critical.length > 0) {
51
+ lines.push(`Critical`);
52
+ for (const c of critical)
53
+ lines.push(` • ${c}`);
54
+ lines.push('');
55
+ }
56
+ // ===== Warnings =====
57
+ const warnings = [];
58
+ addLine(warnings, report.issues.missingH1 > 0, `${report.issues.missingH1} pages missing H1`);
59
+ addLine(warnings, report.issues.thinContent > 0, `${report.issues.thinContent} thin content pages`);
60
+ addLine(warnings, report.issues.excessiveInternalLinkCount > 0, `${report.issues.excessiveInternalLinkCount} pages with excessive links`);
61
+ addLine(warnings, report.issues.imageAltMissing > 0, `${report.issues.imageAltMissing} pages missing image alt`);
62
+ if (warnings.length > 0) {
63
+ lines.push(`Warnings`);
64
+ for (const w of warnings)
65
+ lines.push(` • ${w}`);
66
+ lines.push('');
67
+ }
68
+ }
69
+ // ===== Structure =====
70
+ lines.push(`Structure`);
71
+ lines.push(` Depth Reached ${report.summary.crawlDepth}`);
72
+ lines.push(` Internal Links ${report.summary.internalLinks}`);
73
+ lines.push(` External Links ${report.summary.externalLinks}`);
74
+ lines.push('');
75
+ // ===== Authority =====
76
+ if (report.topAuthorityPages.length > 0) {
77
+ lines.push(`Top Authority`);
78
+ for (const page of report.topAuthorityPages.slice(0, 10)) {
79
+ lines.push(` ${page.url} ${page.score.toFixed(3)}`);
80
+ }
81
+ lines.push('');
82
+ }
83
+ return `${lines.join('\n')}\n`;
84
+ }
85
+ export function renderScoreBreakdown(health) {
86
+ return [
87
+ 'Health Score Breakdown',
88
+ `weights: ${JSON.stringify(health.weights)}`,
89
+ `penalties: ${JSON.stringify(health.weightedPenalties)}`
90
+ ].join('\n');
91
+ }
92
+ export function hasCriticalIssues(report) {
93
+ if (!report.issues)
94
+ return false;
95
+ const { issues } = report;
96
+ return (issues.orphanPages > 0 ||
97
+ issues.brokenInternalLinks > 0 ||
98
+ issues.redirectChains > 0 ||
99
+ issues.duplicateClusters > 0 ||
100
+ issues.canonicalConflicts > 0 ||
101
+ issues.accidentalNoindex > 0 ||
102
+ issues.blockedByRobots > 0);
103
+ }
@@ -1,10 +1,4 @@
1
1
  import { Graph } from '../graph/graph.js';
2
- import { Metrics } from '../graph/metrics.js';
3
- export declare const THIN_CONTENT_THRESHOLD = 300;
4
- export declare const LOW_INTERNAL_LINK_THRESHOLD = 2;
5
- export declare const EXCESSIVE_INTERNAL_LINK_THRESHOLD = 150;
6
- export declare const HIGH_EXTERNAL_LINK_RATIO_THRESHOLD = 0.6;
7
- export declare const OPPORTUNITY_AUTHORITY_THRESHOLD = 0.8;
8
2
  export interface HealthScoreWeights {
9
3
  orphans: number;
10
4
  brokenLinks: number;
@@ -17,8 +11,8 @@ export interface HealthScoreWeights {
17
11
  lowInternalLinks: number;
18
12
  excessiveLinks: number;
19
13
  blockedByRobots: number;
14
+ crawlTraps: number;
20
15
  }
21
- export declare const DEFAULT_HEALTH_WEIGHTS: HealthScoreWeights;
22
16
  export interface CrawlIssueCounts {
23
17
  orphanPages: number;
24
18
  brokenInternalLinks: number;
@@ -38,13 +32,25 @@ export interface CrawlIssueCounts {
38
32
  underlinkedHighAuthorityPages: number;
39
33
  externalLinks: number;
40
34
  blockedByRobots: number;
35
+ crawlTraps: number;
41
36
  }
42
37
  export interface HealthScoreBreakdown {
43
38
  score: number;
44
39
  status: string;
45
- weightedPenalties: Record<keyof HealthScoreWeights, number>;
40
+ weightedPenalties: Record<string, number>;
46
41
  weights: HealthScoreWeights;
47
42
  }
48
- export declare function healthStatusLabel(score: number, hasCritical?: boolean): string;
49
- export declare function calculateHealthScore(totalPages: number, issues: Pick<CrawlIssueCounts, 'orphanPages' | 'brokenInternalLinks' | 'redirectChains' | 'duplicateClusters' | 'thinContent' | 'missingH1' | 'accidentalNoindex' | 'canonicalConflicts' | 'lowInternalLinkCount' | 'excessiveInternalLinkCount' | 'blockedByRobots'>, weights?: HealthScoreWeights): HealthScoreBreakdown;
50
- export declare function collectCrawlIssues(graph: Graph, metrics: Metrics): CrawlIssueCounts;
43
+ export declare const THIN_CONTENT_THRESHOLD = 200;
44
+ export declare const LOW_INTERNAL_LINK_THRESHOLD = 2;
45
+ export declare const EXCESSIVE_INTERNAL_LINK_THRESHOLD = 150;
46
+ export declare const HIGH_EXTERNAL_LINK_RATIO_THRESHOLD = 0.6;
47
+ export declare const OPPORTUNITY_AUTHORITY_THRESHOLD = 0.8;
48
+ export declare const DEFAULT_HEALTH_WEIGHTS: HealthScoreWeights;
49
+ export declare class HealthService {
50
+ calculateHealthScore(totalPages: number, issues: Pick<CrawlIssueCounts, 'orphanPages' | 'brokenInternalLinks' | 'redirectChains' | 'duplicateClusters' | 'thinContent' | 'missingH1' | 'accidentalNoindex' | 'canonicalConflicts' | 'lowInternalLinkCount' | 'excessiveInternalLinkCount' | 'blockedByRobots' | 'crawlTraps'>, weights?: HealthScoreWeights): HealthScoreBreakdown;
51
+ collectCrawlIssues(graph: Graph, metrics: any, rootOrigin?: string): CrawlIssueCounts;
52
+ private clamp;
53
+ private healthStatusLabel;
54
+ }
55
+ export declare const calculateHealthScore: (totalPages: number, issues: Pick<CrawlIssueCounts, "orphanPages" | "brokenInternalLinks" | "redirectChains" | "duplicateClusters" | "thinContent" | "missingH1" | "accidentalNoindex" | "canonicalConflicts" | "lowInternalLinkCount" | "excessiveInternalLinkCount" | "blockedByRobots" | "crawlTraps">, weights?: HealthScoreWeights) => HealthScoreBreakdown;
56
+ export declare const healthStatusLabel: (score: number, hasCritical?: boolean) => "Needs Attention" | "Excellent" | "Good" | "Critical";
@@ -2,7 +2,8 @@ import { analyzeContent } from '../analysis/content.js';
2
2
  import { analyzeH1 } from '../analysis/seo.js';
3
3
  import { analyzeImageAlts } from '../analysis/images.js';
4
4
  import { analyzeLinks } from '../analysis/links.js';
5
- export const THIN_CONTENT_THRESHOLD = 300;
5
+ import { UrlUtil } from '../crawler/normalize.js';
6
+ export const THIN_CONTENT_THRESHOLD = 200;
6
7
  export const LOW_INTERNAL_LINK_THRESHOLD = 2;
7
8
  export const EXCESSIVE_INTERNAL_LINK_THRESHOLD = 150;
8
9
  export const HIGH_EXTERNAL_LINK_RATIO_THRESHOLD = 0.6;
@@ -18,12 +19,188 @@ export const DEFAULT_HEALTH_WEIGHTS = {
18
19
  canonicalConflicts: 10,
19
20
  lowInternalLinks: 10,
20
21
  excessiveLinks: 5,
21
- blockedByRobots: 100
22
+ blockedByRobots: 100,
23
+ crawlTraps: 50
22
24
  };
23
- function clamp(value, min, max) {
24
- return Math.min(max, Math.max(min, value));
25
+ export class HealthService {
26
+ calculateHealthScore(totalPages, issues, weights = DEFAULT_HEALTH_WEIGHTS) {
27
+ const safePages = Math.max(totalPages, 1);
28
+ const weightedPenalties = {
29
+ orphans: this.clamp(((issues.orphanPages || 0) / safePages) * weights.orphans, 0, weights.orphans),
30
+ brokenLinks: this.clamp(((issues.brokenInternalLinks || 0) / safePages) * weights.brokenLinks, 0, weights.brokenLinks),
31
+ redirectChains: this.clamp(((issues.redirectChains || 0) / safePages) * weights.redirectChains, 0, weights.redirectChains),
32
+ duplicateClusters: this.clamp(((issues.duplicateClusters || 0) / safePages) * weights.duplicateClusters, 0, weights.duplicateClusters),
33
+ thinContent: this.clamp(((issues.thinContent || 0) / safePages) * weights.thinContent, 0, weights.thinContent),
34
+ missingH1: this.clamp(((issues.missingH1 || 0) / safePages) * weights.missingH1, 0, weights.missingH1),
35
+ noindexMisuse: this.clamp(((issues.accidentalNoindex || 0) / safePages) * weights.noindexMisuse, 0, weights.noindexMisuse),
36
+ canonicalConflicts: this.clamp(((issues.canonicalConflicts || 0) / safePages) * weights.canonicalConflicts, 0, weights.canonicalConflicts),
37
+ lowInternalLinks: this.clamp(((issues.lowInternalLinkCount || 0) / safePages) * weights.lowInternalLinks, 0, weights.lowInternalLinks),
38
+ excessiveLinks: this.clamp(((issues.excessiveInternalLinkCount || 0) / safePages) * weights.excessiveLinks, 0, weights.excessiveLinks),
39
+ blockedByRobots: this.clamp(((issues.blockedByRobots || 0) / safePages) * weights.blockedByRobots, 0, weights.blockedByRobots),
40
+ crawlTraps: this.clamp(((issues.crawlTraps || 0) / safePages) * weights.crawlTraps, 0, weights.crawlTraps)
41
+ };
42
+ const totalPenalty = Object.values(weightedPenalties).reduce((sum, value) => sum + value, 0);
43
+ const score = Number(this.clamp(100 - totalPenalty, 0, 100).toFixed(1));
44
+ const hasCritical = ((issues.orphanPages || 0) > 0 ||
45
+ (issues.brokenInternalLinks || 0) > 0 ||
46
+ (issues.redirectChains || 0) > 0 ||
47
+ (issues.duplicateClusters || 0) > 0 ||
48
+ (issues.canonicalConflicts || 0) > 0 ||
49
+ (issues.accidentalNoindex || 0) > 0 ||
50
+ (issues.blockedByRobots || 0) > 0);
51
+ return {
52
+ score,
53
+ status: this.healthStatusLabel(score, hasCritical),
54
+ weightedPenalties,
55
+ weights
56
+ };
57
+ }
58
+ collectCrawlIssues(graph, metrics, rootOrigin = '') {
59
+ const nodes = graph.getNodes();
60
+ let brokenInternalLinks = 0;
61
+ let redirectChains = 0;
62
+ let canonicalConflicts = 0;
63
+ let accidentalNoindex = 0;
64
+ let missingH1 = 0;
65
+ let thinContent = 0;
66
+ let highExternalLinkRatio = 0;
67
+ let imageAltMissing = 0;
68
+ let lowInternalLinkCount = 0;
69
+ let excessiveInternalLinkCount = 0;
70
+ let strongPagesUnderLinking = 0;
71
+ let nearAuthorityThreshold = 0;
72
+ let underlinkedHighAuthorityPages = 0;
73
+ let externalLinks = 0;
74
+ let blockedByRobots = 0;
75
+ let crawlTraps = 0;
76
+ for (const node of nodes) {
77
+ if (!node.isInternal) {
78
+ continue;
79
+ }
80
+ if (node.crawlStatus === 'blocked' || node.crawlStatus === 'blocked_by_robots') {
81
+ blockedByRobots += 1;
82
+ }
83
+ if (node.crawlTrapFlag) {
84
+ crawlTraps += 1;
85
+ }
86
+ const isConfirmedError = node.status >= 400 || (node.status === 0 && (node.crawlStatus === 'network_error' || node.crawlStatus === 'failed_after_retries' || node.securityError || node.crawlStatus === 'fetched_error'));
87
+ if (isConfirmedError) {
88
+ brokenInternalLinks += 1;
89
+ }
90
+ if (node.brokenLinks) {
91
+ const actualBreaks = node.brokenLinks.filter(url => {
92
+ const target = graph.nodes.get(url);
93
+ return target && (target.status >= 400 || (target.status === 0 && (target.crawlStatus === 'network_error' || target.crawlStatus === 'failed_after_retries' || target.securityError || target.crawlStatus === 'fetched_error')));
94
+ });
95
+ brokenInternalLinks += actualBreaks.length;
96
+ }
97
+ if ((node.redirectChain?.length || 0) > 1) {
98
+ redirectChains += 1;
99
+ }
100
+ const absoluteUrl = rootOrigin ? (node.url.startsWith('http') ? node.url : new URL(node.url, rootOrigin).toString()) : node.url;
101
+ if (node.canonical && node.canonical !== node.url && node.canonical !== absoluteUrl) {
102
+ // Final check: normalize both to ignore trailing slash differences or protocol mismatches if they are considered "same"
103
+ const normCanonical = node.canonical.replace(/\/$/, '');
104
+ const normAbsolute = absoluteUrl.replace(/\/$/, '');
105
+ if (normCanonical !== normAbsolute) {
106
+ canonicalConflicts += 1;
107
+ }
108
+ }
109
+ if (node.noindex && node.status >= 200 && node.status < 300) {
110
+ accidentalNoindex += 1;
111
+ }
112
+ if (node.inLinks === 1 && node.depth > 0) {
113
+ lowInternalLinkCount += 1;
114
+ }
115
+ if (node.outLinks > EXCESSIVE_INTERNAL_LINK_THRESHOLD) {
116
+ excessiveInternalLinkCount += 1;
117
+ }
118
+ if (!node.html) {
119
+ continue;
120
+ }
121
+ const h1Res = analyzeH1(node.html, '');
122
+ if (h1Res.count === 0) {
123
+ missingH1 += 1;
124
+ }
125
+ if (node.wordCount != null) {
126
+ if (node.wordCount < THIN_CONTENT_THRESHOLD) {
127
+ thinContent += 1;
128
+ }
129
+ }
130
+ else if (node.html) {
131
+ const content = analyzeContent(node.html);
132
+ if (content.wordCount < THIN_CONTENT_THRESHOLD) {
133
+ thinContent += 1;
134
+ }
135
+ }
136
+ const pageAbsUrl = rootOrigin ? UrlUtil.toAbsolute(node.url, rootOrigin) : node.url;
137
+ const links = analyzeLinks(node.html || '', pageAbsUrl, rootOrigin || node.url);
138
+ externalLinks += links.externalLinks;
139
+ if (links.externalRatio > HIGH_EXTERNAL_LINK_RATIO_THRESHOLD) {
140
+ highExternalLinkRatio += 1;
141
+ }
142
+ if (node.html) {
143
+ const imageAlt = analyzeImageAlts(node.html);
144
+ if (imageAlt.missingAlt > 0) {
145
+ imageAltMissing += 1;
146
+ }
147
+ }
148
+ }
149
+ const clusters = graph.contentClusters || metrics.clusters || [];
150
+ const duplicateClusters = clusters.length;
151
+ const cannibalizationClusters = clusters.filter((cluster) => cluster.risk === 'high' || cluster.type === 'near').length;
152
+ for (const node of nodes) {
153
+ const authority = node.inLinks > 5 ? 0.8 : 0.2;
154
+ if (authority >= OPPORTUNITY_AUTHORITY_THRESHOLD && node.outLinks < 3) {
155
+ strongPagesUnderLinking += 1;
156
+ }
157
+ if (authority >= 0.65 && authority < OPPORTUNITY_AUTHORITY_THRESHOLD) {
158
+ nearAuthorityThreshold += 1;
159
+ }
160
+ if (authority >= OPPORTUNITY_AUTHORITY_THRESHOLD && node.inLinks < LOW_INTERNAL_LINK_THRESHOLD) {
161
+ underlinkedHighAuthorityPages += 1;
162
+ }
163
+ }
164
+ return {
165
+ orphanPages: metrics.orphanPages?.length || 0,
166
+ brokenInternalLinks,
167
+ redirectChains,
168
+ duplicateClusters,
169
+ canonicalConflicts,
170
+ accidentalNoindex,
171
+ missingH1,
172
+ thinContent,
173
+ lowInternalLinkCount,
174
+ excessiveInternalLinkCount,
175
+ highExternalLinkRatio,
176
+ imageAltMissing,
177
+ strongPagesUnderLinking,
178
+ cannibalizationClusters,
179
+ nearAuthorityThreshold,
180
+ underlinkedHighAuthorityPages,
181
+ externalLinks,
182
+ blockedByRobots,
183
+ crawlTraps
184
+ };
185
+ }
186
+ clamp(value, min, max) {
187
+ return Math.min(max, Math.max(min, value));
188
+ }
189
+ healthStatusLabel(score, hasCritical = false) {
190
+ if (hasCritical && score >= 75)
191
+ return 'Needs Attention';
192
+ if (score >= 90)
193
+ return 'Excellent';
194
+ if (score >= 75)
195
+ return 'Good';
196
+ if (score >= 50)
197
+ return 'Needs Attention';
198
+ return 'Critical';
199
+ }
25
200
  }
26
- export function healthStatusLabel(score, hasCritical = false) {
201
+ const service = new HealthService();
202
+ export const calculateHealthScore = (totalPages, issues, weights = DEFAULT_HEALTH_WEIGHTS) => service.calculateHealthScore(totalPages, issues, weights);
203
+ export const healthStatusLabel = (score, hasCritical = false) => {
27
204
  if (hasCritical && score >= 75)
28
205
  return 'Needs Attention';
29
206
  if (score >= 90)
@@ -33,138 +210,4 @@ export function healthStatusLabel(score, hasCritical = false) {
33
210
  if (score >= 50)
34
211
  return 'Needs Attention';
35
212
  return 'Critical';
36
- }
37
- export function calculateHealthScore(totalPages, issues, weights = DEFAULT_HEALTH_WEIGHTS) {
38
- const safePages = Math.max(totalPages, 1);
39
- const weightedPenalties = {
40
- orphans: clamp((issues.orphanPages / safePages) * weights.orphans, 0, weights.orphans),
41
- brokenLinks: clamp((issues.brokenInternalLinks / safePages) * weights.brokenLinks, 0, weights.brokenLinks),
42
- redirectChains: clamp((issues.redirectChains / safePages) * weights.redirectChains, 0, weights.redirectChains),
43
- duplicateClusters: clamp((issues.duplicateClusters / safePages) * weights.duplicateClusters, 0, weights.duplicateClusters),
44
- thinContent: clamp((issues.thinContent / safePages) * weights.thinContent, 0, weights.thinContent),
45
- missingH1: clamp((issues.missingH1 / safePages) * weights.missingH1, 0, weights.missingH1),
46
- noindexMisuse: clamp((issues.accidentalNoindex / safePages) * weights.noindexMisuse, 0, weights.noindexMisuse),
47
- canonicalConflicts: clamp((issues.canonicalConflicts / safePages) * weights.canonicalConflicts, 0, weights.canonicalConflicts),
48
- lowInternalLinks: clamp((issues.lowInternalLinkCount / safePages) * weights.lowInternalLinks, 0, weights.lowInternalLinks),
49
- excessiveLinks: clamp((issues.excessiveInternalLinkCount / safePages) * weights.excessiveLinks, 0, weights.excessiveLinks),
50
- blockedByRobots: clamp((issues.blockedByRobots / safePages) * weights.blockedByRobots, 0, weights.blockedByRobots)
51
- };
52
- const totalPenalty = Object.values(weightedPenalties).reduce((sum, value) => sum + value, 0);
53
- const score = Number(clamp(100 - totalPenalty, 0, 100).toFixed(1));
54
- const hasCritical = (issues.orphanPages > 0 ||
55
- issues.brokenInternalLinks > 0 ||
56
- issues.redirectChains > 0 ||
57
- issues.duplicateClusters > 0 ||
58
- issues.canonicalConflicts > 0 ||
59
- issues.accidentalNoindex > 0 ||
60
- issues.blockedByRobots > 0);
61
- return {
62
- score,
63
- status: healthStatusLabel(score, hasCritical),
64
- weightedPenalties,
65
- weights
66
- };
67
- }
68
- export function collectCrawlIssues(graph, metrics) {
69
- const nodes = graph.getNodes();
70
- let brokenInternalLinks = 0;
71
- let redirectChains = 0;
72
- let canonicalConflicts = 0;
73
- let accidentalNoindex = 0;
74
- let missingH1 = 0;
75
- let thinContent = 0;
76
- let highExternalLinkRatio = 0;
77
- let imageAltMissing = 0;
78
- let lowInternalLinkCount = 0;
79
- let excessiveInternalLinkCount = 0;
80
- let strongPagesUnderLinking = 0;
81
- let nearAuthorityThreshold = 0;
82
- let underlinkedHighAuthorityPages = 0;
83
- let externalLinks = 0;
84
- let blockedByRobots = 0;
85
- for (const node of nodes) {
86
- if (node.crawlStatus === 'blocked' || node.crawlStatus === 'blocked_by_robots') {
87
- blockedByRobots += 1;
88
- }
89
- const isConfirmedError = node.status >= 400 || (node.status === 0 && (node.crawlStatus === 'network_error' || node.crawlStatus === 'failed_after_retries' || node.securityError || node.crawlStatus === 'fetched_error'));
90
- if (isConfirmedError) {
91
- brokenInternalLinks += 1;
92
- }
93
- if (node.brokenLinks) {
94
- const actualBreaks = node.brokenLinks.filter(url => {
95
- const target = graph.nodes.get(url);
96
- return target && (target.status >= 400 || (target.status === 0 && (target.crawlStatus === 'network_error' || target.crawlStatus === 'failed_after_retries' || target.securityError || target.crawlStatus === 'fetched_error')));
97
- });
98
- brokenInternalLinks += actualBreaks.length;
99
- }
100
- if ((node.redirectChain?.length || 0) > 1) {
101
- redirectChains += 1;
102
- }
103
- if (node.canonical && node.canonical !== node.url) {
104
- canonicalConflicts += 1;
105
- }
106
- if (node.noindex && node.status >= 200 && node.status < 300) {
107
- accidentalNoindex += 1;
108
- }
109
- if (node.inLinks < LOW_INTERNAL_LINK_THRESHOLD && node.depth > 0) {
110
- lowInternalLinkCount += 1;
111
- }
112
- if (node.outLinks > EXCESSIVE_INTERNAL_LINK_THRESHOLD) {
113
- excessiveInternalLinkCount += 1;
114
- }
115
- if (!node.html) {
116
- continue;
117
- }
118
- const h1 = analyzeH1(node.html, '');
119
- if (h1.count === 0) {
120
- missingH1 += 1;
121
- }
122
- const content = analyzeContent(node.html);
123
- if (content.wordCount < THIN_CONTENT_THRESHOLD) {
124
- thinContent += 1;
125
- }
126
- const links = analyzeLinks(node.html, node.url, node.url);
127
- externalLinks += links.externalLinks;
128
- if (links.externalRatio > HIGH_EXTERNAL_LINK_RATIO_THRESHOLD) {
129
- highExternalLinkRatio += 1;
130
- }
131
- const imageAlt = analyzeImageAlts(node.html);
132
- if (imageAlt.missingAlt > 0) {
133
- imageAltMissing += 1;
134
- }
135
- }
136
- const duplicateClusters = graph.duplicateClusters?.length || 0;
137
- const cannibalizationClusters = graph.duplicateClusters?.filter((cluster) => cluster.type === 'near').length || 0;
138
- for (const node of nodes) {
139
- const authority = node.pageRank || 0;
140
- if (authority >= OPPORTUNITY_AUTHORITY_THRESHOLD && node.outLinks < 3) {
141
- strongPagesUnderLinking += 1;
142
- }
143
- if (authority >= 0.65 && authority < OPPORTUNITY_AUTHORITY_THRESHOLD) {
144
- nearAuthorityThreshold += 1;
145
- }
146
- if (authority >= OPPORTUNITY_AUTHORITY_THRESHOLD && node.inLinks < LOW_INTERNAL_LINK_THRESHOLD) {
147
- underlinkedHighAuthorityPages += 1;
148
- }
149
- }
150
- return {
151
- orphanPages: metrics.orphanPages.length,
152
- brokenInternalLinks,
153
- redirectChains,
154
- duplicateClusters,
155
- canonicalConflicts,
156
- accidentalNoindex,
157
- missingH1,
158
- thinContent,
159
- lowInternalLinkCount,
160
- excessiveInternalLinkCount,
161
- highExternalLinkRatio,
162
- imageAltMissing,
163
- strongPagesUnderLinking,
164
- cannibalizationClusters,
165
- nearAuthorityThreshold,
166
- underlinkedHighAuthorityPages,
167
- externalLinks,
168
- blockedByRobots
169
- };
170
- }
213
+ };
@@ -0,0 +1,6 @@
1
+ interface Chalk {
2
+ (text: unknown): string;
3
+ [key: string]: Chalk;
4
+ }
5
+ declare const chalk: Chalk;
6
+ export default chalk;
@@ -0,0 +1,41 @@
1
+ import { styleText } from 'node:util';
2
+ const alias = {
3
+ grey: 'gray'
4
+ };
5
+ const chalk = createChalk([]);
6
+ function createChalk(styles) {
7
+ const formatter = ((text) => applyStyles(styles, text));
8
+ return new Proxy(formatter, {
9
+ apply(_target, _thisArg, args) {
10
+ return applyStyles(styles, args[0]);
11
+ },
12
+ get(_target, prop) {
13
+ if (typeof prop !== 'string') {
14
+ return undefined;
15
+ }
16
+ const style = alias[prop] ?? prop;
17
+ return createChalk([...styles, style]);
18
+ }
19
+ });
20
+ }
21
+ function applyStyles(styles, text) {
22
+ const value = String(text ?? '');
23
+ if (styles.length === 0 || !isColorEnabled()) {
24
+ return value;
25
+ }
26
+ return styleText(styles, value);
27
+ }
28
+ function isColorEnabled() {
29
+ if (process.env.NO_COLOR !== undefined || process.env.NODE_DISABLE_COLORS !== undefined) {
30
+ return false;
31
+ }
32
+ const forceColor = process.env.FORCE_COLOR;
33
+ if (forceColor === '0') {
34
+ return false;
35
+ }
36
+ if (forceColor !== undefined) {
37
+ return true;
38
+ }
39
+ return Boolean(process.stdout?.isTTY);
40
+ }
41
+ export default chalk;