@crawlith/core 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analyze.d.ts +29 -8
  4. package/dist/analysis/analyze.js +325 -221
  5. package/dist/analysis/clustering.d.ts +23 -0
  6. package/dist/analysis/clustering.js +206 -0
  7. package/dist/analysis/content.d.ts +1 -1
  8. package/dist/analysis/content.js +11 -5
  9. package/dist/analysis/duplicate.d.ts +34 -0
  10. package/dist/analysis/duplicate.js +305 -0
  11. package/dist/analysis/heading.d.ts +116 -0
  12. package/dist/analysis/heading.js +356 -0
  13. package/dist/analysis/images.d.ts +1 -1
  14. package/dist/analysis/images.js +6 -5
  15. package/dist/analysis/links.d.ts +1 -1
  16. package/dist/analysis/links.js +8 -8
  17. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  18. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  19. package/dist/analysis/scoring.js +4 -1
  20. package/dist/analysis/seo.d.ts +8 -4
  21. package/dist/analysis/seo.js +41 -30
  22. package/dist/analysis/soft404.d.ts +17 -0
  23. package/dist/analysis/soft404.js +62 -0
  24. package/dist/analysis/structuredData.d.ts +1 -1
  25. package/dist/analysis/structuredData.js +5 -4
  26. package/dist/application/index.d.ts +2 -0
  27. package/dist/application/index.js +2 -0
  28. package/dist/application/usecase.d.ts +3 -0
  29. package/dist/application/usecase.js +1 -0
  30. package/dist/application/usecases.d.ts +114 -0
  31. package/dist/application/usecases.js +201 -0
  32. package/dist/audit/index.js +1 -1
  33. package/dist/audit/transport.d.ts +1 -1
  34. package/dist/audit/transport.js +5 -4
  35. package/dist/audit/types.d.ts +1 -0
  36. package/dist/constants.d.ts +17 -0
  37. package/dist/constants.js +23 -0
  38. package/dist/core/scope/scopeManager.js +3 -0
  39. package/dist/crawler/crawl.d.ts +2 -2
  40. package/dist/crawler/crawler.d.ts +17 -5
  41. package/dist/crawler/crawler.js +259 -94
  42. package/dist/crawler/fetcher.d.ts +1 -1
  43. package/dist/crawler/fetcher.js +6 -6
  44. package/dist/crawler/metricsRunner.d.ts +21 -1
  45. package/dist/crawler/metricsRunner.js +181 -60
  46. package/dist/crawler/normalize.d.ts +41 -0
  47. package/dist/crawler/normalize.js +119 -3
  48. package/dist/crawler/parser.d.ts +1 -3
  49. package/dist/crawler/parser.js +2 -49
  50. package/dist/crawler/resolver.d.ts +11 -0
  51. package/dist/crawler/resolver.js +67 -0
  52. package/dist/crawler/sitemap.d.ts +4 -1
  53. package/dist/crawler/sitemap.js +24 -18
  54. package/dist/crawler/trap.d.ts +5 -1
  55. package/dist/crawler/trap.js +23 -2
  56. package/dist/db/CrawlithDB.d.ts +110 -0
  57. package/dist/db/CrawlithDB.js +500 -0
  58. package/dist/db/graphLoader.js +15 -32
  59. package/dist/db/index.d.ts +9 -1
  60. package/dist/db/index.js +39 -31
  61. package/dist/db/migrations.d.ts +2 -0
  62. package/dist/db/{schema.js → migrations.js} +90 -43
  63. package/dist/db/pluginRegistry.d.ts +9 -0
  64. package/dist/db/pluginRegistry.js +19 -0
  65. package/dist/db/repositories/EdgeRepository.d.ts +5 -0
  66. package/dist/db/repositories/EdgeRepository.js +7 -0
  67. package/dist/db/repositories/MetricsRepository.d.ts +13 -8
  68. package/dist/db/repositories/MetricsRepository.js +14 -6
  69. package/dist/db/repositories/PageRepository.d.ts +5 -3
  70. package/dist/db/repositories/PageRepository.js +68 -17
  71. package/dist/db/repositories/SiteRepository.d.ts +6 -0
  72. package/dist/db/repositories/SiteRepository.js +4 -0
  73. package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
  74. package/dist/db/repositories/SnapshotRepository.js +48 -10
  75. package/dist/db/reset.d.ts +9 -0
  76. package/dist/db/reset.js +32 -0
  77. package/dist/db/statements.d.ts +12 -0
  78. package/dist/db/statements.js +40 -0
  79. package/dist/diff/compare.d.ts +0 -5
  80. package/dist/diff/compare.js +0 -12
  81. package/dist/diff/service.d.ts +16 -0
  82. package/dist/diff/service.js +41 -0
  83. package/dist/domain/index.d.ts +4 -0
  84. package/dist/domain/index.js +4 -0
  85. package/dist/events.d.ts +8 -0
  86. package/dist/graph/graph.d.ts +20 -42
  87. package/dist/graph/graph.js +12 -16
  88. package/dist/graph/hits.d.ts +23 -0
  89. package/dist/graph/hits.js +111 -0
  90. package/dist/graph/metrics.d.ts +0 -4
  91. package/dist/graph/metrics.js +19 -15
  92. package/dist/graph/pagerank.d.ts +17 -4
  93. package/dist/graph/pagerank.js +126 -93
  94. package/dist/index.d.ts +27 -9
  95. package/dist/index.js +27 -9
  96. package/dist/lock/lockManager.d.ts +1 -0
  97. package/dist/lock/lockManager.js +15 -0
  98. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  99. package/dist/plugin-system/plugin-cli.js +31 -0
  100. package/dist/plugin-system/plugin-config.d.ts +16 -0
  101. package/dist/plugin-system/plugin-config.js +36 -0
  102. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  103. package/dist/plugin-system/plugin-loader.js +122 -0
  104. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  105. package/dist/plugin-system/plugin-registry.js +167 -0
  106. package/dist/plugin-system/plugin-types.d.ts +205 -0
  107. package/dist/plugin-system/plugin-types.js +1 -0
  108. package/dist/ports/index.d.ts +9 -0
  109. package/dist/ports/index.js +1 -0
  110. package/dist/report/export.d.ts +3 -0
  111. package/dist/report/export.js +81 -0
  112. package/dist/report/insight.d.ts +27 -0
  113. package/dist/report/insight.js +103 -0
  114. package/dist/scoring/health.d.ts +17 -11
  115. package/dist/scoring/health.js +183 -140
  116. package/dist/utils/chalk.d.ts +6 -0
  117. package/dist/utils/chalk.js +41 -0
  118. package/dist/utils/secureConfig.d.ts +23 -0
  119. package/dist/utils/secureConfig.js +128 -0
  120. package/package.json +10 -4
  121. package/CHANGELOG.md +0 -13
  122. package/dist/db/schema.d.ts +0 -2
  123. package/dist/graph/cluster.d.ts +0 -6
  124. package/dist/graph/cluster.js +0 -221
  125. package/dist/graph/duplicate.d.ts +0 -10
  126. package/dist/graph/duplicate.js +0 -302
  127. package/dist/scoring/hits.d.ts +0 -10
  128. package/dist/scoring/hits.js +0 -131
  129. package/scripts/copy-assets.js +0 -37
  130. package/src/analysis/analysis_list.html +0 -35
  131. package/src/analysis/analysis_page.html +0 -123
  132. package/src/analysis/analyze.ts +0 -505
  133. package/src/analysis/content.ts +0 -62
  134. package/src/analysis/images.ts +0 -28
  135. package/src/analysis/links.ts +0 -41
  136. package/src/analysis/scoring.ts +0 -66
  137. package/src/analysis/seo.ts +0 -82
  138. package/src/analysis/structuredData.ts +0 -62
  139. package/src/analysis/templates.ts +0 -9
  140. package/src/audit/dns.ts +0 -49
  141. package/src/audit/headers.ts +0 -98
  142. package/src/audit/index.ts +0 -66
  143. package/src/audit/scoring.ts +0 -232
  144. package/src/audit/transport.ts +0 -258
  145. package/src/audit/types.ts +0 -102
  146. package/src/core/network/proxyAdapter.ts +0 -21
  147. package/src/core/network/rateLimiter.ts +0 -39
  148. package/src/core/network/redirectController.ts +0 -47
  149. package/src/core/network/responseLimiter.ts +0 -34
  150. package/src/core/network/retryPolicy.ts +0 -57
  151. package/src/core/scope/domainFilter.ts +0 -45
  152. package/src/core/scope/scopeManager.ts +0 -52
  153. package/src/core/scope/subdomainPolicy.ts +0 -39
  154. package/src/core/security/ipGuard.ts +0 -171
  155. package/src/crawler/crawl.ts +0 -9
  156. package/src/crawler/crawler.ts +0 -601
  157. package/src/crawler/extract.ts +0 -39
  158. package/src/crawler/fetcher.ts +0 -251
  159. package/src/crawler/metricsRunner.ts +0 -137
  160. package/src/crawler/normalize.ts +0 -108
  161. package/src/crawler/parser.ts +0 -190
  162. package/src/crawler/sitemap.ts +0 -76
  163. package/src/crawler/trap.ts +0 -96
  164. package/src/db/graphLoader.ts +0 -135
  165. package/src/db/index.ts +0 -75
  166. package/src/db/repositories/EdgeRepository.ts +0 -43
  167. package/src/db/repositories/MetricsRepository.ts +0 -63
  168. package/src/db/repositories/PageRepository.ts +0 -228
  169. package/src/db/repositories/SiteRepository.ts +0 -43
  170. package/src/db/repositories/SnapshotRepository.ts +0 -99
  171. package/src/db/schema.ts +0 -177
  172. package/src/diff/compare.ts +0 -84
  173. package/src/events.ts +0 -16
  174. package/src/graph/cluster.ts +0 -246
  175. package/src/graph/duplicate.ts +0 -350
  176. package/src/graph/graph.ts +0 -192
  177. package/src/graph/metrics.ts +0 -125
  178. package/src/graph/pagerank.ts +0 -126
  179. package/src/graph/simhash.ts +0 -76
  180. package/src/index.ts +0 -33
  181. package/src/lock/hashKey.ts +0 -51
  182. package/src/lock/lockManager.ts +0 -132
  183. package/src/lock/pidCheck.ts +0 -13
  184. package/src/report/crawl.html +0 -879
  185. package/src/report/crawlExport.ts +0 -58
  186. package/src/report/crawl_template.ts +0 -9
  187. package/src/report/html.ts +0 -27
  188. package/src/scoring/health.ts +0 -241
  189. package/src/scoring/hits.ts +0 -153
  190. package/src/scoring/orphanSeverity.ts +0 -176
  191. package/src/utils/version.ts +0 -18
  192. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  193. package/tests/analysis.unit.test.ts +0 -142
  194. package/tests/analyze.integration.test.ts +0 -133
  195. package/tests/analyze_markdown.test.ts +0 -98
  196. package/tests/audit/audit.test.ts +0 -101
  197. package/tests/audit/dns.test.ts +0 -31
  198. package/tests/audit/headers.test.ts +0 -45
  199. package/tests/audit/scoring.test.ts +0 -133
  200. package/tests/audit/security.test.ts +0 -12
  201. package/tests/audit/transport.test.ts +0 -111
  202. package/tests/clustering.test.ts +0 -118
  203. package/tests/clustering_risk.test.ts +0 -118
  204. package/tests/crawler.test.ts +0 -364
  205. package/tests/db/index.test.ts +0 -134
  206. package/tests/db/repositories.test.ts +0 -115
  207. package/tests/db.test.ts +0 -159
  208. package/tests/db_repos.test.ts +0 -72
  209. package/tests/diff.test.ts +0 -67
  210. package/tests/duplicate.test.ts +0 -110
  211. package/tests/extract.test.ts +0 -86
  212. package/tests/fetcher.test.ts +0 -110
  213. package/tests/fetcher_safety.test.ts +0 -91
  214. package/tests/fixtures/analyze-crawl.json +0 -26
  215. package/tests/graph/graph.test.ts +0 -100
  216. package/tests/graphLoader.test.ts +0 -124
  217. package/tests/hits.test.ts +0 -134
  218. package/tests/html_report.test.ts +0 -59
  219. package/tests/ipGuard.test.ts +0 -73
  220. package/tests/lock/lockManager.test.ts +0 -198
  221. package/tests/metrics.test.ts +0 -196
  222. package/tests/normalize.test.ts +0 -88
  223. package/tests/orphanSeverity.test.ts +0 -160
  224. package/tests/pagerank.test.ts +0 -98
  225. package/tests/parser.test.ts +0 -117
  226. package/tests/proxy_safety.test.ts +0 -57
  227. package/tests/redirect_safety.test.ts +0 -77
  228. package/tests/renderAnalysisCsv.test.ts +0 -183
  229. package/tests/safety.test.ts +0 -126
  230. package/tests/scope.test.ts +0 -84
  231. package/tests/scoring.test.ts +0 -60
  232. package/tests/sitemap.test.ts +0 -100
  233. package/tests/soft404.test.ts +0 -41
  234. package/tests/ssrf_fix.test.ts +0 -69
  235. package/tests/trap.test.ts +0 -39
  236. package/tests/visualization_data.test.ts +0 -46
  237. package/tsconfig.json +0 -11
@@ -1,58 +0,0 @@
1
- export function renderCrawlCsvNodes(graphData: any): string {
2
- const nodeHeaders = ['URL', 'Depth', 'Status', 'InboundLinks', 'OutboundLinks', 'PageRankScore'];
3
- const nodeRows = graphData.nodes.map((n: any) => {
4
- const outbound = graphData.edges.filter((e: any) => e.source === n.url).length;
5
- const inbound = graphData.edges.filter((e: any) => e.target === n.url).length;
6
- const statusStr = n.status === 0 ? 'Pending/Limit' : n.status;
7
- return [n.url, n.depth, statusStr, inbound, outbound, (n.pageRankScore || 0).toFixed(3)].join(',');
8
- });
9
- return [nodeHeaders.join(','), ...nodeRows].join('\n');
10
- }
11
-
12
- export function renderCrawlCsvEdges(graphData: any): string {
13
- const edgeHeaders = ['Source', 'Target', 'Weight'];
14
- const edgeRows = graphData.edges.map((e: any) => [e.source, e.target, e.weight].join(','));
15
- return [edgeHeaders.join(','), ...edgeRows].join('\n');
16
- }
17
-
18
- export function renderCrawlMarkdown(url: string, graphData: any, metrics: any, graph: any): string {
19
- const md = [
20
- `# Crawlith Crawl Summary - ${url}`,
21
- '',
22
- `## 📊 Metrics`,
23
- `- Total Pages Discovered: ${metrics.totalPages}`,
24
- `- Session Pages Crawled: ${graph.sessionStats?.pagesFetched ?? 0}`,
25
- `- Total Edges: ${metrics.totalEdges}`,
26
- `- Avg Depth: ${metrics.averageDepth.toFixed(2)}`,
27
- `- Max Depth: ${metrics.maxDepthFound}`,
28
- `- Crawl Efficiency: ${(metrics.crawlEfficiencyScore * 100).toFixed(1)}%`,
29
- '',
30
- `## 📄 Top Pages (by In-degree)`,
31
- ];
32
-
33
- const topPages = [...graphData.nodes]
34
- .map((n: any) => ({ ...n, inLinks: graphData.edges.filter((e: any) => e.target === n.url).length }))
35
- .sort((a, b) => b.inLinks - a.inLinks)
36
- .slice(0, 10);
37
-
38
- md.push('| URL | Inbound | Status |');
39
- md.push('| :--- | :--- | :--- |');
40
- topPages.forEach(p => {
41
- const statusStr = p.status === 0 ? 'Pending/Limit' : p.status;
42
- md.push(`| ${p.url} | ${p.inLinks} | ${statusStr} |`);
43
- });
44
-
45
- if (metrics.topPageRankPages?.length > 0) {
46
- md.push('');
47
- md.push('## 🏆 Top PageRank Pages');
48
- md.push('| URL | Score |');
49
- md.push('| :--- | :--- |');
50
- metrics.topPageRankPages.slice(0, 10).forEach((p: any) => {
51
- const node = graph.nodes?.get ? graph.nodes.get(p.url) : graph.getNodes?.().find((x: any) => x.url === p.url);
52
- const score = node?.pageRankScore ?? 0;
53
- md.push(`| ${p.url} | ${score.toFixed(3)}/100 |`);
54
- });
55
- }
56
-
57
- return md.join('\n');
58
- }
@@ -1,9 +0,0 @@
1
- import fs from 'node:fs';
2
- import path from 'node:path';
3
- import { fileURLToPath } from 'node:url';
4
-
5
- const __filename = fileURLToPath(import.meta.url);
6
- const __dirname = path.dirname(__filename);
7
- const templatePath = path.join(__dirname, 'crawl.html');
8
-
9
- export const Crawl_HTML = fs.readFileSync(templatePath, 'utf-8');
@@ -1,27 +0,0 @@
1
- import { Metrics } from '../graph/metrics.js';
2
- import { Crawl_HTML } from './crawl_template.js';
3
-
4
- function safeJson(data: any): string {
5
- return JSON.stringify(data).replace(/</g, '\\u003c');
6
- }
7
-
8
- export function generateHtml(graphData: any, metrics: Metrics): string {
9
- // Strip heavy HTML content from nodes to keep the report lightweight
10
- const vizGraphData = {
11
- ...graphData,
12
- nodes: graphData.nodes ? graphData.nodes.map((n: any) => {
13
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
14
- const { html, ...rest } = n;
15
- return rest;
16
- }) : []
17
- };
18
-
19
- const graphJson = safeJson(vizGraphData);
20
- const metricsJson = safeJson(metrics);
21
-
22
- return Crawl_HTML.replace('</body>', `<script>
23
- window.GRAPH_DATA = ${graphJson};
24
- window.METRICS_DATA = ${metricsJson};
25
- </script>
26
- </body>`);
27
- }
@@ -1,241 +0,0 @@
1
- import { Graph } from '../graph/graph.js';
2
- import { Metrics } from '../graph/metrics.js';
3
- import { analyzeContent } from '../analysis/content.js';
4
- import { analyzeH1 } from '../analysis/seo.js';
5
- import { analyzeImageAlts } from '../analysis/images.js';
6
- import { analyzeLinks } from '../analysis/links.js';
7
-
8
- export const THIN_CONTENT_THRESHOLD = 300;
9
- export const LOW_INTERNAL_LINK_THRESHOLD = 2;
10
- export const EXCESSIVE_INTERNAL_LINK_THRESHOLD = 150;
11
- export const HIGH_EXTERNAL_LINK_RATIO_THRESHOLD = 0.6;
12
- export const OPPORTUNITY_AUTHORITY_THRESHOLD = 0.8;
13
-
14
- export interface HealthScoreWeights {
15
- orphans: number;
16
- brokenLinks: number;
17
- redirectChains: number;
18
- duplicateClusters: number;
19
- thinContent: number;
20
- missingH1: number;
21
- noindexMisuse: number;
22
- canonicalConflicts: number;
23
- lowInternalLinks: number;
24
- excessiveLinks: number;
25
- blockedByRobots: number;
26
- }
27
-
28
- export const DEFAULT_HEALTH_WEIGHTS: HealthScoreWeights = {
29
- orphans: 50,
30
- brokenLinks: 100,
31
- redirectChains: 20,
32
- duplicateClusters: 25,
33
- thinContent: 15,
34
- missingH1: 10,
35
- noindexMisuse: 20,
36
- canonicalConflicts: 10,
37
- lowInternalLinks: 10,
38
- excessiveLinks: 5,
39
- blockedByRobots: 100
40
- };
41
-
42
- export interface CrawlIssueCounts {
43
- orphanPages: number;
44
- brokenInternalLinks: number;
45
- redirectChains: number;
46
- duplicateClusters: number;
47
- canonicalConflicts: number;
48
- accidentalNoindex: number;
49
- missingH1: number;
50
- thinContent: number;
51
- lowInternalLinkCount: number;
52
- excessiveInternalLinkCount: number;
53
- highExternalLinkRatio: number;
54
- imageAltMissing: number;
55
- strongPagesUnderLinking: number;
56
- cannibalizationClusters: number;
57
- nearAuthorityThreshold: number;
58
- underlinkedHighAuthorityPages: number;
59
- externalLinks: number;
60
- blockedByRobots: number;
61
- }
62
-
63
- export interface HealthScoreBreakdown {
64
- score: number;
65
- status: string;
66
- weightedPenalties: Record<keyof HealthScoreWeights, number>;
67
- weights: HealthScoreWeights;
68
- }
69
-
70
- function clamp(value: number, min: number, max: number): number {
71
- return Math.min(max, Math.max(min, value));
72
- }
73
-
74
- export function healthStatusLabel(score: number, hasCritical: boolean = false): string {
75
- if (hasCritical && score >= 75) return 'Needs Attention';
76
- if (score >= 90) return 'Excellent';
77
- if (score >= 75) return 'Good';
78
- if (score >= 50) return 'Needs Attention';
79
- return 'Critical';
80
- }
81
-
82
- export function calculateHealthScore(
83
- totalPages: number,
84
- issues: Pick<CrawlIssueCounts, 'orphanPages' | 'brokenInternalLinks' | 'redirectChains' | 'duplicateClusters' | 'thinContent' | 'missingH1' | 'accidentalNoindex' | 'canonicalConflicts' | 'lowInternalLinkCount' | 'excessiveInternalLinkCount' | 'blockedByRobots'>,
85
- weights: HealthScoreWeights = DEFAULT_HEALTH_WEIGHTS
86
- ): HealthScoreBreakdown {
87
- const safePages = Math.max(totalPages, 1);
88
-
89
- const weightedPenalties = {
90
- orphans: clamp((issues.orphanPages / safePages) * weights.orphans, 0, weights.orphans),
91
- brokenLinks: clamp((issues.brokenInternalLinks / safePages) * weights.brokenLinks, 0, weights.brokenLinks),
92
- redirectChains: clamp((issues.redirectChains / safePages) * weights.redirectChains, 0, weights.redirectChains),
93
- duplicateClusters: clamp((issues.duplicateClusters / safePages) * weights.duplicateClusters, 0, weights.duplicateClusters),
94
- thinContent: clamp((issues.thinContent / safePages) * weights.thinContent, 0, weights.thinContent),
95
- missingH1: clamp((issues.missingH1 / safePages) * weights.missingH1, 0, weights.missingH1),
96
- noindexMisuse: clamp((issues.accidentalNoindex / safePages) * weights.noindexMisuse, 0, weights.noindexMisuse),
97
- canonicalConflicts: clamp((issues.canonicalConflicts / safePages) * weights.canonicalConflicts, 0, weights.canonicalConflicts),
98
- lowInternalLinks: clamp((issues.lowInternalLinkCount / safePages) * weights.lowInternalLinks, 0, weights.lowInternalLinks),
99
- excessiveLinks: clamp((issues.excessiveInternalLinkCount / safePages) * weights.excessiveLinks, 0, weights.excessiveLinks),
100
- blockedByRobots: clamp((issues.blockedByRobots / safePages) * weights.blockedByRobots, 0, weights.blockedByRobots)
101
- };
102
-
103
- const totalPenalty = Object.values(weightedPenalties).reduce((sum, value) => sum + value, 0);
104
- const score = Number(clamp(100 - totalPenalty, 0, 100).toFixed(1));
105
-
106
- const hasCritical = (
107
- issues.orphanPages > 0 ||
108
- issues.brokenInternalLinks > 0 ||
109
- issues.redirectChains > 0 ||
110
- issues.duplicateClusters > 0 ||
111
- issues.canonicalConflicts > 0 ||
112
- issues.accidentalNoindex > 0 ||
113
- issues.blockedByRobots > 0
114
- );
115
-
116
- return {
117
- score,
118
- status: healthStatusLabel(score, hasCritical),
119
- weightedPenalties,
120
- weights
121
- };
122
- }
123
-
124
- export function collectCrawlIssues(graph: Graph, metrics: Metrics): CrawlIssueCounts {
125
- const nodes = graph.getNodes();
126
-
127
- let brokenInternalLinks = 0;
128
- let redirectChains = 0;
129
- let canonicalConflicts = 0;
130
- let accidentalNoindex = 0;
131
- let missingH1 = 0;
132
- let thinContent = 0;
133
- let highExternalLinkRatio = 0;
134
- let imageAltMissing = 0;
135
- let lowInternalLinkCount = 0;
136
- let excessiveInternalLinkCount = 0;
137
- let strongPagesUnderLinking = 0;
138
- let nearAuthorityThreshold = 0;
139
- let underlinkedHighAuthorityPages = 0;
140
- let externalLinks = 0;
141
- let blockedByRobots = 0;
142
-
143
- for (const node of nodes) {
144
- if (node.crawlStatus === 'blocked' || node.crawlStatus === 'blocked_by_robots') {
145
- blockedByRobots += 1;
146
- }
147
-
148
- const isConfirmedError = node.status >= 400 || (node.status === 0 && (node.crawlStatus === 'network_error' || node.crawlStatus === 'failed_after_retries' || node.securityError || node.crawlStatus === 'fetched_error'));
149
-
150
- if (isConfirmedError) {
151
- brokenInternalLinks += 1;
152
- }
153
-
154
- if (node.brokenLinks) {
155
- const actualBreaks = node.brokenLinks.filter(url => {
156
- const target = graph.nodes.get(url);
157
- return target && (target.status >= 400 || (target.status === 0 && (target.crawlStatus === 'network_error' || target.crawlStatus === 'failed_after_retries' || target.securityError || target.crawlStatus === 'fetched_error')));
158
- });
159
- brokenInternalLinks += actualBreaks.length;
160
- }
161
-
162
- if ((node.redirectChain?.length || 0) > 1) {
163
- redirectChains += 1;
164
- }
165
- if (node.canonical && node.canonical !== node.url) {
166
- canonicalConflicts += 1;
167
- }
168
- if (node.noindex && node.status >= 200 && node.status < 300) {
169
- accidentalNoindex += 1;
170
- }
171
-
172
- if (node.inLinks < LOW_INTERNAL_LINK_THRESHOLD && node.depth > 0) {
173
- lowInternalLinkCount += 1;
174
- }
175
- if (node.outLinks > EXCESSIVE_INTERNAL_LINK_THRESHOLD) {
176
- excessiveInternalLinkCount += 1;
177
- }
178
-
179
- if (!node.html) {
180
- continue;
181
- }
182
-
183
- const h1 = analyzeH1(node.html, '');
184
- if (h1.count === 0) {
185
- missingH1 += 1;
186
- }
187
-
188
- const content = analyzeContent(node.html);
189
- if (content.wordCount < THIN_CONTENT_THRESHOLD) {
190
- thinContent += 1;
191
- }
192
-
193
- const links = analyzeLinks(node.html, node.url, node.url);
194
- externalLinks += links.externalLinks;
195
- if (links.externalRatio > HIGH_EXTERNAL_LINK_RATIO_THRESHOLD) {
196
- highExternalLinkRatio += 1;
197
- }
198
-
199
- const imageAlt = analyzeImageAlts(node.html);
200
- if (imageAlt.missingAlt > 0) {
201
- imageAltMissing += 1;
202
- }
203
- }
204
-
205
- const duplicateClusters = graph.duplicateClusters?.length || 0;
206
- const cannibalizationClusters = graph.duplicateClusters?.filter((cluster) => cluster.type === 'near').length || 0;
207
-
208
- for (const node of nodes) {
209
- const authority = node.pageRank || 0;
210
- if (authority >= OPPORTUNITY_AUTHORITY_THRESHOLD && node.outLinks < 3) {
211
- strongPagesUnderLinking += 1;
212
- }
213
- if (authority >= 0.65 && authority < OPPORTUNITY_AUTHORITY_THRESHOLD) {
214
- nearAuthorityThreshold += 1;
215
- }
216
- if (authority >= OPPORTUNITY_AUTHORITY_THRESHOLD && node.inLinks < LOW_INTERNAL_LINK_THRESHOLD) {
217
- underlinkedHighAuthorityPages += 1;
218
- }
219
- }
220
-
221
- return {
222
- orphanPages: metrics.orphanPages.length,
223
- brokenInternalLinks,
224
- redirectChains,
225
- duplicateClusters,
226
- canonicalConflicts,
227
- accidentalNoindex,
228
- missingH1,
229
- thinContent,
230
- lowInternalLinkCount,
231
- excessiveInternalLinkCount,
232
- highExternalLinkRatio,
233
- imageAltMissing,
234
- strongPagesUnderLinking,
235
- cannibalizationClusters,
236
- nearAuthorityThreshold,
237
- underlinkedHighAuthorityPages,
238
- externalLinks,
239
- blockedByRobots
240
- };
241
- }
@@ -1,153 +0,0 @@
1
- import { Graph, GraphNode } from '../graph/graph.js';
2
-
3
- export interface HITSOptions {
4
- iterations?: number;
5
- }
6
-
7
- /**
8
- * Computes Hub and Authority scores using the HITS algorithm.
9
- * Operates purely on the internal link graph.
10
- * Optimized for performance using array-based adjacency lists.
11
- */
12
- export function computeHITS(graph: Graph, options: HITSOptions = {}): void {
13
- const iterations = options.iterations || 20;
14
- const nodes = graph.getNodes();
15
-
16
- // 1. Filter eligible nodes
17
- // Eligibility: status 200 (crawled) or status 0 (discovered)
18
- // Non-redirect, not noindex (if known), non-external
19
- const eligibleNodes = nodes.filter(n =>
20
- (n.status === 200 || n.status === 0) &&
21
- (!n.redirectChain || n.redirectChain.length === 0) &&
22
- !n.noindex
23
- );
24
-
25
- const N = eligibleNodes.length;
26
- if (N === 0) return;
27
-
28
- // Map URL to Index for O(1) access
29
- const urlToIndex = new Map<string, number>();
30
- for (let i = 0; i < N; i++) {
31
- urlToIndex.set(eligibleNodes[i].url, i);
32
- }
33
-
34
- // Build Adjacency Lists (Indices)
35
- // incoming[i] = list of { sourceIndex, weight }
36
- // outgoing[i] = list of { targetIndex, weight }
37
- const incoming: { sourceIndex: number, weight: number }[][] = new Array(N).fill(null).map(() => []);
38
- const outgoing: { targetIndex: number, weight: number }[][] = new Array(N).fill(null).map(() => []);
39
-
40
- const allEdges = graph.getEdges();
41
- for (const edge of allEdges) {
42
- if (edge.source === edge.target) continue;
43
-
44
- const sourceIndex = urlToIndex.get(edge.source);
45
- const targetIndex = urlToIndex.get(edge.target);
46
-
47
- if (sourceIndex !== undefined && targetIndex !== undefined) {
48
- incoming[targetIndex].push({ sourceIndex, weight: edge.weight });
49
- outgoing[sourceIndex].push({ targetIndex, weight: edge.weight });
50
- }
51
- }
52
-
53
- // Initialize Scores
54
- const authScores = new Float64Array(N).fill(1.0);
55
- const hubScores = new Float64Array(N).fill(1.0);
56
-
57
- // 2. Iteration
58
- for (let iter = 0; iter < iterations; iter++) {
59
- // Update Authorities
60
- let normAuth = 0;
61
- for (let i = 0; i < N; i++) {
62
- const inLinks = incoming[i];
63
- let newAuth = 0;
64
- for (let j = 0; j < inLinks.length; j++) {
65
- const link = inLinks[j];
66
- newAuth += hubScores[link.sourceIndex] * link.weight;
67
- }
68
- authScores[i] = newAuth;
69
- normAuth += newAuth * newAuth;
70
- }
71
-
72
- // Normalize Authorities (L2 norm)
73
- normAuth = Math.sqrt(normAuth);
74
- if (normAuth > 0) {
75
- for (let i = 0; i < N; i++) {
76
- authScores[i] /= normAuth;
77
- }
78
- }
79
-
80
- // Update Hubs
81
- let normHub = 0;
82
- for (let i = 0; i < N; i++) {
83
- const outLinks = outgoing[i];
84
- let newHub = 0;
85
- for (let j = 0; j < outLinks.length; j++) {
86
- const link = outLinks[j];
87
- newHub += authScores[link.targetIndex] * link.weight;
88
- }
89
- hubScores[i] = newHub;
90
- normHub += newHub * newHub;
91
- }
92
-
93
- // Normalize Hubs (L2 norm)
94
- normHub = Math.sqrt(normHub);
95
- if (normHub > 0) {
96
- for (let i = 0; i < N; i++) {
97
- hubScores[i] /= normHub;
98
- }
99
- }
100
- }
101
-
102
- // 3. Assign back to GraphNodes
103
- for (let i = 0; i < N; i++) {
104
- eligibleNodes[i].authorityScore = authScores[i];
105
- eligibleNodes[i].hubScore = hubScores[i];
106
- }
107
-
108
- // 4. Classification Logic
109
- classifyLinkRoles(eligibleNodes);
110
- }
111
-
112
- function classifyLinkRoles(nodes: GraphNode[]): void {
113
- if (nodes.length === 0) return;
114
-
115
- const authScores = nodes.map(n => n.authorityScore || 0).sort((a, b) => a - b);
116
- const hubScores = nodes.map(n => n.hubScore || 0).sort((a, b) => a - b);
117
-
118
- // Use 75th percentile as "high" threshold
119
- // Using median (50th percentile) as per original implementation,
120
- // but the comment said "Use 75th percentile" while code used median.
121
- // I'll stick to median to avoid breaking existing behavior, but correct the comment or logic?
122
- // The original code:
123
- // const medianAuth = authScores[Math.floor(authScores.length / 2)];
124
- // const isHighAuth = auth > medianAuth && auth > 0.0001;
125
- // So it uses median. I'll keep it as median.
126
-
127
- const medianAuth = authScores[Math.floor(authScores.length / 2)];
128
- const medianHub = hubScores[Math.floor(hubScores.length / 2)];
129
- const maxAuth = authScores[authScores.length - 1];
130
- const maxHub = hubScores[hubScores.length - 1];
131
-
132
- for (const node of nodes) {
133
- const auth = node.authorityScore || 0;
134
- const hub = node.hubScore || 0;
135
-
136
- // A node is high if it's above median, OR if it's the max (to handle uniform distributions)
137
- // auth > 0 check is essential.
138
- const isHighAuth = (auth > medianAuth || (auth === maxAuth && auth > 0)) && auth > 0.00001;
139
- const isHighHub = (hub > medianHub || (hub === maxHub && hub > 0)) && hub > 0.00001;
140
-
141
- if (isHighAuth && isHighHub) {
142
- node.linkRole = 'power';
143
- } else if (isHighAuth) {
144
- node.linkRole = 'authority';
145
- } else if (isHighHub) {
146
- node.linkRole = 'hub';
147
- } else if (auth > 0.00001 && hub > 0.00001) {
148
- node.linkRole = 'balanced';
149
- } else {
150
- node.linkRole = 'peripheral';
151
- }
152
- }
153
- }
@@ -1,176 +0,0 @@
1
- export type OrphanType = 'hard' | 'near' | 'soft' | 'crawl-only';
2
- export type ImpactLevel = 'low' | 'medium' | 'high' | 'critical';
3
-
4
- export interface CrawlNode {
5
- url: string;
6
- depth: number;
7
- inLinks: number;
8
- outLinks: number;
9
- status: number;
10
- discoveredViaSitemap?: boolean;
11
- robotsExcluded?: boolean;
12
- canonicalUrl?: string;
13
- isHomepage?: boolean;
14
- wordCount?: number;
15
- hasStructuredData?: boolean;
16
- pageType?: string;
17
- noindex?: boolean;
18
- duplicateContent?: boolean;
19
- isProductOrCommercial?: boolean;
20
- }
21
-
22
- export interface CrawlEdge {
23
- source: string;
24
- target: string;
25
- }
26
-
27
- export interface OrphanScoringOptions {
28
- enabled: boolean;
29
- severityEnabled: boolean;
30
- includeSoftOrphans: boolean;
31
- minInbound: number;
32
- rootUrl?: string;
33
- }
34
-
35
- export type AnnotatedNode = CrawlNode & {
36
- orphan: boolean;
37
- orphanType?: OrphanType;
38
- orphanSeverity?: number;
39
- impactLevel?: ImpactLevel;
40
- };
41
-
42
- const LOW_VALUE_PATTERNS = [
43
- /[?&](page|p)=\d+/i,
44
- /\/(page|tag|tags|category|categories)\//i,
45
- /[?&](q|query|search|filter|sort)=/i,
46
- /\/search(\/|\?|$)/i
47
- ];
48
-
49
- function isLowValuePage(node: CrawlNode): boolean {
50
- const type = (node.pageType || '').toLowerCase();
51
- if (['pagination', 'tag', 'category', 'filter', 'search', 'archive'].includes(type)) {
52
- return true;
53
- }
54
- if (node.noindex) {
55
- return true;
56
- }
57
- return LOW_VALUE_PATTERNS.some((pattern) => pattern.test(node.url));
58
- }
59
-
60
- function clampScore(score: number): number {
61
- return Math.max(0, Math.min(100, Math.round(score)));
62
- }
63
-
64
- export function mapImpactLevel(score: number): ImpactLevel {
65
- if (score <= 39) return 'low';
66
- if (score <= 69) return 'medium';
67
- if (score <= 89) return 'high';
68
- return 'critical';
69
- }
70
-
71
- export function calculateOrphanSeverity(orphanType: OrphanType, node: CrawlNode): number {
72
- let score = 0;
73
-
74
- switch (orphanType) {
75
- case 'hard':
76
- score = 90;
77
- break;
78
- case 'crawl-only':
79
- score = 80;
80
- break;
81
- case 'near':
82
- score = node.inLinks <= 1 ? 70 : 60;
83
- break;
84
- case 'soft':
85
- score = 50;
86
- break;
87
- }
88
-
89
- let positiveModifier = 0;
90
- if ((node.wordCount || 0) > 800) positiveModifier += 10;
91
- if (node.hasStructuredData) positiveModifier += 10;
92
- if (node.depth <= 2) positiveModifier += 10;
93
- if (node.isProductOrCommercial) positiveModifier += 10;
94
- positiveModifier = Math.min(20, positiveModifier);
95
-
96
- let negativeModifier = 0;
97
- if ((node.wordCount || 0) > 0 && (node.wordCount || 0) < 300) negativeModifier += 20;
98
- if (node.noindex) negativeModifier += 20;
99
- if (node.duplicateContent) negativeModifier += 20;
100
- if ((node.pageType || '').toLowerCase() === 'archive' || (node.pageType || '').toLowerCase() === 'pagination') negativeModifier += 20;
101
- negativeModifier = Math.min(20, negativeModifier);
102
-
103
- score += positiveModifier;
104
- score -= negativeModifier;
105
-
106
- return clampScore(score);
107
- }
108
-
109
- function consolidateInboundByCanonical(nodes: CrawlNode[]): Map<string, number> {
110
- const canonicalInbound = new Map<string, number>();
111
- for (const node of nodes) {
112
- const canonical = node.canonicalUrl || node.url;
113
- canonicalInbound.set(canonical, (canonicalInbound.get(canonical) || 0) + node.inLinks);
114
- }
115
- return canonicalInbound;
116
- }
117
-
118
- export function annotateOrphans(nodes: CrawlNode[], edges: CrawlEdge[], options: OrphanScoringOptions): AnnotatedNode[] {
119
- if (!options.enabled) {
120
- return nodes.map((node) => ({ ...node, orphan: false }));
121
- }
122
-
123
- const canonicalInbound = consolidateInboundByCanonical(nodes);
124
- const nodeByUrl = new Map(nodes.map((node) => [node.url, node]));
125
-
126
- return nodes.map((node) => {
127
- const isHomepage = node.isHomepage || (options.rootUrl ? node.url === options.rootUrl : node.depth === 0);
128
- if (isHomepage || node.robotsExcluded) {
129
- return { ...node, orphan: false };
130
- }
131
-
132
- const canonical = node.canonicalUrl || node.url;
133
- const inbound = canonicalInbound.get(canonical) || 0;
134
-
135
- let orphanType: OrphanType | undefined;
136
-
137
- if (inbound === 0) {
138
- orphanType = node.discoveredViaSitemap ? 'crawl-only' : 'hard';
139
- } else if (inbound <= options.minInbound) {
140
- orphanType = 'near';
141
- }
142
-
143
- if (!orphanType && options.includeSoftOrphans && inbound > 0) {
144
- const inboundSources = edges
145
- .filter((edge) => edge.target === node.url)
146
- .map((edge) => nodeByUrl.get(edge.source))
147
- .filter((source): source is CrawlNode => Boolean(source));
148
-
149
- if (inboundSources.length > 0 && inboundSources.every((source) => isLowValuePage(source))) {
150
- orphanType = 'soft';
151
- }
152
- }
153
-
154
- if (!orphanType) {
155
- return { ...node, orphan: false };
156
- }
157
-
158
- if (!options.severityEnabled) {
159
- return {
160
- ...node,
161
- orphan: true,
162
- orphanType
163
- };
164
- }
165
-
166
- const orphanSeverity = calculateOrphanSeverity(orphanType, { ...node, inLinks: inbound });
167
-
168
- return {
169
- ...node,
170
- orphan: true,
171
- orphanType,
172
- orphanSeverity,
173
- impactLevel: mapImpactLevel(orphanSeverity)
174
- };
175
- });
176
- }
@@ -1,18 +0,0 @@
1
- import { readFileSync } from 'node:fs';
2
- import { fileURLToPath } from 'node:url';
3
- import { dirname, join } from 'node:path';
4
-
5
- const __filename = fileURLToPath(import.meta.url);
6
- const __dirname = dirname(__filename);
7
-
8
- let version = '0.0.1';
9
-
10
- try {
11
- const pkgPath = join(__dirname, '../../package.json');
12
- const pkg = JSON.parse(readFileSync(pkgPath, 'utf-8'));
13
- version = pkg.version;
14
- } catch {
15
- // Fallback to internal default
16
- }
17
-
18
- export { version };