@crawlith/core 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analysis_list.html +35 -0
  4. package/dist/analysis/analysis_page.html +123 -0
  5. package/dist/analysis/analyze.d.ts +40 -5
  6. package/dist/analysis/analyze.js +395 -347
  7. package/dist/analysis/clustering.d.ts +23 -0
  8. package/dist/analysis/clustering.js +206 -0
  9. package/dist/analysis/content.d.ts +1 -1
  10. package/dist/analysis/content.js +11 -5
  11. package/dist/analysis/duplicate.d.ts +34 -0
  12. package/dist/analysis/duplicate.js +305 -0
  13. package/dist/analysis/heading.d.ts +116 -0
  14. package/dist/analysis/heading.js +356 -0
  15. package/dist/analysis/images.d.ts +1 -1
  16. package/dist/analysis/images.js +6 -5
  17. package/dist/analysis/links.d.ts +1 -1
  18. package/dist/analysis/links.js +8 -8
  19. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  20. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  21. package/dist/analysis/scoring.js +11 -2
  22. package/dist/analysis/seo.d.ts +8 -4
  23. package/dist/analysis/seo.js +41 -30
  24. package/dist/analysis/soft404.d.ts +17 -0
  25. package/dist/analysis/soft404.js +62 -0
  26. package/dist/analysis/structuredData.d.ts +1 -1
  27. package/dist/analysis/structuredData.js +5 -4
  28. package/dist/analysis/templates.d.ts +2 -0
  29. package/dist/analysis/templates.js +7 -0
  30. package/dist/application/index.d.ts +2 -0
  31. package/dist/application/index.js +2 -0
  32. package/dist/application/usecase.d.ts +3 -0
  33. package/dist/application/usecase.js +1 -0
  34. package/dist/application/usecases.d.ts +114 -0
  35. package/dist/application/usecases.js +201 -0
  36. package/dist/audit/index.js +1 -1
  37. package/dist/audit/transport.d.ts +1 -1
  38. package/dist/audit/transport.js +5 -4
  39. package/dist/audit/types.d.ts +1 -0
  40. package/dist/constants.d.ts +17 -0
  41. package/dist/constants.js +23 -0
  42. package/dist/core/scope/scopeManager.js +3 -0
  43. package/dist/core/security/ipGuard.d.ts +11 -0
  44. package/dist/core/security/ipGuard.js +71 -3
  45. package/dist/crawler/crawl.d.ts +4 -22
  46. package/dist/crawler/crawl.js +4 -335
  47. package/dist/crawler/crawler.d.ts +87 -0
  48. package/dist/crawler/crawler.js +683 -0
  49. package/dist/crawler/extract.d.ts +4 -1
  50. package/dist/crawler/extract.js +7 -2
  51. package/dist/crawler/fetcher.d.ts +2 -1
  52. package/dist/crawler/fetcher.js +26 -11
  53. package/dist/crawler/metricsRunner.d.ts +23 -1
  54. package/dist/crawler/metricsRunner.js +202 -72
  55. package/dist/crawler/normalize.d.ts +41 -0
  56. package/dist/crawler/normalize.js +119 -3
  57. package/dist/crawler/parser.d.ts +1 -3
  58. package/dist/crawler/parser.js +2 -49
  59. package/dist/crawler/resolver.d.ts +11 -0
  60. package/dist/crawler/resolver.js +67 -0
  61. package/dist/crawler/sitemap.d.ts +6 -0
  62. package/dist/crawler/sitemap.js +27 -17
  63. package/dist/crawler/trap.d.ts +5 -1
  64. package/dist/crawler/trap.js +23 -2
  65. package/dist/db/CrawlithDB.d.ts +110 -0
  66. package/dist/db/CrawlithDB.js +500 -0
  67. package/dist/db/graphLoader.js +42 -30
  68. package/dist/db/index.d.ts +11 -0
  69. package/dist/db/index.js +41 -29
  70. package/dist/db/migrations.d.ts +2 -0
  71. package/dist/db/{schema.js → migrations.js} +90 -43
  72. package/dist/db/pluginRegistry.d.ts +9 -0
  73. package/dist/db/pluginRegistry.js +19 -0
  74. package/dist/db/repositories/EdgeRepository.d.ts +13 -0
  75. package/dist/db/repositories/EdgeRepository.js +20 -0
  76. package/dist/db/repositories/MetricsRepository.d.ts +16 -8
  77. package/dist/db/repositories/MetricsRepository.js +28 -7
  78. package/dist/db/repositories/PageRepository.d.ts +15 -2
  79. package/dist/db/repositories/PageRepository.js +169 -25
  80. package/dist/db/repositories/SiteRepository.d.ts +9 -0
  81. package/dist/db/repositories/SiteRepository.js +13 -0
  82. package/dist/db/repositories/SnapshotRepository.d.ts +14 -5
  83. package/dist/db/repositories/SnapshotRepository.js +64 -5
  84. package/dist/db/reset.d.ts +9 -0
  85. package/dist/db/reset.js +32 -0
  86. package/dist/db/statements.d.ts +12 -0
  87. package/dist/db/statements.js +40 -0
  88. package/dist/diff/compare.d.ts +0 -5
  89. package/dist/diff/compare.js +0 -12
  90. package/dist/diff/service.d.ts +16 -0
  91. package/dist/diff/service.js +41 -0
  92. package/dist/domain/index.d.ts +4 -0
  93. package/dist/domain/index.js +4 -0
  94. package/dist/events.d.ts +56 -0
  95. package/dist/events.js +1 -0
  96. package/dist/graph/graph.d.ts +36 -42
  97. package/dist/graph/graph.js +26 -17
  98. package/dist/graph/hits.d.ts +23 -0
  99. package/dist/graph/hits.js +111 -0
  100. package/dist/graph/metrics.d.ts +0 -4
  101. package/dist/graph/metrics.js +25 -9
  102. package/dist/graph/pagerank.d.ts +17 -4
  103. package/dist/graph/pagerank.js +126 -91
  104. package/dist/graph/simhash.d.ts +6 -0
  105. package/dist/graph/simhash.js +14 -0
  106. package/dist/index.d.ts +29 -8
  107. package/dist/index.js +29 -8
  108. package/dist/lock/hashKey.js +1 -1
  109. package/dist/lock/lockManager.d.ts +5 -1
  110. package/dist/lock/lockManager.js +38 -13
  111. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  112. package/dist/plugin-system/plugin-cli.js +31 -0
  113. package/dist/plugin-system/plugin-config.d.ts +16 -0
  114. package/dist/plugin-system/plugin-config.js +36 -0
  115. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  116. package/dist/plugin-system/plugin-loader.js +122 -0
  117. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  118. package/dist/plugin-system/plugin-registry.js +167 -0
  119. package/dist/plugin-system/plugin-types.d.ts +205 -0
  120. package/dist/plugin-system/plugin-types.js +1 -0
  121. package/dist/ports/index.d.ts +9 -0
  122. package/dist/ports/index.js +1 -0
  123. package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
  124. package/dist/report/crawlExport.d.ts +3 -0
  125. package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
  126. package/dist/report/crawl_template.d.ts +1 -0
  127. package/dist/report/crawl_template.js +7 -0
  128. package/dist/report/export.d.ts +3 -0
  129. package/dist/report/export.js +81 -0
  130. package/dist/report/html.js +15 -216
  131. package/dist/report/insight.d.ts +27 -0
  132. package/dist/report/insight.js +103 -0
  133. package/dist/scoring/health.d.ts +56 -0
  134. package/dist/scoring/health.js +213 -0
  135. package/dist/utils/chalk.d.ts +6 -0
  136. package/dist/utils/chalk.js +41 -0
  137. package/dist/utils/secureConfig.d.ts +23 -0
  138. package/dist/utils/secureConfig.js +128 -0
  139. package/package.json +12 -6
  140. package/CHANGELOG.md +0 -7
  141. package/dist/db/schema.d.ts +0 -2
  142. package/dist/graph/cluster.d.ts +0 -6
  143. package/dist/graph/cluster.js +0 -173
  144. package/dist/graph/duplicate.d.ts +0 -10
  145. package/dist/graph/duplicate.js +0 -251
  146. package/dist/report/sitegraphExport.d.ts +0 -3
  147. package/dist/report/sitegraph_template.d.ts +0 -1
  148. package/dist/report/sitegraph_template.js +0 -630
  149. package/dist/scoring/hits.d.ts +0 -9
  150. package/dist/scoring/hits.js +0 -111
  151. package/src/analysis/analyze.ts +0 -548
  152. package/src/analysis/content.ts +0 -62
  153. package/src/analysis/images.ts +0 -28
  154. package/src/analysis/links.ts +0 -41
  155. package/src/analysis/scoring.ts +0 -59
  156. package/src/analysis/seo.ts +0 -82
  157. package/src/analysis/structuredData.ts +0 -62
  158. package/src/audit/dns.ts +0 -49
  159. package/src/audit/headers.ts +0 -98
  160. package/src/audit/index.ts +0 -66
  161. package/src/audit/scoring.ts +0 -232
  162. package/src/audit/transport.ts +0 -258
  163. package/src/audit/types.ts +0 -102
  164. package/src/core/network/proxyAdapter.ts +0 -21
  165. package/src/core/network/rateLimiter.ts +0 -39
  166. package/src/core/network/redirectController.ts +0 -47
  167. package/src/core/network/responseLimiter.ts +0 -34
  168. package/src/core/network/retryPolicy.ts +0 -57
  169. package/src/core/scope/domainFilter.ts +0 -45
  170. package/src/core/scope/scopeManager.ts +0 -52
  171. package/src/core/scope/subdomainPolicy.ts +0 -39
  172. package/src/core/security/ipGuard.ts +0 -92
  173. package/src/crawler/crawl.ts +0 -382
  174. package/src/crawler/extract.ts +0 -34
  175. package/src/crawler/fetcher.ts +0 -233
  176. package/src/crawler/metricsRunner.ts +0 -124
  177. package/src/crawler/normalize.ts +0 -108
  178. package/src/crawler/parser.ts +0 -190
  179. package/src/crawler/sitemap.ts +0 -73
  180. package/src/crawler/trap.ts +0 -96
  181. package/src/db/graphLoader.ts +0 -105
  182. package/src/db/index.ts +0 -70
  183. package/src/db/repositories/EdgeRepository.ts +0 -29
  184. package/src/db/repositories/MetricsRepository.ts +0 -49
  185. package/src/db/repositories/PageRepository.ts +0 -128
  186. package/src/db/repositories/SiteRepository.ts +0 -32
  187. package/src/db/repositories/SnapshotRepository.ts +0 -74
  188. package/src/db/schema.ts +0 -177
  189. package/src/diff/compare.ts +0 -84
  190. package/src/graph/cluster.ts +0 -192
  191. package/src/graph/duplicate.ts +0 -286
  192. package/src/graph/graph.ts +0 -172
  193. package/src/graph/metrics.ts +0 -110
  194. package/src/graph/pagerank.ts +0 -125
  195. package/src/graph/simhash.ts +0 -61
  196. package/src/index.ts +0 -30
  197. package/src/lock/hashKey.ts +0 -51
  198. package/src/lock/lockManager.ts +0 -124
  199. package/src/lock/pidCheck.ts +0 -13
  200. package/src/report/html.ts +0 -227
  201. package/src/report/sitegraphExport.ts +0 -58
  202. package/src/scoring/hits.ts +0 -131
  203. package/src/scoring/orphanSeverity.ts +0 -176
  204. package/src/utils/version.ts +0 -18
  205. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  206. package/tests/analysis.unit.test.ts +0 -98
  207. package/tests/analyze.integration.test.ts +0 -98
  208. package/tests/audit/dns.test.ts +0 -31
  209. package/tests/audit/headers.test.ts +0 -45
  210. package/tests/audit/scoring.test.ts +0 -133
  211. package/tests/audit/security.test.ts +0 -12
  212. package/tests/audit/transport.test.ts +0 -112
  213. package/tests/clustering.test.ts +0 -118
  214. package/tests/crawler.test.ts +0 -358
  215. package/tests/db.test.ts +0 -159
  216. package/tests/diff.test.ts +0 -67
  217. package/tests/duplicate.test.ts +0 -110
  218. package/tests/fetcher.test.ts +0 -106
  219. package/tests/fetcher_safety.test.ts +0 -85
  220. package/tests/fixtures/analyze-crawl.json +0 -26
  221. package/tests/hits.test.ts +0 -134
  222. package/tests/html_report.test.ts +0 -58
  223. package/tests/lock/lockManager.test.ts +0 -138
  224. package/tests/metrics.test.ts +0 -196
  225. package/tests/normalize.test.ts +0 -101
  226. package/tests/orphanSeverity.test.ts +0 -160
  227. package/tests/pagerank.test.ts +0 -98
  228. package/tests/parser.test.ts +0 -117
  229. package/tests/proxy_safety.test.ts +0 -57
  230. package/tests/redirect_safety.test.ts +0 -73
  231. package/tests/safety.test.ts +0 -114
  232. package/tests/scope.test.ts +0 -66
  233. package/tests/scoring.test.ts +0 -59
  234. package/tests/sitemap.test.ts +0 -88
  235. package/tests/soft404.test.ts +0 -41
  236. package/tests/trap.test.ts +0 -39
  237. package/tests/visualization_data.test.ts +0 -46
  238. package/tsconfig.json +0 -11
@@ -1,111 +0,0 @@
1
- /**
2
- * Computes Hub and Authority scores using the HITS algorithm.
3
- * Operates purely on the internal link graph.
4
- */
5
- export function computeHITS(graph, options = {}) {
6
- const iterations = options.iterations || 20;
7
- const nodes = graph.getNodes();
8
- // 1. Filter eligible nodes
9
- // Eligibility: status 200, non-redirect (redirectChain empty), not noindex, non-external
10
- const eligibleNodes = nodes.filter(n => n.status === 200 &&
11
- (!n.redirectChain || n.redirectChain.length === 0) &&
12
- !n.noindex);
13
- if (eligibleNodes.length === 0)
14
- return;
15
- const urlToNode = new Map();
16
- for (const node of eligibleNodes) {
17
- urlToNode.set(node.url, node);
18
- // 2. Initialization
19
- node.authorityScore = 1.0;
20
- node.hubScore = 1.0;
21
- }
22
- const allEdges = graph.getEdges();
23
- // Filter edges: internal links only (both source and target must be in eligibleNodes), no self-links
24
- const eligibleEdges = allEdges.filter(e => e.source !== e.target &&
25
- urlToNode.has(e.source) &&
26
- urlToNode.has(e.target));
27
- // Group edges for efficient iteration
28
- const incoming = new Map();
29
- const outgoing = new Map();
30
- for (const edge of eligibleEdges) {
31
- if (!incoming.has(edge.target))
32
- incoming.set(edge.target, []);
33
- incoming.get(edge.target).push({ source: edge.source, weight: edge.weight });
34
- if (!outgoing.has(edge.source))
35
- outgoing.set(edge.source, []);
36
- outgoing.get(edge.source).push({ target: edge.target, weight: edge.weight });
37
- }
38
- // 3. Iteration
39
- for (let i = 0; i < iterations; i++) {
40
- // Update Authorities
41
- let normAuth = 0;
42
- for (const node of eligibleNodes) {
43
- const inLinks = incoming.get(node.url) || [];
44
- let newAuth = 0;
45
- for (const link of inLinks) {
46
- const sourceNode = urlToNode.get(link.source);
47
- newAuth += (sourceNode.hubScore || 0) * link.weight;
48
- }
49
- node.authorityScore = newAuth;
50
- normAuth += newAuth * newAuth;
51
- }
52
- // Normalize Authorities (L2 norm)
53
- normAuth = Math.sqrt(normAuth);
54
- if (normAuth > 0) {
55
- for (const node of eligibleNodes) {
56
- node.authorityScore = (node.authorityScore || 0) / normAuth;
57
- }
58
- }
59
- // Update Hubs
60
- let normHub = 0;
61
- for (const node of eligibleNodes) {
62
- const outLinks = outgoing.get(node.url) || [];
63
- let newHub = 0;
64
- for (const link of outLinks) {
65
- const targetNode = urlToNode.get(link.target);
66
- newHub += (targetNode.authorityScore || 0) * link.weight;
67
- }
68
- node.hubScore = newHub;
69
- normHub += newHub * newHub;
70
- }
71
- // Normalize Hubs (L2 norm)
72
- normHub = Math.sqrt(normHub);
73
- if (normHub > 0) {
74
- for (const node of eligibleNodes) {
75
- node.hubScore = (node.hubScore || 0) / normHub;
76
- }
77
- }
78
- }
79
- // 4. Classification Logic
80
- classifyLinkRoles(eligibleNodes);
81
- }
82
- function classifyLinkRoles(nodes) {
83
- if (nodes.length === 0)
84
- return;
85
- const authScores = nodes.map(n => n.authorityScore || 0).sort((a, b) => a - b);
86
- const hubScores = nodes.map(n => n.hubScore || 0).sort((a, b) => a - b);
87
- // Use 75th percentile as "high" threshold
88
- const medianAuth = authScores[Math.floor(authScores.length / 2)];
89
- const medianHub = hubScores[Math.floor(hubScores.length / 2)];
90
- for (const node of nodes) {
91
- const auth = node.authorityScore || 0;
92
- const hub = node.hubScore || 0;
93
- const isHighAuth = auth > medianAuth && auth > 0.0001;
94
- const isHighHub = hub > medianHub && hub > 0.0001;
95
- if (isHighAuth && isHighHub) {
96
- node.linkRole = 'power';
97
- }
98
- else if (isHighAuth) {
99
- node.linkRole = 'authority';
100
- }
101
- else if (isHighHub) {
102
- node.linkRole = 'hub';
103
- }
104
- else if (auth > 0.0001 && hub > 0.0001) {
105
- node.linkRole = 'balanced';
106
- }
107
- else {
108
- node.linkRole = 'peripheral';
109
- }
110
- }
111
- }
@@ -1,548 +0,0 @@
1
- import fs from 'node:fs/promises';
2
- import { crawl } from '../crawler/crawl.js';
3
- import { loadGraphFromSnapshot } from '../db/graphLoader.js';
4
- import { normalizeUrl } from '../crawler/normalize.js';
5
- import { calculateMetrics, Metrics } from '../graph/metrics.js';
6
- import { Graph, ClusterInfo } from '../graph/graph.js';
7
- import { analyzeContent, calculateThinContentScore } from './content.js';
8
- import { analyzeH1, analyzeMetaDescription, analyzeTitle, applyDuplicateStatuses, H1Analysis, TextFieldAnalysis } from './seo.js';
9
- import { analyzeImageAlts, ImageAltAnalysis } from './images.js';
10
- import { analyzeLinks, LinkRatioAnalysis } from './links.js';
11
- import { analyzeStructuredData, StructuredDataResult } from './structuredData.js';
12
- import { aggregateSiteScore, scorePageSeo } from './scoring.js';
13
- import { detectContentClusters } from '../graph/cluster.js';
14
- import { getDb } from '../db/index.js';
15
- import { SiteRepository } from '../db/repositories/SiteRepository.js';
16
- import { SnapshotRepository } from '../db/repositories/SnapshotRepository.js';
17
- import { PageRepository } from '../db/repositories/PageRepository.js';
18
-
19
- export interface CrawlPage {
20
- url: string;
21
- status?: number;
22
- html?: string;
23
- depth?: number;
24
- canonical?: string;
25
- noindex?: boolean;
26
- nofollow?: boolean;
27
- }
28
-
29
- export interface AnalyzeOptions {
30
- fromCrawl?: string;
31
- live?: boolean;
32
- html?: boolean;
33
- seo?: boolean;
34
- content?: boolean;
35
- accessibility?: boolean;
36
- rate?: number;
37
- proxyUrl?: string;
38
- userAgent?: string;
39
- maxRedirects?: number;
40
- debug?: boolean;
41
- clusterThreshold?: number;
42
- minClusterSize?: number;
43
- }
44
-
45
- export interface PageAnalysis {
46
- url: string;
47
- status: number;
48
- title: TextFieldAnalysis;
49
- metaDescription: TextFieldAnalysis;
50
- h1: H1Analysis;
51
- content: ReturnType<typeof analyzeContent>;
52
- thinScore: number;
53
- images: ImageAltAnalysis;
54
- links: LinkRatioAnalysis;
55
- structuredData: StructuredDataResult;
56
- seoScore: number;
57
- meta: {
58
- canonical?: string;
59
- noindex?: boolean;
60
- nofollow?: boolean;
61
- }
62
- }
63
-
64
- export interface AnalysisResult {
65
- site_summary: {
66
- pages_analyzed: number;
67
- avg_seo_score: number;
68
- thin_pages: number;
69
- duplicate_titles: number;
70
- site_score: number;
71
- };
72
- site_scores: ReturnType<typeof aggregateSiteScore>;
73
- pages: PageAnalysis[];
74
- active_modules: {
75
- seo: boolean;
76
- content: boolean;
77
- accessibility: boolean;
78
- };
79
- clusters?: ClusterInfo[];
80
- }
81
-
82
- interface CrawlData {
83
- pages: CrawlPage[];
84
- metrics: Metrics;
85
- graph: Graph;
86
- }
87
-
88
- export async function analyzeSite(url: string, options: AnalyzeOptions): Promise<AnalysisResult> {
89
- const normalizedRoot = normalizeUrl(url, '', { stripQuery: false });
90
- if (!normalizedRoot) {
91
- throw new Error('Invalid URL for analysis');
92
- }
93
-
94
- let crawlData: CrawlData;
95
-
96
- if (options.live) {
97
- crawlData = await runLiveCrawl(normalizedRoot, options);
98
- } else {
99
- try {
100
- crawlData = await loadCrawlData(normalizedRoot, options.fromCrawl);
101
- } catch (error: any) {
102
- const isNotFound = error.code === 'ENOENT' ||
103
- error.message.includes('Crawl data not found') ||
104
- error.message.includes('No completed snapshot found') ||
105
- error.message.includes('not found in database');
106
- if (isNotFound && !options.fromCrawl) {
107
- console.log('No local crawl data found. Switching to live analysis mode...');
108
- crawlData = await runLiveCrawl(normalizedRoot, options);
109
- } else {
110
- throw error;
111
- }
112
- }
113
- }
114
-
115
- // Run clustering if requested or as default
116
- detectContentClusters(crawlData.graph, options.clusterThreshold, options.minClusterSize);
117
-
118
- const pages = analyzePages(normalizedRoot, crawlData.pages);
119
-
120
- const activeModules = {
121
- seo: !!options.seo,
122
- content: !!options.content,
123
- accessibility: !!options.accessibility
124
- };
125
-
126
- const hasFilters = activeModules.seo || activeModules.content || activeModules.accessibility;
127
-
128
- const filteredPages = hasFilters
129
- ? pages.map((page) => filterPageModules(page, activeModules))
130
- : pages;
131
-
132
- // Filter to only the requested URL
133
- const targetPage = filteredPages.find(p => p.url === normalizedRoot);
134
- const resultPages = targetPage ? [targetPage] : (options.live ? filteredPages.slice(0, 1) : filteredPages);
135
-
136
- const duplicateTitles = pages.filter((page) => page.title.status === 'duplicate').length;
137
- const thinPages = pages.filter((page) => page.thinScore >= 70).length;
138
- const siteScores = aggregateSiteScore(crawlData.metrics, pages);
139
-
140
- return {
141
- site_summary: {
142
- pages_analyzed: pages.length,
143
- avg_seo_score: siteScores.seoHealthScore,
144
- thin_pages: thinPages,
145
- duplicate_titles: duplicateTitles,
146
- site_score: siteScores.overallScore
147
- },
148
- site_scores: siteScores,
149
- pages: resultPages,
150
- active_modules: activeModules,
151
- clusters: crawlData.graph.contentClusters
152
- };
153
- }
154
-
155
- export function renderAnalysisHtml(result: AnalysisResult): string {
156
- if (result.pages.length === 1) {
157
- return renderSinglePageHtml(result.pages[0]);
158
- }
159
- const rows = result.pages
160
- .map((page) => `< tr > <td>${escapeHtml(page.url)} </td><td>${page.seoScore}</td > <td>${page.thinScore} </td><td>${page.title.status}</td > <td>${page.metaDescription.status} </td></tr > `)
161
- .join('');
162
-
163
- return `<!DOCTYPE html><html lang="en"><head><meta charset="utf-8" /><title>Crawlith Analysis Report</title></head><body><h1>Analysis</h1><p>Pages: ${result.site_summary.pages_analyzed}</p><p>Average SEO: ${result.site_summary.avg_seo_score}</p><table border="1" cellspacing="0" cellpadding="6"><thead><tr><th>URL</th><th>SEO Score</th><th>Thin Score</th><th>Title</th><th>Meta</th></tr></thead><tbody>${rows}</tbody></table></body></html>`;
164
- }
165
-
166
- function renderSinglePageHtml(page: PageAnalysis): string {
167
- return `<!DOCTYPE html>
168
- <html lang="en">
169
- <head>
170
- <meta charset="UTF-8">
171
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
172
- <title>Analysis for ${escapeHtml(page.url)}</title>
173
- <style>
174
- body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; line-height: 1.6; color: #333; }
175
- h1 { border-bottom: 2px solid #eee; padding-bottom: 10px; }
176
- h2 { margin-top: 30px; border-bottom: 1px solid #eee; padding-bottom: 5px; }
177
- .score-card { display: flex; gap: 20px; margin-bottom: 30px; }
178
- .score-box { background: #f8f9fa; padding: 15px; border-radius: 8px; text-align: center; flex: 1; border: 1px solid #e1e4e8; }
179
- .score-val { font-size: 24px; font-weight: bold; color: #0366d6; }
180
- .status-ok { color: green; font-weight: bold; }
181
- .status-warning { color: orange; font-weight: bold; }
182
- .status-critical { color: red; font-weight: bold; }
183
- .status-missing { color: red; font-weight: bold; }
184
- .data-table { width: 100%; border-collapse: collapse; margin-top: 10px; }
185
- .data-table th, .data-table td { text-align: left; padding: 8px; border-bottom: 1px solid #eee; }
186
- .data-table th { width: 150px; color: #666; }
187
- code { background: #f6f8fa; padding: 2px 4px; border-radius: 3px; font-size: 0.9em; }
188
- </style>
189
- </head>
190
- <body>
191
- <h1>Page Analysis</h1>
192
- <p><strong>URL:</strong> <a href="${page.url}" target="_blank">${page.url}</a></p>
193
-
194
- <div class="score-card">
195
- <div class="score-box">
196
- <div class="score-val">${page.seoScore}</div>
197
- <div>SEO Score</div>
198
- </div>
199
- <div class="score-box">
200
- <div class="score-val">${page.thinScore}</div>
201
- <div>Thin Content Score</div>
202
- </div>
203
- <div class="score-box">
204
- <div class="score-val">${page.status === 0 ? 'Pending/Limit' : page.status}</div>
205
- <div>HTTP Status</div>
206
- </div>
207
- </div>
208
-
209
- <h2>Meta Tags</h2>
210
- <table class="data-table">
211
- <tr>
212
- <th>Title</th>
213
- <td>
214
- <div>${escapeHtml(page.title.value || '(missing)')}</div>
215
- <small>Length: ${page.title.length} | Status: <span class="status-${page.title.status}">${page.title.status}</span></small>
216
- </td>
217
- </tr>
218
- <tr>
219
- <th>Description</th>
220
- <td>
221
- <div>${escapeHtml(page.metaDescription.value || '(missing)')}</div>
222
- <small>Length: ${page.metaDescription.length} | Status: <span class="status-${page.metaDescription.status}">${page.metaDescription.status}</span></small>
223
- </td>
224
- </tr>
225
- <tr>
226
- <th>Canonical</th>
227
- <td>${page.meta.canonical ? escapeHtml(page.meta.canonical) : '<em>(none)</em>'}</td>
228
- </tr>
229
- <tr>
230
- <th>Robots</th>
231
- <td>
232
- Index: ${!page.meta.noindex},
233
- Follow: ${!page.meta.nofollow}
234
- </td>
235
- </tr>
236
- </table>
237
-
238
- <h2>Content & Heading</h2>
239
- <table class="data-table">
240
- <tr>
241
- <th>H1 Tag</th>
242
- <td>
243
- Status: <span class="status-${page.h1.status}">${page.h1.status}</span>
244
- (${page.h1.count} detected)
245
- ${page.h1.matchesTitle ? ' | Matches Title' : ''}
246
- </td>
247
- </tr>
248
- <tr>
249
- <th>Word Count</th>
250
- <td>${page.content.wordCount} words</td>
251
- </tr>
252
- <tr>
253
- <th>Unique Sentences</th>
254
- <td>${page.content.uniqueSentenceCount}</td>
255
- </tr>
256
- <tr>
257
- <th>Text / HTML Ratio</th>
258
- <td>${(page.content.textHtmlRatio * 100).toFixed(2)}%</td>
259
- </tr>
260
- </table>
261
-
262
- <h2>Links & Images</h2>
263
- <table class="data-table">
264
- <tr>
265
- <th>Internal Links</th>
266
- <td>${page.links.internalLinks}</td>
267
- </tr>
268
- <tr>
269
- <th>External Links</th>
270
- <td>${page.links.externalLinks} (${(page.links.externalRatio * 100).toFixed(1)}%)</td>
271
- </tr>
272
- <tr>
273
- <th>Images</th>
274
- <td>${page.images.totalImages} total (${page.images.missingAlt} missing alt text)</td>
275
- </tr>
276
- </table>
277
-
278
- <h2>Structured Data</h2>
279
- <table class="data-table">
280
- <tr>
281
- <th>Status</th>
282
- <td>
283
- ${page.structuredData.present
284
- ? (page.structuredData.valid ? '<span class="status-ok">Valid</span>' : '<span class="status-critical">Invalid JSON</span>')
285
- : 'Not detected'
286
- }
287
- </td>
288
- </tr>
289
- ${page.structuredData.present ? `
290
- <tr>
291
- <th>Types Found</th>
292
- <td>${page.structuredData.types.map(t => `<code>${t}</code>`).join(', ')}</td>
293
- </tr>
294
- ` : ''}
295
- </table>
296
- </body>
297
- </html>`;
298
- }
299
-
300
- export function renderAnalysisMarkdown(result: AnalysisResult): string {
301
- const summary = [
302
- '# Crawlith SEO Analysis Report',
303
- '',
304
- '## 📊 Summary',
305
- `- Pages Analyzed: ${result.site_summary.pages_analyzed}`,
306
- `- Overall Site Score: ${result.site_summary.site_score.toFixed(1)}`,
307
- `- Avg SEO Score: ${result.site_summary.avg_seo_score.toFixed(1)}`,
308
- `- Thin Pages Found: ${result.site_summary.thin_pages}`,
309
- `- Duplicate Titles: ${result.site_summary.duplicate_titles}`,
310
- '',
311
- '## 📄 Page Details',
312
- '',
313
- '| URL | SEO Score | Thin Score | Title Status | Meta Status |',
314
- '| :--- | :--- | :--- | :--- | :--- |',
315
- ];
316
-
317
- result.pages.forEach((page) => {
318
- summary.push(`| ${page.url} | ${page.seoScore} | ${page.thinScore} | ${page.title.status} | ${page.metaDescription.status} |`);
319
- });
320
-
321
- return summary.join('\n');
322
- }
323
-
324
- export function renderAnalysisCsv(result: AnalysisResult): string {
325
- const headers = ['URL', 'SEO Score', 'Thin Score', 'HTTP Status', 'Title', 'Title Length', 'Meta Description', 'Desc Length', 'Word Count', 'Internal Links', 'External Links'];
326
- const rows = result.pages.map((p) => {
327
- const statusStr = p.status === 0 ? 'Pending/Limit' : p.status;
328
- return [
329
- p.url,
330
- p.seoScore,
331
- p.thinScore,
332
- statusStr,
333
- `"${(p.title.value || '').replace(/"/g, '""')}"`,
334
- p.title.length,
335
- `"${(p.metaDescription.value || '').replace(/"/g, '""')}"`,
336
- p.metaDescription.length,
337
- p.content.wordCount,
338
- p.links.internalLinks,
339
- p.links.externalLinks
340
- ].join(',');
341
- });
342
-
343
- return [headers.join(','), ...rows].join('\n');
344
- }
345
-
346
- function escapeHtml(value: string): string {
347
- return value.replaceAll('&', '&amp;').replaceAll('<', '&lt;').replaceAll('>', '&gt;');
348
- }
349
-
350
- function analyzePages(rootUrl: string, pages: CrawlPage[]): PageAnalysis[] {
351
- const titleCandidates = pages.map((page) => analyzeTitle(page.html || ''));
352
- const metaCandidates = pages.map((page) => analyzeMetaDescription(page.html || ''));
353
- const titles = applyDuplicateStatuses(titleCandidates);
354
- const metas = applyDuplicateStatuses(metaCandidates);
355
-
356
- const sentenceCountFrequency = new Map<number, number>();
357
- const baseContent = pages.map((page) => analyzeContent(page.html || ''));
358
- for (const item of baseContent) {
359
- sentenceCountFrequency.set(item.uniqueSentenceCount, (sentenceCountFrequency.get(item.uniqueSentenceCount) || 0) + 1);
360
- }
361
-
362
- return pages.map((page, index) => {
363
- const html = page.html || '';
364
- const title = titles[index];
365
- const metaDescription = metas[index];
366
- const h1 = analyzeH1(html, title.value);
367
- const content = baseContent[index];
368
- const duplicationScore = (sentenceCountFrequency.get(content.uniqueSentenceCount) || 0) > 1 ? 100 : 0;
369
- const thinScore = calculateThinContentScore(content, duplicationScore);
370
- const images = analyzeImageAlts(html);
371
- const links = analyzeLinks(html, page.url, rootUrl);
372
- const structuredData = analyzeStructuredData(html);
373
-
374
- const analysis: PageAnalysis = {
375
- url: page.url,
376
- status: page.status || 0,
377
- title,
378
- metaDescription,
379
- h1,
380
- content,
381
- thinScore,
382
- images,
383
- links,
384
- structuredData,
385
- seoScore: 0,
386
- meta: {
387
- canonical: page.canonical,
388
- noindex: page.noindex,
389
- nofollow: page.nofollow
390
- }
391
- };
392
-
393
- analysis.seoScore = scorePageSeo(analysis);
394
- return analysis;
395
- });
396
- }
397
-
398
- function filterPageModules(
399
- page: PageAnalysis,
400
- modules: { seo: boolean; content: boolean; accessibility: boolean }
401
- ): PageAnalysis {
402
- const keepSeo = modules.seo;
403
- const keepContent = modules.content;
404
- const keepAccessibility = modules.accessibility;
405
-
406
- return {
407
- ...page,
408
- title: keepSeo ? page.title : { value: null, length: 0, status: 'missing' },
409
- metaDescription: keepSeo ? page.metaDescription : { value: null, length: 0, status: 'missing' },
410
- h1: (keepSeo || keepContent) ? page.h1 : { count: 0, status: 'critical', matchesTitle: false },
411
- links: keepSeo ? page.links : { internalLinks: 0, externalLinks: 0, nofollowCount: 0, externalRatio: 0 },
412
- structuredData: keepSeo ? page.structuredData : { present: false, valid: false, types: [] },
413
- content: keepContent ? page.content : { wordCount: 0, textHtmlRatio: 0, uniqueSentenceCount: 0 },
414
- thinScore: keepContent ? page.thinScore : 0,
415
- images: keepAccessibility ? page.images : { totalImages: 0, missingAlt: 0, emptyAlt: 0 }
416
- };
417
- }
418
-
419
- async function loadCrawlData(rootUrl: string, fromCrawl?: string): Promise<CrawlData> {
420
- // If fromCrawl is provided, we could theoretically load JSON, but
421
- // we now default to DB fetching for all operations.
422
-
423
- if (fromCrawl) {
424
- try {
425
- const content = await fs.readFile(fromCrawl, 'utf-8');
426
- const raw = JSON.parse(content) as Record<string, unknown>;
427
- const pages = parsePages(raw);
428
- const graph = graphFromPages(rootUrl, pages, raw);
429
- const metrics = calculateMetrics(graph, 5);
430
- return { pages, metrics, graph };
431
- } catch (_e) {
432
- // Fallback downwards if file doesn't exist
433
- }
434
- }
435
-
436
- const db = getDb();
437
- const siteRepo = new SiteRepository(db);
438
- const snapshotRepo = new SnapshotRepository(db);
439
- const pageRepo = new PageRepository(db);
440
-
441
- const urlObj = new URL(rootUrl);
442
- const domain = urlObj.hostname.replace('www.', '');
443
- const site = siteRepo.firstOrCreateSite(domain);
444
-
445
- const snapshot = snapshotRepo.getLatestSnapshot(site.id, 'completed');
446
- if (!snapshot) {
447
- throw new Error(`No completed snapshot found for ${rootUrl} in database.`);
448
- }
449
-
450
- const graph = loadGraphFromSnapshot(snapshot.id);
451
- const metrics = calculateMetrics(graph, 5);
452
-
453
- // We also need the `pages` array for analysis.
454
- // It needs `html` which might not be fully available unless we look up from the DB or Graph.
455
- // Wait, the Graph stores Node which doesn't contain HTML since we removed it from memory?
456
- // Actually, `loadGraphFromSnapshot` does NOT load actual raw HTML from nodes to save memory.
457
- // We need HTML for `analyzeSite` module! So we must fetch it from `pageRepo`.
458
-
459
- const dbPages = pageRepo.getPagesBySnapshot(snapshot.id);
460
- const pages: CrawlPage[] = dbPages.map((p: any) => ({
461
- url: p.normalized_url,
462
- status: p.http_status || 0,
463
- html: p.html || '',
464
- depth: p.depth || 0
465
- }));
466
-
467
- return { pages, metrics, graph };
468
- }
469
-
470
- function parsePages(raw: Record<string, unknown>): CrawlPage[] {
471
- if (Array.isArray(raw.pages)) {
472
- return raw.pages.map((page) => {
473
- const p = page as Record<string, unknown>;
474
- return {
475
- url: String(p.url || ''),
476
- status: Number(p.status || 0),
477
- html: typeof p.html === 'string' ? p.html : '',
478
- depth: Number(p.depth || 0)
479
- };
480
- }).filter((page) => Boolean(page.url));
481
- }
482
-
483
- if (Array.isArray(raw.nodes)) {
484
- return raw.nodes.map((node) => {
485
- const n = node as Record<string, unknown>;
486
- return {
487
- url: String(n.url || ''),
488
- status: Number(n.status || 0),
489
- html: typeof n.html === 'string' ? n.html : '',
490
- depth: Number(n.depth || 0)
491
- };
492
- }).filter((page) => Boolean(page.url));
493
- }
494
-
495
- return [];
496
- }
497
-
498
- function graphFromPages(rootUrl: string, pages: CrawlPage[], raw: Record<string, unknown>): Graph {
499
- const graph = new Graph();
500
-
501
- for (const page of pages) {
502
- graph.addNode(page.url, page.depth || 0, page.status || 0);
503
- }
504
-
505
- if (Array.isArray(raw.edges)) {
506
- for (const edge of raw.edges) {
507
- const e = edge as Record<string, unknown>;
508
- if (typeof e.source === 'string' && typeof e.target === 'string') {
509
- graph.addNode(e.source, 0, 0);
510
- graph.addNode(e.target, 0, 0);
511
- graph.addEdge(e.source, e.target);
512
- }
513
- }
514
- return graph;
515
- }
516
-
517
- for (const page of pages) {
518
- if (!page.html) continue;
519
- const linkAnalysis = analyzeLinks(page.html, page.url, rootUrl);
520
- if (linkAnalysis.internalLinks === 0 && linkAnalysis.externalLinks === 0) continue;
521
- }
522
-
523
- return graph;
524
- }
525
-
526
- async function runLiveCrawl(url: string, options: AnalyzeOptions): Promise<CrawlData> {
527
- const snapshotId = await crawl(url, {
528
- limit: 1,
529
- depth: 0,
530
- rate: options.rate,
531
- proxyUrl: options.proxyUrl,
532
- userAgent: options.userAgent,
533
- maxRedirects: options.maxRedirects,
534
- debug: options.debug
535
- });
536
- const graph = loadGraphFromSnapshot(snapshotId);
537
- const pages = graph.getNodes().map((node) => ({
538
- url: node.url,
539
- status: node.status,
540
- html: node.html || '', // Include HTML
541
- depth: node.depth
542
- }));
543
- return {
544
- pages,
545
- metrics: calculateMetrics(graph, 1),
546
- graph
547
- };
548
- }