@crawlith/core 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analyze.d.ts +29 -8
  4. package/dist/analysis/analyze.js +325 -221
  5. package/dist/analysis/clustering.d.ts +23 -0
  6. package/dist/analysis/clustering.js +206 -0
  7. package/dist/analysis/content.d.ts +1 -1
  8. package/dist/analysis/content.js +11 -5
  9. package/dist/analysis/duplicate.d.ts +34 -0
  10. package/dist/analysis/duplicate.js +305 -0
  11. package/dist/analysis/heading.d.ts +116 -0
  12. package/dist/analysis/heading.js +356 -0
  13. package/dist/analysis/images.d.ts +1 -1
  14. package/dist/analysis/images.js +6 -5
  15. package/dist/analysis/links.d.ts +1 -1
  16. package/dist/analysis/links.js +8 -8
  17. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  18. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  19. package/dist/analysis/scoring.js +4 -1
  20. package/dist/analysis/seo.d.ts +8 -4
  21. package/dist/analysis/seo.js +41 -30
  22. package/dist/analysis/soft404.d.ts +17 -0
  23. package/dist/analysis/soft404.js +62 -0
  24. package/dist/analysis/structuredData.d.ts +1 -1
  25. package/dist/analysis/structuredData.js +5 -4
  26. package/dist/application/index.d.ts +2 -0
  27. package/dist/application/index.js +2 -0
  28. package/dist/application/usecase.d.ts +3 -0
  29. package/dist/application/usecase.js +1 -0
  30. package/dist/application/usecases.d.ts +114 -0
  31. package/dist/application/usecases.js +201 -0
  32. package/dist/audit/index.js +1 -1
  33. package/dist/audit/transport.d.ts +1 -1
  34. package/dist/audit/transport.js +5 -4
  35. package/dist/audit/types.d.ts +1 -0
  36. package/dist/constants.d.ts +17 -0
  37. package/dist/constants.js +23 -0
  38. package/dist/core/scope/scopeManager.js +3 -0
  39. package/dist/crawler/crawl.d.ts +2 -2
  40. package/dist/crawler/crawler.d.ts +17 -5
  41. package/dist/crawler/crawler.js +259 -94
  42. package/dist/crawler/fetcher.d.ts +1 -1
  43. package/dist/crawler/fetcher.js +6 -6
  44. package/dist/crawler/metricsRunner.d.ts +21 -1
  45. package/dist/crawler/metricsRunner.js +181 -60
  46. package/dist/crawler/normalize.d.ts +41 -0
  47. package/dist/crawler/normalize.js +119 -3
  48. package/dist/crawler/parser.d.ts +1 -3
  49. package/dist/crawler/parser.js +2 -49
  50. package/dist/crawler/resolver.d.ts +11 -0
  51. package/dist/crawler/resolver.js +67 -0
  52. package/dist/crawler/sitemap.d.ts +4 -1
  53. package/dist/crawler/sitemap.js +24 -18
  54. package/dist/crawler/trap.d.ts +5 -1
  55. package/dist/crawler/trap.js +23 -2
  56. package/dist/db/CrawlithDB.d.ts +110 -0
  57. package/dist/db/CrawlithDB.js +500 -0
  58. package/dist/db/graphLoader.js +15 -32
  59. package/dist/db/index.d.ts +9 -1
  60. package/dist/db/index.js +39 -31
  61. package/dist/db/migrations.d.ts +2 -0
  62. package/dist/db/{schema.js → migrations.js} +90 -43
  63. package/dist/db/pluginRegistry.d.ts +9 -0
  64. package/dist/db/pluginRegistry.js +19 -0
  65. package/dist/db/repositories/EdgeRepository.d.ts +5 -0
  66. package/dist/db/repositories/EdgeRepository.js +7 -0
  67. package/dist/db/repositories/MetricsRepository.d.ts +13 -8
  68. package/dist/db/repositories/MetricsRepository.js +14 -6
  69. package/dist/db/repositories/PageRepository.d.ts +5 -3
  70. package/dist/db/repositories/PageRepository.js +68 -17
  71. package/dist/db/repositories/SiteRepository.d.ts +6 -0
  72. package/dist/db/repositories/SiteRepository.js +4 -0
  73. package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
  74. package/dist/db/repositories/SnapshotRepository.js +48 -10
  75. package/dist/db/reset.d.ts +9 -0
  76. package/dist/db/reset.js +32 -0
  77. package/dist/db/statements.d.ts +12 -0
  78. package/dist/db/statements.js +40 -0
  79. package/dist/diff/compare.d.ts +0 -5
  80. package/dist/diff/compare.js +0 -12
  81. package/dist/diff/service.d.ts +16 -0
  82. package/dist/diff/service.js +41 -0
  83. package/dist/domain/index.d.ts +4 -0
  84. package/dist/domain/index.js +4 -0
  85. package/dist/events.d.ts +8 -0
  86. package/dist/graph/graph.d.ts +20 -42
  87. package/dist/graph/graph.js +12 -16
  88. package/dist/graph/hits.d.ts +23 -0
  89. package/dist/graph/hits.js +111 -0
  90. package/dist/graph/metrics.d.ts +0 -4
  91. package/dist/graph/metrics.js +19 -15
  92. package/dist/graph/pagerank.d.ts +17 -4
  93. package/dist/graph/pagerank.js +126 -93
  94. package/dist/index.d.ts +27 -9
  95. package/dist/index.js +27 -9
  96. package/dist/lock/lockManager.d.ts +1 -0
  97. package/dist/lock/lockManager.js +15 -0
  98. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  99. package/dist/plugin-system/plugin-cli.js +31 -0
  100. package/dist/plugin-system/plugin-config.d.ts +16 -0
  101. package/dist/plugin-system/plugin-config.js +36 -0
  102. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  103. package/dist/plugin-system/plugin-loader.js +122 -0
  104. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  105. package/dist/plugin-system/plugin-registry.js +167 -0
  106. package/dist/plugin-system/plugin-types.d.ts +205 -0
  107. package/dist/plugin-system/plugin-types.js +1 -0
  108. package/dist/ports/index.d.ts +9 -0
  109. package/dist/ports/index.js +1 -0
  110. package/dist/report/export.d.ts +3 -0
  111. package/dist/report/export.js +81 -0
  112. package/dist/report/insight.d.ts +27 -0
  113. package/dist/report/insight.js +103 -0
  114. package/dist/scoring/health.d.ts +17 -11
  115. package/dist/scoring/health.js +183 -140
  116. package/dist/utils/chalk.d.ts +6 -0
  117. package/dist/utils/chalk.js +41 -0
  118. package/dist/utils/secureConfig.d.ts +23 -0
  119. package/dist/utils/secureConfig.js +128 -0
  120. package/package.json +10 -4
  121. package/CHANGELOG.md +0 -13
  122. package/dist/db/schema.d.ts +0 -2
  123. package/dist/graph/cluster.d.ts +0 -6
  124. package/dist/graph/cluster.js +0 -221
  125. package/dist/graph/duplicate.d.ts +0 -10
  126. package/dist/graph/duplicate.js +0 -302
  127. package/dist/scoring/hits.d.ts +0 -10
  128. package/dist/scoring/hits.js +0 -131
  129. package/scripts/copy-assets.js +0 -37
  130. package/src/analysis/analysis_list.html +0 -35
  131. package/src/analysis/analysis_page.html +0 -123
  132. package/src/analysis/analyze.ts +0 -505
  133. package/src/analysis/content.ts +0 -62
  134. package/src/analysis/images.ts +0 -28
  135. package/src/analysis/links.ts +0 -41
  136. package/src/analysis/scoring.ts +0 -66
  137. package/src/analysis/seo.ts +0 -82
  138. package/src/analysis/structuredData.ts +0 -62
  139. package/src/analysis/templates.ts +0 -9
  140. package/src/audit/dns.ts +0 -49
  141. package/src/audit/headers.ts +0 -98
  142. package/src/audit/index.ts +0 -66
  143. package/src/audit/scoring.ts +0 -232
  144. package/src/audit/transport.ts +0 -258
  145. package/src/audit/types.ts +0 -102
  146. package/src/core/network/proxyAdapter.ts +0 -21
  147. package/src/core/network/rateLimiter.ts +0 -39
  148. package/src/core/network/redirectController.ts +0 -47
  149. package/src/core/network/responseLimiter.ts +0 -34
  150. package/src/core/network/retryPolicy.ts +0 -57
  151. package/src/core/scope/domainFilter.ts +0 -45
  152. package/src/core/scope/scopeManager.ts +0 -52
  153. package/src/core/scope/subdomainPolicy.ts +0 -39
  154. package/src/core/security/ipGuard.ts +0 -171
  155. package/src/crawler/crawl.ts +0 -9
  156. package/src/crawler/crawler.ts +0 -601
  157. package/src/crawler/extract.ts +0 -39
  158. package/src/crawler/fetcher.ts +0 -251
  159. package/src/crawler/metricsRunner.ts +0 -137
  160. package/src/crawler/normalize.ts +0 -108
  161. package/src/crawler/parser.ts +0 -190
  162. package/src/crawler/sitemap.ts +0 -76
  163. package/src/crawler/trap.ts +0 -96
  164. package/src/db/graphLoader.ts +0 -135
  165. package/src/db/index.ts +0 -75
  166. package/src/db/repositories/EdgeRepository.ts +0 -43
  167. package/src/db/repositories/MetricsRepository.ts +0 -63
  168. package/src/db/repositories/PageRepository.ts +0 -228
  169. package/src/db/repositories/SiteRepository.ts +0 -43
  170. package/src/db/repositories/SnapshotRepository.ts +0 -99
  171. package/src/db/schema.ts +0 -177
  172. package/src/diff/compare.ts +0 -84
  173. package/src/events.ts +0 -16
  174. package/src/graph/cluster.ts +0 -246
  175. package/src/graph/duplicate.ts +0 -350
  176. package/src/graph/graph.ts +0 -192
  177. package/src/graph/metrics.ts +0 -125
  178. package/src/graph/pagerank.ts +0 -126
  179. package/src/graph/simhash.ts +0 -76
  180. package/src/index.ts +0 -33
  181. package/src/lock/hashKey.ts +0 -51
  182. package/src/lock/lockManager.ts +0 -132
  183. package/src/lock/pidCheck.ts +0 -13
  184. package/src/report/crawl.html +0 -879
  185. package/src/report/crawlExport.ts +0 -58
  186. package/src/report/crawl_template.ts +0 -9
  187. package/src/report/html.ts +0 -27
  188. package/src/scoring/health.ts +0 -241
  189. package/src/scoring/hits.ts +0 -153
  190. package/src/scoring/orphanSeverity.ts +0 -176
  191. package/src/utils/version.ts +0 -18
  192. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  193. package/tests/analysis.unit.test.ts +0 -142
  194. package/tests/analyze.integration.test.ts +0 -133
  195. package/tests/analyze_markdown.test.ts +0 -98
  196. package/tests/audit/audit.test.ts +0 -101
  197. package/tests/audit/dns.test.ts +0 -31
  198. package/tests/audit/headers.test.ts +0 -45
  199. package/tests/audit/scoring.test.ts +0 -133
  200. package/tests/audit/security.test.ts +0 -12
  201. package/tests/audit/transport.test.ts +0 -111
  202. package/tests/clustering.test.ts +0 -118
  203. package/tests/clustering_risk.test.ts +0 -118
  204. package/tests/crawler.test.ts +0 -364
  205. package/tests/db/index.test.ts +0 -134
  206. package/tests/db/repositories.test.ts +0 -115
  207. package/tests/db.test.ts +0 -159
  208. package/tests/db_repos.test.ts +0 -72
  209. package/tests/diff.test.ts +0 -67
  210. package/tests/duplicate.test.ts +0 -110
  211. package/tests/extract.test.ts +0 -86
  212. package/tests/fetcher.test.ts +0 -110
  213. package/tests/fetcher_safety.test.ts +0 -91
  214. package/tests/fixtures/analyze-crawl.json +0 -26
  215. package/tests/graph/graph.test.ts +0 -100
  216. package/tests/graphLoader.test.ts +0 -124
  217. package/tests/hits.test.ts +0 -134
  218. package/tests/html_report.test.ts +0 -59
  219. package/tests/ipGuard.test.ts +0 -73
  220. package/tests/lock/lockManager.test.ts +0 -198
  221. package/tests/metrics.test.ts +0 -196
  222. package/tests/normalize.test.ts +0 -88
  223. package/tests/orphanSeverity.test.ts +0 -160
  224. package/tests/pagerank.test.ts +0 -98
  225. package/tests/parser.test.ts +0 -117
  226. package/tests/proxy_safety.test.ts +0 -57
  227. package/tests/redirect_safety.test.ts +0 -77
  228. package/tests/renderAnalysisCsv.test.ts +0 -183
  229. package/tests/safety.test.ts +0 -126
  230. package/tests/scope.test.ts +0 -84
  231. package/tests/scoring.test.ts +0 -60
  232. package/tests/sitemap.test.ts +0 -100
  233. package/tests/soft404.test.ts +0 -41
  234. package/tests/ssrf_fix.test.ts +0 -69
  235. package/tests/trap.test.ts +0 -39
  236. package/tests/visualization_data.test.ts +0 -46
  237. package/tsconfig.json +0 -11
@@ -1,6 +1,9 @@
1
+ import { load } from 'cheerio';
1
2
  import { crawl } from '../crawler/crawl.js';
3
+ import { UrlResolver } from '../crawler/resolver.js';
4
+ import { Fetcher } from '../crawler/fetcher.js';
2
5
  import { loadGraphFromSnapshot } from '../db/graphLoader.js';
3
- import { normalizeUrl } from '../crawler/normalize.js';
6
+ import { normalizeUrl, UrlUtil } from '../crawler/normalize.js';
4
7
  import { calculateMetrics } from '../graph/metrics.js';
5
8
  import { analyzeContent, calculateThinContentScore } from './content.js';
6
9
  import { analyzeH1, analyzeMetaDescription, analyzeTitle } from './seo.js';
@@ -8,95 +11,135 @@ import { analyzeImageAlts } from './images.js';
8
11
  import { analyzeLinks } from './links.js';
9
12
  import { analyzeStructuredData } from './structuredData.js';
10
13
  import { aggregateSiteScore, scorePageSeo } from './scoring.js';
11
- import { detectContentClusters } from '../graph/cluster.js';
14
+ import { ClusteringService } from './clustering.js';
15
+ import { DuplicateService } from './duplicate.js';
16
+ import { Soft404Service } from './soft404.js';
12
17
  import { getDb } from '../db/index.js';
13
18
  import { SiteRepository } from '../db/repositories/SiteRepository.js';
14
19
  import { SnapshotRepository } from '../db/repositories/SnapshotRepository.js';
15
20
  import { PageRepository } from '../db/repositories/PageRepository.js';
21
+ import { MetricsRepository } from '../db/repositories/MetricsRepository.js';
16
22
  import { ANALYSIS_LIST_TEMPLATE, ANALYSIS_PAGE_TEMPLATE } from './templates.js';
23
+ import { DEFAULTS } from '../constants.js';
24
+ import { PageRankService } from '../graph/pagerank.js';
25
+ import { HITSService } from '../graph/hits.js';
26
+ import { HeadingHealthService } from './heading.js';
27
+ import { annotateOrphans } from './orphan.js';
28
+ import { HealthService } from '../scoring/health.js';
17
29
  /**
18
30
  * Analyzes a site for SEO, content, and accessibility.
19
31
  * Supports live crawling or loading from a database snapshot.
20
- * Note: File-based data loading is not supported.
21
- *
22
- * @param url The root URL to analyze
23
- * @param options Analysis options
24
- * @param context Engine context for event emission
25
32
  */
26
33
  export async function analyzeSite(url, options, context) {
27
- const normalizedRoot = normalizeUrl(url, '', { stripQuery: false });
28
- if (!normalizedRoot) {
34
+ // 1. Parse siteOrigin (e.g. https://example.com) and targetPath (e.g. /stats) from the URL.
35
+ // We resolve the *origin* — not the full page URL — so rootOrigin is always just the
36
+ // scheme+host and normalizedPath is always the pathname.
37
+ let parsedUrl = null;
38
+ try {
39
+ parsedUrl = new URL(url);
40
+ }
41
+ catch { /* bare domain fallback below */ }
42
+ const inputFullUrl = parsedUrl ? parsedUrl.toString() : (url.startsWith('http') ? url : `https://${url}`);
43
+ const inputOrigin = parsedUrl ? `${parsedUrl.protocol}//${parsedUrl.host}` : url;
44
+ let rootOrigin = inputOrigin;
45
+ if (options.live !== false) {
46
+ const resolver = new UrlResolver();
47
+ const fetcher = new Fetcher({ rate: options.rate, proxyUrl: options.proxyUrl, userAgent: options.userAgent });
48
+ try {
49
+ const resolved = await resolver.resolve(inputOrigin, fetcher);
50
+ rootOrigin = resolved.url;
51
+ }
52
+ catch {
53
+ // Fallback to basic normalization if resolution fails
54
+ }
55
+ }
56
+ // Normalize origin and target URL independently.
57
+ const normalizedOrigin = normalizeUrl(rootOrigin, '', { stripQuery: false });
58
+ if (!normalizedOrigin) {
29
59
  throw new Error('Invalid URL for analysis');
30
60
  }
61
+ const normalizedTargetAbs = normalizeUrl(inputFullUrl, rootOrigin, { stripQuery: false }) || inputFullUrl;
62
+ const normalizedPath = normalizeUrl(inputFullUrl, rootOrigin, { stripQuery: false, toPath: true })
63
+ || UrlUtil.toPath(normalizedTargetAbs, rootOrigin);
64
+ const start = Date.now();
31
65
  let crawlData;
32
66
  let robots = null;
33
- // Always try to fetch robots.txt for the analysis session
34
- // to ensure we have the latest rules for visibility reporting.
35
- try {
36
- const robotsUrl = new URL('/robots.txt', normalizedRoot).toString();
37
- const robotsRes = await (new (await import('../crawler/fetcher.js')).Fetcher()).fetch(robotsUrl, { maxBytes: 500000 });
38
- const status = robotsRes.status;
39
- if (typeof status === 'number' && status >= 200 && status < 300) {
40
- const robotsParserModule = await import('robots-parser');
41
- const robotsParser = robotsParserModule.default || robotsParserModule;
42
- robots = robotsParser(robotsUrl, robotsRes.body);
67
+ // 1. Robots fetch (live-mode only to keep snapshot analysis deterministic and fast)
68
+ if (options.live) {
69
+ try {
70
+ const robotsUrl = new URL('/robots.txt', rootOrigin).toString();
71
+ const { Fetcher } = await import('../crawler/fetcher.js');
72
+ const fetcher = new Fetcher({
73
+ rate: DEFAULTS.RATE_LIMIT,
74
+ proxyUrl: options.proxyUrl,
75
+ userAgent: options.userAgent
76
+ });
77
+ const robotsRes = await fetcher.fetch(robotsUrl, { maxBytes: 500000 });
78
+ if (typeof robotsRes.status === 'number' && robotsRes.status >= 200 && robotsRes.status < 300) {
79
+ const robotsParserModule = await import('robots-parser');
80
+ const robotsParser = robotsParserModule.default || robotsParserModule;
81
+ robots = robotsParser(robotsUrl, robotsRes.body);
82
+ if (context)
83
+ context.emit({ type: 'info', message: `[analyze] Robots fetch took ${Date.now() - start}ms` });
84
+ }
85
+ }
86
+ catch {
87
+ // Fallback
43
88
  }
44
89
  }
45
- catch {
46
- // Silence robots fetch errors, fallback to existing or none
47
- }
90
+ // Data Acquisition
48
91
  if (options.live) {
49
- crawlData = await runLiveCrawl(normalizedRoot, options, context);
92
+ const crawlStart = Date.now();
93
+ crawlData = await runLiveCrawl(normalizedTargetAbs, rootOrigin, options, context, robots);
94
+ if (context)
95
+ context.emit({ type: 'info', message: `[analyze] runLiveCrawl took ${Date.now() - crawlStart}ms` });
50
96
  }
51
97
  else {
52
98
  try {
53
- crawlData = await loadCrawlData(normalizedRoot);
54
- // Convert generator to array so it can be reused multiple times
99
+ const loadStart = Date.now();
100
+ crawlData = await loadCrawlData(normalizedOrigin, options.snapshotId);
101
+ if (context)
102
+ context.emit({ type: 'debug', message: `[analyze] loadCrawlData took ${Date.now() - loadStart}ms` });
55
103
  const allPages = Array.from(crawlData.pages);
56
104
  crawlData.pages = allPages;
57
- // Check if the requested URL actually exists in this snapshot
58
- const exists = allPages.some(p => p.url === normalizedRoot);
105
+ const exists = allPages.some(p => p.url === normalizedPath || p.url === normalizedTargetAbs);
59
106
  if (!exists) {
60
- options.live = true; // Mark as live so the analysis knows to pick the first page if exact match fails
61
- if (context) {
62
- context.emit({ type: 'info', message: `URL ${normalizedRoot} not found in latest snapshot. Fetching live...` });
63
- }
64
- crawlData = await runLiveCrawl(normalizedRoot, options, context);
107
+ if (context)
108
+ context.emit({ type: 'info', message: `URL ${normalizedTargetAbs} not found. Fetching live...` });
109
+ crawlData = await runLiveCrawl(normalizedTargetAbs, rootOrigin, options, context, robots);
65
110
  }
66
111
  }
67
- catch (error) {
68
- const isNotFound = error.code === 'ENOENT' ||
69
- error.message.includes('Crawl data not found') ||
70
- error.message.includes('No completed snapshot found') ||
71
- error.message.includes('not found in database');
72
- if (isNotFound) {
73
- options.live = true; // Force live mode
74
- if (context) {
75
- context.emit({ type: 'info', message: 'No local crawl data found. Switching to live analysis mode...' });
76
- }
77
- crawlData = await runLiveCrawl(normalizedRoot, options, context);
78
- }
79
- else {
80
- throw error;
81
- }
112
+ catch (_error) {
113
+ if (context)
114
+ context.emit({ type: 'info', message: 'No local crawl data found. Switching to live...' });
115
+ crawlData = await runLiveCrawl(normalizedTargetAbs, rootOrigin, options, context, robots);
82
116
  }
83
117
  }
84
118
  const snapshotId = crawlData.snapshotId;
85
119
  const crawledAt = crawlData.crawledAt;
86
- // Run clustering if requested or as default
87
- detectContentClusters(crawlData.graph, options.clusterThreshold, options.minClusterSize);
88
- const pages = analyzePages(normalizedRoot, crawlData.pages, robots);
120
+ const pagesStart = Date.now();
121
+ const pages = analyzePages(normalizedTargetAbs, rootOrigin, crawlData.pages, robots, options);
122
+ if (context)
123
+ context.emit({ type: 'debug', message: `[analyze] analyzePages took ${Date.now() - pagesStart}ms` });
124
+ // Sync basic page analysis results back to graph nodes for persistence
125
+ for (const pageAnalysis of pages) {
126
+ const node = crawlData.graph.nodes.get(pageAnalysis.url);
127
+ if (node) {
128
+ node.soft404Score = pageAnalysis.soft404?.score;
129
+ node.wordCount = pageAnalysis.content.wordCount;
130
+ node.externalLinkRatio = pageAnalysis.links.externalRatio;
131
+ node.thinContentScore = pageAnalysis.thinScore;
132
+ node.title = pageAnalysis.title.value || undefined;
133
+ }
134
+ }
89
135
  const activeModules = {
90
136
  seo: !!options.seo,
91
137
  content: !!options.content,
92
138
  accessibility: !!options.accessibility
93
139
  };
94
140
  const hasFilters = activeModules.seo || activeModules.content || activeModules.accessibility;
95
- const filteredPages = hasFilters
96
- ? pages.map((page) => filterPageModules(page, activeModules))
97
- : pages;
98
- // Filter to only the requested URL
99
- const targetPage = filteredPages.find(p => p.url === normalizedRoot);
141
+ const filteredPages = hasFilters ? pages.map((page) => filterPageModules(page, activeModules)) : pages;
142
+ const targetPage = filteredPages.find(p => p.url === normalizedPath || p.url === normalizedTargetAbs);
100
143
  let resultPages;
101
144
  if (options.allPages) {
102
145
  resultPages = filteredPages;
@@ -104,215 +147,247 @@ export async function analyzeSite(url, options, context) {
104
147
  else {
105
148
  resultPages = targetPage ? [targetPage] : (options.live ? filteredPages.slice(0, 1) : []);
106
149
  }
150
+ let clusters = [];
151
+ let duplicates = [];
152
+ let prResults = new Map();
153
+ let hitsResults = new Map();
154
+ let headingPayloads = {};
155
+ if (options.clustering) {
156
+ const clustering = new ClusteringService();
157
+ clusters = clustering.detectContentClusters(crawlData.graph, options.clusterThreshold, options.minClusterSize);
158
+ }
159
+ if (options.allPages) {
160
+ const duplication = new DuplicateService();
161
+ duplicates = duplication.detectDuplicates(crawlData.graph, { collapse: false });
162
+ }
163
+ if (options.computePagerank) {
164
+ const prService = new PageRankService();
165
+ prResults = prService.evaluate(crawlData.graph);
166
+ }
167
+ if (options.computeHits) {
168
+ const hitsService = new HITSService();
169
+ hitsResults = hitsService.evaluate(crawlData.graph);
170
+ }
171
+ if (options.heading) {
172
+ const headingService = new HeadingHealthService();
173
+ const { payloadsByUrl } = headingService.evaluateNodes(crawlData.graph.getNodes());
174
+ headingPayloads = payloadsByUrl;
175
+ }
176
+ if (options.orphans) {
177
+ const edges = crawlData.graph.getEdges();
178
+ annotateOrphans(crawlData.graph.getNodes(), edges, {
179
+ enabled: true,
180
+ severityEnabled: !!options.orphanSeverity,
181
+ includeSoftOrphans: !!options.includeSoftOrphans,
182
+ minInbound: options.minInbound || 2,
183
+ rootUrl: normalizedOrigin
184
+ });
185
+ }
186
+ // Run HealthService when --health is enabled
187
+ let healthBreakdown;
188
+ if (options.health) {
189
+ const healthService = new HealthService();
190
+ const issues = healthService.collectCrawlIssues(crawlData.graph, crawlData.metrics, rootOrigin);
191
+ healthBreakdown = healthService.calculateHealthScore(crawlData.graph.nodes.size, issues);
192
+ }
193
+ // Update nodes in graph with results
194
+ for (const node of crawlData.graph.getNodes()) {
195
+ const pr = prResults.get(node.url);
196
+ if (pr)
197
+ node.pagerankScore = pr.score;
198
+ const hits = hitsResults.get(node.url);
199
+ if (hits) {
200
+ node.hubScore = hits.hub_score;
201
+ node.authScore = hits.authority_score;
202
+ node.linkRole = hits.link_role;
203
+ }
204
+ const heading = headingPayloads[node.url];
205
+ if (heading) {
206
+ node.headingScore = heading.score;
207
+ node.headingData = JSON.stringify(heading);
208
+ }
209
+ }
210
+ // Synchronize graph-level final scores back to PageAnalysis models
211
+ for (const page of pages) {
212
+ const node = crawlData.graph.nodes.get(page.url);
213
+ if (node) {
214
+ if (node.headingScore !== undefined)
215
+ page.headingScore = node.headingScore;
216
+ page.seoScore = scorePageSeo(page);
217
+ }
218
+ }
107
219
  const duplicateTitles = pages.filter((page) => page.title.status === 'duplicate').length;
108
220
  const thinPages = pages.filter((page) => page.thinScore >= 70).length;
109
221
  const siteScores = aggregateSiteScore(crawlData.metrics, resultPages.length === 1 ? resultPages : pages);
110
- return {
222
+ if (context)
223
+ context.emit({ type: 'debug', message: `[analyze] Total analysis completed in ${Date.now() - start}ms` });
224
+ // Persist to Database
225
+ const db = getDb();
226
+ const metricsRepo = new MetricsRepository(db);
227
+ const pageRepo = new PageRepository(db);
228
+ // Efficiently map URLs to IDs for this snapshot
229
+ const pagesIdentity = pageRepo.getPagesIdentityBySnapshot(snapshotId);
230
+ const urlToIdMap = new Map(pagesIdentity.map(p => [p.normalized_url, p.id]));
231
+ const metricsToSave = crawlData.graph.getNodes().map(node => {
232
+ const pageId = urlToIdMap.get(node.url);
233
+ if (!pageId)
234
+ return null;
235
+ return {
236
+ snapshot_id: snapshotId,
237
+ page_id: pageId,
238
+ crawl_status: node.crawlStatus || null,
239
+ word_count: node.wordCount || null,
240
+ thin_content_score: node.thinContentScore || null,
241
+ external_link_ratio: node.externalLinkRatio || null,
242
+ pagerank_score: node.pagerankScore || null,
243
+ hub_score: node.hubScore || null,
244
+ auth_score: node.authScore || null,
245
+ link_role: node.linkRole || null,
246
+ duplicate_cluster_id: node.duplicateClusterId || null,
247
+ duplicate_type: node.duplicateType || null,
248
+ cluster_id: node.clusterId || null,
249
+ soft404_score: node.soft404Score || null,
250
+ heading_score: node.headingScore || null,
251
+ orphan_score: node.orphanScore || null,
252
+ orphan_type: node.orphanType || null,
253
+ impact_level: node.impactLevel || null,
254
+ heading_data: node.headingData || null,
255
+ is_cluster_primary: node.isClusterPrimary ? 1 : 0
256
+ };
257
+ }).filter(m => m !== null);
258
+ // Persist health score to snapshot if computed
259
+ if (healthBreakdown && snapshotId) {
260
+ const db2 = getDb();
261
+ const snapshotRepo = new SnapshotRepository(db2);
262
+ snapshotRepo.updateSnapshotStatus(snapshotId, 'completed', {
263
+ health_score: healthBreakdown.score
264
+ });
265
+ }
266
+ metricsRepo.insertMany(metricsToSave);
267
+ const result = {
111
268
  site_summary: {
112
269
  pages_analyzed: resultPages.length,
113
270
  avg_seo_score: siteScores.seoHealthScore,
114
271
  thin_pages: thinPages,
115
272
  duplicate_titles: duplicateTitles,
116
- site_score: siteScores.overallScore
273
+ site_score: siteScores.overallScore,
274
+ site_score_breakdown: siteScores.breakdown
117
275
  },
118
276
  site_scores: siteScores,
119
277
  pages: resultPages,
120
278
  active_modules: activeModules,
121
- clusters: crawlData.graph.contentClusters,
122
279
  snapshotId,
123
- crawledAt
280
+ crawledAt,
281
+ clusters,
282
+ duplicates
124
283
  };
284
+ return result;
125
285
  }
126
- export function renderAnalysisHtml(result) {
127
- if (result.pages.length === 1) {
128
- return renderSinglePageHtml(result.pages[0]);
129
- }
130
- const rows = result.pages
131
- .map((page) => `<tr><td>${escapeHtml(page.url)}</td><td>${page.seoScore}</td><td>${page.thinScore}</td><td>${page.title.status}</td><td>${page.metaDescription.status}</td></tr>`)
132
- .join('');
133
- return ANALYSIS_LIST_TEMPLATE
134
- .replace('{{PAGES_ANALYZED}}', result.site_summary.pages_analyzed.toString())
135
- .replace('{{AVG_SEO_SCORE}}', result.site_summary.avg_seo_score.toString())
136
- .replace('{{ROWS}}', rows);
137
- }
138
- function renderSinglePageHtml(page) {
139
- const structuredDataStatus = page.structuredData.present
140
- ? (page.structuredData.valid ? '<span class="status-ok">Valid</span>' : '<span class="status-critical">Invalid JSON</span>')
141
- : 'Not detected';
142
- const structuredDataTypesRow = page.structuredData.present ? `
143
- <tr>
144
- <th>Types Found</th>
145
- <td>${page.structuredData.types.map(t => `<code>${t}</code>`).join(', ')}</td>
146
- </tr>
147
- ` : '';
148
- return ANALYSIS_PAGE_TEMPLATE
149
- .replaceAll('{{URL}}', escapeHtml(page.url))
150
- .replace('{{SEO_SCORE}}', page.seoScore.toString())
151
- .replace('{{THIN_SCORE}}', page.thinScore.toString())
152
- .replace('{{HTTP_STATUS}}', page.status === 0 ? 'Pending/Limit' : page.status.toString())
153
- .replace('{{TITLE_VALUE}}', escapeHtml(page.title.value || '(missing)'))
154
- .replace('{{TITLE_LENGTH}}', page.title.length.toString())
155
- .replaceAll('{{TITLE_STATUS}}', page.title.status)
156
- .replace('{{META_DESCRIPTION_VALUE}}', escapeHtml(page.metaDescription.value || '(missing)'))
157
- .replace('{{META_DESCRIPTION_LENGTH}}', page.metaDescription.length.toString())
158
- .replaceAll('{{META_DESCRIPTION_STATUS}}', page.metaDescription.status)
159
- .replace('{{CANONICAL}}', page.meta.canonical ? escapeHtml(page.meta.canonical) : '<em>(none)</em>')
160
- .replace('{{ROBOTS_INDEX}}', (!page.meta.noindex).toString())
161
- .replace('{{ROBOTS_FOLLOW}}', (!page.meta.nofollow).toString())
162
- .replaceAll('{{H1_STATUS}}', page.h1.status)
163
- .replace('{{H1_COUNT}}', page.h1.count.toString())
164
- .replace('{{H1_MATCHES_TITLE}}', page.h1.matchesTitle ? ' | Matches Title' : '')
165
- .replace('{{WORD_COUNT}}', page.content.wordCount.toString())
166
- .replace('{{UNIQUE_SENTENCES}}', page.content.uniqueSentenceCount.toString())
167
- .replace('{{TEXT_HTML_RATIO}}', (page.content.textHtmlRatio * 100).toFixed(2))
168
- .replace('{{INTERNAL_LINKS}}', page.links.internalLinks.toString())
169
- .replace('{{EXTERNAL_LINKS}}', page.links.externalLinks.toString())
170
- .replace('{{EXTERNAL_RATIO}}', (page.links.externalRatio * 100).toFixed(1))
171
- .replace('{{TOTAL_IMAGES}}', page.images.totalImages.toString())
172
- .replace('{{MISSING_ALT}}', page.images.missingAlt.toString())
173
- .replace('{{STRUCTURED_DATA_STATUS}}', structuredDataStatus)
174
- .replace('{{STRUCTURED_DATA_TYPES_ROW}}', structuredDataTypesRow);
175
- }
176
- export function renderAnalysisMarkdown(result) {
177
- const summary = [
178
- '# Crawlith SEO Analysis Report',
179
- '',
180
- '## 📊 Summary',
181
- `- Pages Analyzed: ${result.site_summary.pages_analyzed}`,
182
- `- Overall Site Score: ${result.site_summary.site_score.toFixed(1)}`,
183
- `- Avg SEO Score: ${result.site_summary.avg_seo_score.toFixed(1)}`,
184
- `- Thin Pages Found: ${result.site_summary.thin_pages}`,
185
- `- Duplicate Titles: ${result.site_summary.duplicate_titles}`,
186
- '',
187
- '## 📄 Page Details',
188
- '',
189
- '| URL | SEO Score | Thin Score | Title Status | Meta Status |',
190
- '| :--- | :--- | :--- | :--- | :--- |',
191
- ];
192
- result.pages.forEach((page) => {
193
- summary.push(`| ${page.url} | ${page.seoScore} | ${page.thinScore} | ${page.title.status} | ${page.metaDescription.status} |`);
194
- });
195
- return summary.join('\n');
196
- }
197
- export function renderAnalysisCsv(result) {
198
- const headers = ['URL', 'SEO Score', 'Thin Score', 'HTTP Status', 'Title', 'Title Length', 'Meta Description', 'Desc Length', 'Word Count', 'Internal Links', 'External Links'];
199
- const rows = result.pages.map((p) => {
200
- const statusStr = p.status === 0 ? 'Pending/Limit' : p.status;
201
- return [
202
- p.url,
203
- p.seoScore,
204
- p.thinScore,
205
- statusStr,
206
- `"${(p.title.value || '').replace(/"/g, '""')}"`,
207
- p.title.length,
208
- `"${(p.metaDescription.value || '').replace(/"/g, '""')}"`,
209
- p.metaDescription.length,
210
- p.content.wordCount,
211
- p.links.internalLinks,
212
- p.links.externalLinks
213
- ].join(',');
214
- });
215
- return [headers.join(','), ...rows].join('\n');
216
- }
217
- function escapeHtml(value) {
218
- return value.replaceAll('&', '&amp;').replaceAll('<', '&lt;').replaceAll('>', '&gt;');
219
- }
220
- export function analyzePages(rootUrl, pages, robots) {
286
+ export function analyzePages(targetUrl, rootOrigin, pages, robots, options = {}) {
221
287
  const titleCounts = new Map();
222
288
  const metaCounts = new Map();
223
289
  const sentenceCountFrequency = new Map();
224
290
  const results = [];
291
+ const targetPath = UrlUtil.toPath(targetUrl, rootOrigin);
292
+ const targetAbs = UrlUtil.toAbsolute(targetUrl, rootOrigin);
225
293
  for (const page of pages) {
294
+ const pagePath = UrlUtil.toPath(page.url, rootOrigin);
295
+ const pageAbs = UrlUtil.toAbsolute(page.url, rootOrigin);
296
+ const isTarget = page.url === targetUrl || pagePath === targetPath || pageAbs === targetAbs;
297
+ // In single-page mode, if it's not the target, we skip it entirely for speed.
298
+ if (!options.allPages && !isTarget)
299
+ continue;
226
300
  const html = page.html || '';
227
- // 0. Update crawl status based on current robots rules
301
+ const $ = load(html || '<html></html>');
302
+ // Reconstruct absolute URL from stored path for robots & link resolution
303
+ const pageAbsUrl = UrlUtil.toAbsolute(page.url, rootOrigin);
228
304
  let crawlStatus = page.crawlStatus;
229
305
  if (robots) {
230
- const isBlocked = !robots.isAllowed(page.url, 'crawlith') ||
231
- (!page.url.endsWith('/') && !robots.isAllowed(page.url + '/', 'crawlith'));
232
- if (isBlocked) {
306
+ const isBlocked = !robots.isAllowed(pageAbsUrl, 'crawlith') ||
307
+ (!pageAbsUrl.endsWith('/') && !robots.isAllowed(pageAbsUrl + '/', 'crawlith'));
308
+ if (isBlocked)
233
309
  crawlStatus = 'blocked_by_robots';
234
- }
235
310
  }
236
- // 1. Analyze Individual Components
237
- const title = analyzeTitle(html);
238
- const metaDescription = analyzeMetaDescription(html);
239
- const h1 = analyzeH1(html, title.value);
240
- const content = analyzeContent(html);
241
- const images = analyzeImageAlts(html);
242
- const links = analyzeLinks(html, page.url, rootUrl);
243
- const structuredData = analyzeStructuredData(html);
244
- // 2. Accumulate Frequencies for Duplicates
311
+ // Shared DOM Analysis
312
+ const title = analyzeTitle($);
313
+ const metaDescription = analyzeMetaDescription($);
314
+ const h1 = analyzeH1($, title.value);
315
+ const content = analyzeContent($);
316
+ const images = analyzeImageAlts($);
317
+ const links = analyzeLinks($, pageAbsUrl, rootOrigin);
318
+ const structuredData = analyzeStructuredData($);
245
319
  if (title.value) {
246
- const key = (title.value || '').trim().toLowerCase();
320
+ const key = title.value.trim().toLowerCase();
247
321
  titleCounts.set(key, (titleCounts.get(key) || 0) + 1);
248
322
  }
249
323
  if (metaDescription.value) {
250
- const key = (metaDescription.value || '').trim().toLowerCase();
324
+ const key = metaDescription.value.trim().toLowerCase();
251
325
  metaCounts.set(key, (metaCounts.get(key) || 0) + 1);
252
326
  }
253
327
  sentenceCountFrequency.set(content.uniqueSentenceCount, (sentenceCountFrequency.get(content.uniqueSentenceCount) || 0) + 1);
254
- // 3. Store Preliminary Result
255
- results.push({
328
+ const soft404Service = new Soft404Service();
329
+ const soft404 = soft404Service.analyze(html, links.externalLinks + links.internalLinks);
330
+ const isCanonicalConflict = !!(page.canonical && page.canonical !== page.url && page.canonical !== pageAbsUrl &&
331
+ page.canonical.replace(/\/$/, '') !== pageAbsUrl.replace(/\/$/, ''));
332
+ const resultPage = {
256
333
  url: page.url,
257
334
  status: page.status || 0,
258
335
  title,
259
336
  metaDescription,
260
337
  h1,
261
338
  content,
262
- thinScore: 0, // Calculated in pass 2
339
+ thinScore: 0,
263
340
  images,
264
341
  links,
265
342
  structuredData,
266
- seoScore: 0, // Calculated in pass 2
343
+ seoScore: 0,
267
344
  meta: {
268
345
  canonical: page.canonical,
269
346
  noindex: page.noindex,
270
347
  nofollow: page.nofollow,
271
- crawlStatus
272
- }
273
- });
348
+ crawlStatus,
349
+ canonicalConflict: isCanonicalConflict
350
+ },
351
+ soft404
352
+ };
353
+ Object.defineProperty(resultPage, 'html', { value: html, enumerable: false });
354
+ results.push(resultPage);
274
355
  }
275
- // 4. Finalize Statuses and Scores (Pass 2)
276
356
  for (const analysis of results) {
277
- // Check Title Duplicates
278
357
  if (analysis.title.value) {
279
- const key = (analysis.title.value || '').trim().toLowerCase();
280
- if ((titleCounts.get(key) || 0) > 1) {
358
+ const key = analysis.title.value.trim().toLowerCase();
359
+ if ((titleCounts.get(key) || 0) > 1)
281
360
  analysis.title.status = 'duplicate';
282
- }
283
361
  }
284
- // Check Meta Duplicates
285
362
  if (analysis.metaDescription.value) {
286
- const key = (analysis.metaDescription.value || '').trim().toLowerCase();
287
- if ((metaCounts.get(key) || 0) > 1) {
363
+ const key = analysis.metaDescription.value.trim().toLowerCase();
364
+ if ((metaCounts.get(key) || 0) > 1)
288
365
  analysis.metaDescription.status = 'duplicate';
289
- }
290
366
  }
291
- // Check Content Duplication
292
367
  const duplicationScore = (sentenceCountFrequency.get(analysis.content.uniqueSentenceCount) || 0) > 1 ? 100 : 0;
293
368
  analysis.thinScore = calculateThinContentScore(analysis.content, duplicationScore);
294
- // Calculate Final SEO Score
295
369
  analysis.seoScore = scorePageSeo(analysis);
296
370
  }
297
371
  return results;
298
372
  }
299
373
  function filterPageModules(page, modules) {
300
- const keepSeo = modules.seo;
301
- const keepContent = modules.content;
302
- const keepAccessibility = modules.accessibility;
303
- return {
374
+ const filtered = {
304
375
  ...page,
305
- title: keepSeo ? page.title : { value: null, length: 0, status: 'missing' },
306
- metaDescription: keepSeo ? page.metaDescription : { value: null, length: 0, status: 'missing' },
307
- h1: (keepSeo || keepContent) ? page.h1 : { count: 0, status: 'critical', matchesTitle: false },
308
- links: keepSeo ? page.links : { internalLinks: 0, externalLinks: 0, nofollowCount: 0, externalRatio: 0 },
309
- structuredData: keepSeo ? page.structuredData : { present: false, valid: false, types: [] },
310
- content: keepContent ? page.content : { wordCount: 0, textHtmlRatio: 0, uniqueSentenceCount: 0 },
311
- thinScore: keepContent ? page.thinScore : 0,
312
- images: keepAccessibility ? page.images : { totalImages: 0, missingAlt: 0, emptyAlt: 0 }
376
+ title: modules.seo ? page.title : { value: null, length: 0, status: 'missing' },
377
+ metaDescription: modules.seo ? page.metaDescription : { value: null, length: 0, status: 'missing' },
378
+ h1: (modules.seo || modules.content) ? page.h1 : { count: 0, status: 'critical', matchesTitle: false, value: null },
379
+ links: modules.seo ? page.links : { internalLinks: 0, externalLinks: 0, nofollowCount: 0, externalRatio: 0 },
380
+ structuredData: modules.seo ? page.structuredData : { present: false, valid: false, types: [] },
381
+ content: modules.content ? page.content : { wordCount: 0, textHtmlRatio: 0, uniqueSentenceCount: 0 },
382
+ thinScore: modules.content ? page.thinScore : 0,
383
+ images: modules.accessibility ? page.images : { totalImages: 0, missingAlt: 0, emptyAlt: 0 }
313
384
  };
385
+ if (page.html) {
386
+ Object.defineProperty(filtered, 'html', { value: page.html, enumerable: false });
387
+ }
388
+ return filtered;
314
389
  }
315
- async function loadCrawlData(rootUrl) {
390
+ async function loadCrawlData(rootUrl, snapshotId) {
316
391
  const db = getDb();
317
392
  const siteRepo = new SiteRepository(db);
318
393
  const snapshotRepo = new SnapshotRepository(db);
@@ -320,22 +395,26 @@ async function loadCrawlData(rootUrl) {
320
395
  const urlObj = new URL(rootUrl);
321
396
  const domain = urlObj.hostname.replace('www.', '');
322
397
  const site = siteRepo.firstOrCreateSite(domain);
323
- let snapshot;
324
- const page = pageRepo.getPage(site.id, rootUrl);
325
- if (page && page.last_seen_snapshot_id) {
326
- snapshot = snapshotRepo.getSnapshot(page.last_seen_snapshot_id);
327
- }
328
- if (!snapshot) {
329
- snapshot = snapshotRepo.getLatestSnapshot(site.id);
398
+ let snapshot = null;
399
+ if (snapshotId) {
400
+ snapshot = snapshotRepo.getSnapshot(snapshotId);
330
401
  }
331
402
  if (!snapshot) {
332
- throw new Error(`No crawl data found for ${rootUrl} in database.`);
403
+ for (const candidate of UrlUtil.toLookupCandidates(rootUrl, `${urlObj.protocol}//${urlObj.host}`)) {
404
+ const page = pageRepo.getPage(site.id, candidate);
405
+ if (page?.last_seen_snapshot_id) {
406
+ snapshot = snapshotRepo.getSnapshot(page.last_seen_snapshot_id);
407
+ break;
408
+ }
409
+ }
333
410
  }
411
+ if (!snapshot)
412
+ snapshot = snapshotRepo.getLatestSnapshot(site.id);
413
+ if (!snapshot)
414
+ throw new Error(`No crawl data found for ${rootUrl}`);
334
415
  const graph = loadGraphFromSnapshot(snapshot.id);
335
416
  const metrics = calculateMetrics(graph, 5);
336
- // Use iterator to save memory
337
417
  const dbPagesIterator = pageRepo.getPagesIteratorBySnapshot(snapshot.id);
338
- // We need to map the DB pages to CrawlPage format lazily
339
418
  const pagesGenerator = function* () {
340
419
  for (const p of dbPagesIterator) {
341
420
  yield {
@@ -352,29 +431,54 @@ async function loadCrawlData(rootUrl) {
352
431
  };
353
432
  return { pages: pagesGenerator(), metrics, graph, snapshotId: snapshot.id, crawledAt: snapshot.created_at };
354
433
  }
355
- async function runLiveCrawl(url, options, context) {
434
+ async function runLiveCrawl(url, origin, options, context, robots) {
356
435
  const snapshotId = await crawl(url, {
357
- limit: 1, // Always limit to 1 for single page live analysis
436
+ limit: 1,
358
437
  depth: 0,
359
438
  rate: options.rate,
360
439
  proxyUrl: options.proxyUrl,
361
440
  userAgent: options.userAgent,
362
441
  maxRedirects: options.maxRedirects,
363
442
  debug: options.debug,
364
- snapshotType: 'partial'
443
+ snapshotRunType: 'single',
444
+ robots,
445
+ sitemap: options.sitemap,
446
+ plugins: options.plugins
365
447
  }, context);
366
448
  const graph = loadGraphFromSnapshot(snapshotId);
367
449
  const pages = graph.getNodes().map((node) => ({
368
450
  url: node.url,
369
451
  status: node.status,
370
- html: node.html || '', // Include HTML
452
+ html: node.html || '',
371
453
  depth: node.depth,
372
454
  crawlStatus: node.crawlStatus
373
455
  }));
374
- return {
375
- pages,
376
- metrics: calculateMetrics(graph, 1),
377
- graph,
378
- snapshotId
379
- };
456
+ return { pages, metrics: calculateMetrics(graph, 1), graph, snapshotId };
457
+ }
458
+ export function escapeHtml(value) {
459
+ return value.replaceAll('&', '&amp;').replaceAll('<', '&lt;').replaceAll('>', '&gt;');
460
+ }
461
+ export function renderAnalysisHtml(result) {
462
+ if (result.pages.length === 1)
463
+ return renderSinglePageHtml(result.pages[0]);
464
+ const rows = result.pages.map((page) => `<tr><td>${escapeHtml(page.url)}</td><td>${page.seoScore}</td><td>${page.thinScore}</td><td>${page.title.status}</td><td>${page.metaDescription.status}</td></tr>`).join('');
465
+ return ANALYSIS_LIST_TEMPLATE.replace('{{PAGES_ANALYZED}}', result.site_summary.pages_analyzed.toString()).replace('{{AVG_SEO_SCORE}}', result.site_summary.avg_seo_score.toString()).replace('{{ROWS}}', rows);
466
+ }
467
+ function renderSinglePageHtml(page) {
468
+ const structuredDataStatus = page.structuredData.present ? (page.structuredData.valid ? '<span class="status-ok">Valid</span>' : '<span class="status-critical">Invalid JSON</span>') : 'Not detected';
469
+ const structuredDataTypesRow = page.structuredData.present ? `<tr><th>Types Found</th><td>${page.structuredData.types.map(t => `<code>${t}</code>`).join(', ')}</td></tr>` : '';
470
+ return ANALYSIS_PAGE_TEMPLATE.replaceAll('{{URL}}', escapeHtml(page.url)).replace('{{SEO_SCORE}}', page.seoScore.toString()).replace('{{THIN_SCORE}}', page.thinScore.toString()).replace('{{HTTP_STATUS}}', page.status === 0 ? 'Pending/Limit' : page.status.toString()).replace('{{TITLE_VALUE}}', escapeHtml(page.title.value || '(missing)')).replace('{{TITLE_LENGTH}}', page.title.length.toString()).replaceAll('{{TITLE_STATUS}}', page.title.status).replace('{{META_DESCRIPTION_VALUE}}', escapeHtml(page.metaDescription.value || '(missing)')).replace('{{META_DESCRIPTION_LENGTH}}', page.metaDescription.length.toString()).replaceAll('{{META_DESCRIPTION_STATUS}}', page.metaDescription.status).replace('{{CANONICAL}}', page.meta.canonical ? escapeHtml(page.meta.canonical) : '<em>(none)</em>').replace('{{ROBOTS_INDEX}}', (!page.meta.noindex).toString()).replace('{{ROBOTS_FOLLOW}}', (!page.meta.nofollow).toString()).replaceAll('{{H1_STATUS}}', page.h1.status).replace('{{H1_COUNT}}', page.h1.count.toString()).replace('{{H1_MATCHES_TITLE}}', page.h1.matchesTitle ? ' | Matches Title' : '').replace('{{WORD_COUNT}}', page.content.wordCount.toString()).replace('{{UNIQUE_SENTENCES}}', page.content.uniqueSentenceCount.toString()).replace('{{TEXT_HTML_RATIO}}', (page.content.textHtmlRatio * 100).toFixed(2)).replace('{{INTERNAL_LINKS}}', page.links.internalLinks.toString()).replace('{{EXTERNAL_LINKS}}', page.links.externalLinks.toString()).replace('{{EXTERNAL_RATIO}}', (page.links.externalRatio * 100).toFixed(1)).replace('{{TOTAL_IMAGES}}', page.images.totalImages.toString()).replace('{{MISSING_ALT}}', page.images.missingAlt.toString()).replace('{{STRUCTURED_DATA_STATUS}}', structuredDataStatus).replace('{{STRUCTURED_DATA_TYPES_ROW}}', structuredDataTypesRow);
471
+ }
472
+ export function renderAnalysisMarkdown(result) {
473
+ const summary = ['# Crawlith SEO Analysis Report', '', '## 📊 Summary', `- Pages Analyzed: ${result.site_summary.pages_analyzed}`, `- Overall Site Score: ${result.site_summary.site_score.toFixed(1)}`, `- Avg SEO Score: ${result.site_summary.avg_seo_score.toFixed(1)}`, `- Thin Pages Found: ${result.site_summary.thin_pages}`, `- Duplicate Titles: ${result.site_summary.duplicate_titles}`, '', '## 📄 Page Details', '', '| URL | SEO Score | Thin Score | Title Status | Meta Status | Canonical |', '| :--- | :--- | :--- | :--- | :--- | :--- |'];
474
+ result.pages.forEach((page) => summary.push(`| ${page.url} | ${page.seoScore} | ${page.thinScore} | ${page.title.status} | ${page.metaDescription.status} | ${page.meta.canonical || '-'} |`));
475
+ return summary.join('\n');
476
+ }
477
+ export function renderAnalysisCsv(result) {
478
+ const headers = ['URL', 'SEO Score', 'Thin Score', 'HTTP Status', 'Title', 'Title Length', 'Meta Description', 'Desc Length', 'Word Count', 'Internal Links', 'External Links', 'Canonical'];
479
+ const rows = result.pages.map((p) => {
480
+ const statusStr = p.status === 0 ? 'Pending/Limit' : p.status;
481
+ return [p.url, p.seoScore, p.thinScore, statusStr, `"${(p.title.value || '').replace(/"/g, '""')}"`, p.title.length, `"${(p.metaDescription.value || '').replace(/"/g, '""')}"`, p.metaDescription.length, p.content.wordCount, p.links.internalLinks, p.links.externalLinks, p.meta.canonical || ''].join(',');
482
+ });
483
+ return [headers.join(','), ...rows].join('\n');
380
484
  }