@crawlith/core 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analysis_list.html +35 -0
  4. package/dist/analysis/analysis_page.html +123 -0
  5. package/dist/analysis/analyze.d.ts +40 -5
  6. package/dist/analysis/analyze.js +395 -347
  7. package/dist/analysis/clustering.d.ts +23 -0
  8. package/dist/analysis/clustering.js +206 -0
  9. package/dist/analysis/content.d.ts +1 -1
  10. package/dist/analysis/content.js +11 -5
  11. package/dist/analysis/duplicate.d.ts +34 -0
  12. package/dist/analysis/duplicate.js +305 -0
  13. package/dist/analysis/heading.d.ts +116 -0
  14. package/dist/analysis/heading.js +356 -0
  15. package/dist/analysis/images.d.ts +1 -1
  16. package/dist/analysis/images.js +6 -5
  17. package/dist/analysis/links.d.ts +1 -1
  18. package/dist/analysis/links.js +8 -8
  19. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  20. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  21. package/dist/analysis/scoring.js +11 -2
  22. package/dist/analysis/seo.d.ts +8 -4
  23. package/dist/analysis/seo.js +41 -30
  24. package/dist/analysis/soft404.d.ts +17 -0
  25. package/dist/analysis/soft404.js +62 -0
  26. package/dist/analysis/structuredData.d.ts +1 -1
  27. package/dist/analysis/structuredData.js +5 -4
  28. package/dist/analysis/templates.d.ts +2 -0
  29. package/dist/analysis/templates.js +7 -0
  30. package/dist/application/index.d.ts +2 -0
  31. package/dist/application/index.js +2 -0
  32. package/dist/application/usecase.d.ts +3 -0
  33. package/dist/application/usecase.js +1 -0
  34. package/dist/application/usecases.d.ts +114 -0
  35. package/dist/application/usecases.js +201 -0
  36. package/dist/audit/index.js +1 -1
  37. package/dist/audit/transport.d.ts +1 -1
  38. package/dist/audit/transport.js +5 -4
  39. package/dist/audit/types.d.ts +1 -0
  40. package/dist/constants.d.ts +17 -0
  41. package/dist/constants.js +23 -0
  42. package/dist/core/scope/scopeManager.js +3 -0
  43. package/dist/core/security/ipGuard.d.ts +11 -0
  44. package/dist/core/security/ipGuard.js +71 -3
  45. package/dist/crawler/crawl.d.ts +4 -22
  46. package/dist/crawler/crawl.js +4 -335
  47. package/dist/crawler/crawler.d.ts +87 -0
  48. package/dist/crawler/crawler.js +683 -0
  49. package/dist/crawler/extract.d.ts +4 -1
  50. package/dist/crawler/extract.js +7 -2
  51. package/dist/crawler/fetcher.d.ts +2 -1
  52. package/dist/crawler/fetcher.js +26 -11
  53. package/dist/crawler/metricsRunner.d.ts +23 -1
  54. package/dist/crawler/metricsRunner.js +202 -72
  55. package/dist/crawler/normalize.d.ts +41 -0
  56. package/dist/crawler/normalize.js +119 -3
  57. package/dist/crawler/parser.d.ts +1 -3
  58. package/dist/crawler/parser.js +2 -49
  59. package/dist/crawler/resolver.d.ts +11 -0
  60. package/dist/crawler/resolver.js +67 -0
  61. package/dist/crawler/sitemap.d.ts +6 -0
  62. package/dist/crawler/sitemap.js +27 -17
  63. package/dist/crawler/trap.d.ts +5 -1
  64. package/dist/crawler/trap.js +23 -2
  65. package/dist/db/CrawlithDB.d.ts +110 -0
  66. package/dist/db/CrawlithDB.js +500 -0
  67. package/dist/db/graphLoader.js +42 -30
  68. package/dist/db/index.d.ts +11 -0
  69. package/dist/db/index.js +41 -29
  70. package/dist/db/migrations.d.ts +2 -0
  71. package/dist/db/{schema.js → migrations.js} +90 -43
  72. package/dist/db/pluginRegistry.d.ts +9 -0
  73. package/dist/db/pluginRegistry.js +19 -0
  74. package/dist/db/repositories/EdgeRepository.d.ts +13 -0
  75. package/dist/db/repositories/EdgeRepository.js +20 -0
  76. package/dist/db/repositories/MetricsRepository.d.ts +16 -8
  77. package/dist/db/repositories/MetricsRepository.js +28 -7
  78. package/dist/db/repositories/PageRepository.d.ts +15 -2
  79. package/dist/db/repositories/PageRepository.js +169 -25
  80. package/dist/db/repositories/SiteRepository.d.ts +9 -0
  81. package/dist/db/repositories/SiteRepository.js +13 -0
  82. package/dist/db/repositories/SnapshotRepository.d.ts +14 -5
  83. package/dist/db/repositories/SnapshotRepository.js +64 -5
  84. package/dist/db/reset.d.ts +9 -0
  85. package/dist/db/reset.js +32 -0
  86. package/dist/db/statements.d.ts +12 -0
  87. package/dist/db/statements.js +40 -0
  88. package/dist/diff/compare.d.ts +0 -5
  89. package/dist/diff/compare.js +0 -12
  90. package/dist/diff/service.d.ts +16 -0
  91. package/dist/diff/service.js +41 -0
  92. package/dist/domain/index.d.ts +4 -0
  93. package/dist/domain/index.js +4 -0
  94. package/dist/events.d.ts +56 -0
  95. package/dist/events.js +1 -0
  96. package/dist/graph/graph.d.ts +36 -42
  97. package/dist/graph/graph.js +26 -17
  98. package/dist/graph/hits.d.ts +23 -0
  99. package/dist/graph/hits.js +111 -0
  100. package/dist/graph/metrics.d.ts +0 -4
  101. package/dist/graph/metrics.js +25 -9
  102. package/dist/graph/pagerank.d.ts +17 -4
  103. package/dist/graph/pagerank.js +126 -91
  104. package/dist/graph/simhash.d.ts +6 -0
  105. package/dist/graph/simhash.js +14 -0
  106. package/dist/index.d.ts +29 -8
  107. package/dist/index.js +29 -8
  108. package/dist/lock/hashKey.js +1 -1
  109. package/dist/lock/lockManager.d.ts +5 -1
  110. package/dist/lock/lockManager.js +38 -13
  111. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  112. package/dist/plugin-system/plugin-cli.js +31 -0
  113. package/dist/plugin-system/plugin-config.d.ts +16 -0
  114. package/dist/plugin-system/plugin-config.js +36 -0
  115. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  116. package/dist/plugin-system/plugin-loader.js +122 -0
  117. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  118. package/dist/plugin-system/plugin-registry.js +167 -0
  119. package/dist/plugin-system/plugin-types.d.ts +205 -0
  120. package/dist/plugin-system/plugin-types.js +1 -0
  121. package/dist/ports/index.d.ts +9 -0
  122. package/dist/ports/index.js +1 -0
  123. package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
  124. package/dist/report/crawlExport.d.ts +3 -0
  125. package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
  126. package/dist/report/crawl_template.d.ts +1 -0
  127. package/dist/report/crawl_template.js +7 -0
  128. package/dist/report/export.d.ts +3 -0
  129. package/dist/report/export.js +81 -0
  130. package/dist/report/html.js +15 -216
  131. package/dist/report/insight.d.ts +27 -0
  132. package/dist/report/insight.js +103 -0
  133. package/dist/scoring/health.d.ts +56 -0
  134. package/dist/scoring/health.js +213 -0
  135. package/dist/utils/chalk.d.ts +6 -0
  136. package/dist/utils/chalk.js +41 -0
  137. package/dist/utils/secureConfig.d.ts +23 -0
  138. package/dist/utils/secureConfig.js +128 -0
  139. package/package.json +12 -6
  140. package/CHANGELOG.md +0 -7
  141. package/dist/db/schema.d.ts +0 -2
  142. package/dist/graph/cluster.d.ts +0 -6
  143. package/dist/graph/cluster.js +0 -173
  144. package/dist/graph/duplicate.d.ts +0 -10
  145. package/dist/graph/duplicate.js +0 -251
  146. package/dist/report/sitegraphExport.d.ts +0 -3
  147. package/dist/report/sitegraph_template.d.ts +0 -1
  148. package/dist/report/sitegraph_template.js +0 -630
  149. package/dist/scoring/hits.d.ts +0 -9
  150. package/dist/scoring/hits.js +0 -111
  151. package/src/analysis/analyze.ts +0 -548
  152. package/src/analysis/content.ts +0 -62
  153. package/src/analysis/images.ts +0 -28
  154. package/src/analysis/links.ts +0 -41
  155. package/src/analysis/scoring.ts +0 -59
  156. package/src/analysis/seo.ts +0 -82
  157. package/src/analysis/structuredData.ts +0 -62
  158. package/src/audit/dns.ts +0 -49
  159. package/src/audit/headers.ts +0 -98
  160. package/src/audit/index.ts +0 -66
  161. package/src/audit/scoring.ts +0 -232
  162. package/src/audit/transport.ts +0 -258
  163. package/src/audit/types.ts +0 -102
  164. package/src/core/network/proxyAdapter.ts +0 -21
  165. package/src/core/network/rateLimiter.ts +0 -39
  166. package/src/core/network/redirectController.ts +0 -47
  167. package/src/core/network/responseLimiter.ts +0 -34
  168. package/src/core/network/retryPolicy.ts +0 -57
  169. package/src/core/scope/domainFilter.ts +0 -45
  170. package/src/core/scope/scopeManager.ts +0 -52
  171. package/src/core/scope/subdomainPolicy.ts +0 -39
  172. package/src/core/security/ipGuard.ts +0 -92
  173. package/src/crawler/crawl.ts +0 -382
  174. package/src/crawler/extract.ts +0 -34
  175. package/src/crawler/fetcher.ts +0 -233
  176. package/src/crawler/metricsRunner.ts +0 -124
  177. package/src/crawler/normalize.ts +0 -108
  178. package/src/crawler/parser.ts +0 -190
  179. package/src/crawler/sitemap.ts +0 -73
  180. package/src/crawler/trap.ts +0 -96
  181. package/src/db/graphLoader.ts +0 -105
  182. package/src/db/index.ts +0 -70
  183. package/src/db/repositories/EdgeRepository.ts +0 -29
  184. package/src/db/repositories/MetricsRepository.ts +0 -49
  185. package/src/db/repositories/PageRepository.ts +0 -128
  186. package/src/db/repositories/SiteRepository.ts +0 -32
  187. package/src/db/repositories/SnapshotRepository.ts +0 -74
  188. package/src/db/schema.ts +0 -177
  189. package/src/diff/compare.ts +0 -84
  190. package/src/graph/cluster.ts +0 -192
  191. package/src/graph/duplicate.ts +0 -286
  192. package/src/graph/graph.ts +0 -172
  193. package/src/graph/metrics.ts +0 -110
  194. package/src/graph/pagerank.ts +0 -125
  195. package/src/graph/simhash.ts +0 -61
  196. package/src/index.ts +0 -30
  197. package/src/lock/hashKey.ts +0 -51
  198. package/src/lock/lockManager.ts +0 -124
  199. package/src/lock/pidCheck.ts +0 -13
  200. package/src/report/html.ts +0 -227
  201. package/src/report/sitegraphExport.ts +0 -58
  202. package/src/scoring/hits.ts +0 -131
  203. package/src/scoring/orphanSeverity.ts +0 -176
  204. package/src/utils/version.ts +0 -18
  205. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  206. package/tests/analysis.unit.test.ts +0 -98
  207. package/tests/analyze.integration.test.ts +0 -98
  208. package/tests/audit/dns.test.ts +0 -31
  209. package/tests/audit/headers.test.ts +0 -45
  210. package/tests/audit/scoring.test.ts +0 -133
  211. package/tests/audit/security.test.ts +0 -12
  212. package/tests/audit/transport.test.ts +0 -112
  213. package/tests/clustering.test.ts +0 -118
  214. package/tests/crawler.test.ts +0 -358
  215. package/tests/db.test.ts +0 -159
  216. package/tests/diff.test.ts +0 -67
  217. package/tests/duplicate.test.ts +0 -110
  218. package/tests/fetcher.test.ts +0 -106
  219. package/tests/fetcher_safety.test.ts +0 -85
  220. package/tests/fixtures/analyze-crawl.json +0 -26
  221. package/tests/hits.test.ts +0 -134
  222. package/tests/html_report.test.ts +0 -58
  223. package/tests/lock/lockManager.test.ts +0 -138
  224. package/tests/metrics.test.ts +0 -196
  225. package/tests/normalize.test.ts +0 -101
  226. package/tests/orphanSeverity.test.ts +0 -160
  227. package/tests/pagerank.test.ts +0 -98
  228. package/tests/parser.test.ts +0 -117
  229. package/tests/proxy_safety.test.ts +0 -57
  230. package/tests/redirect_safety.test.ts +0 -73
  231. package/tests/safety.test.ts +0 -114
  232. package/tests/scope.test.ts +0 -66
  233. package/tests/scoring.test.ts +0 -59
  234. package/tests/sitemap.test.ts +0 -88
  235. package/tests/soft404.test.ts +0 -41
  236. package/tests/trap.test.ts +0 -39
  237. package/tests/visualization_data.test.ts +0 -46
  238. package/tsconfig.json +0 -11
@@ -1,293 +1,342 @@
1
- import fs from 'node:fs/promises';
1
+ import { load } from 'cheerio';
2
2
  import { crawl } from '../crawler/crawl.js';
3
+ import { UrlResolver } from '../crawler/resolver.js';
4
+ import { Fetcher } from '../crawler/fetcher.js';
3
5
  import { loadGraphFromSnapshot } from '../db/graphLoader.js';
4
- import { normalizeUrl } from '../crawler/normalize.js';
6
+ import { normalizeUrl, UrlUtil } from '../crawler/normalize.js';
5
7
  import { calculateMetrics } from '../graph/metrics.js';
6
- import { Graph } from '../graph/graph.js';
7
8
  import { analyzeContent, calculateThinContentScore } from './content.js';
8
- import { analyzeH1, analyzeMetaDescription, analyzeTitle, applyDuplicateStatuses } from './seo.js';
9
+ import { analyzeH1, analyzeMetaDescription, analyzeTitle } from './seo.js';
9
10
  import { analyzeImageAlts } from './images.js';
10
11
  import { analyzeLinks } from './links.js';
11
12
  import { analyzeStructuredData } from './structuredData.js';
12
13
  import { aggregateSiteScore, scorePageSeo } from './scoring.js';
13
- import { detectContentClusters } from '../graph/cluster.js';
14
+ import { ClusteringService } from './clustering.js';
15
+ import { DuplicateService } from './duplicate.js';
16
+ import { Soft404Service } from './soft404.js';
14
17
  import { getDb } from '../db/index.js';
15
18
  import { SiteRepository } from '../db/repositories/SiteRepository.js';
16
19
  import { SnapshotRepository } from '../db/repositories/SnapshotRepository.js';
17
20
  import { PageRepository } from '../db/repositories/PageRepository.js';
18
- export async function analyzeSite(url, options) {
19
- const normalizedRoot = normalizeUrl(url, '', { stripQuery: false });
20
- if (!normalizedRoot) {
21
+ import { MetricsRepository } from '../db/repositories/MetricsRepository.js';
22
+ import { ANALYSIS_LIST_TEMPLATE, ANALYSIS_PAGE_TEMPLATE } from './templates.js';
23
+ import { DEFAULTS } from '../constants.js';
24
+ import { PageRankService } from '../graph/pagerank.js';
25
+ import { HITSService } from '../graph/hits.js';
26
+ import { HeadingHealthService } from './heading.js';
27
+ import { annotateOrphans } from './orphan.js';
28
+ import { HealthService } from '../scoring/health.js';
29
+ /**
30
+ * Analyzes a site for SEO, content, and accessibility.
31
+ * Supports live crawling or loading from a database snapshot.
32
+ */
33
+ export async function analyzeSite(url, options, context) {
34
+ // 1. Parse siteOrigin (e.g. https://example.com) and targetPath (e.g. /stats) from the URL.
35
+ // We resolve the *origin* — not the full page URL — so rootOrigin is always just the
36
+ // scheme+host and normalizedPath is always the pathname.
37
+ let parsedUrl = null;
38
+ try {
39
+ parsedUrl = new URL(url);
40
+ }
41
+ catch { /* bare domain fallback below */ }
42
+ const inputFullUrl = parsedUrl ? parsedUrl.toString() : (url.startsWith('http') ? url : `https://${url}`);
43
+ const inputOrigin = parsedUrl ? `${parsedUrl.protocol}//${parsedUrl.host}` : url;
44
+ let rootOrigin = inputOrigin;
45
+ if (options.live !== false) {
46
+ const resolver = new UrlResolver();
47
+ const fetcher = new Fetcher({ rate: options.rate, proxyUrl: options.proxyUrl, userAgent: options.userAgent });
48
+ try {
49
+ const resolved = await resolver.resolve(inputOrigin, fetcher);
50
+ rootOrigin = resolved.url;
51
+ }
52
+ catch {
53
+ // Fallback to basic normalization if resolution fails
54
+ }
55
+ }
56
+ // Normalize origin and target URL independently.
57
+ const normalizedOrigin = normalizeUrl(rootOrigin, '', { stripQuery: false });
58
+ if (!normalizedOrigin) {
21
59
  throw new Error('Invalid URL for analysis');
22
60
  }
61
+ const normalizedTargetAbs = normalizeUrl(inputFullUrl, rootOrigin, { stripQuery: false }) || inputFullUrl;
62
+ const normalizedPath = normalizeUrl(inputFullUrl, rootOrigin, { stripQuery: false, toPath: true })
63
+ || UrlUtil.toPath(normalizedTargetAbs, rootOrigin);
64
+ const start = Date.now();
23
65
  let crawlData;
66
+ let robots = null;
67
+ // 1. Robots fetch (live-mode only to keep snapshot analysis deterministic and fast)
68
+ if (options.live) {
69
+ try {
70
+ const robotsUrl = new URL('/robots.txt', rootOrigin).toString();
71
+ const { Fetcher } = await import('../crawler/fetcher.js');
72
+ const fetcher = new Fetcher({
73
+ rate: DEFAULTS.RATE_LIMIT,
74
+ proxyUrl: options.proxyUrl,
75
+ userAgent: options.userAgent
76
+ });
77
+ const robotsRes = await fetcher.fetch(robotsUrl, { maxBytes: 500000 });
78
+ if (typeof robotsRes.status === 'number' && robotsRes.status >= 200 && robotsRes.status < 300) {
79
+ const robotsParserModule = await import('robots-parser');
80
+ const robotsParser = robotsParserModule.default || robotsParserModule;
81
+ robots = robotsParser(robotsUrl, robotsRes.body);
82
+ if (context)
83
+ context.emit({ type: 'info', message: `[analyze] Robots fetch took ${Date.now() - start}ms` });
84
+ }
85
+ }
86
+ catch {
87
+ // Fallback
88
+ }
89
+ }
90
+ // Data Acquisition
24
91
  if (options.live) {
25
- crawlData = await runLiveCrawl(normalizedRoot, options);
92
+ const crawlStart = Date.now();
93
+ crawlData = await runLiveCrawl(normalizedTargetAbs, rootOrigin, options, context, robots);
94
+ if (context)
95
+ context.emit({ type: 'info', message: `[analyze] runLiveCrawl took ${Date.now() - crawlStart}ms` });
26
96
  }
27
97
  else {
28
98
  try {
29
- crawlData = await loadCrawlData(normalizedRoot, options.fromCrawl);
30
- }
31
- catch (error) {
32
- const isNotFound = error.code === 'ENOENT' ||
33
- error.message.includes('Crawl data not found') ||
34
- error.message.includes('No completed snapshot found') ||
35
- error.message.includes('not found in database');
36
- if (isNotFound && !options.fromCrawl) {
37
- console.log('No local crawl data found. Switching to live analysis mode...');
38
- crawlData = await runLiveCrawl(normalizedRoot, options);
39
- }
40
- else {
41
- throw error;
99
+ const loadStart = Date.now();
100
+ crawlData = await loadCrawlData(normalizedOrigin, options.snapshotId);
101
+ if (context)
102
+ context.emit({ type: 'debug', message: `[analyze] loadCrawlData took ${Date.now() - loadStart}ms` });
103
+ const allPages = Array.from(crawlData.pages);
104
+ crawlData.pages = allPages;
105
+ const exists = allPages.some(p => p.url === normalizedPath || p.url === normalizedTargetAbs);
106
+ if (!exists) {
107
+ if (context)
108
+ context.emit({ type: 'info', message: `URL ${normalizedTargetAbs} not found. Fetching live...` });
109
+ crawlData = await runLiveCrawl(normalizedTargetAbs, rootOrigin, options, context, robots);
42
110
  }
43
111
  }
112
+ catch (_error) {
113
+ if (context)
114
+ context.emit({ type: 'info', message: 'No local crawl data found. Switching to live...' });
115
+ crawlData = await runLiveCrawl(normalizedTargetAbs, rootOrigin, options, context, robots);
116
+ }
117
+ }
118
+ const snapshotId = crawlData.snapshotId;
119
+ const crawledAt = crawlData.crawledAt;
120
+ const pagesStart = Date.now();
121
+ const pages = analyzePages(normalizedTargetAbs, rootOrigin, crawlData.pages, robots, options);
122
+ if (context)
123
+ context.emit({ type: 'debug', message: `[analyze] analyzePages took ${Date.now() - pagesStart}ms` });
124
+ // Sync basic page analysis results back to graph nodes for persistence
125
+ for (const pageAnalysis of pages) {
126
+ const node = crawlData.graph.nodes.get(pageAnalysis.url);
127
+ if (node) {
128
+ node.soft404Score = pageAnalysis.soft404?.score;
129
+ node.wordCount = pageAnalysis.content.wordCount;
130
+ node.externalLinkRatio = pageAnalysis.links.externalRatio;
131
+ node.thinContentScore = pageAnalysis.thinScore;
132
+ node.title = pageAnalysis.title.value || undefined;
133
+ }
44
134
  }
45
- // Run clustering if requested or as default
46
- detectContentClusters(crawlData.graph, options.clusterThreshold, options.minClusterSize);
47
- const pages = analyzePages(normalizedRoot, crawlData.pages);
48
135
  const activeModules = {
49
136
  seo: !!options.seo,
50
137
  content: !!options.content,
51
138
  accessibility: !!options.accessibility
52
139
  };
53
140
  const hasFilters = activeModules.seo || activeModules.content || activeModules.accessibility;
54
- const filteredPages = hasFilters
55
- ? pages.map((page) => filterPageModules(page, activeModules))
56
- : pages;
57
- // Filter to only the requested URL
58
- const targetPage = filteredPages.find(p => p.url === normalizedRoot);
59
- const resultPages = targetPage ? [targetPage] : (options.live ? filteredPages.slice(0, 1) : filteredPages);
141
+ const filteredPages = hasFilters ? pages.map((page) => filterPageModules(page, activeModules)) : pages;
142
+ const targetPage = filteredPages.find(p => p.url === normalizedPath || p.url === normalizedTargetAbs);
143
+ let resultPages;
144
+ if (options.allPages) {
145
+ resultPages = filteredPages;
146
+ }
147
+ else {
148
+ resultPages = targetPage ? [targetPage] : (options.live ? filteredPages.slice(0, 1) : []);
149
+ }
150
+ let clusters = [];
151
+ let duplicates = [];
152
+ let prResults = new Map();
153
+ let hitsResults = new Map();
154
+ let headingPayloads = {};
155
+ if (options.clustering) {
156
+ const clustering = new ClusteringService();
157
+ clusters = clustering.detectContentClusters(crawlData.graph, options.clusterThreshold, options.minClusterSize);
158
+ }
159
+ if (options.allPages) {
160
+ const duplication = new DuplicateService();
161
+ duplicates = duplication.detectDuplicates(crawlData.graph, { collapse: false });
162
+ }
163
+ if (options.computePagerank) {
164
+ const prService = new PageRankService();
165
+ prResults = prService.evaluate(crawlData.graph);
166
+ }
167
+ if (options.computeHits) {
168
+ const hitsService = new HITSService();
169
+ hitsResults = hitsService.evaluate(crawlData.graph);
170
+ }
171
+ if (options.heading) {
172
+ const headingService = new HeadingHealthService();
173
+ const { payloadsByUrl } = headingService.evaluateNodes(crawlData.graph.getNodes());
174
+ headingPayloads = payloadsByUrl;
175
+ }
176
+ if (options.orphans) {
177
+ const edges = crawlData.graph.getEdges();
178
+ annotateOrphans(crawlData.graph.getNodes(), edges, {
179
+ enabled: true,
180
+ severityEnabled: !!options.orphanSeverity,
181
+ includeSoftOrphans: !!options.includeSoftOrphans,
182
+ minInbound: options.minInbound || 2,
183
+ rootUrl: normalizedOrigin
184
+ });
185
+ }
186
+ // Run HealthService when --health is enabled
187
+ let healthBreakdown;
188
+ if (options.health) {
189
+ const healthService = new HealthService();
190
+ const issues = healthService.collectCrawlIssues(crawlData.graph, crawlData.metrics, rootOrigin);
191
+ healthBreakdown = healthService.calculateHealthScore(crawlData.graph.nodes.size, issues);
192
+ }
193
+ // Update nodes in graph with results
194
+ for (const node of crawlData.graph.getNodes()) {
195
+ const pr = prResults.get(node.url);
196
+ if (pr)
197
+ node.pagerankScore = pr.score;
198
+ const hits = hitsResults.get(node.url);
199
+ if (hits) {
200
+ node.hubScore = hits.hub_score;
201
+ node.authScore = hits.authority_score;
202
+ node.linkRole = hits.link_role;
203
+ }
204
+ const heading = headingPayloads[node.url];
205
+ if (heading) {
206
+ node.headingScore = heading.score;
207
+ node.headingData = JSON.stringify(heading);
208
+ }
209
+ }
210
+ // Synchronize graph-level final scores back to PageAnalysis models
211
+ for (const page of pages) {
212
+ const node = crawlData.graph.nodes.get(page.url);
213
+ if (node) {
214
+ if (node.headingScore !== undefined)
215
+ page.headingScore = node.headingScore;
216
+ page.seoScore = scorePageSeo(page);
217
+ }
218
+ }
60
219
  const duplicateTitles = pages.filter((page) => page.title.status === 'duplicate').length;
61
220
  const thinPages = pages.filter((page) => page.thinScore >= 70).length;
62
- const siteScores = aggregateSiteScore(crawlData.metrics, pages);
63
- return {
221
+ const siteScores = aggregateSiteScore(crawlData.metrics, resultPages.length === 1 ? resultPages : pages);
222
+ if (context)
223
+ context.emit({ type: 'debug', message: `[analyze] Total analysis completed in ${Date.now() - start}ms` });
224
+ // Persist to Database
225
+ const db = getDb();
226
+ const metricsRepo = new MetricsRepository(db);
227
+ const pageRepo = new PageRepository(db);
228
+ // Efficiently map URLs to IDs for this snapshot
229
+ const pagesIdentity = pageRepo.getPagesIdentityBySnapshot(snapshotId);
230
+ const urlToIdMap = new Map(pagesIdentity.map(p => [p.normalized_url, p.id]));
231
+ const metricsToSave = crawlData.graph.getNodes().map(node => {
232
+ const pageId = urlToIdMap.get(node.url);
233
+ if (!pageId)
234
+ return null;
235
+ return {
236
+ snapshot_id: snapshotId,
237
+ page_id: pageId,
238
+ crawl_status: node.crawlStatus || null,
239
+ word_count: node.wordCount || null,
240
+ thin_content_score: node.thinContentScore || null,
241
+ external_link_ratio: node.externalLinkRatio || null,
242
+ pagerank_score: node.pagerankScore || null,
243
+ hub_score: node.hubScore || null,
244
+ auth_score: node.authScore || null,
245
+ link_role: node.linkRole || null,
246
+ duplicate_cluster_id: node.duplicateClusterId || null,
247
+ duplicate_type: node.duplicateType || null,
248
+ cluster_id: node.clusterId || null,
249
+ soft404_score: node.soft404Score || null,
250
+ heading_score: node.headingScore || null,
251
+ orphan_score: node.orphanScore || null,
252
+ orphan_type: node.orphanType || null,
253
+ impact_level: node.impactLevel || null,
254
+ heading_data: node.headingData || null,
255
+ is_cluster_primary: node.isClusterPrimary ? 1 : 0
256
+ };
257
+ }).filter(m => m !== null);
258
+ // Persist health score to snapshot if computed
259
+ if (healthBreakdown && snapshotId) {
260
+ const db2 = getDb();
261
+ const snapshotRepo = new SnapshotRepository(db2);
262
+ snapshotRepo.updateSnapshotStatus(snapshotId, 'completed', {
263
+ health_score: healthBreakdown.score
264
+ });
265
+ }
266
+ metricsRepo.insertMany(metricsToSave);
267
+ const result = {
64
268
  site_summary: {
65
- pages_analyzed: pages.length,
269
+ pages_analyzed: resultPages.length,
66
270
  avg_seo_score: siteScores.seoHealthScore,
67
271
  thin_pages: thinPages,
68
272
  duplicate_titles: duplicateTitles,
69
- site_score: siteScores.overallScore
273
+ site_score: siteScores.overallScore,
274
+ site_score_breakdown: siteScores.breakdown
70
275
  },
71
276
  site_scores: siteScores,
72
277
  pages: resultPages,
73
278
  active_modules: activeModules,
74
- clusters: crawlData.graph.contentClusters
279
+ snapshotId,
280
+ crawledAt,
281
+ clusters,
282
+ duplicates
75
283
  };
284
+ return result;
76
285
  }
77
- export function renderAnalysisHtml(result) {
78
- if (result.pages.length === 1) {
79
- return renderSinglePageHtml(result.pages[0]);
80
- }
81
- const rows = result.pages
82
- .map((page) => `< tr > <td>${escapeHtml(page.url)} </td><td>${page.seoScore}</td > <td>${page.thinScore} </td><td>${page.title.status}</td > <td>${page.metaDescription.status} </td></tr > `)
83
- .join('');
84
- return `<!DOCTYPE html><html lang="en"><head><meta charset="utf-8" /><title>Crawlith Analysis Report</title></head><body><h1>Analysis</h1><p>Pages: ${result.site_summary.pages_analyzed}</p><p>Average SEO: ${result.site_summary.avg_seo_score}</p><table border="1" cellspacing="0" cellpadding="6"><thead><tr><th>URL</th><th>SEO Score</th><th>Thin Score</th><th>Title</th><th>Meta</th></tr></thead><tbody>${rows}</tbody></table></body></html>`;
85
- }
86
- function renderSinglePageHtml(page) {
87
- return `<!DOCTYPE html>
88
- <html lang="en">
89
- <head>
90
- <meta charset="UTF-8">
91
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
92
- <title>Analysis for ${escapeHtml(page.url)}</title>
93
- <style>
94
- body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; line-height: 1.6; color: #333; }
95
- h1 { border-bottom: 2px solid #eee; padding-bottom: 10px; }
96
- h2 { margin-top: 30px; border-bottom: 1px solid #eee; padding-bottom: 5px; }
97
- .score-card { display: flex; gap: 20px; margin-bottom: 30px; }
98
- .score-box { background: #f8f9fa; padding: 15px; border-radius: 8px; text-align: center; flex: 1; border: 1px solid #e1e4e8; }
99
- .score-val { font-size: 24px; font-weight: bold; color: #0366d6; }
100
- .status-ok { color: green; font-weight: bold; }
101
- .status-warning { color: orange; font-weight: bold; }
102
- .status-critical { color: red; font-weight: bold; }
103
- .status-missing { color: red; font-weight: bold; }
104
- .data-table { width: 100%; border-collapse: collapse; margin-top: 10px; }
105
- .data-table th, .data-table td { text-align: left; padding: 8px; border-bottom: 1px solid #eee; }
106
- .data-table th { width: 150px; color: #666; }
107
- code { background: #f6f8fa; padding: 2px 4px; border-radius: 3px; font-size: 0.9em; }
108
- </style>
109
- </head>
110
- <body>
111
- <h1>Page Analysis</h1>
112
- <p><strong>URL:</strong> <a href="${page.url}" target="_blank">${page.url}</a></p>
113
-
114
- <div class="score-card">
115
- <div class="score-box">
116
- <div class="score-val">${page.seoScore}</div>
117
- <div>SEO Score</div>
118
- </div>
119
- <div class="score-box">
120
- <div class="score-val">${page.thinScore}</div>
121
- <div>Thin Content Score</div>
122
- </div>
123
- <div class="score-box">
124
- <div class="score-val">${page.status === 0 ? 'Pending/Limit' : page.status}</div>
125
- <div>HTTP Status</div>
126
- </div>
127
- </div>
128
-
129
- <h2>Meta Tags</h2>
130
- <table class="data-table">
131
- <tr>
132
- <th>Title</th>
133
- <td>
134
- <div>${escapeHtml(page.title.value || '(missing)')}</div>
135
- <small>Length: ${page.title.length} | Status: <span class="status-${page.title.status}">${page.title.status}</span></small>
136
- </td>
137
- </tr>
138
- <tr>
139
- <th>Description</th>
140
- <td>
141
- <div>${escapeHtml(page.metaDescription.value || '(missing)')}</div>
142
- <small>Length: ${page.metaDescription.length} | Status: <span class="status-${page.metaDescription.status}">${page.metaDescription.status}</span></small>
143
- </td>
144
- </tr>
145
- <tr>
146
- <th>Canonical</th>
147
- <td>${page.meta.canonical ? escapeHtml(page.meta.canonical) : '<em>(none)</em>'}</td>
148
- </tr>
149
- <tr>
150
- <th>Robots</th>
151
- <td>
152
- Index: ${!page.meta.noindex},
153
- Follow: ${!page.meta.nofollow}
154
- </td>
155
- </tr>
156
- </table>
157
-
158
- <h2>Content & Heading</h2>
159
- <table class="data-table">
160
- <tr>
161
- <th>H1 Tag</th>
162
- <td>
163
- Status: <span class="status-${page.h1.status}">${page.h1.status}</span>
164
- (${page.h1.count} detected)
165
- ${page.h1.matchesTitle ? ' | Matches Title' : ''}
166
- </td>
167
- </tr>
168
- <tr>
169
- <th>Word Count</th>
170
- <td>${page.content.wordCount} words</td>
171
- </tr>
172
- <tr>
173
- <th>Unique Sentences</th>
174
- <td>${page.content.uniqueSentenceCount}</td>
175
- </tr>
176
- <tr>
177
- <th>Text / HTML Ratio</th>
178
- <td>${(page.content.textHtmlRatio * 100).toFixed(2)}%</td>
179
- </tr>
180
- </table>
181
-
182
- <h2>Links & Images</h2>
183
- <table class="data-table">
184
- <tr>
185
- <th>Internal Links</th>
186
- <td>${page.links.internalLinks}</td>
187
- </tr>
188
- <tr>
189
- <th>External Links</th>
190
- <td>${page.links.externalLinks} (${(page.links.externalRatio * 100).toFixed(1)}%)</td>
191
- </tr>
192
- <tr>
193
- <th>Images</th>
194
- <td>${page.images.totalImages} total (${page.images.missingAlt} missing alt text)</td>
195
- </tr>
196
- </table>
197
-
198
- <h2>Structured Data</h2>
199
- <table class="data-table">
200
- <tr>
201
- <th>Status</th>
202
- <td>
203
- ${page.structuredData.present
204
- ? (page.structuredData.valid ? '<span class="status-ok">Valid</span>' : '<span class="status-critical">Invalid JSON</span>')
205
- : 'Not detected'}
206
- </td>
207
- </tr>
208
- ${page.structuredData.present ? `
209
- <tr>
210
- <th>Types Found</th>
211
- <td>${page.structuredData.types.map(t => `<code>${t}</code>`).join(', ')}</td>
212
- </tr>
213
- ` : ''}
214
- </table>
215
- </body>
216
- </html>`;
217
- }
218
- export function renderAnalysisMarkdown(result) {
219
- const summary = [
220
- '# Crawlith SEO Analysis Report',
221
- '',
222
- '## 📊 Summary',
223
- `- Pages Analyzed: ${result.site_summary.pages_analyzed}`,
224
- `- Overall Site Score: ${result.site_summary.site_score.toFixed(1)}`,
225
- `- Avg SEO Score: ${result.site_summary.avg_seo_score.toFixed(1)}`,
226
- `- Thin Pages Found: ${result.site_summary.thin_pages}`,
227
- `- Duplicate Titles: ${result.site_summary.duplicate_titles}`,
228
- '',
229
- '## 📄 Page Details',
230
- '',
231
- '| URL | SEO Score | Thin Score | Title Status | Meta Status |',
232
- '| :--- | :--- | :--- | :--- | :--- |',
233
- ];
234
- result.pages.forEach((page) => {
235
- summary.push(`| ${page.url} | ${page.seoScore} | ${page.thinScore} | ${page.title.status} | ${page.metaDescription.status} |`);
236
- });
237
- return summary.join('\n');
238
- }
239
- export function renderAnalysisCsv(result) {
240
- const headers = ['URL', 'SEO Score', 'Thin Score', 'HTTP Status', 'Title', 'Title Length', 'Meta Description', 'Desc Length', 'Word Count', 'Internal Links', 'External Links'];
241
- const rows = result.pages.map((p) => {
242
- const statusStr = p.status === 0 ? 'Pending/Limit' : p.status;
243
- return [
244
- p.url,
245
- p.seoScore,
246
- p.thinScore,
247
- statusStr,
248
- `"${(p.title.value || '').replace(/"/g, '""')}"`,
249
- p.title.length,
250
- `"${(p.metaDescription.value || '').replace(/"/g, '""')}"`,
251
- p.metaDescription.length,
252
- p.content.wordCount,
253
- p.links.internalLinks,
254
- p.links.externalLinks
255
- ].join(',');
256
- });
257
- return [headers.join(','), ...rows].join('\n');
258
- }
259
- function escapeHtml(value) {
260
- return value.replaceAll('&', '&amp;').replaceAll('<', '&lt;').replaceAll('>', '&gt;');
261
- }
262
- function analyzePages(rootUrl, pages) {
263
- const titleCandidates = pages.map((page) => analyzeTitle(page.html || ''));
264
- const metaCandidates = pages.map((page) => analyzeMetaDescription(page.html || ''));
265
- const titles = applyDuplicateStatuses(titleCandidates);
266
- const metas = applyDuplicateStatuses(metaCandidates);
286
+ export function analyzePages(targetUrl, rootOrigin, pages, robots, options = {}) {
287
+ const titleCounts = new Map();
288
+ const metaCounts = new Map();
267
289
  const sentenceCountFrequency = new Map();
268
- const baseContent = pages.map((page) => analyzeContent(page.html || ''));
269
- for (const item of baseContent) {
270
- sentenceCountFrequency.set(item.uniqueSentenceCount, (sentenceCountFrequency.get(item.uniqueSentenceCount) || 0) + 1);
271
- }
272
- return pages.map((page, index) => {
290
+ const results = [];
291
+ const targetPath = UrlUtil.toPath(targetUrl, rootOrigin);
292
+ const targetAbs = UrlUtil.toAbsolute(targetUrl, rootOrigin);
293
+ for (const page of pages) {
294
+ const pagePath = UrlUtil.toPath(page.url, rootOrigin);
295
+ const pageAbs = UrlUtil.toAbsolute(page.url, rootOrigin);
296
+ const isTarget = page.url === targetUrl || pagePath === targetPath || pageAbs === targetAbs;
297
+ // In single-page mode, if it's not the target, we skip it entirely for speed.
298
+ if (!options.allPages && !isTarget)
299
+ continue;
273
300
  const html = page.html || '';
274
- const title = titles[index];
275
- const metaDescription = metas[index];
276
- const h1 = analyzeH1(html, title.value);
277
- const content = baseContent[index];
278
- const duplicationScore = (sentenceCountFrequency.get(content.uniqueSentenceCount) || 0) > 1 ? 100 : 0;
279
- const thinScore = calculateThinContentScore(content, duplicationScore);
280
- const images = analyzeImageAlts(html);
281
- const links = analyzeLinks(html, page.url, rootUrl);
282
- const structuredData = analyzeStructuredData(html);
283
- const analysis = {
301
+ const $ = load(html || '<html></html>');
302
+ // Reconstruct absolute URL from stored path for robots & link resolution
303
+ const pageAbsUrl = UrlUtil.toAbsolute(page.url, rootOrigin);
304
+ let crawlStatus = page.crawlStatus;
305
+ if (robots) {
306
+ const isBlocked = !robots.isAllowed(pageAbsUrl, 'crawlith') ||
307
+ (!pageAbsUrl.endsWith('/') && !robots.isAllowed(pageAbsUrl + '/', 'crawlith'));
308
+ if (isBlocked)
309
+ crawlStatus = 'blocked_by_robots';
310
+ }
311
+ // Shared DOM Analysis
312
+ const title = analyzeTitle($);
313
+ const metaDescription = analyzeMetaDescription($);
314
+ const h1 = analyzeH1($, title.value);
315
+ const content = analyzeContent($);
316
+ const images = analyzeImageAlts($);
317
+ const links = analyzeLinks($, pageAbsUrl, rootOrigin);
318
+ const structuredData = analyzeStructuredData($);
319
+ if (title.value) {
320
+ const key = title.value.trim().toLowerCase();
321
+ titleCounts.set(key, (titleCounts.get(key) || 0) + 1);
322
+ }
323
+ if (metaDescription.value) {
324
+ const key = metaDescription.value.trim().toLowerCase();
325
+ metaCounts.set(key, (metaCounts.get(key) || 0) + 1);
326
+ }
327
+ sentenceCountFrequency.set(content.uniqueSentenceCount, (sentenceCountFrequency.get(content.uniqueSentenceCount) || 0) + 1);
328
+ const soft404Service = new Soft404Service();
329
+ const soft404 = soft404Service.analyze(html, links.externalLinks + links.internalLinks);
330
+ const isCanonicalConflict = !!(page.canonical && page.canonical !== page.url && page.canonical !== pageAbsUrl &&
331
+ page.canonical.replace(/\/$/, '') !== pageAbsUrl.replace(/\/$/, ''));
332
+ const resultPage = {
284
333
  url: page.url,
285
334
  status: page.status || 0,
286
335
  title,
287
336
  metaDescription,
288
337
  h1,
289
338
  content,
290
- thinScore,
339
+ thinScore: 0,
291
340
  images,
292
341
  links,
293
342
  structuredData,
@@ -295,45 +344,50 @@ function analyzePages(rootUrl, pages) {
295
344
  meta: {
296
345
  canonical: page.canonical,
297
346
  noindex: page.noindex,
298
- nofollow: page.nofollow
299
- }
347
+ nofollow: page.nofollow,
348
+ crawlStatus,
349
+ canonicalConflict: isCanonicalConflict
350
+ },
351
+ soft404
300
352
  };
353
+ Object.defineProperty(resultPage, 'html', { value: html, enumerable: false });
354
+ results.push(resultPage);
355
+ }
356
+ for (const analysis of results) {
357
+ if (analysis.title.value) {
358
+ const key = analysis.title.value.trim().toLowerCase();
359
+ if ((titleCounts.get(key) || 0) > 1)
360
+ analysis.title.status = 'duplicate';
361
+ }
362
+ if (analysis.metaDescription.value) {
363
+ const key = analysis.metaDescription.value.trim().toLowerCase();
364
+ if ((metaCounts.get(key) || 0) > 1)
365
+ analysis.metaDescription.status = 'duplicate';
366
+ }
367
+ const duplicationScore = (sentenceCountFrequency.get(analysis.content.uniqueSentenceCount) || 0) > 1 ? 100 : 0;
368
+ analysis.thinScore = calculateThinContentScore(analysis.content, duplicationScore);
301
369
  analysis.seoScore = scorePageSeo(analysis);
302
- return analysis;
303
- });
370
+ }
371
+ return results;
304
372
  }
305
373
  function filterPageModules(page, modules) {
306
- const keepSeo = modules.seo;
307
- const keepContent = modules.content;
308
- const keepAccessibility = modules.accessibility;
309
- return {
374
+ const filtered = {
310
375
  ...page,
311
- title: keepSeo ? page.title : { value: null, length: 0, status: 'missing' },
312
- metaDescription: keepSeo ? page.metaDescription : { value: null, length: 0, status: 'missing' },
313
- h1: (keepSeo || keepContent) ? page.h1 : { count: 0, status: 'critical', matchesTitle: false },
314
- links: keepSeo ? page.links : { internalLinks: 0, externalLinks: 0, nofollowCount: 0, externalRatio: 0 },
315
- structuredData: keepSeo ? page.structuredData : { present: false, valid: false, types: [] },
316
- content: keepContent ? page.content : { wordCount: 0, textHtmlRatio: 0, uniqueSentenceCount: 0 },
317
- thinScore: keepContent ? page.thinScore : 0,
318
- images: keepAccessibility ? page.images : { totalImages: 0, missingAlt: 0, emptyAlt: 0 }
376
+ title: modules.seo ? page.title : { value: null, length: 0, status: 'missing' },
377
+ metaDescription: modules.seo ? page.metaDescription : { value: null, length: 0, status: 'missing' },
378
+ h1: (modules.seo || modules.content) ? page.h1 : { count: 0, status: 'critical', matchesTitle: false, value: null },
379
+ links: modules.seo ? page.links : { internalLinks: 0, externalLinks: 0, nofollowCount: 0, externalRatio: 0 },
380
+ structuredData: modules.seo ? page.structuredData : { present: false, valid: false, types: [] },
381
+ content: modules.content ? page.content : { wordCount: 0, textHtmlRatio: 0, uniqueSentenceCount: 0 },
382
+ thinScore: modules.content ? page.thinScore : 0,
383
+ images: modules.accessibility ? page.images : { totalImages: 0, missingAlt: 0, emptyAlt: 0 }
319
384
  };
320
- }
321
- async function loadCrawlData(rootUrl, fromCrawl) {
322
- // If fromCrawl is provided, we could theoretically load JSON, but
323
- // we now default to DB fetching for all operations.
324
- if (fromCrawl) {
325
- try {
326
- const content = await fs.readFile(fromCrawl, 'utf-8');
327
- const raw = JSON.parse(content);
328
- const pages = parsePages(raw);
329
- const graph = graphFromPages(rootUrl, pages, raw);
330
- const metrics = calculateMetrics(graph, 5);
331
- return { pages, metrics, graph };
332
- }
333
- catch (_e) {
334
- // Fallback downwards if file doesn't exist
335
- }
385
+ if (page.html) {
386
+ Object.defineProperty(filtered, 'html', { value: page.html, enumerable: false });
336
387
  }
388
+ return filtered;
389
+ }
390
+ async function loadCrawlData(rootUrl, snapshotId) {
337
391
  const db = getDb();
338
392
  const siteRepo = new SiteRepository(db);
339
393
  const snapshotRepo = new SnapshotRepository(db);
@@ -341,77 +395,43 @@ async function loadCrawlData(rootUrl, fromCrawl) {
341
395
  const urlObj = new URL(rootUrl);
342
396
  const domain = urlObj.hostname.replace('www.', '');
343
397
  const site = siteRepo.firstOrCreateSite(domain);
344
- const snapshot = snapshotRepo.getLatestSnapshot(site.id, 'completed');
398
+ let snapshot = null;
399
+ if (snapshotId) {
400
+ snapshot = snapshotRepo.getSnapshot(snapshotId);
401
+ }
345
402
  if (!snapshot) {
346
- throw new Error(`No completed snapshot found for ${rootUrl} in database.`);
403
+ for (const candidate of UrlUtil.toLookupCandidates(rootUrl, `${urlObj.protocol}//${urlObj.host}`)) {
404
+ const page = pageRepo.getPage(site.id, candidate);
405
+ if (page?.last_seen_snapshot_id) {
406
+ snapshot = snapshotRepo.getSnapshot(page.last_seen_snapshot_id);
407
+ break;
408
+ }
409
+ }
347
410
  }
411
+ if (!snapshot)
412
+ snapshot = snapshotRepo.getLatestSnapshot(site.id);
413
+ if (!snapshot)
414
+ throw new Error(`No crawl data found for ${rootUrl}`);
348
415
  const graph = loadGraphFromSnapshot(snapshot.id);
349
416
  const metrics = calculateMetrics(graph, 5);
350
- // We also need the `pages` array for analysis.
351
- // It needs `html` which might not be fully available unless we look up from the DB or Graph.
352
- // Wait, the Graph stores Node which doesn't contain HTML since we removed it from memory?
353
- // Actually, `loadGraphFromSnapshot` does NOT load actual raw HTML from nodes to save memory.
354
- // We need HTML for `analyzeSite` module! So we must fetch it from `pageRepo`.
355
- const dbPages = pageRepo.getPagesBySnapshot(snapshot.id);
356
- const pages = dbPages.map((p) => ({
357
- url: p.normalized_url,
358
- status: p.http_status || 0,
359
- html: p.html || '',
360
- depth: p.depth || 0
361
- }));
362
- return { pages, metrics, graph };
363
- }
364
- function parsePages(raw) {
365
- if (Array.isArray(raw.pages)) {
366
- return raw.pages.map((page) => {
367
- const p = page;
368
- return {
369
- url: String(p.url || ''),
370
- status: Number(p.status || 0),
371
- html: typeof p.html === 'string' ? p.html : '',
372
- depth: Number(p.depth || 0)
373
- };
374
- }).filter((page) => Boolean(page.url));
375
- }
376
- if (Array.isArray(raw.nodes)) {
377
- return raw.nodes.map((node) => {
378
- const n = node;
379
- return {
380
- url: String(n.url || ''),
381
- status: Number(n.status || 0),
382
- html: typeof n.html === 'string' ? n.html : '',
383
- depth: Number(n.depth || 0)
417
+ const dbPagesIterator = pageRepo.getPagesIteratorBySnapshot(snapshot.id);
418
+ const pagesGenerator = function* () {
419
+ for (const p of dbPagesIterator) {
420
+ yield {
421
+ url: p.normalized_url,
422
+ status: p.http_status || 0,
423
+ html: p.html || '',
424
+ depth: p.depth || 0,
425
+ canonical: p.canonical_url || undefined,
426
+ noindex: !!p.noindex,
427
+ nofollow: !!p.nofollow,
428
+ crawlStatus: graph.nodes.get(p.normalized_url)?.crawlStatus
384
429
  };
385
- }).filter((page) => Boolean(page.url));
386
- }
387
- return [];
388
- }
389
- function graphFromPages(rootUrl, pages, raw) {
390
- const graph = new Graph();
391
- for (const page of pages) {
392
- graph.addNode(page.url, page.depth || 0, page.status || 0);
393
- }
394
- if (Array.isArray(raw.edges)) {
395
- for (const edge of raw.edges) {
396
- const e = edge;
397
- if (typeof e.source === 'string' && typeof e.target === 'string') {
398
- graph.addNode(e.source, 0, 0);
399
- graph.addNode(e.target, 0, 0);
400
- graph.addEdge(e.source, e.target);
401
- }
402
430
  }
403
- return graph;
404
- }
405
- for (const page of pages) {
406
- if (!page.html)
407
- continue;
408
- const linkAnalysis = analyzeLinks(page.html, page.url, rootUrl);
409
- if (linkAnalysis.internalLinks === 0 && linkAnalysis.externalLinks === 0)
410
- continue;
411
- }
412
- return graph;
431
+ };
432
+ return { pages: pagesGenerator(), metrics, graph, snapshotId: snapshot.id, crawledAt: snapshot.created_at };
413
433
  }
414
- async function runLiveCrawl(url, options) {
434
+ async function runLiveCrawl(url, origin, options, context, robots) {
415
435
  const snapshotId = await crawl(url, {
416
436
  limit: 1,
417
437
  depth: 0,
@@ -419,18 +439,46 @@ async function runLiveCrawl(url, options) {
419
439
  proxyUrl: options.proxyUrl,
420
440
  userAgent: options.userAgent,
421
441
  maxRedirects: options.maxRedirects,
422
- debug: options.debug
423
- });
442
+ debug: options.debug,
443
+ snapshotRunType: 'single',
444
+ robots,
445
+ sitemap: options.sitemap,
446
+ plugins: options.plugins
447
+ }, context);
424
448
  const graph = loadGraphFromSnapshot(snapshotId);
425
449
  const pages = graph.getNodes().map((node) => ({
426
450
  url: node.url,
427
451
  status: node.status,
428
- html: node.html || '', // Include HTML
429
- depth: node.depth
452
+ html: node.html || '',
453
+ depth: node.depth,
454
+ crawlStatus: node.crawlStatus
430
455
  }));
431
- return {
432
- pages,
433
- metrics: calculateMetrics(graph, 1),
434
- graph
435
- };
456
+ return { pages, metrics: calculateMetrics(graph, 1), graph, snapshotId };
457
+ }
458
+ export function escapeHtml(value) {
459
+ return value.replaceAll('&', '&amp;').replaceAll('<', '&lt;').replaceAll('>', '&gt;');
460
+ }
461
+ export function renderAnalysisHtml(result) {
462
+ if (result.pages.length === 1)
463
+ return renderSinglePageHtml(result.pages[0]);
464
+ const rows = result.pages.map((page) => `<tr><td>${escapeHtml(page.url)}</td><td>${page.seoScore}</td><td>${page.thinScore}</td><td>${page.title.status}</td><td>${page.metaDescription.status}</td></tr>`).join('');
465
+ return ANALYSIS_LIST_TEMPLATE.replace('{{PAGES_ANALYZED}}', result.site_summary.pages_analyzed.toString()).replace('{{AVG_SEO_SCORE}}', result.site_summary.avg_seo_score.toString()).replace('{{ROWS}}', rows);
466
+ }
467
+ function renderSinglePageHtml(page) {
468
+ const structuredDataStatus = page.structuredData.present ? (page.structuredData.valid ? '<span class="status-ok">Valid</span>' : '<span class="status-critical">Invalid JSON</span>') : 'Not detected';
469
+ const structuredDataTypesRow = page.structuredData.present ? `<tr><th>Types Found</th><td>${page.structuredData.types.map(t => `<code>${t}</code>`).join(', ')}</td></tr>` : '';
470
+ return ANALYSIS_PAGE_TEMPLATE.replaceAll('{{URL}}', escapeHtml(page.url)).replace('{{SEO_SCORE}}', page.seoScore.toString()).replace('{{THIN_SCORE}}', page.thinScore.toString()).replace('{{HTTP_STATUS}}', page.status === 0 ? 'Pending/Limit' : page.status.toString()).replace('{{TITLE_VALUE}}', escapeHtml(page.title.value || '(missing)')).replace('{{TITLE_LENGTH}}', page.title.length.toString()).replaceAll('{{TITLE_STATUS}}', page.title.status).replace('{{META_DESCRIPTION_VALUE}}', escapeHtml(page.metaDescription.value || '(missing)')).replace('{{META_DESCRIPTION_LENGTH}}', page.metaDescription.length.toString()).replaceAll('{{META_DESCRIPTION_STATUS}}', page.metaDescription.status).replace('{{CANONICAL}}', page.meta.canonical ? escapeHtml(page.meta.canonical) : '<em>(none)</em>').replace('{{ROBOTS_INDEX}}', (!page.meta.noindex).toString()).replace('{{ROBOTS_FOLLOW}}', (!page.meta.nofollow).toString()).replaceAll('{{H1_STATUS}}', page.h1.status).replace('{{H1_COUNT}}', page.h1.count.toString()).replace('{{H1_MATCHES_TITLE}}', page.h1.matchesTitle ? ' | Matches Title' : '').replace('{{WORD_COUNT}}', page.content.wordCount.toString()).replace('{{UNIQUE_SENTENCES}}', page.content.uniqueSentenceCount.toString()).replace('{{TEXT_HTML_RATIO}}', (page.content.textHtmlRatio * 100).toFixed(2)).replace('{{INTERNAL_LINKS}}', page.links.internalLinks.toString()).replace('{{EXTERNAL_LINKS}}', page.links.externalLinks.toString()).replace('{{EXTERNAL_RATIO}}', (page.links.externalRatio * 100).toFixed(1)).replace('{{TOTAL_IMAGES}}', page.images.totalImages.toString()).replace('{{MISSING_ALT}}', page.images.missingAlt.toString()).replace('{{STRUCTURED_DATA_STATUS}}', structuredDataStatus).replace('{{STRUCTURED_DATA_TYPES_ROW}}', structuredDataTypesRow);
471
+ }
472
+ export function renderAnalysisMarkdown(result) {
473
+ const summary = ['# Crawlith SEO Analysis Report', '', '## 📊 Summary', `- Pages Analyzed: ${result.site_summary.pages_analyzed}`, `- Overall Site Score: ${result.site_summary.site_score.toFixed(1)}`, `- Avg SEO Score: ${result.site_summary.avg_seo_score.toFixed(1)}`, `- Thin Pages Found: ${result.site_summary.thin_pages}`, `- Duplicate Titles: ${result.site_summary.duplicate_titles}`, '', '## 📄 Page Details', '', '| URL | SEO Score | Thin Score | Title Status | Meta Status | Canonical |', '| :--- | :--- | :--- | :--- | :--- | :--- |'];
474
+ result.pages.forEach((page) => summary.push(`| ${page.url} | ${page.seoScore} | ${page.thinScore} | ${page.title.status} | ${page.metaDescription.status} | ${page.meta.canonical || '-'} |`));
475
+ return summary.join('\n');
476
+ }
477
+ export function renderAnalysisCsv(result) {
478
+ const headers = ['URL', 'SEO Score', 'Thin Score', 'HTTP Status', 'Title', 'Title Length', 'Meta Description', 'Desc Length', 'Word Count', 'Internal Links', 'External Links', 'Canonical'];
479
+ const rows = result.pages.map((p) => {
480
+ const statusStr = p.status === 0 ? 'Pending/Limit' : p.status;
481
+ return [p.url, p.seoScore, p.thinScore, statusStr, `"${(p.title.value || '').replace(/"/g, '""')}"`, p.title.length, `"${(p.metaDescription.value || '').replace(/"/g, '""')}"`, p.metaDescription.length, p.content.wordCount, p.links.internalLinks, p.links.externalLinks, p.meta.canonical || ''].join(',');
482
+ });
483
+ return [headers.join(','), ...rows].join('\n');
436
484
  }