@crawlith/core 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analyze.d.ts +29 -8
  4. package/dist/analysis/analyze.js +325 -221
  5. package/dist/analysis/clustering.d.ts +23 -0
  6. package/dist/analysis/clustering.js +206 -0
  7. package/dist/analysis/content.d.ts +1 -1
  8. package/dist/analysis/content.js +11 -5
  9. package/dist/analysis/duplicate.d.ts +34 -0
  10. package/dist/analysis/duplicate.js +305 -0
  11. package/dist/analysis/heading.d.ts +116 -0
  12. package/dist/analysis/heading.js +356 -0
  13. package/dist/analysis/images.d.ts +1 -1
  14. package/dist/analysis/images.js +6 -5
  15. package/dist/analysis/links.d.ts +1 -1
  16. package/dist/analysis/links.js +8 -8
  17. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  18. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  19. package/dist/analysis/scoring.js +4 -1
  20. package/dist/analysis/seo.d.ts +8 -4
  21. package/dist/analysis/seo.js +41 -30
  22. package/dist/analysis/soft404.d.ts +17 -0
  23. package/dist/analysis/soft404.js +62 -0
  24. package/dist/analysis/structuredData.d.ts +1 -1
  25. package/dist/analysis/structuredData.js +5 -4
  26. package/dist/application/index.d.ts +2 -0
  27. package/dist/application/index.js +2 -0
  28. package/dist/application/usecase.d.ts +3 -0
  29. package/dist/application/usecase.js +1 -0
  30. package/dist/application/usecases.d.ts +114 -0
  31. package/dist/application/usecases.js +201 -0
  32. package/dist/audit/index.js +1 -1
  33. package/dist/audit/transport.d.ts +1 -1
  34. package/dist/audit/transport.js +5 -4
  35. package/dist/audit/types.d.ts +1 -0
  36. package/dist/constants.d.ts +17 -0
  37. package/dist/constants.js +23 -0
  38. package/dist/core/scope/scopeManager.js +3 -0
  39. package/dist/crawler/crawl.d.ts +2 -2
  40. package/dist/crawler/crawler.d.ts +17 -5
  41. package/dist/crawler/crawler.js +259 -94
  42. package/dist/crawler/fetcher.d.ts +1 -1
  43. package/dist/crawler/fetcher.js +6 -6
  44. package/dist/crawler/metricsRunner.d.ts +21 -1
  45. package/dist/crawler/metricsRunner.js +181 -60
  46. package/dist/crawler/normalize.d.ts +41 -0
  47. package/dist/crawler/normalize.js +119 -3
  48. package/dist/crawler/parser.d.ts +1 -3
  49. package/dist/crawler/parser.js +2 -49
  50. package/dist/crawler/resolver.d.ts +11 -0
  51. package/dist/crawler/resolver.js +67 -0
  52. package/dist/crawler/sitemap.d.ts +4 -1
  53. package/dist/crawler/sitemap.js +24 -18
  54. package/dist/crawler/trap.d.ts +5 -1
  55. package/dist/crawler/trap.js +23 -2
  56. package/dist/db/CrawlithDB.d.ts +110 -0
  57. package/dist/db/CrawlithDB.js +500 -0
  58. package/dist/db/graphLoader.js +15 -32
  59. package/dist/db/index.d.ts +9 -1
  60. package/dist/db/index.js +39 -31
  61. package/dist/db/migrations.d.ts +2 -0
  62. package/dist/db/{schema.js → migrations.js} +90 -43
  63. package/dist/db/pluginRegistry.d.ts +9 -0
  64. package/dist/db/pluginRegistry.js +19 -0
  65. package/dist/db/repositories/EdgeRepository.d.ts +5 -0
  66. package/dist/db/repositories/EdgeRepository.js +7 -0
  67. package/dist/db/repositories/MetricsRepository.d.ts +13 -8
  68. package/dist/db/repositories/MetricsRepository.js +14 -6
  69. package/dist/db/repositories/PageRepository.d.ts +5 -3
  70. package/dist/db/repositories/PageRepository.js +68 -17
  71. package/dist/db/repositories/SiteRepository.d.ts +6 -0
  72. package/dist/db/repositories/SiteRepository.js +4 -0
  73. package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
  74. package/dist/db/repositories/SnapshotRepository.js +48 -10
  75. package/dist/db/reset.d.ts +9 -0
  76. package/dist/db/reset.js +32 -0
  77. package/dist/db/statements.d.ts +12 -0
  78. package/dist/db/statements.js +40 -0
  79. package/dist/diff/compare.d.ts +0 -5
  80. package/dist/diff/compare.js +0 -12
  81. package/dist/diff/service.d.ts +16 -0
  82. package/dist/diff/service.js +41 -0
  83. package/dist/domain/index.d.ts +4 -0
  84. package/dist/domain/index.js +4 -0
  85. package/dist/events.d.ts +8 -0
  86. package/dist/graph/graph.d.ts +20 -42
  87. package/dist/graph/graph.js +12 -16
  88. package/dist/graph/hits.d.ts +23 -0
  89. package/dist/graph/hits.js +111 -0
  90. package/dist/graph/metrics.d.ts +0 -4
  91. package/dist/graph/metrics.js +19 -15
  92. package/dist/graph/pagerank.d.ts +17 -4
  93. package/dist/graph/pagerank.js +126 -93
  94. package/dist/index.d.ts +27 -9
  95. package/dist/index.js +27 -9
  96. package/dist/lock/lockManager.d.ts +1 -0
  97. package/dist/lock/lockManager.js +15 -0
  98. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  99. package/dist/plugin-system/plugin-cli.js +31 -0
  100. package/dist/plugin-system/plugin-config.d.ts +16 -0
  101. package/dist/plugin-system/plugin-config.js +36 -0
  102. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  103. package/dist/plugin-system/plugin-loader.js +122 -0
  104. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  105. package/dist/plugin-system/plugin-registry.js +167 -0
  106. package/dist/plugin-system/plugin-types.d.ts +205 -0
  107. package/dist/plugin-system/plugin-types.js +1 -0
  108. package/dist/ports/index.d.ts +9 -0
  109. package/dist/ports/index.js +1 -0
  110. package/dist/report/export.d.ts +3 -0
  111. package/dist/report/export.js +81 -0
  112. package/dist/report/insight.d.ts +27 -0
  113. package/dist/report/insight.js +103 -0
  114. package/dist/scoring/health.d.ts +17 -11
  115. package/dist/scoring/health.js +183 -140
  116. package/dist/utils/chalk.d.ts +6 -0
  117. package/dist/utils/chalk.js +41 -0
  118. package/dist/utils/secureConfig.d.ts +23 -0
  119. package/dist/utils/secureConfig.js +128 -0
  120. package/package.json +10 -4
  121. package/CHANGELOG.md +0 -13
  122. package/dist/db/schema.d.ts +0 -2
  123. package/dist/graph/cluster.d.ts +0 -6
  124. package/dist/graph/cluster.js +0 -221
  125. package/dist/graph/duplicate.d.ts +0 -10
  126. package/dist/graph/duplicate.js +0 -302
  127. package/dist/scoring/hits.d.ts +0 -10
  128. package/dist/scoring/hits.js +0 -131
  129. package/scripts/copy-assets.js +0 -37
  130. package/src/analysis/analysis_list.html +0 -35
  131. package/src/analysis/analysis_page.html +0 -123
  132. package/src/analysis/analyze.ts +0 -505
  133. package/src/analysis/content.ts +0 -62
  134. package/src/analysis/images.ts +0 -28
  135. package/src/analysis/links.ts +0 -41
  136. package/src/analysis/scoring.ts +0 -66
  137. package/src/analysis/seo.ts +0 -82
  138. package/src/analysis/structuredData.ts +0 -62
  139. package/src/analysis/templates.ts +0 -9
  140. package/src/audit/dns.ts +0 -49
  141. package/src/audit/headers.ts +0 -98
  142. package/src/audit/index.ts +0 -66
  143. package/src/audit/scoring.ts +0 -232
  144. package/src/audit/transport.ts +0 -258
  145. package/src/audit/types.ts +0 -102
  146. package/src/core/network/proxyAdapter.ts +0 -21
  147. package/src/core/network/rateLimiter.ts +0 -39
  148. package/src/core/network/redirectController.ts +0 -47
  149. package/src/core/network/responseLimiter.ts +0 -34
  150. package/src/core/network/retryPolicy.ts +0 -57
  151. package/src/core/scope/domainFilter.ts +0 -45
  152. package/src/core/scope/scopeManager.ts +0 -52
  153. package/src/core/scope/subdomainPolicy.ts +0 -39
  154. package/src/core/security/ipGuard.ts +0 -171
  155. package/src/crawler/crawl.ts +0 -9
  156. package/src/crawler/crawler.ts +0 -601
  157. package/src/crawler/extract.ts +0 -39
  158. package/src/crawler/fetcher.ts +0 -251
  159. package/src/crawler/metricsRunner.ts +0 -137
  160. package/src/crawler/normalize.ts +0 -108
  161. package/src/crawler/parser.ts +0 -190
  162. package/src/crawler/sitemap.ts +0 -76
  163. package/src/crawler/trap.ts +0 -96
  164. package/src/db/graphLoader.ts +0 -135
  165. package/src/db/index.ts +0 -75
  166. package/src/db/repositories/EdgeRepository.ts +0 -43
  167. package/src/db/repositories/MetricsRepository.ts +0 -63
  168. package/src/db/repositories/PageRepository.ts +0 -228
  169. package/src/db/repositories/SiteRepository.ts +0 -43
  170. package/src/db/repositories/SnapshotRepository.ts +0 -99
  171. package/src/db/schema.ts +0 -177
  172. package/src/diff/compare.ts +0 -84
  173. package/src/events.ts +0 -16
  174. package/src/graph/cluster.ts +0 -246
  175. package/src/graph/duplicate.ts +0 -350
  176. package/src/graph/graph.ts +0 -192
  177. package/src/graph/metrics.ts +0 -125
  178. package/src/graph/pagerank.ts +0 -126
  179. package/src/graph/simhash.ts +0 -76
  180. package/src/index.ts +0 -33
  181. package/src/lock/hashKey.ts +0 -51
  182. package/src/lock/lockManager.ts +0 -132
  183. package/src/lock/pidCheck.ts +0 -13
  184. package/src/report/crawl.html +0 -879
  185. package/src/report/crawlExport.ts +0 -58
  186. package/src/report/crawl_template.ts +0 -9
  187. package/src/report/html.ts +0 -27
  188. package/src/scoring/health.ts +0 -241
  189. package/src/scoring/hits.ts +0 -153
  190. package/src/scoring/orphanSeverity.ts +0 -176
  191. package/src/utils/version.ts +0 -18
  192. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  193. package/tests/analysis.unit.test.ts +0 -142
  194. package/tests/analyze.integration.test.ts +0 -133
  195. package/tests/analyze_markdown.test.ts +0 -98
  196. package/tests/audit/audit.test.ts +0 -101
  197. package/tests/audit/dns.test.ts +0 -31
  198. package/tests/audit/headers.test.ts +0 -45
  199. package/tests/audit/scoring.test.ts +0 -133
  200. package/tests/audit/security.test.ts +0 -12
  201. package/tests/audit/transport.test.ts +0 -111
  202. package/tests/clustering.test.ts +0 -118
  203. package/tests/clustering_risk.test.ts +0 -118
  204. package/tests/crawler.test.ts +0 -364
  205. package/tests/db/index.test.ts +0 -134
  206. package/tests/db/repositories.test.ts +0 -115
  207. package/tests/db.test.ts +0 -159
  208. package/tests/db_repos.test.ts +0 -72
  209. package/tests/diff.test.ts +0 -67
  210. package/tests/duplicate.test.ts +0 -110
  211. package/tests/extract.test.ts +0 -86
  212. package/tests/fetcher.test.ts +0 -110
  213. package/tests/fetcher_safety.test.ts +0 -91
  214. package/tests/fixtures/analyze-crawl.json +0 -26
  215. package/tests/graph/graph.test.ts +0 -100
  216. package/tests/graphLoader.test.ts +0 -124
  217. package/tests/hits.test.ts +0 -134
  218. package/tests/html_report.test.ts +0 -59
  219. package/tests/ipGuard.test.ts +0 -73
  220. package/tests/lock/lockManager.test.ts +0 -198
  221. package/tests/metrics.test.ts +0 -196
  222. package/tests/normalize.test.ts +0 -88
  223. package/tests/orphanSeverity.test.ts +0 -160
  224. package/tests/pagerank.test.ts +0 -98
  225. package/tests/parser.test.ts +0 -117
  226. package/tests/proxy_safety.test.ts +0 -57
  227. package/tests/redirect_safety.test.ts +0 -77
  228. package/tests/renderAnalysisCsv.test.ts +0 -183
  229. package/tests/safety.test.ts +0 -126
  230. package/tests/scope.test.ts +0 -84
  231. package/tests/scoring.test.ts +0 -60
  232. package/tests/sitemap.test.ts +0 -100
  233. package/tests/soft404.test.ts +0 -41
  234. package/tests/ssrf_fix.test.ts +0 -69
  235. package/tests/trap.test.ts +0 -39
  236. package/tests/visualization_data.test.ts +0 -46
  237. package/tsconfig.json +0 -11
@@ -1,505 +0,0 @@
1
- import { crawl } from '../crawler/crawl.js';
2
- import { loadGraphFromSnapshot } from '../db/graphLoader.js';
3
- import { normalizeUrl } from '../crawler/normalize.js';
4
- import { calculateMetrics, Metrics } from '../graph/metrics.js';
5
- import { Graph, ClusterInfo } from '../graph/graph.js';
6
- import { analyzeContent, calculateThinContentScore } from './content.js';
7
- import { analyzeH1, analyzeMetaDescription, analyzeTitle, H1Analysis, TextFieldAnalysis } from './seo.js';
8
- import { analyzeImageAlts, ImageAltAnalysis } from './images.js';
9
- import { analyzeLinks, LinkRatioAnalysis } from './links.js';
10
- import { analyzeStructuredData, StructuredDataResult } from './structuredData.js';
11
- import { aggregateSiteScore, scorePageSeo } from './scoring.js';
12
- import { detectContentClusters } from '../graph/cluster.js';
13
- import { getDb } from '../db/index.js';
14
- import { SiteRepository } from '../db/repositories/SiteRepository.js';
15
- import { SnapshotRepository } from '../db/repositories/SnapshotRepository.js';
16
- import { PageRepository } from '../db/repositories/PageRepository.js';
17
- import { ANALYSIS_LIST_TEMPLATE, ANALYSIS_PAGE_TEMPLATE } from './templates.js';
18
- import { EngineContext } from '../events.js';
19
-
20
- export interface CrawlPage {
21
- url: string;
22
- status?: number;
23
- html?: string;
24
- depth?: number;
25
- canonical?: string;
26
- noindex?: boolean;
27
- nofollow?: boolean;
28
- crawlStatus?: string;
29
- }
30
-
31
- export interface AnalyzeOptions {
32
- live?: boolean;
33
- seo?: boolean;
34
- content?: boolean;
35
- accessibility?: boolean;
36
- rate?: number;
37
- proxyUrl?: string;
38
- userAgent?: string;
39
- maxRedirects?: number;
40
- debug?: boolean;
41
- clusterThreshold?: number;
42
- minClusterSize?: number;
43
- allPages?: boolean;
44
- }
45
-
46
- export interface PageAnalysis {
47
- url: string;
48
- status: number;
49
- title: TextFieldAnalysis;
50
- metaDescription: TextFieldAnalysis;
51
- h1: H1Analysis;
52
- content: ReturnType<typeof analyzeContent>;
53
- thinScore: number;
54
- images: ImageAltAnalysis;
55
- links: LinkRatioAnalysis;
56
- structuredData: StructuredDataResult;
57
- seoScore: number;
58
- meta: {
59
- canonical?: string;
60
- noindex?: boolean;
61
- nofollow?: boolean;
62
- crawlStatus?: string;
63
- }
64
- }
65
-
66
- export interface AnalysisResult {
67
- site_summary: {
68
- pages_analyzed: number;
69
- avg_seo_score: number;
70
- thin_pages: number;
71
- duplicate_titles: number;
72
- site_score: number;
73
- };
74
- site_scores: ReturnType<typeof aggregateSiteScore>;
75
- pages: PageAnalysis[];
76
- active_modules: {
77
- seo: boolean;
78
- content: boolean;
79
- accessibility: boolean;
80
- };
81
- clusters?: ClusterInfo[];
82
- snapshotId?: number;
83
- crawledAt?: string;
84
- }
85
-
86
- interface CrawlData {
87
- pages: Iterable<CrawlPage> | CrawlPage[];
88
- metrics: Metrics;
89
- graph: Graph;
90
- snapshotId: number;
91
- crawledAt?: string;
92
- }
93
-
94
- /**
95
- * Analyzes a site for SEO, content, and accessibility.
96
- * Supports live crawling or loading from a database snapshot.
97
- * Note: File-based data loading is not supported.
98
- *
99
- * @param url The root URL to analyze
100
- * @param options Analysis options
101
- * @param context Engine context for event emission
102
- */
103
- export async function analyzeSite(url: string, options: AnalyzeOptions, context?: EngineContext): Promise<AnalysisResult> {
104
- const normalizedRoot = normalizeUrl(url, '', { stripQuery: false });
105
- if (!normalizedRoot) {
106
- throw new Error('Invalid URL for analysis');
107
- }
108
-
109
- let crawlData: CrawlData;
110
- let robots: any = null;
111
-
112
- // Always try to fetch robots.txt for the analysis session
113
- // to ensure we have the latest rules for visibility reporting.
114
- try {
115
- const robotsUrl = new URL('/robots.txt', normalizedRoot).toString();
116
- const robotsRes = await (new (await import('../crawler/fetcher.js')).Fetcher()).fetch(robotsUrl, { maxBytes: 500000 });
117
- const status = robotsRes.status;
118
- if (typeof status === 'number' && status >= 200 && status < 300) {
119
- const robotsParserModule = await import('robots-parser');
120
- const robotsParser = (robotsParserModule as any).default || robotsParserModule;
121
- robots = (robotsParser as any)(robotsUrl, robotsRes.body);
122
- }
123
- } catch {
124
- // Silence robots fetch errors, fallback to existing or none
125
- }
126
- if (options.live) {
127
- crawlData = await runLiveCrawl(normalizedRoot, options, context);
128
- } else {
129
- try {
130
- crawlData = await loadCrawlData(normalizedRoot);
131
-
132
- // Convert generator to array so it can be reused multiple times
133
- const allPages = Array.from(crawlData.pages);
134
- crawlData.pages = allPages;
135
-
136
- // Check if the requested URL actually exists in this snapshot
137
- const exists = allPages.some(p => p.url === normalizedRoot);
138
- if (!exists) {
139
- options.live = true; // Mark as live so the analysis knows to pick the first page if exact match fails
140
- if (context) {
141
- context.emit({ type: 'info', message: `URL ${normalizedRoot} not found in latest snapshot. Fetching live...` });
142
- }
143
- crawlData = await runLiveCrawl(normalizedRoot, options, context);
144
- }
145
- } catch (error: any) {
146
- const isNotFound = error.code === 'ENOENT' ||
147
- error.message.includes('Crawl data not found') ||
148
- error.message.includes('No completed snapshot found') ||
149
- error.message.includes('not found in database');
150
- if (isNotFound) {
151
- options.live = true; // Force live mode
152
- if (context) {
153
- context.emit({ type: 'info', message: 'No local crawl data found. Switching to live analysis mode...' });
154
- }
155
- crawlData = await runLiveCrawl(normalizedRoot, options, context);
156
- } else {
157
- throw error;
158
- }
159
- }
160
- }
161
-
162
- const snapshotId = crawlData.snapshotId;
163
- const crawledAt = crawlData.crawledAt;
164
-
165
- // Run clustering if requested or as default
166
- detectContentClusters(crawlData.graph, options.clusterThreshold, options.minClusterSize);
167
-
168
- const pages = analyzePages(normalizedRoot, crawlData.pages, robots);
169
-
170
- const activeModules = {
171
- seo: !!options.seo,
172
- content: !!options.content,
173
- accessibility: !!options.accessibility
174
- };
175
-
176
- const hasFilters = activeModules.seo || activeModules.content || activeModules.accessibility;
177
-
178
- const filteredPages = hasFilters
179
- ? pages.map((page) => filterPageModules(page, activeModules))
180
- : pages;
181
-
182
- // Filter to only the requested URL
183
- const targetPage = filteredPages.find(p => p.url === normalizedRoot);
184
- let resultPages: PageAnalysis[];
185
-
186
- if (options.allPages) {
187
- resultPages = filteredPages;
188
- } else {
189
- resultPages = targetPage ? [targetPage] : (options.live ? filteredPages.slice(0, 1) : []);
190
- }
191
-
192
- const duplicateTitles = pages.filter((page) => page.title.status === 'duplicate').length;
193
- const thinPages = pages.filter((page) => page.thinScore >= 70).length;
194
- const siteScores = aggregateSiteScore(crawlData.metrics, resultPages.length === 1 ? resultPages : pages);
195
-
196
- return {
197
- site_summary: {
198
- pages_analyzed: resultPages.length,
199
- avg_seo_score: siteScores.seoHealthScore,
200
- thin_pages: thinPages,
201
- duplicate_titles: duplicateTitles,
202
- site_score: siteScores.overallScore
203
- },
204
- site_scores: siteScores,
205
- pages: resultPages,
206
- active_modules: activeModules,
207
- clusters: crawlData.graph.contentClusters,
208
- snapshotId,
209
- crawledAt
210
- };
211
- }
212
-
213
- export function renderAnalysisHtml(result: AnalysisResult): string {
214
- if (result.pages.length === 1) {
215
- return renderSinglePageHtml(result.pages[0]);
216
- }
217
- const rows = result.pages
218
- .map((page) => `<tr><td>${escapeHtml(page.url)}</td><td>${page.seoScore}</td><td>${page.thinScore}</td><td>${page.title.status}</td><td>${page.metaDescription.status}</td></tr>`)
219
- .join('');
220
-
221
- return ANALYSIS_LIST_TEMPLATE
222
- .replace('{{PAGES_ANALYZED}}', result.site_summary.pages_analyzed.toString())
223
- .replace('{{AVG_SEO_SCORE}}', result.site_summary.avg_seo_score.toString())
224
- .replace('{{ROWS}}', rows);
225
- }
226
-
227
- function renderSinglePageHtml(page: PageAnalysis): string {
228
- const structuredDataStatus = page.structuredData.present
229
- ? (page.structuredData.valid ? '<span class="status-ok">Valid</span>' : '<span class="status-critical">Invalid JSON</span>')
230
- : 'Not detected';
231
-
232
- const structuredDataTypesRow = page.structuredData.present ? `
233
- <tr>
234
- <th>Types Found</th>
235
- <td>${page.structuredData.types.map(t => `<code>${t}</code>`).join(', ')}</td>
236
- </tr>
237
- ` : '';
238
-
239
- return ANALYSIS_PAGE_TEMPLATE
240
- .replaceAll('{{URL}}', escapeHtml(page.url))
241
- .replace('{{SEO_SCORE}}', page.seoScore.toString())
242
- .replace('{{THIN_SCORE}}', page.thinScore.toString())
243
- .replace('{{HTTP_STATUS}}', page.status === 0 ? 'Pending/Limit' : page.status.toString())
244
- .replace('{{TITLE_VALUE}}', escapeHtml(page.title.value || '(missing)'))
245
- .replace('{{TITLE_LENGTH}}', page.title.length.toString())
246
- .replaceAll('{{TITLE_STATUS}}', page.title.status)
247
- .replace('{{META_DESCRIPTION_VALUE}}', escapeHtml(page.metaDescription.value || '(missing)'))
248
- .replace('{{META_DESCRIPTION_LENGTH}}', page.metaDescription.length.toString())
249
- .replaceAll('{{META_DESCRIPTION_STATUS}}', page.metaDescription.status)
250
- .replace('{{CANONICAL}}', page.meta.canonical ? escapeHtml(page.meta.canonical) : '<em>(none)</em>')
251
- .replace('{{ROBOTS_INDEX}}', (!page.meta.noindex).toString())
252
- .replace('{{ROBOTS_FOLLOW}}', (!page.meta.nofollow).toString())
253
- .replaceAll('{{H1_STATUS}}', page.h1.status)
254
- .replace('{{H1_COUNT}}', page.h1.count.toString())
255
- .replace('{{H1_MATCHES_TITLE}}', page.h1.matchesTitle ? ' | Matches Title' : '')
256
- .replace('{{WORD_COUNT}}', page.content.wordCount.toString())
257
- .replace('{{UNIQUE_SENTENCES}}', page.content.uniqueSentenceCount.toString())
258
- .replace('{{TEXT_HTML_RATIO}}', (page.content.textHtmlRatio * 100).toFixed(2))
259
- .replace('{{INTERNAL_LINKS}}', page.links.internalLinks.toString())
260
- .replace('{{EXTERNAL_LINKS}}', page.links.externalLinks.toString())
261
- .replace('{{EXTERNAL_RATIO}}', (page.links.externalRatio * 100).toFixed(1))
262
- .replace('{{TOTAL_IMAGES}}', page.images.totalImages.toString())
263
- .replace('{{MISSING_ALT}}', page.images.missingAlt.toString())
264
- .replace('{{STRUCTURED_DATA_STATUS}}', structuredDataStatus)
265
- .replace('{{STRUCTURED_DATA_TYPES_ROW}}', structuredDataTypesRow);
266
- }
267
-
268
- export function renderAnalysisMarkdown(result: AnalysisResult): string {
269
- const summary = [
270
- '# Crawlith SEO Analysis Report',
271
- '',
272
- '## 📊 Summary',
273
- `- Pages Analyzed: ${result.site_summary.pages_analyzed}`,
274
- `- Overall Site Score: ${result.site_summary.site_score.toFixed(1)}`,
275
- `- Avg SEO Score: ${result.site_summary.avg_seo_score.toFixed(1)}`,
276
- `- Thin Pages Found: ${result.site_summary.thin_pages}`,
277
- `- Duplicate Titles: ${result.site_summary.duplicate_titles}`,
278
- '',
279
- '## 📄 Page Details',
280
- '',
281
- '| URL | SEO Score | Thin Score | Title Status | Meta Status |',
282
- '| :--- | :--- | :--- | :--- | :--- |',
283
- ];
284
-
285
- result.pages.forEach((page) => {
286
- summary.push(`| ${page.url} | ${page.seoScore} | ${page.thinScore} | ${page.title.status} | ${page.metaDescription.status} |`);
287
- });
288
-
289
- return summary.join('\n');
290
- }
291
-
292
- export function renderAnalysisCsv(result: AnalysisResult): string {
293
- const headers = ['URL', 'SEO Score', 'Thin Score', 'HTTP Status', 'Title', 'Title Length', 'Meta Description', 'Desc Length', 'Word Count', 'Internal Links', 'External Links'];
294
- const rows = result.pages.map((p) => {
295
- const statusStr = p.status === 0 ? 'Pending/Limit' : p.status;
296
- return [
297
- p.url,
298
- p.seoScore,
299
- p.thinScore,
300
- statusStr,
301
- `"${(p.title.value || '').replace(/"/g, '""')}"`,
302
- p.title.length,
303
- `"${(p.metaDescription.value || '').replace(/"/g, '""')}"`,
304
- p.metaDescription.length,
305
- p.content.wordCount,
306
- p.links.internalLinks,
307
- p.links.externalLinks
308
- ].join(',');
309
- });
310
-
311
- return [headers.join(','), ...rows].join('\n');
312
- }
313
-
314
- function escapeHtml(value: string): string {
315
- return value.replaceAll('&', '&amp;').replaceAll('<', '&lt;').replaceAll('>', '&gt;');
316
- }
317
-
318
- export function analyzePages(rootUrl: string, pages: Iterable<CrawlPage> | CrawlPage[], robots?: any): PageAnalysis[] {
319
- const titleCounts = new Map<string, number>();
320
- const metaCounts = new Map<string, number>();
321
- const sentenceCountFrequency = new Map<number, number>();
322
-
323
- const results: PageAnalysis[] = [];
324
-
325
- for (const page of pages) {
326
- const html = page.html || '';
327
-
328
- // 0. Update crawl status based on current robots rules
329
- let crawlStatus = page.crawlStatus;
330
- if (robots) {
331
- const isBlocked = !robots.isAllowed(page.url, 'crawlith') ||
332
- (!page.url.endsWith('/') && !robots.isAllowed(page.url + '/', 'crawlith'));
333
- if (isBlocked) {
334
- crawlStatus = 'blocked_by_robots';
335
- }
336
- }
337
-
338
- // 1. Analyze Individual Components
339
- const title = analyzeTitle(html);
340
- const metaDescription = analyzeMetaDescription(html);
341
- const h1 = analyzeH1(html, title.value);
342
- const content = analyzeContent(html);
343
- const images = analyzeImageAlts(html);
344
- const links = analyzeLinks(html, page.url, rootUrl);
345
- const structuredData = analyzeStructuredData(html);
346
-
347
- // 2. Accumulate Frequencies for Duplicates
348
- if (title.value) {
349
- const key = (title.value || '').trim().toLowerCase();
350
- titleCounts.set(key, (titleCounts.get(key) || 0) + 1);
351
- }
352
- if (metaDescription.value) {
353
- const key = (metaDescription.value || '').trim().toLowerCase();
354
- metaCounts.set(key, (metaCounts.get(key) || 0) + 1);
355
- }
356
- sentenceCountFrequency.set(content.uniqueSentenceCount, (sentenceCountFrequency.get(content.uniqueSentenceCount) || 0) + 1);
357
-
358
- // 3. Store Preliminary Result
359
- results.push({
360
- url: page.url,
361
- status: page.status || 0,
362
- title,
363
- metaDescription,
364
- h1,
365
- content,
366
- thinScore: 0, // Calculated in pass 2
367
- images,
368
- links,
369
- structuredData,
370
- seoScore: 0, // Calculated in pass 2
371
- meta: {
372
- canonical: page.canonical,
373
- noindex: page.noindex,
374
- nofollow: page.nofollow,
375
- crawlStatus
376
- }
377
- });
378
- }
379
-
380
- // 4. Finalize Statuses and Scores (Pass 2)
381
- for (const analysis of results) {
382
- // Check Title Duplicates
383
- if (analysis.title.value) {
384
- const key = (analysis.title.value || '').trim().toLowerCase();
385
- if ((titleCounts.get(key) || 0) > 1) {
386
- analysis.title.status = 'duplicate';
387
- }
388
- }
389
-
390
- // Check Meta Duplicates
391
- if (analysis.metaDescription.value) {
392
- const key = (analysis.metaDescription.value || '').trim().toLowerCase();
393
- if ((metaCounts.get(key) || 0) > 1) {
394
- analysis.metaDescription.status = 'duplicate';
395
- }
396
- }
397
-
398
- // Check Content Duplication
399
- const duplicationScore = (sentenceCountFrequency.get(analysis.content.uniqueSentenceCount) || 0) > 1 ? 100 : 0;
400
- analysis.thinScore = calculateThinContentScore(analysis.content, duplicationScore);
401
-
402
- // Calculate Final SEO Score
403
- analysis.seoScore = scorePageSeo(analysis);
404
- }
405
-
406
- return results;
407
- }
408
-
409
- function filterPageModules(
410
- page: PageAnalysis,
411
- modules: { seo: boolean; content: boolean; accessibility: boolean }
412
- ): PageAnalysis {
413
- const keepSeo = modules.seo;
414
- const keepContent = modules.content;
415
- const keepAccessibility = modules.accessibility;
416
-
417
- return {
418
- ...page,
419
- title: keepSeo ? page.title : { value: null, length: 0, status: 'missing' },
420
- metaDescription: keepSeo ? page.metaDescription : { value: null, length: 0, status: 'missing' },
421
- h1: (keepSeo || keepContent) ? page.h1 : { count: 0, status: 'critical', matchesTitle: false },
422
- links: keepSeo ? page.links : { internalLinks: 0, externalLinks: 0, nofollowCount: 0, externalRatio: 0 },
423
- structuredData: keepSeo ? page.structuredData : { present: false, valid: false, types: [] },
424
- content: keepContent ? page.content : { wordCount: 0, textHtmlRatio: 0, uniqueSentenceCount: 0 },
425
- thinScore: keepContent ? page.thinScore : 0,
426
- images: keepAccessibility ? page.images : { totalImages: 0, missingAlt: 0, emptyAlt: 0 }
427
- };
428
- }
429
-
430
- async function loadCrawlData(rootUrl: string): Promise<CrawlData> {
431
- const db = getDb();
432
- const siteRepo = new SiteRepository(db);
433
- const snapshotRepo = new SnapshotRepository(db);
434
- const pageRepo = new PageRepository(db);
435
-
436
- const urlObj = new URL(rootUrl);
437
- const domain = urlObj.hostname.replace('www.', '');
438
- const site = siteRepo.firstOrCreateSite(domain);
439
-
440
- let snapshot;
441
- const page = pageRepo.getPage(site.id, rootUrl);
442
- if (page && page.last_seen_snapshot_id) {
443
- snapshot = snapshotRepo.getSnapshot(page.last_seen_snapshot_id);
444
- }
445
-
446
- if (!snapshot) {
447
- snapshot = snapshotRepo.getLatestSnapshot(site.id);
448
- }
449
-
450
- if (!snapshot) {
451
- throw new Error(`No crawl data found for ${rootUrl} in database.`);
452
- }
453
-
454
- const graph = loadGraphFromSnapshot(snapshot.id);
455
- const metrics = calculateMetrics(graph, 5);
456
-
457
- // Use iterator to save memory
458
- const dbPagesIterator = pageRepo.getPagesIteratorBySnapshot(snapshot.id);
459
-
460
- // We need to map the DB pages to CrawlPage format lazily
461
- const pagesGenerator = function* () {
462
- for (const p of dbPagesIterator) {
463
- yield {
464
- url: p.normalized_url,
465
- status: p.http_status || 0,
466
- html: p.html || '',
467
- depth: p.depth || 0,
468
- canonical: p.canonical_url || undefined,
469
- noindex: !!p.noindex,
470
- nofollow: !!p.nofollow,
471
- crawlStatus: graph.nodes.get(p.normalized_url)?.crawlStatus
472
- } as CrawlPage;
473
- }
474
- };
475
-
476
- return { pages: pagesGenerator(), metrics, graph, snapshotId: snapshot.id, crawledAt: snapshot.created_at };
477
- }
478
-
479
-
480
- async function runLiveCrawl(url: string, options: AnalyzeOptions, context?: EngineContext): Promise<CrawlData> {
481
- const snapshotId = await crawl(url, {
482
- limit: 1, // Always limit to 1 for single page live analysis
483
- depth: 0,
484
- rate: options.rate,
485
- proxyUrl: options.proxyUrl,
486
- userAgent: options.userAgent,
487
- maxRedirects: options.maxRedirects,
488
- debug: options.debug,
489
- snapshotType: 'partial'
490
- }, context) as number;
491
- const graph = loadGraphFromSnapshot(snapshotId);
492
- const pages = graph.getNodes().map((node) => ({
493
- url: node.url,
494
- status: node.status,
495
- html: node.html || '', // Include HTML
496
- depth: node.depth,
497
- crawlStatus: node.crawlStatus
498
- }));
499
- return {
500
- pages,
501
- metrics: calculateMetrics(graph, 1),
502
- graph,
503
- snapshotId
504
- };
505
- }
@@ -1,62 +0,0 @@
1
- import { load } from 'cheerio';
2
-
3
- export interface ContentAnalysis {
4
- wordCount: number;
5
- textHtmlRatio: number;
6
- uniqueSentenceCount: number;
7
- }
8
-
9
- export interface ThinScoreWeights {
10
- lowWordWeight: number;
11
- ratioWeight: number;
12
- dupWeight: number;
13
- }
14
-
15
- const DEFAULT_WEIGHTS: ThinScoreWeights = {
16
- lowWordWeight: 0.4,
17
- ratioWeight: 0.35,
18
- dupWeight: 0.25
19
- };
20
-
21
- export function analyzeContent(html: string): ContentAnalysis {
22
- const $ = load(html || '<html></html>');
23
- $('script,style,nav,footer').remove();
24
-
25
- const text = $('body').length ? $('body').text() : $.text();
26
- const cleanText = text.replace(/\s+/g, ' ').trim();
27
-
28
- const words = cleanText ? cleanText.split(/\s+/).filter(Boolean) : [];
29
- const wordCount = words.length;
30
-
31
- const htmlLength = Math.max(html.length, 1);
32
- const textHtmlRatio = cleanText.length / htmlLength;
33
-
34
- const sentenceSet = new Set(
35
- cleanText
36
- .split(/[.!?]+/)
37
- .map((item) => item.trim().toLowerCase())
38
- .filter(Boolean)
39
- );
40
-
41
- return {
42
- wordCount,
43
- textHtmlRatio,
44
- uniqueSentenceCount: sentenceSet.size
45
- };
46
- }
47
-
48
- export function calculateThinContentScore(
49
- content: ContentAnalysis,
50
- duplicationScore: number,
51
- weights: ThinScoreWeights = DEFAULT_WEIGHTS
52
- ): number {
53
- const wordScore = content.wordCount >= 300 ? 0 : 100 - Math.min(100, (content.wordCount / 300) * 100);
54
- const textRatioScore = content.textHtmlRatio >= 0.2 ? 0 : 100 - Math.min(100, (content.textHtmlRatio / 0.2) * 100);
55
-
56
- const raw =
57
- weights.lowWordWeight * wordScore +
58
- weights.ratioWeight * textRatioScore +
59
- weights.dupWeight * duplicationScore;
60
-
61
- return Math.max(0, Math.min(100, Number(raw.toFixed(2))));
62
- }
@@ -1,28 +0,0 @@
1
- import { load } from 'cheerio';
2
-
3
- export interface ImageAltAnalysis {
4
- totalImages: number;
5
- missingAlt: number;
6
- emptyAlt: number;
7
- }
8
-
9
- export function analyzeImageAlts(html: string): ImageAltAnalysis {
10
- const $ = load(html);
11
- let missingAlt = 0;
12
- let emptyAlt = 0;
13
-
14
- $('img').each((_idx, el) => {
15
- const alt = $(el).attr('alt');
16
- if (alt === undefined) {
17
- missingAlt += 1;
18
- return;
19
- }
20
-
21
- if (!alt.trim()) {
22
- emptyAlt += 1;
23
- }
24
- });
25
-
26
- const totalImages = $('img').length;
27
- return { totalImages, missingAlt, emptyAlt };
28
- }
@@ -1,41 +0,0 @@
1
- import { load } from 'cheerio';
2
- import { normalizeUrl } from '../crawler/normalize.js';
3
-
4
- export interface LinkRatioAnalysis {
5
- internalLinks: number;
6
- externalLinks: number;
7
- nofollowCount: number;
8
- externalRatio: number;
9
- }
10
-
11
- export function analyzeLinks(html: string, pageUrl: string, rootUrl: string): LinkRatioAnalysis {
12
- const $ = load(html);
13
- const rootOrigin = new URL(rootUrl).origin;
14
-
15
- let internalLinks = 0;
16
- let externalLinks = 0;
17
- let nofollowCount = 0;
18
-
19
- $('a[href]').each((_idx, el) => {
20
- const href = $(el).attr('href');
21
- if (!href) return;
22
- const normalized = normalizeUrl(href, pageUrl, { stripQuery: false });
23
- if (!normalized) return;
24
-
25
- const rel = ($(el).attr('rel') || '').toLowerCase();
26
- if (rel.includes('nofollow')) {
27
- nofollowCount += 1;
28
- }
29
-
30
- if (new URL(normalized).origin === rootOrigin) {
31
- internalLinks += 1;
32
- } else {
33
- externalLinks += 1;
34
- }
35
- });
36
-
37
- const total = internalLinks + externalLinks;
38
- const externalRatio = total === 0 ? 0 : externalLinks / total;
39
-
40
- return { internalLinks, externalLinks, nofollowCount, externalRatio };
41
- }