@crawlith/core 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/CHANGELOG.md +6 -0
  2. package/dist/analysis/analysis_list.html +35 -0
  3. package/dist/analysis/analysis_page.html +123 -0
  4. package/dist/analysis/analyze.d.ts +17 -3
  5. package/dist/analysis/analyze.js +192 -248
  6. package/dist/analysis/scoring.js +7 -1
  7. package/dist/analysis/templates.d.ts +2 -0
  8. package/dist/analysis/templates.js +7 -0
  9. package/dist/core/security/ipGuard.d.ts +11 -0
  10. package/dist/core/security/ipGuard.js +71 -3
  11. package/dist/crawler/crawl.d.ts +4 -22
  12. package/dist/crawler/crawl.js +4 -335
  13. package/dist/crawler/crawler.d.ts +75 -0
  14. package/dist/crawler/crawler.js +518 -0
  15. package/dist/crawler/extract.d.ts +4 -1
  16. package/dist/crawler/extract.js +7 -2
  17. package/dist/crawler/fetcher.d.ts +1 -0
  18. package/dist/crawler/fetcher.js +20 -5
  19. package/dist/crawler/metricsRunner.d.ts +3 -1
  20. package/dist/crawler/metricsRunner.js +55 -46
  21. package/dist/crawler/sitemap.d.ts +3 -0
  22. package/dist/crawler/sitemap.js +5 -1
  23. package/dist/db/graphLoader.js +32 -3
  24. package/dist/db/index.d.ts +3 -0
  25. package/dist/db/index.js +4 -0
  26. package/dist/db/repositories/EdgeRepository.d.ts +8 -0
  27. package/dist/db/repositories/EdgeRepository.js +13 -0
  28. package/dist/db/repositories/MetricsRepository.d.ts +3 -0
  29. package/dist/db/repositories/MetricsRepository.js +14 -1
  30. package/dist/db/repositories/PageRepository.d.ts +11 -0
  31. package/dist/db/repositories/PageRepository.js +112 -19
  32. package/dist/db/repositories/SiteRepository.d.ts +3 -0
  33. package/dist/db/repositories/SiteRepository.js +9 -0
  34. package/dist/db/repositories/SnapshotRepository.d.ts +2 -0
  35. package/dist/db/repositories/SnapshotRepository.js +23 -2
  36. package/dist/events.d.ts +48 -0
  37. package/dist/events.js +1 -0
  38. package/dist/graph/cluster.js +62 -14
  39. package/dist/graph/duplicate.js +242 -191
  40. package/dist/graph/graph.d.ts +16 -0
  41. package/dist/graph/graph.js +17 -4
  42. package/dist/graph/metrics.js +12 -0
  43. package/dist/graph/pagerank.js +2 -0
  44. package/dist/graph/simhash.d.ts +6 -0
  45. package/dist/graph/simhash.js +14 -0
  46. package/dist/index.d.ts +5 -2
  47. package/dist/index.js +5 -2
  48. package/dist/lock/hashKey.js +1 -1
  49. package/dist/lock/lockManager.d.ts +4 -1
  50. package/dist/lock/lockManager.js +23 -13
  51. package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
  52. package/dist/report/crawlExport.d.ts +3 -0
  53. package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
  54. package/dist/report/crawl_template.d.ts +1 -0
  55. package/dist/report/crawl_template.js +7 -0
  56. package/dist/report/html.js +15 -216
  57. package/dist/scoring/health.d.ts +50 -0
  58. package/dist/scoring/health.js +170 -0
  59. package/dist/scoring/hits.d.ts +1 -0
  60. package/dist/scoring/hits.js +64 -44
  61. package/dist/scoring/orphanSeverity.d.ts +5 -5
  62. package/package.json +3 -3
  63. package/scripts/copy-assets.js +37 -0
  64. package/src/analysis/analysis_list.html +35 -0
  65. package/src/analysis/analysis_page.html +123 -0
  66. package/src/analysis/analyze.ts +218 -261
  67. package/src/analysis/scoring.ts +8 -1
  68. package/src/analysis/templates.ts +9 -0
  69. package/src/core/security/ipGuard.ts +82 -3
  70. package/src/crawler/crawl.ts +6 -379
  71. package/src/crawler/crawler.ts +601 -0
  72. package/src/crawler/extract.ts +7 -2
  73. package/src/crawler/fetcher.ts +24 -6
  74. package/src/crawler/metricsRunner.ts +60 -47
  75. package/src/crawler/sitemap.ts +4 -1
  76. package/src/db/graphLoader.ts +33 -3
  77. package/src/db/index.ts +5 -0
  78. package/src/db/repositories/EdgeRepository.ts +14 -0
  79. package/src/db/repositories/MetricsRepository.ts +15 -1
  80. package/src/db/repositories/PageRepository.ts +119 -19
  81. package/src/db/repositories/SiteRepository.ts +11 -0
  82. package/src/db/repositories/SnapshotRepository.ts +28 -3
  83. package/src/events.ts +16 -0
  84. package/src/graph/cluster.ts +69 -15
  85. package/src/graph/duplicate.ts +249 -185
  86. package/src/graph/graph.ts +24 -4
  87. package/src/graph/metrics.ts +15 -0
  88. package/src/graph/pagerank.ts +1 -0
  89. package/src/graph/simhash.ts +15 -0
  90. package/src/index.ts +5 -2
  91. package/src/lock/hashKey.ts +1 -1
  92. package/src/lock/lockManager.ts +21 -13
  93. package/{dist/report/sitegraph_template.js → src/report/crawl.html} +330 -81
  94. package/src/report/{sitegraphExport.ts → crawlExport.ts} +3 -3
  95. package/src/report/crawl_template.ts +9 -0
  96. package/src/report/html.ts +17 -217
  97. package/src/scoring/health.ts +241 -0
  98. package/src/scoring/hits.ts +67 -45
  99. package/src/scoring/orphanSeverity.ts +8 -8
  100. package/tests/analysis.unit.test.ts +44 -0
  101. package/tests/analyze.integration.test.ts +88 -53
  102. package/tests/analyze_markdown.test.ts +98 -0
  103. package/tests/audit/audit.test.ts +101 -0
  104. package/tests/audit/scoring.test.ts +25 -25
  105. package/tests/audit/transport.test.ts +0 -1
  106. package/tests/clustering_risk.test.ts +118 -0
  107. package/tests/crawler.test.ts +19 -13
  108. package/tests/db/index.test.ts +134 -0
  109. package/tests/db/repositories.test.ts +115 -0
  110. package/tests/db_repos.test.ts +72 -0
  111. package/tests/duplicate.test.ts +2 -2
  112. package/tests/extract.test.ts +86 -0
  113. package/tests/fetcher.test.ts +5 -1
  114. package/tests/fetcher_safety.test.ts +9 -3
  115. package/tests/graph/graph.test.ts +100 -0
  116. package/tests/graphLoader.test.ts +124 -0
  117. package/tests/html_report.test.ts +52 -51
  118. package/tests/ipGuard.test.ts +73 -0
  119. package/tests/lock/lockManager.test.ts +77 -17
  120. package/tests/normalize.test.ts +6 -19
  121. package/tests/orphanSeverity.test.ts +9 -9
  122. package/tests/redirect_safety.test.ts +5 -1
  123. package/tests/renderAnalysisCsv.test.ts +183 -0
  124. package/tests/safety.test.ts +12 -0
  125. package/tests/scope.test.ts +18 -0
  126. package/tests/scoring.test.ts +25 -24
  127. package/tests/sitemap.test.ts +13 -1
  128. package/tests/ssrf_fix.test.ts +69 -0
  129. package/tests/visualization_data.test.ts +10 -10
  130. package/dist/report/sitegraphExport.d.ts +0 -3
  131. package/dist/report/sitegraph_template.d.ts +0 -1
@@ -1,11 +1,10 @@
1
- import fs from 'node:fs/promises';
2
1
  import { crawl } from '../crawler/crawl.js';
3
2
  import { loadGraphFromSnapshot } from '../db/graphLoader.js';
4
3
  import { normalizeUrl } from '../crawler/normalize.js';
5
4
  import { calculateMetrics, Metrics } from '../graph/metrics.js';
6
5
  import { Graph, ClusterInfo } from '../graph/graph.js';
7
6
  import { analyzeContent, calculateThinContentScore } from './content.js';
8
- import { analyzeH1, analyzeMetaDescription, analyzeTitle, applyDuplicateStatuses, H1Analysis, TextFieldAnalysis } from './seo.js';
7
+ import { analyzeH1, analyzeMetaDescription, analyzeTitle, H1Analysis, TextFieldAnalysis } from './seo.js';
9
8
  import { analyzeImageAlts, ImageAltAnalysis } from './images.js';
10
9
  import { analyzeLinks, LinkRatioAnalysis } from './links.js';
11
10
  import { analyzeStructuredData, StructuredDataResult } from './structuredData.js';
@@ -15,6 +14,8 @@ import { getDb } from '../db/index.js';
15
14
  import { SiteRepository } from '../db/repositories/SiteRepository.js';
16
15
  import { SnapshotRepository } from '../db/repositories/SnapshotRepository.js';
17
16
  import { PageRepository } from '../db/repositories/PageRepository.js';
17
+ import { ANALYSIS_LIST_TEMPLATE, ANALYSIS_PAGE_TEMPLATE } from './templates.js';
18
+ import { EngineContext } from '../events.js';
18
19
 
19
20
  export interface CrawlPage {
20
21
  url: string;
@@ -24,12 +25,11 @@ export interface CrawlPage {
24
25
  canonical?: string;
25
26
  noindex?: boolean;
26
27
  nofollow?: boolean;
28
+ crawlStatus?: string;
27
29
  }
28
30
 
29
31
  export interface AnalyzeOptions {
30
- fromCrawl?: string;
31
32
  live?: boolean;
32
- html?: boolean;
33
33
  seo?: boolean;
34
34
  content?: boolean;
35
35
  accessibility?: boolean;
@@ -40,6 +40,7 @@ export interface AnalyzeOptions {
40
40
  debug?: boolean;
41
41
  clusterThreshold?: number;
42
42
  minClusterSize?: number;
43
+ allPages?: boolean;
43
44
  }
44
45
 
45
46
  export interface PageAnalysis {
@@ -58,6 +59,7 @@ export interface PageAnalysis {
58
59
  canonical?: string;
59
60
  noindex?: boolean;
60
61
  nofollow?: boolean;
62
+ crawlStatus?: string;
61
63
  }
62
64
  }
63
65
 
@@ -77,45 +79,93 @@ export interface AnalysisResult {
77
79
  accessibility: boolean;
78
80
  };
79
81
  clusters?: ClusterInfo[];
82
+ snapshotId?: number;
83
+ crawledAt?: string;
80
84
  }
81
85
 
82
86
  interface CrawlData {
83
- pages: CrawlPage[];
87
+ pages: Iterable<CrawlPage> | CrawlPage[];
84
88
  metrics: Metrics;
85
89
  graph: Graph;
90
+ snapshotId: number;
91
+ crawledAt?: string;
86
92
  }
87
93
 
88
- export async function analyzeSite(url: string, options: AnalyzeOptions): Promise<AnalysisResult> {
94
+ /**
95
+ * Analyzes a site for SEO, content, and accessibility.
96
+ * Supports live crawling or loading from a database snapshot.
97
+ * Note: File-based data loading is not supported.
98
+ *
99
+ * @param url The root URL to analyze
100
+ * @param options Analysis options
101
+ * @param context Engine context for event emission
102
+ */
103
+ export async function analyzeSite(url: string, options: AnalyzeOptions, context?: EngineContext): Promise<AnalysisResult> {
89
104
  const normalizedRoot = normalizeUrl(url, '', { stripQuery: false });
90
105
  if (!normalizedRoot) {
91
106
  throw new Error('Invalid URL for analysis');
92
107
  }
93
108
 
94
109
  let crawlData: CrawlData;
95
-
110
+ let robots: any = null;
111
+
112
+ // Always try to fetch robots.txt for the analysis session
113
+ // to ensure we have the latest rules for visibility reporting.
114
+ try {
115
+ const robotsUrl = new URL('/robots.txt', normalizedRoot).toString();
116
+ const robotsRes = await (new (await import('../crawler/fetcher.js')).Fetcher()).fetch(robotsUrl, { maxBytes: 500000 });
117
+ const status = robotsRes.status;
118
+ if (typeof status === 'number' && status >= 200 && status < 300) {
119
+ const robotsParserModule = await import('robots-parser');
120
+ const robotsParser = (robotsParserModule as any).default || robotsParserModule;
121
+ robots = (robotsParser as any)(robotsUrl, robotsRes.body);
122
+ }
123
+ } catch {
124
+ // Silence robots fetch errors, fallback to existing or none
125
+ }
96
126
  if (options.live) {
97
- crawlData = await runLiveCrawl(normalizedRoot, options);
127
+ crawlData = await runLiveCrawl(normalizedRoot, options, context);
98
128
  } else {
99
129
  try {
100
- crawlData = await loadCrawlData(normalizedRoot, options.fromCrawl);
130
+ crawlData = await loadCrawlData(normalizedRoot);
131
+
132
+ // Convert generator to array so it can be reused multiple times
133
+ const allPages = Array.from(crawlData.pages);
134
+ crawlData.pages = allPages;
135
+
136
+ // Check if the requested URL actually exists in this snapshot
137
+ const exists = allPages.some(p => p.url === normalizedRoot);
138
+ if (!exists) {
139
+ options.live = true; // Mark as live so the analysis knows to pick the first page if exact match fails
140
+ if (context) {
141
+ context.emit({ type: 'info', message: `URL ${normalizedRoot} not found in latest snapshot. Fetching live...` });
142
+ }
143
+ crawlData = await runLiveCrawl(normalizedRoot, options, context);
144
+ }
101
145
  } catch (error: any) {
102
146
  const isNotFound = error.code === 'ENOENT' ||
103
147
  error.message.includes('Crawl data not found') ||
104
148
  error.message.includes('No completed snapshot found') ||
105
149
  error.message.includes('not found in database');
106
- if (isNotFound && !options.fromCrawl) {
107
- console.log('No local crawl data found. Switching to live analysis mode...');
108
- crawlData = await runLiveCrawl(normalizedRoot, options);
150
+ if (isNotFound) {
151
+ options.live = true; // Force live mode
152
+ if (context) {
153
+ context.emit({ type: 'info', message: 'No local crawl data found. Switching to live analysis mode...' });
154
+ }
155
+ crawlData = await runLiveCrawl(normalizedRoot, options, context);
109
156
  } else {
110
157
  throw error;
111
158
  }
112
159
  }
113
160
  }
114
161
 
162
+ const snapshotId = crawlData.snapshotId;
163
+ const crawledAt = crawlData.crawledAt;
164
+
115
165
  // Run clustering if requested or as default
116
166
  detectContentClusters(crawlData.graph, options.clusterThreshold, options.minClusterSize);
117
167
 
118
- const pages = analyzePages(normalizedRoot, crawlData.pages);
168
+ const pages = analyzePages(normalizedRoot, crawlData.pages, robots);
119
169
 
120
170
  const activeModules = {
121
171
  seo: !!options.seo,
@@ -131,15 +181,21 @@ export async function analyzeSite(url: string, options: AnalyzeOptions): Promise
131
181
 
132
182
  // Filter to only the requested URL
133
183
  const targetPage = filteredPages.find(p => p.url === normalizedRoot);
134
- const resultPages = targetPage ? [targetPage] : (options.live ? filteredPages.slice(0, 1) : filteredPages);
184
+ let resultPages: PageAnalysis[];
185
+
186
+ if (options.allPages) {
187
+ resultPages = filteredPages;
188
+ } else {
189
+ resultPages = targetPage ? [targetPage] : (options.live ? filteredPages.slice(0, 1) : []);
190
+ }
135
191
 
136
192
  const duplicateTitles = pages.filter((page) => page.title.status === 'duplicate').length;
137
193
  const thinPages = pages.filter((page) => page.thinScore >= 70).length;
138
- const siteScores = aggregateSiteScore(crawlData.metrics, pages);
194
+ const siteScores = aggregateSiteScore(crawlData.metrics, resultPages.length === 1 ? resultPages : pages);
139
195
 
140
196
  return {
141
197
  site_summary: {
142
- pages_analyzed: pages.length,
198
+ pages_analyzed: resultPages.length,
143
199
  avg_seo_score: siteScores.seoHealthScore,
144
200
  thin_pages: thinPages,
145
201
  duplicate_titles: duplicateTitles,
@@ -148,7 +204,9 @@ export async function analyzeSite(url: string, options: AnalyzeOptions): Promise
148
204
  site_scores: siteScores,
149
205
  pages: resultPages,
150
206
  active_modules: activeModules,
151
- clusters: crawlData.graph.contentClusters
207
+ clusters: crawlData.graph.contentClusters,
208
+ snapshotId,
209
+ crawledAt
152
210
  };
153
211
  }
154
212
 
@@ -157,144 +215,54 @@ export function renderAnalysisHtml(result: AnalysisResult): string {
157
215
  return renderSinglePageHtml(result.pages[0]);
158
216
  }
159
217
  const rows = result.pages
160
- .map((page) => `< tr > <td>${escapeHtml(page.url)} </td><td>${page.seoScore}</td > <td>${page.thinScore} </td><td>${page.title.status}</td > <td>${page.metaDescription.status} </td></tr > `)
218
+ .map((page) => `<tr><td>${escapeHtml(page.url)}</td><td>${page.seoScore}</td><td>${page.thinScore}</td><td>${page.title.status}</td><td>${page.metaDescription.status}</td></tr>`)
161
219
  .join('');
162
220
 
163
- return `<!DOCTYPE html><html lang="en"><head><meta charset="utf-8" /><title>Crawlith Analysis Report</title></head><body><h1>Analysis</h1><p>Pages: ${result.site_summary.pages_analyzed}</p><p>Average SEO: ${result.site_summary.avg_seo_score}</p><table border="1" cellspacing="0" cellpadding="6"><thead><tr><th>URL</th><th>SEO Score</th><th>Thin Score</th><th>Title</th><th>Meta</th></tr></thead><tbody>${rows}</tbody></table></body></html>`;
221
+ return ANALYSIS_LIST_TEMPLATE
222
+ .replace('{{PAGES_ANALYZED}}', result.site_summary.pages_analyzed.toString())
223
+ .replace('{{AVG_SEO_SCORE}}', result.site_summary.avg_seo_score.toString())
224
+ .replace('{{ROWS}}', rows);
164
225
  }
165
226
 
166
227
  function renderSinglePageHtml(page: PageAnalysis): string {
167
- return `<!DOCTYPE html>
168
- <html lang="en">
169
- <head>
170
- <meta charset="UTF-8">
171
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
172
- <title>Analysis for ${escapeHtml(page.url)}</title>
173
- <style>
174
- body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; line-height: 1.6; color: #333; }
175
- h1 { border-bottom: 2px solid #eee; padding-bottom: 10px; }
176
- h2 { margin-top: 30px; border-bottom: 1px solid #eee; padding-bottom: 5px; }
177
- .score-card { display: flex; gap: 20px; margin-bottom: 30px; }
178
- .score-box { background: #f8f9fa; padding: 15px; border-radius: 8px; text-align: center; flex: 1; border: 1px solid #e1e4e8; }
179
- .score-val { font-size: 24px; font-weight: bold; color: #0366d6; }
180
- .status-ok { color: green; font-weight: bold; }
181
- .status-warning { color: orange; font-weight: bold; }
182
- .status-critical { color: red; font-weight: bold; }
183
- .status-missing { color: red; font-weight: bold; }
184
- .data-table { width: 100%; border-collapse: collapse; margin-top: 10px; }
185
- .data-table th, .data-table td { text-align: left; padding: 8px; border-bottom: 1px solid #eee; }
186
- .data-table th { width: 150px; color: #666; }
187
- code { background: #f6f8fa; padding: 2px 4px; border-radius: 3px; font-size: 0.9em; }
188
- </style>
189
- </head>
190
- <body>
191
- <h1>Page Analysis</h1>
192
- <p><strong>URL:</strong> <a href="${page.url}" target="_blank">${page.url}</a></p>
193
-
194
- <div class="score-card">
195
- <div class="score-box">
196
- <div class="score-val">${page.seoScore}</div>
197
- <div>SEO Score</div>
198
- </div>
199
- <div class="score-box">
200
- <div class="score-val">${page.thinScore}</div>
201
- <div>Thin Content Score</div>
202
- </div>
203
- <div class="score-box">
204
- <div class="score-val">${page.status === 0 ? 'Pending/Limit' : page.status}</div>
205
- <div>HTTP Status</div>
206
- </div>
207
- </div>
208
-
209
- <h2>Meta Tags</h2>
210
- <table class="data-table">
211
- <tr>
212
- <th>Title</th>
213
- <td>
214
- <div>${escapeHtml(page.title.value || '(missing)')}</div>
215
- <small>Length: ${page.title.length} | Status: <span class="status-${page.title.status}">${page.title.status}</span></small>
216
- </td>
217
- </tr>
218
- <tr>
219
- <th>Description</th>
220
- <td>
221
- <div>${escapeHtml(page.metaDescription.value || '(missing)')}</div>
222
- <small>Length: ${page.metaDescription.length} | Status: <span class="status-${page.metaDescription.status}">${page.metaDescription.status}</span></small>
223
- </td>
224
- </tr>
225
- <tr>
226
- <th>Canonical</th>
227
- <td>${page.meta.canonical ? escapeHtml(page.meta.canonical) : '<em>(none)</em>'}</td>
228
- </tr>
229
- <tr>
230
- <th>Robots</th>
231
- <td>
232
- Index: ${!page.meta.noindex},
233
- Follow: ${!page.meta.nofollow}
234
- </td>
235
- </tr>
236
- </table>
237
-
238
- <h2>Content & Heading</h2>
239
- <table class="data-table">
240
- <tr>
241
- <th>H1 Tag</th>
242
- <td>
243
- Status: <span class="status-${page.h1.status}">${page.h1.status}</span>
244
- (${page.h1.count} detected)
245
- ${page.h1.matchesTitle ? ' | Matches Title' : ''}
246
- </td>
247
- </tr>
248
- <tr>
249
- <th>Word Count</th>
250
- <td>${page.content.wordCount} words</td>
251
- </tr>
252
- <tr>
253
- <th>Unique Sentences</th>
254
- <td>${page.content.uniqueSentenceCount}</td>
255
- </tr>
256
- <tr>
257
- <th>Text / HTML Ratio</th>
258
- <td>${(page.content.textHtmlRatio * 100).toFixed(2)}%</td>
259
- </tr>
260
- </table>
228
+ const structuredDataStatus = page.structuredData.present
229
+ ? (page.structuredData.valid ? '<span class="status-ok">Valid</span>' : '<span class="status-critical">Invalid JSON</span>')
230
+ : 'Not detected';
261
231
 
262
- <h2>Links & Images</h2>
263
- <table class="data-table">
264
- <tr>
265
- <th>Internal Links</th>
266
- <td>${page.links.internalLinks}</td>
267
- </tr>
268
- <tr>
269
- <th>External Links</th>
270
- <td>${page.links.externalLinks} (${(page.links.externalRatio * 100).toFixed(1)}%)</td>
271
- </tr>
272
- <tr>
273
- <th>Images</th>
274
- <td>${page.images.totalImages} total (${page.images.missingAlt} missing alt text)</td>
275
- </tr>
276
- </table>
277
-
278
- <h2>Structured Data</h2>
279
- <table class="data-table">
280
- <tr>
281
- <th>Status</th>
282
- <td>
283
- ${page.structuredData.present
284
- ? (page.structuredData.valid ? '<span class="status-ok">Valid</span>' : '<span class="status-critical">Invalid JSON</span>')
285
- : 'Not detected'
286
- }
287
- </td>
288
- </tr>
289
- ${page.structuredData.present ? `
232
+ const structuredDataTypesRow = page.structuredData.present ? `
290
233
  <tr>
291
234
  <th>Types Found</th>
292
235
  <td>${page.structuredData.types.map(t => `<code>${t}</code>`).join(', ')}</td>
293
236
  </tr>
294
- ` : ''}
295
- </table>
296
- </body>
297
- </html>`;
237
+ ` : '';
238
+
239
+ return ANALYSIS_PAGE_TEMPLATE
240
+ .replaceAll('{{URL}}', escapeHtml(page.url))
241
+ .replace('{{SEO_SCORE}}', page.seoScore.toString())
242
+ .replace('{{THIN_SCORE}}', page.thinScore.toString())
243
+ .replace('{{HTTP_STATUS}}', page.status === 0 ? 'Pending/Limit' : page.status.toString())
244
+ .replace('{{TITLE_VALUE}}', escapeHtml(page.title.value || '(missing)'))
245
+ .replace('{{TITLE_LENGTH}}', page.title.length.toString())
246
+ .replaceAll('{{TITLE_STATUS}}', page.title.status)
247
+ .replace('{{META_DESCRIPTION_VALUE}}', escapeHtml(page.metaDescription.value || '(missing)'))
248
+ .replace('{{META_DESCRIPTION_LENGTH}}', page.metaDescription.length.toString())
249
+ .replaceAll('{{META_DESCRIPTION_STATUS}}', page.metaDescription.status)
250
+ .replace('{{CANONICAL}}', page.meta.canonical ? escapeHtml(page.meta.canonical) : '<em>(none)</em>')
251
+ .replace('{{ROBOTS_INDEX}}', (!page.meta.noindex).toString())
252
+ .replace('{{ROBOTS_FOLLOW}}', (!page.meta.nofollow).toString())
253
+ .replaceAll('{{H1_STATUS}}', page.h1.status)
254
+ .replace('{{H1_COUNT}}', page.h1.count.toString())
255
+ .replace('{{H1_MATCHES_TITLE}}', page.h1.matchesTitle ? ' | Matches Title' : '')
256
+ .replace('{{WORD_COUNT}}', page.content.wordCount.toString())
257
+ .replace('{{UNIQUE_SENTENCES}}', page.content.uniqueSentenceCount.toString())
258
+ .replace('{{TEXT_HTML_RATIO}}', (page.content.textHtmlRatio * 100).toFixed(2))
259
+ .replace('{{INTERNAL_LINKS}}', page.links.internalLinks.toString())
260
+ .replace('{{EXTERNAL_LINKS}}', page.links.externalLinks.toString())
261
+ .replace('{{EXTERNAL_RATIO}}', (page.links.externalRatio * 100).toFixed(1))
262
+ .replace('{{TOTAL_IMAGES}}', page.images.totalImages.toString())
263
+ .replace('{{MISSING_ALT}}', page.images.missingAlt.toString())
264
+ .replace('{{STRUCTURED_DATA_STATUS}}', structuredDataStatus)
265
+ .replace('{{STRUCTURED_DATA_TYPES_ROW}}', structuredDataTypesRow);
298
266
  }
299
267
 
300
268
  export function renderAnalysisMarkdown(result: AnalysisResult): string {
@@ -347,52 +315,95 @@ function escapeHtml(value: string): string {
347
315
  return value.replaceAll('&', '&amp;').replaceAll('<', '&lt;').replaceAll('>', '&gt;');
348
316
  }
349
317
 
350
- function analyzePages(rootUrl: string, pages: CrawlPage[]): PageAnalysis[] {
351
- const titleCandidates = pages.map((page) => analyzeTitle(page.html || ''));
352
- const metaCandidates = pages.map((page) => analyzeMetaDescription(page.html || ''));
353
- const titles = applyDuplicateStatuses(titleCandidates);
354
- const metas = applyDuplicateStatuses(metaCandidates);
355
-
318
+ export function analyzePages(rootUrl: string, pages: Iterable<CrawlPage> | CrawlPage[], robots?: any): PageAnalysis[] {
319
+ const titleCounts = new Map<string, number>();
320
+ const metaCounts = new Map<string, number>();
356
321
  const sentenceCountFrequency = new Map<number, number>();
357
- const baseContent = pages.map((page) => analyzeContent(page.html || ''));
358
- for (const item of baseContent) {
359
- sentenceCountFrequency.set(item.uniqueSentenceCount, (sentenceCountFrequency.get(item.uniqueSentenceCount) || 0) + 1);
360
- }
361
322
 
362
- return pages.map((page, index) => {
323
+ const results: PageAnalysis[] = [];
324
+
325
+ for (const page of pages) {
363
326
  const html = page.html || '';
364
- const title = titles[index];
365
- const metaDescription = metas[index];
327
+
328
+ // 0. Update crawl status based on current robots rules
329
+ let crawlStatus = page.crawlStatus;
330
+ if (robots) {
331
+ const isBlocked = !robots.isAllowed(page.url, 'crawlith') ||
332
+ (!page.url.endsWith('/') && !robots.isAllowed(page.url + '/', 'crawlith'));
333
+ if (isBlocked) {
334
+ crawlStatus = 'blocked_by_robots';
335
+ }
336
+ }
337
+
338
+ // 1. Analyze Individual Components
339
+ const title = analyzeTitle(html);
340
+ const metaDescription = analyzeMetaDescription(html);
366
341
  const h1 = analyzeH1(html, title.value);
367
- const content = baseContent[index];
368
- const duplicationScore = (sentenceCountFrequency.get(content.uniqueSentenceCount) || 0) > 1 ? 100 : 0;
369
- const thinScore = calculateThinContentScore(content, duplicationScore);
342
+ const content = analyzeContent(html);
370
343
  const images = analyzeImageAlts(html);
371
344
  const links = analyzeLinks(html, page.url, rootUrl);
372
345
  const structuredData = analyzeStructuredData(html);
373
346
 
374
- const analysis: PageAnalysis = {
347
+ // 2. Accumulate Frequencies for Duplicates
348
+ if (title.value) {
349
+ const key = (title.value || '').trim().toLowerCase();
350
+ titleCounts.set(key, (titleCounts.get(key) || 0) + 1);
351
+ }
352
+ if (metaDescription.value) {
353
+ const key = (metaDescription.value || '').trim().toLowerCase();
354
+ metaCounts.set(key, (metaCounts.get(key) || 0) + 1);
355
+ }
356
+ sentenceCountFrequency.set(content.uniqueSentenceCount, (sentenceCountFrequency.get(content.uniqueSentenceCount) || 0) + 1);
357
+
358
+ // 3. Store Preliminary Result
359
+ results.push({
375
360
  url: page.url,
376
361
  status: page.status || 0,
377
362
  title,
378
363
  metaDescription,
379
364
  h1,
380
365
  content,
381
- thinScore,
366
+ thinScore: 0, // Calculated in pass 2
382
367
  images,
383
368
  links,
384
369
  structuredData,
385
- seoScore: 0,
370
+ seoScore: 0, // Calculated in pass 2
386
371
  meta: {
387
372
  canonical: page.canonical,
388
373
  noindex: page.noindex,
389
- nofollow: page.nofollow
374
+ nofollow: page.nofollow,
375
+ crawlStatus
376
+ }
377
+ });
378
+ }
379
+
380
+ // 4. Finalize Statuses and Scores (Pass 2)
381
+ for (const analysis of results) {
382
+ // Check Title Duplicates
383
+ if (analysis.title.value) {
384
+ const key = (analysis.title.value || '').trim().toLowerCase();
385
+ if ((titleCounts.get(key) || 0) > 1) {
386
+ analysis.title.status = 'duplicate';
390
387
  }
391
- };
388
+ }
392
389
 
390
+ // Check Meta Duplicates
391
+ if (analysis.metaDescription.value) {
392
+ const key = (analysis.metaDescription.value || '').trim().toLowerCase();
393
+ if ((metaCounts.get(key) || 0) > 1) {
394
+ analysis.metaDescription.status = 'duplicate';
395
+ }
396
+ }
397
+
398
+ // Check Content Duplication
399
+ const duplicationScore = (sentenceCountFrequency.get(analysis.content.uniqueSentenceCount) || 0) > 1 ? 100 : 0;
400
+ analysis.thinScore = calculateThinContentScore(analysis.content, duplicationScore);
401
+
402
+ // Calculate Final SEO Score
393
403
  analysis.seoScore = scorePageSeo(analysis);
394
- return analysis;
395
- });
404
+ }
405
+
406
+ return results;
396
407
  }
397
408
 
398
409
  function filterPageModules(
@@ -416,23 +427,7 @@ function filterPageModules(
416
427
  };
417
428
  }
418
429
 
419
- async function loadCrawlData(rootUrl: string, fromCrawl?: string): Promise<CrawlData> {
420
- // If fromCrawl is provided, we could theoretically load JSON, but
421
- // we now default to DB fetching for all operations.
422
-
423
- if (fromCrawl) {
424
- try {
425
- const content = await fs.readFile(fromCrawl, 'utf-8');
426
- const raw = JSON.parse(content) as Record<string, unknown>;
427
- const pages = parsePages(raw);
428
- const graph = graphFromPages(rootUrl, pages, raw);
429
- const metrics = calculateMetrics(graph, 5);
430
- return { pages, metrics, graph };
431
- } catch (_e) {
432
- // Fallback downwards if file doesn't exist
433
- }
434
- }
435
-
430
+ async function loadCrawlData(rootUrl: string): Promise<CrawlData> {
436
431
  const db = getDb();
437
432
  const siteRepo = new SiteRepository(db);
438
433
  const snapshotRepo = new SnapshotRepository(db);
@@ -442,107 +437,69 @@ async function loadCrawlData(rootUrl: string, fromCrawl?: string): Promise<Crawl
442
437
  const domain = urlObj.hostname.replace('www.', '');
443
438
  const site = siteRepo.firstOrCreateSite(domain);
444
439
 
445
- const snapshot = snapshotRepo.getLatestSnapshot(site.id, 'completed');
446
- if (!snapshot) {
447
- throw new Error(`No completed snapshot found for ${rootUrl} in database.`);
440
+ let snapshot;
441
+ const page = pageRepo.getPage(site.id, rootUrl);
442
+ if (page && page.last_seen_snapshot_id) {
443
+ snapshot = snapshotRepo.getSnapshot(page.last_seen_snapshot_id);
448
444
  }
449
445
 
450
- const graph = loadGraphFromSnapshot(snapshot.id);
451
- const metrics = calculateMetrics(graph, 5);
452
-
453
- // We also need the `pages` array for analysis.
454
- // It needs `html` which might not be fully available unless we look up from the DB or Graph.
455
- // Wait, the Graph stores Node which doesn't contain HTML since we removed it from memory?
456
- // Actually, `loadGraphFromSnapshot` does NOT load actual raw HTML from nodes to save memory.
457
- // We need HTML for `analyzeSite` module! So we must fetch it from `pageRepo`.
458
-
459
- const dbPages = pageRepo.getPagesBySnapshot(snapshot.id);
460
- const pages: CrawlPage[] = dbPages.map((p: any) => ({
461
- url: p.normalized_url,
462
- status: p.http_status || 0,
463
- html: p.html || '',
464
- depth: p.depth || 0
465
- }));
466
-
467
- return { pages, metrics, graph };
468
- }
469
-
470
- function parsePages(raw: Record<string, unknown>): CrawlPage[] {
471
- if (Array.isArray(raw.pages)) {
472
- return raw.pages.map((page) => {
473
- const p = page as Record<string, unknown>;
474
- return {
475
- url: String(p.url || ''),
476
- status: Number(p.status || 0),
477
- html: typeof p.html === 'string' ? p.html : '',
478
- depth: Number(p.depth || 0)
479
- };
480
- }).filter((page) => Boolean(page.url));
446
+ if (!snapshot) {
447
+ snapshot = snapshotRepo.getLatestSnapshot(site.id);
481
448
  }
482
449
 
483
- if (Array.isArray(raw.nodes)) {
484
- return raw.nodes.map((node) => {
485
- const n = node as Record<string, unknown>;
486
- return {
487
- url: String(n.url || ''),
488
- status: Number(n.status || 0),
489
- html: typeof n.html === 'string' ? n.html : '',
490
- depth: Number(n.depth || 0)
491
- };
492
- }).filter((page) => Boolean(page.url));
450
+ if (!snapshot) {
451
+ throw new Error(`No crawl data found for ${rootUrl} in database.`);
493
452
  }
494
453
 
495
- return [];
496
- }
497
-
498
- function graphFromPages(rootUrl: string, pages: CrawlPage[], raw: Record<string, unknown>): Graph {
499
- const graph = new Graph();
500
-
501
- for (const page of pages) {
502
- graph.addNode(page.url, page.depth || 0, page.status || 0);
503
- }
454
+ const graph = loadGraphFromSnapshot(snapshot.id);
455
+ const metrics = calculateMetrics(graph, 5);
504
456
 
505
- if (Array.isArray(raw.edges)) {
506
- for (const edge of raw.edges) {
507
- const e = edge as Record<string, unknown>;
508
- if (typeof e.source === 'string' && typeof e.target === 'string') {
509
- graph.addNode(e.source, 0, 0);
510
- graph.addNode(e.target, 0, 0);
511
- graph.addEdge(e.source, e.target);
512
- }
457
+ // Use iterator to save memory
458
+ const dbPagesIterator = pageRepo.getPagesIteratorBySnapshot(snapshot.id);
459
+
460
+ // We need to map the DB pages to CrawlPage format lazily
461
+ const pagesGenerator = function* () {
462
+ for (const p of dbPagesIterator) {
463
+ yield {
464
+ url: p.normalized_url,
465
+ status: p.http_status || 0,
466
+ html: p.html || '',
467
+ depth: p.depth || 0,
468
+ canonical: p.canonical_url || undefined,
469
+ noindex: !!p.noindex,
470
+ nofollow: !!p.nofollow,
471
+ crawlStatus: graph.nodes.get(p.normalized_url)?.crawlStatus
472
+ } as CrawlPage;
513
473
  }
514
- return graph;
515
- }
516
-
517
- for (const page of pages) {
518
- if (!page.html) continue;
519
- const linkAnalysis = analyzeLinks(page.html, page.url, rootUrl);
520
- if (linkAnalysis.internalLinks === 0 && linkAnalysis.externalLinks === 0) continue;
521
- }
474
+ };
522
475
 
523
- return graph;
476
+ return { pages: pagesGenerator(), metrics, graph, snapshotId: snapshot.id, crawledAt: snapshot.created_at };
524
477
  }
525
478
 
526
- async function runLiveCrawl(url: string, options: AnalyzeOptions): Promise<CrawlData> {
479
+
480
+ async function runLiveCrawl(url: string, options: AnalyzeOptions, context?: EngineContext): Promise<CrawlData> {
527
481
  const snapshotId = await crawl(url, {
528
- limit: 1,
482
+ limit: 1, // Always limit to 1 for single page live analysis
529
483
  depth: 0,
530
484
  rate: options.rate,
531
485
  proxyUrl: options.proxyUrl,
532
486
  userAgent: options.userAgent,
533
487
  maxRedirects: options.maxRedirects,
534
- debug: options.debug
535
- });
488
+ debug: options.debug,
489
+ snapshotType: 'partial'
490
+ }, context) as number;
536
491
  const graph = loadGraphFromSnapshot(snapshotId);
537
492
  const pages = graph.getNodes().map((node) => ({
538
493
  url: node.url,
539
494
  status: node.status,
540
495
  html: node.html || '', // Include HTML
541
- depth: node.depth
496
+ depth: node.depth,
497
+ crawlStatus: node.crawlStatus
542
498
  }));
543
499
  return {
544
500
  pages,
545
501
  metrics: calculateMetrics(graph, 1),
546
- graph
502
+ graph,
503
+ snapshotId
547
504
  };
548
505
  }