@crawlith/core 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/CHANGELOG.md +6 -0
  2. package/dist/analysis/analysis_list.html +35 -0
  3. package/dist/analysis/analysis_page.html +123 -0
  4. package/dist/analysis/analyze.d.ts +17 -3
  5. package/dist/analysis/analyze.js +192 -248
  6. package/dist/analysis/scoring.js +7 -1
  7. package/dist/analysis/templates.d.ts +2 -0
  8. package/dist/analysis/templates.js +7 -0
  9. package/dist/core/security/ipGuard.d.ts +11 -0
  10. package/dist/core/security/ipGuard.js +71 -3
  11. package/dist/crawler/crawl.d.ts +4 -22
  12. package/dist/crawler/crawl.js +4 -335
  13. package/dist/crawler/crawler.d.ts +75 -0
  14. package/dist/crawler/crawler.js +518 -0
  15. package/dist/crawler/extract.d.ts +4 -1
  16. package/dist/crawler/extract.js +7 -2
  17. package/dist/crawler/fetcher.d.ts +1 -0
  18. package/dist/crawler/fetcher.js +20 -5
  19. package/dist/crawler/metricsRunner.d.ts +3 -1
  20. package/dist/crawler/metricsRunner.js +55 -46
  21. package/dist/crawler/sitemap.d.ts +3 -0
  22. package/dist/crawler/sitemap.js +5 -1
  23. package/dist/db/graphLoader.js +32 -3
  24. package/dist/db/index.d.ts +3 -0
  25. package/dist/db/index.js +4 -0
  26. package/dist/db/repositories/EdgeRepository.d.ts +8 -0
  27. package/dist/db/repositories/EdgeRepository.js +13 -0
  28. package/dist/db/repositories/MetricsRepository.d.ts +3 -0
  29. package/dist/db/repositories/MetricsRepository.js +14 -1
  30. package/dist/db/repositories/PageRepository.d.ts +11 -0
  31. package/dist/db/repositories/PageRepository.js +112 -19
  32. package/dist/db/repositories/SiteRepository.d.ts +3 -0
  33. package/dist/db/repositories/SiteRepository.js +9 -0
  34. package/dist/db/repositories/SnapshotRepository.d.ts +2 -0
  35. package/dist/db/repositories/SnapshotRepository.js +23 -2
  36. package/dist/events.d.ts +48 -0
  37. package/dist/events.js +1 -0
  38. package/dist/graph/cluster.js +62 -14
  39. package/dist/graph/duplicate.js +242 -191
  40. package/dist/graph/graph.d.ts +16 -0
  41. package/dist/graph/graph.js +17 -4
  42. package/dist/graph/metrics.js +12 -0
  43. package/dist/graph/pagerank.js +2 -0
  44. package/dist/graph/simhash.d.ts +6 -0
  45. package/dist/graph/simhash.js +14 -0
  46. package/dist/index.d.ts +5 -2
  47. package/dist/index.js +5 -2
  48. package/dist/lock/hashKey.js +1 -1
  49. package/dist/lock/lockManager.d.ts +4 -1
  50. package/dist/lock/lockManager.js +23 -13
  51. package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
  52. package/dist/report/crawlExport.d.ts +3 -0
  53. package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
  54. package/dist/report/crawl_template.d.ts +1 -0
  55. package/dist/report/crawl_template.js +7 -0
  56. package/dist/report/html.js +15 -216
  57. package/dist/scoring/health.d.ts +50 -0
  58. package/dist/scoring/health.js +170 -0
  59. package/dist/scoring/hits.d.ts +1 -0
  60. package/dist/scoring/hits.js +64 -44
  61. package/dist/scoring/orphanSeverity.d.ts +5 -5
  62. package/package.json +3 -3
  63. package/scripts/copy-assets.js +37 -0
  64. package/src/analysis/analysis_list.html +35 -0
  65. package/src/analysis/analysis_page.html +123 -0
  66. package/src/analysis/analyze.ts +218 -261
  67. package/src/analysis/scoring.ts +8 -1
  68. package/src/analysis/templates.ts +9 -0
  69. package/src/core/security/ipGuard.ts +82 -3
  70. package/src/crawler/crawl.ts +6 -379
  71. package/src/crawler/crawler.ts +601 -0
  72. package/src/crawler/extract.ts +7 -2
  73. package/src/crawler/fetcher.ts +24 -6
  74. package/src/crawler/metricsRunner.ts +60 -47
  75. package/src/crawler/sitemap.ts +4 -1
  76. package/src/db/graphLoader.ts +33 -3
  77. package/src/db/index.ts +5 -0
  78. package/src/db/repositories/EdgeRepository.ts +14 -0
  79. package/src/db/repositories/MetricsRepository.ts +15 -1
  80. package/src/db/repositories/PageRepository.ts +119 -19
  81. package/src/db/repositories/SiteRepository.ts +11 -0
  82. package/src/db/repositories/SnapshotRepository.ts +28 -3
  83. package/src/events.ts +16 -0
  84. package/src/graph/cluster.ts +69 -15
  85. package/src/graph/duplicate.ts +249 -185
  86. package/src/graph/graph.ts +24 -4
  87. package/src/graph/metrics.ts +15 -0
  88. package/src/graph/pagerank.ts +1 -0
  89. package/src/graph/simhash.ts +15 -0
  90. package/src/index.ts +5 -2
  91. package/src/lock/hashKey.ts +1 -1
  92. package/src/lock/lockManager.ts +21 -13
  93. package/{dist/report/sitegraph_template.js → src/report/crawl.html} +330 -81
  94. package/src/report/{sitegraphExport.ts → crawlExport.ts} +3 -3
  95. package/src/report/crawl_template.ts +9 -0
  96. package/src/report/html.ts +17 -217
  97. package/src/scoring/health.ts +241 -0
  98. package/src/scoring/hits.ts +67 -45
  99. package/src/scoring/orphanSeverity.ts +8 -8
  100. package/tests/analysis.unit.test.ts +44 -0
  101. package/tests/analyze.integration.test.ts +88 -53
  102. package/tests/analyze_markdown.test.ts +98 -0
  103. package/tests/audit/audit.test.ts +101 -0
  104. package/tests/audit/scoring.test.ts +25 -25
  105. package/tests/audit/transport.test.ts +0 -1
  106. package/tests/clustering_risk.test.ts +118 -0
  107. package/tests/crawler.test.ts +19 -13
  108. package/tests/db/index.test.ts +134 -0
  109. package/tests/db/repositories.test.ts +115 -0
  110. package/tests/db_repos.test.ts +72 -0
  111. package/tests/duplicate.test.ts +2 -2
  112. package/tests/extract.test.ts +86 -0
  113. package/tests/fetcher.test.ts +5 -1
  114. package/tests/fetcher_safety.test.ts +9 -3
  115. package/tests/graph/graph.test.ts +100 -0
  116. package/tests/graphLoader.test.ts +124 -0
  117. package/tests/html_report.test.ts +52 -51
  118. package/tests/ipGuard.test.ts +73 -0
  119. package/tests/lock/lockManager.test.ts +77 -17
  120. package/tests/normalize.test.ts +6 -19
  121. package/tests/orphanSeverity.test.ts +9 -9
  122. package/tests/redirect_safety.test.ts +5 -1
  123. package/tests/renderAnalysisCsv.test.ts +183 -0
  124. package/tests/safety.test.ts +12 -0
  125. package/tests/scope.test.ts +18 -0
  126. package/tests/scoring.test.ts +25 -24
  127. package/tests/sitemap.test.ts +13 -1
  128. package/tests/ssrf_fix.test.ts +69 -0
  129. package/tests/visualization_data.test.ts +10 -10
  130. package/dist/report/sitegraphExport.d.ts +0 -3
  131. package/dist/report/sitegraph_template.d.ts +0 -1
@@ -1,11 +1,9 @@
1
- import fs from 'node:fs/promises';
2
1
  import { crawl } from '../crawler/crawl.js';
3
2
  import { loadGraphFromSnapshot } from '../db/graphLoader.js';
4
3
  import { normalizeUrl } from '../crawler/normalize.js';
5
4
  import { calculateMetrics } from '../graph/metrics.js';
6
- import { Graph } from '../graph/graph.js';
7
5
  import { analyzeContent, calculateThinContentScore } from './content.js';
8
- import { analyzeH1, analyzeMetaDescription, analyzeTitle, applyDuplicateStatuses } from './seo.js';
6
+ import { analyzeH1, analyzeMetaDescription, analyzeTitle } from './seo.js';
9
7
  import { analyzeImageAlts } from './images.js';
10
8
  import { analyzeLinks } from './links.js';
11
9
  import { analyzeStructuredData } from './structuredData.js';
@@ -15,36 +13,79 @@ import { getDb } from '../db/index.js';
15
13
  import { SiteRepository } from '../db/repositories/SiteRepository.js';
16
14
  import { SnapshotRepository } from '../db/repositories/SnapshotRepository.js';
17
15
  import { PageRepository } from '../db/repositories/PageRepository.js';
18
- export async function analyzeSite(url, options) {
16
+ import { ANALYSIS_LIST_TEMPLATE, ANALYSIS_PAGE_TEMPLATE } from './templates.js';
17
+ /**
18
+ * Analyzes a site for SEO, content, and accessibility.
19
+ * Supports live crawling or loading from a database snapshot.
20
+ * Note: File-based data loading is not supported.
21
+ *
22
+ * @param url The root URL to analyze
23
+ * @param options Analysis options
24
+ * @param context Engine context for event emission
25
+ */
26
+ export async function analyzeSite(url, options, context) {
19
27
  const normalizedRoot = normalizeUrl(url, '', { stripQuery: false });
20
28
  if (!normalizedRoot) {
21
29
  throw new Error('Invalid URL for analysis');
22
30
  }
23
31
  let crawlData;
32
+ let robots = null;
33
+ // Always try to fetch robots.txt for the analysis session
34
+ // to ensure we have the latest rules for visibility reporting.
35
+ try {
36
+ const robotsUrl = new URL('/robots.txt', normalizedRoot).toString();
37
+ const robotsRes = await (new (await import('../crawler/fetcher.js')).Fetcher()).fetch(robotsUrl, { maxBytes: 500000 });
38
+ const status = robotsRes.status;
39
+ if (typeof status === 'number' && status >= 200 && status < 300) {
40
+ const robotsParserModule = await import('robots-parser');
41
+ const robotsParser = robotsParserModule.default || robotsParserModule;
42
+ robots = robotsParser(robotsUrl, robotsRes.body);
43
+ }
44
+ }
45
+ catch {
46
+ // Silence robots fetch errors, fallback to existing or none
47
+ }
24
48
  if (options.live) {
25
- crawlData = await runLiveCrawl(normalizedRoot, options);
49
+ crawlData = await runLiveCrawl(normalizedRoot, options, context);
26
50
  }
27
51
  else {
28
52
  try {
29
- crawlData = await loadCrawlData(normalizedRoot, options.fromCrawl);
53
+ crawlData = await loadCrawlData(normalizedRoot);
54
+ // Convert generator to array so it can be reused multiple times
55
+ const allPages = Array.from(crawlData.pages);
56
+ crawlData.pages = allPages;
57
+ // Check if the requested URL actually exists in this snapshot
58
+ const exists = allPages.some(p => p.url === normalizedRoot);
59
+ if (!exists) {
60
+ options.live = true; // Mark as live so the analysis knows to pick the first page if exact match fails
61
+ if (context) {
62
+ context.emit({ type: 'info', message: `URL ${normalizedRoot} not found in latest snapshot. Fetching live...` });
63
+ }
64
+ crawlData = await runLiveCrawl(normalizedRoot, options, context);
65
+ }
30
66
  }
31
67
  catch (error) {
32
68
  const isNotFound = error.code === 'ENOENT' ||
33
69
  error.message.includes('Crawl data not found') ||
34
70
  error.message.includes('No completed snapshot found') ||
35
71
  error.message.includes('not found in database');
36
- if (isNotFound && !options.fromCrawl) {
37
- console.log('No local crawl data found. Switching to live analysis mode...');
38
- crawlData = await runLiveCrawl(normalizedRoot, options);
72
+ if (isNotFound) {
73
+ options.live = true; // Force live mode
74
+ if (context) {
75
+ context.emit({ type: 'info', message: 'No local crawl data found. Switching to live analysis mode...' });
76
+ }
77
+ crawlData = await runLiveCrawl(normalizedRoot, options, context);
39
78
  }
40
79
  else {
41
80
  throw error;
42
81
  }
43
82
  }
44
83
  }
84
+ const snapshotId = crawlData.snapshotId;
85
+ const crawledAt = crawlData.crawledAt;
45
86
  // Run clustering if requested or as default
46
87
  detectContentClusters(crawlData.graph, options.clusterThreshold, options.minClusterSize);
47
- const pages = analyzePages(normalizedRoot, crawlData.pages);
88
+ const pages = analyzePages(normalizedRoot, crawlData.pages, robots);
48
89
  const activeModules = {
49
90
  seo: !!options.seo,
50
91
  content: !!options.content,
@@ -56,13 +97,19 @@ export async function analyzeSite(url, options) {
56
97
  : pages;
57
98
  // Filter to only the requested URL
58
99
  const targetPage = filteredPages.find(p => p.url === normalizedRoot);
59
- const resultPages = targetPage ? [targetPage] : (options.live ? filteredPages.slice(0, 1) : filteredPages);
100
+ let resultPages;
101
+ if (options.allPages) {
102
+ resultPages = filteredPages;
103
+ }
104
+ else {
105
+ resultPages = targetPage ? [targetPage] : (options.live ? filteredPages.slice(0, 1) : []);
106
+ }
60
107
  const duplicateTitles = pages.filter((page) => page.title.status === 'duplicate').length;
61
108
  const thinPages = pages.filter((page) => page.thinScore >= 70).length;
62
- const siteScores = aggregateSiteScore(crawlData.metrics, pages);
109
+ const siteScores = aggregateSiteScore(crawlData.metrics, resultPages.length === 1 ? resultPages : pages);
63
110
  return {
64
111
  site_summary: {
65
- pages_analyzed: pages.length,
112
+ pages_analyzed: resultPages.length,
66
113
  avg_seo_score: siteScores.seoHealthScore,
67
114
  thin_pages: thinPages,
68
115
  duplicate_titles: duplicateTitles,
@@ -71,7 +118,9 @@ export async function analyzeSite(url, options) {
71
118
  site_scores: siteScores,
72
119
  pages: resultPages,
73
120
  active_modules: activeModules,
74
- clusters: crawlData.graph.contentClusters
121
+ clusters: crawlData.graph.contentClusters,
122
+ snapshotId,
123
+ crawledAt
75
124
  };
76
125
  }
77
126
  export function renderAnalysisHtml(result) {
@@ -79,141 +128,50 @@ export function renderAnalysisHtml(result) {
79
128
  return renderSinglePageHtml(result.pages[0]);
80
129
  }
81
130
  const rows = result.pages
82
- .map((page) => `< tr > <td>${escapeHtml(page.url)} </td><td>${page.seoScore}</td > <td>${page.thinScore} </td><td>${page.title.status}</td > <td>${page.metaDescription.status} </td></tr > `)
131
+ .map((page) => `<tr><td>${escapeHtml(page.url)}</td><td>${page.seoScore}</td><td>${page.thinScore}</td><td>${page.title.status}</td><td>${page.metaDescription.status}</td></tr>`)
83
132
  .join('');
84
- return `<!DOCTYPE html><html lang="en"><head><meta charset="utf-8" /><title>Crawlith Analysis Report</title></head><body><h1>Analysis</h1><p>Pages: ${result.site_summary.pages_analyzed}</p><p>Average SEO: ${result.site_summary.avg_seo_score}</p><table border="1" cellspacing="0" cellpadding="6"><thead><tr><th>URL</th><th>SEO Score</th><th>Thin Score</th><th>Title</th><th>Meta</th></tr></thead><tbody>${rows}</tbody></table></body></html>`;
133
+ return ANALYSIS_LIST_TEMPLATE
134
+ .replace('{{PAGES_ANALYZED}}', result.site_summary.pages_analyzed.toString())
135
+ .replace('{{AVG_SEO_SCORE}}', result.site_summary.avg_seo_score.toString())
136
+ .replace('{{ROWS}}', rows);
85
137
  }
86
138
  function renderSinglePageHtml(page) {
87
- return `<!DOCTYPE html>
88
- <html lang="en">
89
- <head>
90
- <meta charset="UTF-8">
91
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
92
- <title>Analysis for ${escapeHtml(page.url)}</title>
93
- <style>
94
- body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; line-height: 1.6; color: #333; }
95
- h1 { border-bottom: 2px solid #eee; padding-bottom: 10px; }
96
- h2 { margin-top: 30px; border-bottom: 1px solid #eee; padding-bottom: 5px; }
97
- .score-card { display: flex; gap: 20px; margin-bottom: 30px; }
98
- .score-box { background: #f8f9fa; padding: 15px; border-radius: 8px; text-align: center; flex: 1; border: 1px solid #e1e4e8; }
99
- .score-val { font-size: 24px; font-weight: bold; color: #0366d6; }
100
- .status-ok { color: green; font-weight: bold; }
101
- .status-warning { color: orange; font-weight: bold; }
102
- .status-critical { color: red; font-weight: bold; }
103
- .status-missing { color: red; font-weight: bold; }
104
- .data-table { width: 100%; border-collapse: collapse; margin-top: 10px; }
105
- .data-table th, .data-table td { text-align: left; padding: 8px; border-bottom: 1px solid #eee; }
106
- .data-table th { width: 150px; color: #666; }
107
- code { background: #f6f8fa; padding: 2px 4px; border-radius: 3px; font-size: 0.9em; }
108
- </style>
109
- </head>
110
- <body>
111
- <h1>Page Analysis</h1>
112
- <p><strong>URL:</strong> <a href="${page.url}" target="_blank">${page.url}</a></p>
113
-
114
- <div class="score-card">
115
- <div class="score-box">
116
- <div class="score-val">${page.seoScore}</div>
117
- <div>SEO Score</div>
118
- </div>
119
- <div class="score-box">
120
- <div class="score-val">${page.thinScore}</div>
121
- <div>Thin Content Score</div>
122
- </div>
123
- <div class="score-box">
124
- <div class="score-val">${page.status === 0 ? 'Pending/Limit' : page.status}</div>
125
- <div>HTTP Status</div>
126
- </div>
127
- </div>
128
-
129
- <h2>Meta Tags</h2>
130
- <table class="data-table">
131
- <tr>
132
- <th>Title</th>
133
- <td>
134
- <div>${escapeHtml(page.title.value || '(missing)')}</div>
135
- <small>Length: ${page.title.length} | Status: <span class="status-${page.title.status}">${page.title.status}</span></small>
136
- </td>
137
- </tr>
138
- <tr>
139
- <th>Description</th>
140
- <td>
141
- <div>${escapeHtml(page.metaDescription.value || '(missing)')}</div>
142
- <small>Length: ${page.metaDescription.length} | Status: <span class="status-${page.metaDescription.status}">${page.metaDescription.status}</span></small>
143
- </td>
144
- </tr>
145
- <tr>
146
- <th>Canonical</th>
147
- <td>${page.meta.canonical ? escapeHtml(page.meta.canonical) : '<em>(none)</em>'}</td>
148
- </tr>
149
- <tr>
150
- <th>Robots</th>
151
- <td>
152
- Index: ${!page.meta.noindex},
153
- Follow: ${!page.meta.nofollow}
154
- </td>
155
- </tr>
156
- </table>
157
-
158
- <h2>Content & Heading</h2>
159
- <table class="data-table">
160
- <tr>
161
- <th>H1 Tag</th>
162
- <td>
163
- Status: <span class="status-${page.h1.status}">${page.h1.status}</span>
164
- (${page.h1.count} detected)
165
- ${page.h1.matchesTitle ? ' | Matches Title' : ''}
166
- </td>
167
- </tr>
168
- <tr>
169
- <th>Word Count</th>
170
- <td>${page.content.wordCount} words</td>
171
- </tr>
172
- <tr>
173
- <th>Unique Sentences</th>
174
- <td>${page.content.uniqueSentenceCount}</td>
175
- </tr>
176
- <tr>
177
- <th>Text / HTML Ratio</th>
178
- <td>${(page.content.textHtmlRatio * 100).toFixed(2)}%</td>
179
- </tr>
180
- </table>
181
-
182
- <h2>Links & Images</h2>
183
- <table class="data-table">
184
- <tr>
185
- <th>Internal Links</th>
186
- <td>${page.links.internalLinks}</td>
187
- </tr>
188
- <tr>
189
- <th>External Links</th>
190
- <td>${page.links.externalLinks} (${(page.links.externalRatio * 100).toFixed(1)}%)</td>
191
- </tr>
192
- <tr>
193
- <th>Images</th>
194
- <td>${page.images.totalImages} total (${page.images.missingAlt} missing alt text)</td>
195
- </tr>
196
- </table>
197
-
198
- <h2>Structured Data</h2>
199
- <table class="data-table">
200
- <tr>
201
- <th>Status</th>
202
- <td>
203
- ${page.structuredData.present
139
+ const structuredDataStatus = page.structuredData.present
204
140
  ? (page.structuredData.valid ? '<span class="status-ok">Valid</span>' : '<span class="status-critical">Invalid JSON</span>')
205
- : 'Not detected'}
206
- </td>
207
- </tr>
208
- ${page.structuredData.present ? `
141
+ : 'Not detected';
142
+ const structuredDataTypesRow = page.structuredData.present ? `
209
143
  <tr>
210
144
  <th>Types Found</th>
211
145
  <td>${page.structuredData.types.map(t => `<code>${t}</code>`).join(', ')}</td>
212
146
  </tr>
213
- ` : ''}
214
- </table>
215
- </body>
216
- </html>`;
147
+ ` : '';
148
+ return ANALYSIS_PAGE_TEMPLATE
149
+ .replaceAll('{{URL}}', escapeHtml(page.url))
150
+ .replace('{{SEO_SCORE}}', page.seoScore.toString())
151
+ .replace('{{THIN_SCORE}}', page.thinScore.toString())
152
+ .replace('{{HTTP_STATUS}}', page.status === 0 ? 'Pending/Limit' : page.status.toString())
153
+ .replace('{{TITLE_VALUE}}', escapeHtml(page.title.value || '(missing)'))
154
+ .replace('{{TITLE_LENGTH}}', page.title.length.toString())
155
+ .replaceAll('{{TITLE_STATUS}}', page.title.status)
156
+ .replace('{{META_DESCRIPTION_VALUE}}', escapeHtml(page.metaDescription.value || '(missing)'))
157
+ .replace('{{META_DESCRIPTION_LENGTH}}', page.metaDescription.length.toString())
158
+ .replaceAll('{{META_DESCRIPTION_STATUS}}', page.metaDescription.status)
159
+ .replace('{{CANONICAL}}', page.meta.canonical ? escapeHtml(page.meta.canonical) : '<em>(none)</em>')
160
+ .replace('{{ROBOTS_INDEX}}', (!page.meta.noindex).toString())
161
+ .replace('{{ROBOTS_FOLLOW}}', (!page.meta.nofollow).toString())
162
+ .replaceAll('{{H1_STATUS}}', page.h1.status)
163
+ .replace('{{H1_COUNT}}', page.h1.count.toString())
164
+ .replace('{{H1_MATCHES_TITLE}}', page.h1.matchesTitle ? ' | Matches Title' : '')
165
+ .replace('{{WORD_COUNT}}', page.content.wordCount.toString())
166
+ .replace('{{UNIQUE_SENTENCES}}', page.content.uniqueSentenceCount.toString())
167
+ .replace('{{TEXT_HTML_RATIO}}', (page.content.textHtmlRatio * 100).toFixed(2))
168
+ .replace('{{INTERNAL_LINKS}}', page.links.internalLinks.toString())
169
+ .replace('{{EXTERNAL_LINKS}}', page.links.externalLinks.toString())
170
+ .replace('{{EXTERNAL_RATIO}}', (page.links.externalRatio * 100).toFixed(1))
171
+ .replace('{{TOTAL_IMAGES}}', page.images.totalImages.toString())
172
+ .replace('{{MISSING_ALT}}', page.images.missingAlt.toString())
173
+ .replace('{{STRUCTURED_DATA_STATUS}}', structuredDataStatus)
174
+ .replace('{{STRUCTURED_DATA_TYPES_ROW}}', structuredDataTypesRow);
217
175
  }
218
176
  export function renderAnalysisMarkdown(result) {
219
177
  const summary = [
@@ -259,48 +217,84 @@ export function renderAnalysisCsv(result) {
259
217
  function escapeHtml(value) {
260
218
  return value.replaceAll('&', '&amp;').replaceAll('<', '&lt;').replaceAll('>', '&gt;');
261
219
  }
262
- function analyzePages(rootUrl, pages) {
263
- const titleCandidates = pages.map((page) => analyzeTitle(page.html || ''));
264
- const metaCandidates = pages.map((page) => analyzeMetaDescription(page.html || ''));
265
- const titles = applyDuplicateStatuses(titleCandidates);
266
- const metas = applyDuplicateStatuses(metaCandidates);
220
+ export function analyzePages(rootUrl, pages, robots) {
221
+ const titleCounts = new Map();
222
+ const metaCounts = new Map();
267
223
  const sentenceCountFrequency = new Map();
268
- const baseContent = pages.map((page) => analyzeContent(page.html || ''));
269
- for (const item of baseContent) {
270
- sentenceCountFrequency.set(item.uniqueSentenceCount, (sentenceCountFrequency.get(item.uniqueSentenceCount) || 0) + 1);
271
- }
272
- return pages.map((page, index) => {
224
+ const results = [];
225
+ for (const page of pages) {
273
226
  const html = page.html || '';
274
- const title = titles[index];
275
- const metaDescription = metas[index];
227
+ // 0. Update crawl status based on current robots rules
228
+ let crawlStatus = page.crawlStatus;
229
+ if (robots) {
230
+ const isBlocked = !robots.isAllowed(page.url, 'crawlith') ||
231
+ (!page.url.endsWith('/') && !robots.isAllowed(page.url + '/', 'crawlith'));
232
+ if (isBlocked) {
233
+ crawlStatus = 'blocked_by_robots';
234
+ }
235
+ }
236
+ // 1. Analyze Individual Components
237
+ const title = analyzeTitle(html);
238
+ const metaDescription = analyzeMetaDescription(html);
276
239
  const h1 = analyzeH1(html, title.value);
277
- const content = baseContent[index];
278
- const duplicationScore = (sentenceCountFrequency.get(content.uniqueSentenceCount) || 0) > 1 ? 100 : 0;
279
- const thinScore = calculateThinContentScore(content, duplicationScore);
240
+ const content = analyzeContent(html);
280
241
  const images = analyzeImageAlts(html);
281
242
  const links = analyzeLinks(html, page.url, rootUrl);
282
243
  const structuredData = analyzeStructuredData(html);
283
- const analysis = {
244
+ // 2. Accumulate Frequencies for Duplicates
245
+ if (title.value) {
246
+ const key = (title.value || '').trim().toLowerCase();
247
+ titleCounts.set(key, (titleCounts.get(key) || 0) + 1);
248
+ }
249
+ if (metaDescription.value) {
250
+ const key = (metaDescription.value || '').trim().toLowerCase();
251
+ metaCounts.set(key, (metaCounts.get(key) || 0) + 1);
252
+ }
253
+ sentenceCountFrequency.set(content.uniqueSentenceCount, (sentenceCountFrequency.get(content.uniqueSentenceCount) || 0) + 1);
254
+ // 3. Store Preliminary Result
255
+ results.push({
284
256
  url: page.url,
285
257
  status: page.status || 0,
286
258
  title,
287
259
  metaDescription,
288
260
  h1,
289
261
  content,
290
- thinScore,
262
+ thinScore: 0, // Calculated in pass 2
291
263
  images,
292
264
  links,
293
265
  structuredData,
294
- seoScore: 0,
266
+ seoScore: 0, // Calculated in pass 2
295
267
  meta: {
296
268
  canonical: page.canonical,
297
269
  noindex: page.noindex,
298
- nofollow: page.nofollow
270
+ nofollow: page.nofollow,
271
+ crawlStatus
272
+ }
273
+ });
274
+ }
275
+ // 4. Finalize Statuses and Scores (Pass 2)
276
+ for (const analysis of results) {
277
+ // Check Title Duplicates
278
+ if (analysis.title.value) {
279
+ const key = (analysis.title.value || '').trim().toLowerCase();
280
+ if ((titleCounts.get(key) || 0) > 1) {
281
+ analysis.title.status = 'duplicate';
282
+ }
283
+ }
284
+ // Check Meta Duplicates
285
+ if (analysis.metaDescription.value) {
286
+ const key = (analysis.metaDescription.value || '').trim().toLowerCase();
287
+ if ((metaCounts.get(key) || 0) > 1) {
288
+ analysis.metaDescription.status = 'duplicate';
299
289
  }
300
- };
290
+ }
291
+ // Check Content Duplication
292
+ const duplicationScore = (sentenceCountFrequency.get(analysis.content.uniqueSentenceCount) || 0) > 1 ? 100 : 0;
293
+ analysis.thinScore = calculateThinContentScore(analysis.content, duplicationScore);
294
+ // Calculate Final SEO Score
301
295
  analysis.seoScore = scorePageSeo(analysis);
302
- return analysis;
303
- });
296
+ }
297
+ return results;
304
298
  }
305
299
  function filterPageModules(page, modules) {
306
300
  const keepSeo = modules.seo;
@@ -318,22 +312,7 @@ function filterPageModules(page, modules) {
318
312
  images: keepAccessibility ? page.images : { totalImages: 0, missingAlt: 0, emptyAlt: 0 }
319
313
  };
320
314
  }
321
- async function loadCrawlData(rootUrl, fromCrawl) {
322
- // If fromCrawl is provided, we could theoretically load JSON, but
323
- // we now default to DB fetching for all operations.
324
- if (fromCrawl) {
325
- try {
326
- const content = await fs.readFile(fromCrawl, 'utf-8');
327
- const raw = JSON.parse(content);
328
- const pages = parsePages(raw);
329
- const graph = graphFromPages(rootUrl, pages, raw);
330
- const metrics = calculateMetrics(graph, 5);
331
- return { pages, metrics, graph };
332
- }
333
- catch (_e) {
334
- // Fallback downwards if file doesn't exist
335
- }
336
- }
315
+ async function loadCrawlData(rootUrl) {
337
316
  const db = getDb();
338
317
  const siteRepo = new SiteRepository(db);
339
318
  const snapshotRepo = new SnapshotRepository(db);
@@ -341,96 +320,61 @@ async function loadCrawlData(rootUrl, fromCrawl) {
341
320
  const urlObj = new URL(rootUrl);
342
321
  const domain = urlObj.hostname.replace('www.', '');
343
322
  const site = siteRepo.firstOrCreateSite(domain);
344
- const snapshot = snapshotRepo.getLatestSnapshot(site.id, 'completed');
323
+ let snapshot;
324
+ const page = pageRepo.getPage(site.id, rootUrl);
325
+ if (page && page.last_seen_snapshot_id) {
326
+ snapshot = snapshotRepo.getSnapshot(page.last_seen_snapshot_id);
327
+ }
345
328
  if (!snapshot) {
346
- throw new Error(`No completed snapshot found for ${rootUrl} in database.`);
329
+ snapshot = snapshotRepo.getLatestSnapshot(site.id);
330
+ }
331
+ if (!snapshot) {
332
+ throw new Error(`No crawl data found for ${rootUrl} in database.`);
347
333
  }
348
334
  const graph = loadGraphFromSnapshot(snapshot.id);
349
335
  const metrics = calculateMetrics(graph, 5);
350
- // We also need the `pages` array for analysis.
351
- // It needs `html` which might not be fully available unless we look up from the DB or Graph.
352
- // Wait, the Graph stores Node which doesn't contain HTML since we removed it from memory?
353
- // Actually, `loadGraphFromSnapshot` does NOT load actual raw HTML from nodes to save memory.
354
- // We need HTML for `analyzeSite` module! So we must fetch it from `pageRepo`.
355
- const dbPages = pageRepo.getPagesBySnapshot(snapshot.id);
356
- const pages = dbPages.map((p) => ({
357
- url: p.normalized_url,
358
- status: p.http_status || 0,
359
- html: p.html || '',
360
- depth: p.depth || 0
361
- }));
362
- return { pages, metrics, graph };
363
- }
364
- function parsePages(raw) {
365
- if (Array.isArray(raw.pages)) {
366
- return raw.pages.map((page) => {
367
- const p = page;
368
- return {
369
- url: String(p.url || ''),
370
- status: Number(p.status || 0),
371
- html: typeof p.html === 'string' ? p.html : '',
372
- depth: Number(p.depth || 0)
336
+ // Use iterator to save memory
337
+ const dbPagesIterator = pageRepo.getPagesIteratorBySnapshot(snapshot.id);
338
+ // We need to map the DB pages to CrawlPage format lazily
339
+ const pagesGenerator = function* () {
340
+ for (const p of dbPagesIterator) {
341
+ yield {
342
+ url: p.normalized_url,
343
+ status: p.http_status || 0,
344
+ html: p.html || '',
345
+ depth: p.depth || 0,
346
+ canonical: p.canonical_url || undefined,
347
+ noindex: !!p.noindex,
348
+ nofollow: !!p.nofollow,
349
+ crawlStatus: graph.nodes.get(p.normalized_url)?.crawlStatus
373
350
  };
374
- }).filter((page) => Boolean(page.url));
375
- }
376
- if (Array.isArray(raw.nodes)) {
377
- return raw.nodes.map((node) => {
378
- const n = node;
379
- return {
380
- url: String(n.url || ''),
381
- status: Number(n.status || 0),
382
- html: typeof n.html === 'string' ? n.html : '',
383
- depth: Number(n.depth || 0)
384
- };
385
- }).filter((page) => Boolean(page.url));
386
- }
387
- return [];
388
- }
389
- function graphFromPages(rootUrl, pages, raw) {
390
- const graph = new Graph();
391
- for (const page of pages) {
392
- graph.addNode(page.url, page.depth || 0, page.status || 0);
393
- }
394
- if (Array.isArray(raw.edges)) {
395
- for (const edge of raw.edges) {
396
- const e = edge;
397
- if (typeof e.source === 'string' && typeof e.target === 'string') {
398
- graph.addNode(e.source, 0, 0);
399
- graph.addNode(e.target, 0, 0);
400
- graph.addEdge(e.source, e.target);
401
- }
402
351
  }
403
- return graph;
404
- }
405
- for (const page of pages) {
406
- if (!page.html)
407
- continue;
408
- const linkAnalysis = analyzeLinks(page.html, page.url, rootUrl);
409
- if (linkAnalysis.internalLinks === 0 && linkAnalysis.externalLinks === 0)
410
- continue;
411
- }
412
- return graph;
352
+ };
353
+ return { pages: pagesGenerator(), metrics, graph, snapshotId: snapshot.id, crawledAt: snapshot.created_at };
413
354
  }
414
- async function runLiveCrawl(url, options) {
355
+ async function runLiveCrawl(url, options, context) {
415
356
  const snapshotId = await crawl(url, {
416
- limit: 1,
357
+ limit: 1, // Always limit to 1 for single page live analysis
417
358
  depth: 0,
418
359
  rate: options.rate,
419
360
  proxyUrl: options.proxyUrl,
420
361
  userAgent: options.userAgent,
421
362
  maxRedirects: options.maxRedirects,
422
- debug: options.debug
423
- });
363
+ debug: options.debug,
364
+ snapshotType: 'partial'
365
+ }, context);
424
366
  const graph = loadGraphFromSnapshot(snapshotId);
425
367
  const pages = graph.getNodes().map((node) => ({
426
368
  url: node.url,
427
369
  status: node.status,
428
370
  html: node.html || '', // Include HTML
429
- depth: node.depth
371
+ depth: node.depth,
372
+ crawlStatus: node.crawlStatus
430
373
  }));
431
374
  return {
432
375
  pages,
433
376
  metrics: calculateMetrics(graph, 1),
434
- graph
377
+ graph,
378
+ snapshotId
435
379
  };
436
380
  }
@@ -1,4 +1,7 @@
1
1
  export function scorePageSeo(page) {
2
+ if (page.meta.crawlStatus === 'blocked_by_robots') {
3
+ return 0;
4
+ }
2
5
  const titleMeta = (scoreTextStatus(page.title.status) + scoreTextStatus(page.metaDescription.status)) / 2;
3
6
  const h1 = page.h1.status === 'ok' ? 100 : page.h1.status === 'warning' ? 60 : 10;
4
7
  const wordQuality = Math.min(100, (page.content.wordCount / 600) * 100) * 0.7 + Math.min(100, page.content.textHtmlRatio * 500) * 0.3;
@@ -33,7 +36,10 @@ export function aggregateSiteScore(metrics, pages) {
33
36
  const entropyScore = Math.max(0, 100 - Math.abs(metrics.structuralEntropy - 2) * 25);
34
37
  const orphanPenalty = metrics.totalPages === 0 ? 0 : (metrics.orphanPages.length / metrics.totalPages) * 100;
35
38
  const authorityEntropyOrphanScore = Math.max(0, Math.min(100, (avgAuthority * 100 * 0.4) + (entropyScore * 0.35) + ((100 - orphanPenalty) * 0.25)));
36
- const overallScore = Number((seoHealthScore * 0.7 + authorityEntropyOrphanScore * 0.3).toFixed(2));
39
+ let overallScore = Number((seoHealthScore * 0.7 + authorityEntropyOrphanScore * 0.3).toFixed(2));
40
+ if (pages.some(p => p.meta.crawlStatus === 'blocked_by_robots')) {
41
+ overallScore = 0;
42
+ }
37
43
  return {
38
44
  seoHealthScore: Number(seoHealthScore.toFixed(2)),
39
45
  authorityEntropyOrphanScore: Number(authorityEntropyOrphanScore.toFixed(2)),
@@ -0,0 +1,2 @@
1
+ export declare const ANALYSIS_LIST_TEMPLATE: string;
2
+ export declare const ANALYSIS_PAGE_TEMPLATE: string;
@@ -0,0 +1,7 @@
1
+ import fs from 'node:fs';
2
+ import path from 'node:path';
3
+ import { fileURLToPath } from 'node:url';
4
+ const __filename = fileURLToPath(import.meta.url);
5
+ const __dirname = path.dirname(__filename);
6
+ export const ANALYSIS_LIST_TEMPLATE = fs.readFileSync(path.join(__dirname, 'analysis_list.html'), 'utf-8');
7
+ export const ANALYSIS_PAGE_TEMPLATE = fs.readFileSync(path.join(__dirname, 'analysis_page.html'), 'utf-8');
@@ -1,3 +1,5 @@
1
+ import * as dns from 'dns';
2
+ import { Agent } from 'undici';
1
3
  export declare class IPGuard {
2
4
  /**
3
5
  * Checks if an IP address is internal/private
@@ -7,5 +9,14 @@ export declare class IPGuard {
7
9
  * Resolves a hostname and validates all result IPs
8
10
  */
9
11
  static validateHost(host: string): Promise<boolean>;
12
+ /**
13
+ * Custom lookup function for undici that validates the resolved IP.
14
+ * Prevents DNS Rebinding attacks by checking the IP immediately before connection.
15
+ */
16
+ static secureLookup(hostname: string, options: dns.LookupOneOptions | dns.LookupAllOptions, callback: (err: NodeJS.ErrnoException | null, address: string | dns.LookupAddress[], family: number) => void): void;
17
+ /**
18
+ * Returns an undici Agent configured with secure DNS lookup.
19
+ */
20
+ static getSecureDispatcher(): Agent;
10
21
  private static expandIPv6;
11
22
  }