@crawlith/core 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/CHANGELOG.md +6 -0
  2. package/dist/analysis/analysis_list.html +35 -0
  3. package/dist/analysis/analysis_page.html +123 -0
  4. package/dist/analysis/analyze.d.ts +17 -3
  5. package/dist/analysis/analyze.js +192 -248
  6. package/dist/analysis/scoring.js +7 -1
  7. package/dist/analysis/templates.d.ts +2 -0
  8. package/dist/analysis/templates.js +7 -0
  9. package/dist/core/security/ipGuard.d.ts +11 -0
  10. package/dist/core/security/ipGuard.js +71 -3
  11. package/dist/crawler/crawl.d.ts +4 -22
  12. package/dist/crawler/crawl.js +4 -335
  13. package/dist/crawler/crawler.d.ts +75 -0
  14. package/dist/crawler/crawler.js +518 -0
  15. package/dist/crawler/extract.d.ts +4 -1
  16. package/dist/crawler/extract.js +7 -2
  17. package/dist/crawler/fetcher.d.ts +1 -0
  18. package/dist/crawler/fetcher.js +20 -5
  19. package/dist/crawler/metricsRunner.d.ts +3 -1
  20. package/dist/crawler/metricsRunner.js +55 -46
  21. package/dist/crawler/sitemap.d.ts +3 -0
  22. package/dist/crawler/sitemap.js +5 -1
  23. package/dist/db/graphLoader.js +32 -3
  24. package/dist/db/index.d.ts +3 -0
  25. package/dist/db/index.js +4 -0
  26. package/dist/db/repositories/EdgeRepository.d.ts +8 -0
  27. package/dist/db/repositories/EdgeRepository.js +13 -0
  28. package/dist/db/repositories/MetricsRepository.d.ts +3 -0
  29. package/dist/db/repositories/MetricsRepository.js +14 -1
  30. package/dist/db/repositories/PageRepository.d.ts +11 -0
  31. package/dist/db/repositories/PageRepository.js +112 -19
  32. package/dist/db/repositories/SiteRepository.d.ts +3 -0
  33. package/dist/db/repositories/SiteRepository.js +9 -0
  34. package/dist/db/repositories/SnapshotRepository.d.ts +2 -0
  35. package/dist/db/repositories/SnapshotRepository.js +23 -2
  36. package/dist/events.d.ts +48 -0
  37. package/dist/events.js +1 -0
  38. package/dist/graph/cluster.js +62 -14
  39. package/dist/graph/duplicate.js +242 -191
  40. package/dist/graph/graph.d.ts +16 -0
  41. package/dist/graph/graph.js +17 -4
  42. package/dist/graph/metrics.js +12 -0
  43. package/dist/graph/pagerank.js +2 -0
  44. package/dist/graph/simhash.d.ts +6 -0
  45. package/dist/graph/simhash.js +14 -0
  46. package/dist/index.d.ts +5 -2
  47. package/dist/index.js +5 -2
  48. package/dist/lock/hashKey.js +1 -1
  49. package/dist/lock/lockManager.d.ts +4 -1
  50. package/dist/lock/lockManager.js +23 -13
  51. package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
  52. package/dist/report/crawlExport.d.ts +3 -0
  53. package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
  54. package/dist/report/crawl_template.d.ts +1 -0
  55. package/dist/report/crawl_template.js +7 -0
  56. package/dist/report/html.js +15 -216
  57. package/dist/scoring/health.d.ts +50 -0
  58. package/dist/scoring/health.js +170 -0
  59. package/dist/scoring/hits.d.ts +1 -0
  60. package/dist/scoring/hits.js +64 -44
  61. package/dist/scoring/orphanSeverity.d.ts +5 -5
  62. package/package.json +3 -3
  63. package/scripts/copy-assets.js +37 -0
  64. package/src/analysis/analysis_list.html +35 -0
  65. package/src/analysis/analysis_page.html +123 -0
  66. package/src/analysis/analyze.ts +218 -261
  67. package/src/analysis/scoring.ts +8 -1
  68. package/src/analysis/templates.ts +9 -0
  69. package/src/core/security/ipGuard.ts +82 -3
  70. package/src/crawler/crawl.ts +6 -379
  71. package/src/crawler/crawler.ts +601 -0
  72. package/src/crawler/extract.ts +7 -2
  73. package/src/crawler/fetcher.ts +24 -6
  74. package/src/crawler/metricsRunner.ts +60 -47
  75. package/src/crawler/sitemap.ts +4 -1
  76. package/src/db/graphLoader.ts +33 -3
  77. package/src/db/index.ts +5 -0
  78. package/src/db/repositories/EdgeRepository.ts +14 -0
  79. package/src/db/repositories/MetricsRepository.ts +15 -1
  80. package/src/db/repositories/PageRepository.ts +119 -19
  81. package/src/db/repositories/SiteRepository.ts +11 -0
  82. package/src/db/repositories/SnapshotRepository.ts +28 -3
  83. package/src/events.ts +16 -0
  84. package/src/graph/cluster.ts +69 -15
  85. package/src/graph/duplicate.ts +249 -185
  86. package/src/graph/graph.ts +24 -4
  87. package/src/graph/metrics.ts +15 -0
  88. package/src/graph/pagerank.ts +1 -0
  89. package/src/graph/simhash.ts +15 -0
  90. package/src/index.ts +5 -2
  91. package/src/lock/hashKey.ts +1 -1
  92. package/src/lock/lockManager.ts +21 -13
  93. package/{dist/report/sitegraph_template.js → src/report/crawl.html} +330 -81
  94. package/src/report/{sitegraphExport.ts → crawlExport.ts} +3 -3
  95. package/src/report/crawl_template.ts +9 -0
  96. package/src/report/html.ts +17 -217
  97. package/src/scoring/health.ts +241 -0
  98. package/src/scoring/hits.ts +67 -45
  99. package/src/scoring/orphanSeverity.ts +8 -8
  100. package/tests/analysis.unit.test.ts +44 -0
  101. package/tests/analyze.integration.test.ts +88 -53
  102. package/tests/analyze_markdown.test.ts +98 -0
  103. package/tests/audit/audit.test.ts +101 -0
  104. package/tests/audit/scoring.test.ts +25 -25
  105. package/tests/audit/transport.test.ts +0 -1
  106. package/tests/clustering_risk.test.ts +118 -0
  107. package/tests/crawler.test.ts +19 -13
  108. package/tests/db/index.test.ts +134 -0
  109. package/tests/db/repositories.test.ts +115 -0
  110. package/tests/db_repos.test.ts +72 -0
  111. package/tests/duplicate.test.ts +2 -2
  112. package/tests/extract.test.ts +86 -0
  113. package/tests/fetcher.test.ts +5 -1
  114. package/tests/fetcher_safety.test.ts +9 -3
  115. package/tests/graph/graph.test.ts +100 -0
  116. package/tests/graphLoader.test.ts +124 -0
  117. package/tests/html_report.test.ts +52 -51
  118. package/tests/ipGuard.test.ts +73 -0
  119. package/tests/lock/lockManager.test.ts +77 -17
  120. package/tests/normalize.test.ts +6 -19
  121. package/tests/orphanSeverity.test.ts +9 -9
  122. package/tests/redirect_safety.test.ts +5 -1
  123. package/tests/renderAnalysisCsv.test.ts +183 -0
  124. package/tests/safety.test.ts +12 -0
  125. package/tests/scope.test.ts +18 -0
  126. package/tests/scoring.test.ts +25 -24
  127. package/tests/sitemap.test.ts +13 -1
  128. package/tests/ssrf_fix.test.ts +69 -0
  129. package/tests/visualization_data.test.ts +10 -10
  130. package/dist/report/sitegraphExport.d.ts +0 -3
  131. package/dist/report/sitegraph_template.d.ts +0 -1
@@ -0,0 +1,518 @@
1
+ import chalk from 'chalk';
2
+ import pLimit from 'p-limit';
3
+ import robotsParser from 'robots-parser';
4
+ import { Fetcher } from './fetcher.js';
5
+ import { Parser } from './parser.js';
6
+ import { Sitemap } from './sitemap.js';
7
+ import { normalizeUrl } from './normalize.js';
8
+ import { TrapDetector } from './trap.js';
9
+ import { ScopeManager } from '../core/scope/scopeManager.js';
10
+ import { getDb } from '../db/index.js';
11
+ import { SiteRepository } from '../db/repositories/SiteRepository.js';
12
+ import { SnapshotRepository } from '../db/repositories/SnapshotRepository.js';
13
+ import { PageRepository } from '../db/repositories/PageRepository.js';
14
+ import { EdgeRepository } from '../db/repositories/EdgeRepository.js';
15
+ import { MetricsRepository } from '../db/repositories/MetricsRepository.js';
16
+ import { analyzeContent, calculateThinContentScore } from '../analysis/content.js';
17
+ import { analyzeLinks } from '../analysis/links.js';
18
+ // Fallback context for backward compatibility or when no context is provided
19
+ const nullContext = {
20
+ emit: (event) => {
21
+ // Basic console fallback for critical events if no listener is attached
22
+ // This maintains some visibility for consumers not using the event system
23
+ if (event.type === 'error') {
24
+ console.error(event.message, event.error || '');
25
+ }
26
+ else if (event.type === 'warn') {
27
+ console.warn(event.message);
28
+ }
29
+ }
30
+ };
31
+ export class Crawler {
32
+ startUrl;
33
+ options;
34
+ context;
35
+ visited;
36
+ uniqueQueue;
37
+ queue;
38
+ active;
39
+ pagesCrawled;
40
+ reachedLimit;
41
+ maxDepthInCrawl;
42
+ concurrency;
43
+ limitConcurrency;
44
+ // Repositories
45
+ siteRepo = null;
46
+ snapshotRepo = null;
47
+ pageRepo = null;
48
+ edgeRepo = null;
49
+ metricsRepo = null;
50
+ // Site/Snapshot info
51
+ siteId = null;
52
+ snapshotId = null;
53
+ rootOrigin = '';
54
+ // Discovery tracking
55
+ discoveryDepths = new Map();
56
+ // Buffers for batch operations
57
+ pageBuffer = new Map();
58
+ edgeBuffer = [];
59
+ metricsBuffer = [];
60
+ // Modules
61
+ scopeManager = null;
62
+ fetcher = null;
63
+ parser = null;
64
+ sitemapFetcher = null;
65
+ trapDetector = null;
66
+ robots = null;
67
+ constructor(startUrl, options, context) {
68
+ this.startUrl = startUrl;
69
+ this.options = options;
70
+ this.context = context || nullContext;
71
+ this.visited = new Set();
72
+ this.uniqueQueue = new Set();
73
+ this.queue = [];
74
+ this.active = 0;
75
+ this.pagesCrawled = 0;
76
+ this.reachedLimit = false;
77
+ this.maxDepthInCrawl = Math.min(options.depth, 10);
78
+ this.concurrency = Math.min(options.concurrency || 2, 10);
79
+ this.limitConcurrency = pLimit(this.concurrency);
80
+ }
81
+ async initialize() {
82
+ const db = getDb();
83
+ this.siteRepo = new SiteRepository(db);
84
+ this.snapshotRepo = new SnapshotRepository(db);
85
+ this.pageRepo = new PageRepository(db);
86
+ this.edgeRepo = new EdgeRepository(db);
87
+ this.metricsRepo = new MetricsRepository(db);
88
+ const rootUrl = normalizeUrl(this.startUrl, '', { stripQuery: this.options.stripQuery });
89
+ if (!rootUrl)
90
+ throw new Error('Invalid start URL');
91
+ const urlObj = new URL(rootUrl);
92
+ const domain = urlObj.hostname.replace('www.', '');
93
+ const site = this.siteRepo.firstOrCreateSite(domain);
94
+ this.siteId = site.id;
95
+ const type = this.options.snapshotType || (this.options.previousGraph ? 'incremental' : 'full');
96
+ this.snapshotId = this.snapshotRepo.createSnapshot(this.siteId, type);
97
+ this.rootOrigin = urlObj.origin;
98
+ this.startUrl = rootUrl;
99
+ // Seed discovery depth for root
100
+ this.discoveryDepths.set(this.startUrl, 0);
101
+ }
102
+ setupModules() {
103
+ this.scopeManager = new ScopeManager({
104
+ allowedDomains: this.options.allowedDomains || [],
105
+ deniedDomains: this.options.deniedDomains || [],
106
+ includeSubdomains: this.options.includeSubdomains || false,
107
+ rootUrl: this.startUrl
108
+ });
109
+ this.fetcher = new Fetcher({
110
+ rate: this.options.rate,
111
+ proxyUrl: this.options.proxyUrl,
112
+ scopeManager: this.scopeManager,
113
+ maxRedirects: this.options.maxRedirects,
114
+ userAgent: this.options.userAgent
115
+ });
116
+ this.parser = new Parser();
117
+ this.sitemapFetcher = new Sitemap(this.context);
118
+ this.trapDetector = new TrapDetector();
119
+ }
120
+ async fetchRobots() {
121
+ try {
122
+ const robotsUrl = new URL('/robots.txt', this.rootOrigin).toString();
123
+ const res = await this.fetcher.fetch(robotsUrl, { maxBytes: 500000 });
124
+ if (res && typeof res.status === 'number' && res.status >= 200 && res.status < 300) {
125
+ this.robots = robotsParser(robotsUrl, res.body);
126
+ }
127
+ }
128
+ catch {
129
+ // Suppressed expected network warnings when robots block
130
+ console.warn('Failed to fetch robots.txt, proceeding...');
131
+ }
132
+ }
133
+ shouldEnqueue(url, depth) {
134
+ if (this.visited.has(url))
135
+ return false;
136
+ if (this.uniqueQueue.has(url))
137
+ return false;
138
+ if (depth > this.maxDepthInCrawl)
139
+ return false;
140
+ if (this.scopeManager.isUrlEligible(url) !== 'allowed')
141
+ return false;
142
+ if (this.options.detectTraps) {
143
+ const trap = this.trapDetector.checkTrap(url, depth);
144
+ if (trap.risk > 0.8)
145
+ return false;
146
+ }
147
+ return true;
148
+ }
149
+ addToQueue(u, d) {
150
+ if (this.scopeManager.isUrlEligible(u) !== 'allowed')
151
+ return;
152
+ if (!this.uniqueQueue.has(u)) {
153
+ this.uniqueQueue.add(u);
154
+ this.queue.push({ url: u, depth: d });
155
+ this.context.emit({ type: 'queue:enqueue', url: u, depth: d });
156
+ const currentDiscovery = this.discoveryDepths.get(u);
157
+ if (currentDiscovery === undefined || d < currentDiscovery) {
158
+ this.discoveryDepths.set(u, d);
159
+ }
160
+ }
161
+ }
162
+ async seedQueue() {
163
+ // Seed from Sitemap
164
+ if (this.options.sitemap) {
165
+ try {
166
+ const sitemapUrl = this.options.sitemap === 'true' ? new URL('/sitemap.xml', this.rootOrigin).toString() : this.options.sitemap;
167
+ if (sitemapUrl.startsWith('http')) {
168
+ this.context.emit({ type: 'info', message: 'Fetching sitemap', context: { url: sitemapUrl } });
169
+ const sitemapUrls = await this.sitemapFetcher.fetch(sitemapUrl);
170
+ for (const u of sitemapUrls) {
171
+ const normalized = normalizeUrl(u, '', this.options);
172
+ if (normalized)
173
+ this.addToQueue(normalized, 0);
174
+ }
175
+ }
176
+ }
177
+ catch (e) {
178
+ this.context.emit({ type: 'warn', message: 'Sitemap fetch failed', context: e });
179
+ }
180
+ }
181
+ // Seed from startUrl
182
+ this.addToQueue(this.startUrl, 0);
183
+ }
184
+ bufferPage(url, depth, status, data = {}) {
185
+ const existing = this.pageBuffer.get(url);
186
+ const knownDiscovery = this.discoveryDepths.get(url);
187
+ // Always use the best (minimum) depth discovered for this URL
188
+ const finalDepth = knownDiscovery !== undefined ? Math.min(knownDiscovery, depth) : depth;
189
+ if (knownDiscovery === undefined || depth < knownDiscovery) {
190
+ this.discoveryDepths.set(url, depth);
191
+ }
192
+ // If we already have a buffered record, only update if the new one is more "complete" (has status)
193
+ // or if the depth is better.
194
+ if (existing) {
195
+ const isStatusUpdate = status !== 0 && existing.http_status === 0;
196
+ const isBetterDepth = finalDepth < existing.depth;
197
+ if (!isStatusUpdate && !isBetterDepth && Object.keys(data).length === 0) {
198
+ return;
199
+ }
200
+ this.pageBuffer.set(url, {
201
+ ...existing,
202
+ depth: finalDepth,
203
+ http_status: status !== 0 ? status : existing.http_status,
204
+ ...data
205
+ });
206
+ }
207
+ else {
208
+ this.pageBuffer.set(url, {
209
+ site_id: this.siteId,
210
+ normalized_url: url,
211
+ depth: finalDepth,
212
+ http_status: status,
213
+ last_seen_snapshot_id: this.snapshotId,
214
+ ...data
215
+ });
216
+ }
217
+ if (this.pageBuffer.size >= 50) {
218
+ this.flushPages();
219
+ }
220
+ }
221
+ flushPages() {
222
+ if (this.pageBuffer.size === 0)
223
+ return;
224
+ this.pageRepo.upsertMany(Array.from(this.pageBuffer.values()));
225
+ this.pageBuffer.clear();
226
+ }
227
+ bufferEdge(sourceUrl, targetUrl, weight = 1.0, rel = 'internal') {
228
+ this.edgeBuffer.push({ sourceUrl, targetUrl, weight, rel });
229
+ if (this.edgeBuffer.length >= 100) {
230
+ this.flushEdges();
231
+ }
232
+ }
233
+ flushEdges() {
234
+ if (this.edgeBuffer.length === 0)
235
+ return;
236
+ // To resolve URLs to IDs, we need to make sure pages are flushed first
237
+ this.flushPages();
238
+ const identities = this.pageRepo.getPagesIdentityBySnapshot(this.snapshotId);
239
+ const urlToId = new Map(identities.map(p => [p.normalized_url, p.id]));
240
+ const edgesToInsert = this.edgeBuffer
241
+ .map(e => ({
242
+ snapshot_id: this.snapshotId,
243
+ source_page_id: urlToId.get(e.sourceUrl),
244
+ target_page_id: urlToId.get(e.targetUrl),
245
+ weight: e.weight,
246
+ rel: e.rel
247
+ }))
248
+ .filter(e => e.source_page_id !== undefined && e.target_page_id !== undefined);
249
+ if (edgesToInsert.length > 0) {
250
+ this.edgeRepo.insertEdges(edgesToInsert);
251
+ }
252
+ this.edgeBuffer = [];
253
+ }
254
+ bufferMetrics(url, data) {
255
+ this.metricsBuffer.push({ url, data });
256
+ if (this.metricsBuffer.length >= 50) {
257
+ this.flushMetrics();
258
+ }
259
+ }
260
+ flushMetrics() {
261
+ if (this.metricsBuffer.length === 0)
262
+ return;
263
+ this.flushPages();
264
+ const identities = this.pageRepo.getPagesIdentityBySnapshot(this.snapshotId);
265
+ const urlToId = new Map(identities.map(p => [p.normalized_url, p.id]));
266
+ const metricsList = this.metricsBuffer.map(item => {
267
+ const pageId = urlToId.get(item.url);
268
+ if (!pageId)
269
+ return null;
270
+ return {
271
+ snapshot_id: this.snapshotId,
272
+ page_id: pageId,
273
+ authority_score: null,
274
+ hub_score: null,
275
+ pagerank: null,
276
+ pagerank_score: null,
277
+ link_role: null,
278
+ crawl_status: null,
279
+ word_count: null,
280
+ thin_content_score: null,
281
+ external_link_ratio: null,
282
+ orphan_score: null,
283
+ duplicate_cluster_id: null,
284
+ duplicate_type: null,
285
+ is_cluster_primary: 0,
286
+ ...item.data
287
+ };
288
+ }).filter(m => m !== null);
289
+ if (metricsList.length > 0) {
290
+ this.metricsRepo.insertMany(metricsList);
291
+ }
292
+ this.metricsBuffer = [];
293
+ }
294
+ async flushAll() {
295
+ this.flushPages();
296
+ this.flushEdges();
297
+ this.flushMetrics();
298
+ }
299
+ async fetchPage(url, depth, prevNode) {
300
+ const startTime = Date.now();
301
+ try {
302
+ this.context.emit({ type: 'crawl:start', url });
303
+ const res = await this.fetcher.fetch(url, {
304
+ maxBytes: this.options.maxBytes,
305
+ crawlDelay: this.robots ? this.robots.getCrawlDelay('crawlith') : undefined,
306
+ etag: prevNode?.etag,
307
+ lastModified: prevNode?.lastModified
308
+ });
309
+ const durationMs = Date.now() - startTime;
310
+ this.context.emit({
311
+ type: 'crawl:success',
312
+ url,
313
+ status: typeof res.status === 'number' ? res.status : 0,
314
+ durationMs,
315
+ depth
316
+ });
317
+ return res;
318
+ }
319
+ catch (e) {
320
+ this.context.emit({ type: 'crawl:error', url, error: String(e), depth });
321
+ return null;
322
+ }
323
+ }
324
+ handleCachedResponse(url, finalUrl, depth, prevNode) {
325
+ this.bufferPage(finalUrl, depth, 200, {
326
+ html: prevNode.html,
327
+ canonical_url: prevNode.canonical,
328
+ content_hash: prevNode.contentHash,
329
+ simhash: prevNode.simhash,
330
+ etag: prevNode.etag,
331
+ last_modified: prevNode.lastModified,
332
+ noindex: prevNode.noindex ? 1 : 0,
333
+ nofollow: prevNode.nofollow ? 1 : 0
334
+ });
335
+ this.bufferMetrics(finalUrl, {
336
+ crawl_status: 'cached'
337
+ });
338
+ // Re-discovery links from previous graph to continue crawling if needed
339
+ const prevLinks = this.options.previousGraph?.getEdges()
340
+ .filter(e => e.source === url)
341
+ .map(e => e.target);
342
+ if (prevLinks) {
343
+ for (const link of prevLinks) {
344
+ const normalizedLink = normalizeUrl(link, '', this.options);
345
+ if (normalizedLink && normalizedLink !== finalUrl) {
346
+ this.bufferPage(normalizedLink, depth + 1, 0);
347
+ this.bufferEdge(finalUrl, normalizedLink, 1.0, 'internal');
348
+ if (this.shouldEnqueue(normalizedLink, depth + 1)) {
349
+ this.addToQueue(normalizedLink, depth + 1);
350
+ }
351
+ }
352
+ }
353
+ }
354
+ }
355
+ handleRedirects(chain, depth) {
356
+ for (const step of chain) {
357
+ const source = normalizeUrl(step.url, '', this.options);
358
+ const target = normalizeUrl(step.target, '', this.options);
359
+ if (source && target) {
360
+ this.bufferPage(source, depth, step.status);
361
+ this.bufferPage(target, depth, 0);
362
+ this.bufferEdge(source, target);
363
+ }
364
+ }
365
+ }
366
+ handleSuccessResponse(res, finalUrl, depth, isBlocked = false) {
367
+ const contentTypeHeader = res.headers['content-type'];
368
+ const contentType = Array.isArray(contentTypeHeader) ? contentTypeHeader[0] : (contentTypeHeader || '');
369
+ if (!contentType || !contentType.toLowerCase().includes('text/html')) {
370
+ this.bufferPage(finalUrl, depth, typeof res.status === 'number' ? res.status : 0);
371
+ return;
372
+ }
373
+ const parseResult = this.parser.parse(res.body, finalUrl, res.status);
374
+ this.bufferPage(finalUrl, depth, res.status, {
375
+ html: parseResult.html,
376
+ canonical_url: parseResult.canonical || undefined,
377
+ noindex: parseResult.noindex ? 1 : 0,
378
+ nofollow: parseResult.nofollow ? 1 : 0,
379
+ content_hash: parseResult.contentHash,
380
+ simhash: parseResult.simhash,
381
+ soft404_score: parseResult.soft404Score,
382
+ etag: res.etag,
383
+ last_modified: res.lastModified,
384
+ retries: res.retries
385
+ });
386
+ try {
387
+ const contentAnalysis = analyzeContent(parseResult.html);
388
+ const linkAnalysis = analyzeLinks(parseResult.html, finalUrl, this.rootOrigin);
389
+ const thinScore = calculateThinContentScore(contentAnalysis, 0);
390
+ this.bufferMetrics(finalUrl, {
391
+ crawl_status: isBlocked ? 'blocked_by_robots' : 'fetched',
392
+ word_count: contentAnalysis.wordCount,
393
+ thin_content_score: thinScore,
394
+ external_link_ratio: linkAnalysis.externalRatio
395
+ });
396
+ }
397
+ catch (e) {
398
+ this.context.emit({ type: 'error', message: 'Error calculating per-page metrics', error: e, context: { url: finalUrl } });
399
+ }
400
+ for (const linkItem of parseResult.links) {
401
+ const normalizedLink = normalizeUrl(linkItem.url, '', this.options);
402
+ if (normalizedLink && normalizedLink !== finalUrl) {
403
+ this.bufferPage(normalizedLink, depth + 1, 0);
404
+ this.bufferEdge(finalUrl, normalizedLink, 1.0, 'internal');
405
+ if (this.shouldEnqueue(normalizedLink, depth + 1)) {
406
+ this.addToQueue(normalizedLink, depth + 1);
407
+ }
408
+ }
409
+ }
410
+ }
411
+ async processPage(item, isBlocked = false) {
412
+ const { url, depth } = item;
413
+ if (this.scopeManager.isUrlEligible(url) !== 'allowed') {
414
+ this.bufferPage(url, depth, 0, { securityError: 'blocked_by_domain_filter' });
415
+ return;
416
+ }
417
+ try {
418
+ const prevNode = this.options.previousGraph?.nodes.get(url);
419
+ const res = await this.fetchPage(url, depth, prevNode);
420
+ if (!res)
421
+ return;
422
+ const finalUrl = normalizeUrl(res.finalUrl, '', this.options);
423
+ if (!finalUrl)
424
+ return;
425
+ if (res.status === 304 && prevNode) {
426
+ this.handleCachedResponse(url, finalUrl, depth, prevNode);
427
+ return;
428
+ }
429
+ this.handleRedirects(res.redirectChain, depth);
430
+ const isStringStatus = typeof res.status === 'string';
431
+ if (isStringStatus || (typeof res.status === 'number' && res.status >= 300)) {
432
+ const statusNum = typeof res.status === 'number' ? res.status : 0;
433
+ this.bufferPage(finalUrl, depth, statusNum, {
434
+ security_error: isStringStatus ? res.status : undefined,
435
+ retries: res.retries
436
+ });
437
+ this.bufferMetrics(finalUrl, {
438
+ crawl_status: isStringStatus ? res.status : 'fetched_error'
439
+ });
440
+ return;
441
+ }
442
+ if (res.status === 200) {
443
+ this.handleSuccessResponse(res, finalUrl, depth, isBlocked);
444
+ }
445
+ }
446
+ catch (e) {
447
+ this.context.emit({ type: 'crawl:error', url, error: String(e), depth });
448
+ }
449
+ }
450
+ async run() {
451
+ await this.initialize();
452
+ this.setupModules();
453
+ await this.fetchRobots();
454
+ await this.seedQueue();
455
+ return new Promise((resolve) => {
456
+ const checkDone = async () => {
457
+ if (this.queue.length === 0 && this.active === 0) {
458
+ await this.flushAll();
459
+ this.snapshotRepo.updateSnapshotStatus(this.snapshotId, 'completed', {
460
+ limit_reached: this.reachedLimit ? 1 : 0
461
+ });
462
+ resolve(this.snapshotId);
463
+ return true;
464
+ }
465
+ return false;
466
+ };
467
+ const next = async () => {
468
+ if (await checkDone())
469
+ return;
470
+ if (this.pagesCrawled >= this.options.limit) {
471
+ this.reachedLimit = true;
472
+ if (this.active === 0) {
473
+ await this.flushAll();
474
+ this.snapshotRepo.updateSnapshotStatus(this.snapshotId, 'completed', {
475
+ limit_reached: 1
476
+ });
477
+ this.context.emit({ type: 'crawl:limit-reached', limit: this.options.limit });
478
+ resolve(this.snapshotId);
479
+ }
480
+ return;
481
+ }
482
+ while (this.queue.length > 0 && this.active < this.concurrency && this.pagesCrawled < this.options.limit) {
483
+ const item = this.queue.shift();
484
+ if (this.visited.has(item.url))
485
+ continue;
486
+ // Robust robots check: if path doesn't end in /, check both /path and /path/
487
+ // to handle cases where normalization stripped a slash that robots.txt relies on.
488
+ const isBlocked = this.robots && (!this.robots.isAllowed(item.url, 'crawlith') ||
489
+ (!item.url.endsWith('/') && !this.robots.isAllowed(item.url + '/', 'crawlith')));
490
+ if (isBlocked) {
491
+ if (this.options.debug) {
492
+ console.log(`${chalk.yellow('⊘ Robots')} ${chalk.gray(item.url)}`);
493
+ }
494
+ // Tag as blocked for reporting
495
+ this.bufferMetrics(item.url, {
496
+ crawl_status: 'blocked_by_robots'
497
+ });
498
+ this.bufferPage(item.url, item.depth, 0);
499
+ if (!this.options.ignoreRobots) {
500
+ this.visited.add(item.url);
501
+ this.pagesCrawled++;
502
+ continue;
503
+ }
504
+ }
505
+ this.active++;
506
+ this.pagesCrawled++;
507
+ this.visited.add(item.url);
508
+ this.limitConcurrency(() => this.processPage(item, isBlocked)).finally(() => {
509
+ this.active--;
510
+ next();
511
+ });
512
+ }
513
+ await checkDone();
514
+ };
515
+ next();
516
+ });
517
+ }
518
+ }
@@ -1,5 +1,8 @@
1
1
  /**
2
2
  * Extracts all links from an HTML document.
3
3
  * Returns absolute URLs.
4
+ * @param html The HTML content string
5
+ * @param baseUrl The base URL to resolve relative links against
6
+ * @param onError Optional callback for handling extraction errors
4
7
  */
5
- export declare function extractLinks(html: string, baseUrl: string): string[];
8
+ export declare function extractLinks(html: string, baseUrl: string, onError?: (error: unknown) => void): string[];
@@ -2,8 +2,11 @@ import * as cheerio from 'cheerio';
2
2
  /**
3
3
  * Extracts all links from an HTML document.
4
4
  * Returns absolute URLs.
5
+ * @param html The HTML content string
6
+ * @param baseUrl The base URL to resolve relative links against
7
+ * @param onError Optional callback for handling extraction errors
5
8
  */
6
- export function extractLinks(html, baseUrl) {
9
+ export function extractLinks(html, baseUrl, onError) {
7
10
  try {
8
11
  const $ = cheerio.load(html);
9
12
  const links = new Set();
@@ -27,7 +30,9 @@ export function extractLinks(html, baseUrl) {
27
30
  return Array.from(links);
28
31
  }
29
32
  catch (e) {
30
- console.error(`Error extracting links from ${baseUrl}:`, e);
33
+ if (onError) {
34
+ onError(e);
35
+ }
31
36
  return [];
32
37
  }
33
38
  }
@@ -26,6 +26,7 @@ export declare class Fetcher {
26
26
  private userAgent;
27
27
  private rateLimiter;
28
28
  private proxyAdapter;
29
+ private secureDispatcher;
29
30
  private scopeManager?;
30
31
  private maxRedirects;
31
32
  constructor(options?: {
@@ -1,4 +1,5 @@
1
1
  import { request } from 'undici';
2
+ import * as net from 'net';
2
3
  import { IPGuard } from '../core/security/ipGuard.js';
3
4
  import { RateLimiter } from '../core/network/rateLimiter.js';
4
5
  import { RetryPolicy } from '../core/network/retryPolicy.js';
@@ -10,11 +11,18 @@ export class Fetcher {
10
11
  userAgent = 'crawlith/1.0';
11
12
  rateLimiter;
12
13
  proxyAdapter;
14
+ secureDispatcher;
13
15
  scopeManager;
14
16
  maxRedirects;
15
17
  constructor(options = {}) {
16
18
  this.rateLimiter = new RateLimiter(options.rate || 2);
17
19
  this.proxyAdapter = new ProxyAdapter(options.proxyUrl);
20
+ if (this.proxyAdapter.dispatcher) {
21
+ this.secureDispatcher = this.proxyAdapter.dispatcher;
22
+ }
23
+ else {
24
+ this.secureDispatcher = IPGuard.getSecureDispatcher();
25
+ }
18
26
  this.scopeManager = options.scopeManager;
19
27
  this.maxRedirects = Math.min(options.maxRedirects ?? 2, 11);
20
28
  this.userAgent = options.userAgent || `crawlith/${version}`;
@@ -28,10 +36,14 @@ export class Fetcher {
28
36
  // Use a while(true) and explicit return/continue to handle redirects
29
37
  while (true) {
30
38
  const urlObj = new URL(currentUrl);
31
- // 1. SSRF Guard
32
- const isSafe = await IPGuard.validateHost(urlObj.hostname);
33
- if (!isSafe) {
34
- return this.errorResult('blocked_internal_ip', currentUrl, redirectChain, totalRetries);
39
+ // 1. SSRF Guard (IP Literals only)
40
+ // We only check explicit IP literals here to fail fast.
41
+ // For domains, we rely on the secureDispatcher (which uses IPGuard.secureLookup)
42
+ // to resolve and validate the IP at connection time, preventing TOCTOU attacks.
43
+ if (net.isIP(urlObj.hostname)) {
44
+ if (IPGuard.isInternal(urlObj.hostname)) {
45
+ return this.errorResult('blocked_internal_ip', currentUrl, redirectChain, totalRetries);
46
+ }
35
47
  }
36
48
  // 2. Scope Validation (Domain & Subdomain)
37
49
  if (this.scopeManager) {
@@ -61,7 +73,7 @@ export class Fetcher {
61
73
  method: 'GET',
62
74
  headers,
63
75
  maxRedirections: 0,
64
- dispatcher: this.proxyAdapter.dispatcher,
76
+ dispatcher: this.secureDispatcher,
65
77
  headersTimeout: 10000,
66
78
  bodyTimeout: 10000
67
79
  });
@@ -141,6 +153,9 @@ export class Fetcher {
141
153
  catch (error) {
142
154
  // Map common network errors to specific statuses if needed
143
155
  const isProxyError = error.message?.toLowerCase().includes('proxy') || error.code === 'ECONNREFUSED';
156
+ if (error.code === 'EBLOCKED' || error.message?.includes('Blocked internal IP')) {
157
+ return this.errorResult('blocked_internal_ip', currentUrl, redirectChain, totalRetries);
158
+ }
144
159
  const finalStatus = isProxyError ? 'proxy_connection_failed' : 'network_error';
145
160
  return this.errorResult(totalRetries >= RetryPolicy.DEFAULT_CONFIG.maxRetries ? 'failed_after_retries' : finalStatus, currentUrl, redirectChain, totalRetries);
146
161
  }
@@ -1 +1,3 @@
1
- export declare function runPostCrawlMetrics(snapshotId: number, maxDepth: number, limitReached?: boolean): void;
1
+ import { EngineContext } from '../events.js';
2
+ import { Graph } from '../graph/graph.js';
3
+ export declare function runPostCrawlMetrics(snapshotId: number, maxDepth: number, context?: EngineContext, limitReached?: boolean, graphInstance?: Graph): void;