@crawlith/core 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analysis_list.html +35 -0
  4. package/dist/analysis/analysis_page.html +123 -0
  5. package/dist/analysis/analyze.d.ts +40 -5
  6. package/dist/analysis/analyze.js +395 -347
  7. package/dist/analysis/clustering.d.ts +23 -0
  8. package/dist/analysis/clustering.js +206 -0
  9. package/dist/analysis/content.d.ts +1 -1
  10. package/dist/analysis/content.js +11 -5
  11. package/dist/analysis/duplicate.d.ts +34 -0
  12. package/dist/analysis/duplicate.js +305 -0
  13. package/dist/analysis/heading.d.ts +116 -0
  14. package/dist/analysis/heading.js +356 -0
  15. package/dist/analysis/images.d.ts +1 -1
  16. package/dist/analysis/images.js +6 -5
  17. package/dist/analysis/links.d.ts +1 -1
  18. package/dist/analysis/links.js +8 -8
  19. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  20. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  21. package/dist/analysis/scoring.js +11 -2
  22. package/dist/analysis/seo.d.ts +8 -4
  23. package/dist/analysis/seo.js +41 -30
  24. package/dist/analysis/soft404.d.ts +17 -0
  25. package/dist/analysis/soft404.js +62 -0
  26. package/dist/analysis/structuredData.d.ts +1 -1
  27. package/dist/analysis/structuredData.js +5 -4
  28. package/dist/analysis/templates.d.ts +2 -0
  29. package/dist/analysis/templates.js +7 -0
  30. package/dist/application/index.d.ts +2 -0
  31. package/dist/application/index.js +2 -0
  32. package/dist/application/usecase.d.ts +3 -0
  33. package/dist/application/usecase.js +1 -0
  34. package/dist/application/usecases.d.ts +114 -0
  35. package/dist/application/usecases.js +201 -0
  36. package/dist/audit/index.js +1 -1
  37. package/dist/audit/transport.d.ts +1 -1
  38. package/dist/audit/transport.js +5 -4
  39. package/dist/audit/types.d.ts +1 -0
  40. package/dist/constants.d.ts +17 -0
  41. package/dist/constants.js +23 -0
  42. package/dist/core/scope/scopeManager.js +3 -0
  43. package/dist/core/security/ipGuard.d.ts +11 -0
  44. package/dist/core/security/ipGuard.js +71 -3
  45. package/dist/crawler/crawl.d.ts +4 -22
  46. package/dist/crawler/crawl.js +4 -335
  47. package/dist/crawler/crawler.d.ts +87 -0
  48. package/dist/crawler/crawler.js +683 -0
  49. package/dist/crawler/extract.d.ts +4 -1
  50. package/dist/crawler/extract.js +7 -2
  51. package/dist/crawler/fetcher.d.ts +2 -1
  52. package/dist/crawler/fetcher.js +26 -11
  53. package/dist/crawler/metricsRunner.d.ts +23 -1
  54. package/dist/crawler/metricsRunner.js +202 -72
  55. package/dist/crawler/normalize.d.ts +41 -0
  56. package/dist/crawler/normalize.js +119 -3
  57. package/dist/crawler/parser.d.ts +1 -3
  58. package/dist/crawler/parser.js +2 -49
  59. package/dist/crawler/resolver.d.ts +11 -0
  60. package/dist/crawler/resolver.js +67 -0
  61. package/dist/crawler/sitemap.d.ts +6 -0
  62. package/dist/crawler/sitemap.js +27 -17
  63. package/dist/crawler/trap.d.ts +5 -1
  64. package/dist/crawler/trap.js +23 -2
  65. package/dist/db/CrawlithDB.d.ts +110 -0
  66. package/dist/db/CrawlithDB.js +500 -0
  67. package/dist/db/graphLoader.js +42 -30
  68. package/dist/db/index.d.ts +11 -0
  69. package/dist/db/index.js +41 -29
  70. package/dist/db/migrations.d.ts +2 -0
  71. package/dist/db/{schema.js → migrations.js} +90 -43
  72. package/dist/db/pluginRegistry.d.ts +9 -0
  73. package/dist/db/pluginRegistry.js +19 -0
  74. package/dist/db/repositories/EdgeRepository.d.ts +13 -0
  75. package/dist/db/repositories/EdgeRepository.js +20 -0
  76. package/dist/db/repositories/MetricsRepository.d.ts +16 -8
  77. package/dist/db/repositories/MetricsRepository.js +28 -7
  78. package/dist/db/repositories/PageRepository.d.ts +15 -2
  79. package/dist/db/repositories/PageRepository.js +169 -25
  80. package/dist/db/repositories/SiteRepository.d.ts +9 -0
  81. package/dist/db/repositories/SiteRepository.js +13 -0
  82. package/dist/db/repositories/SnapshotRepository.d.ts +14 -5
  83. package/dist/db/repositories/SnapshotRepository.js +64 -5
  84. package/dist/db/reset.d.ts +9 -0
  85. package/dist/db/reset.js +32 -0
  86. package/dist/db/statements.d.ts +12 -0
  87. package/dist/db/statements.js +40 -0
  88. package/dist/diff/compare.d.ts +0 -5
  89. package/dist/diff/compare.js +0 -12
  90. package/dist/diff/service.d.ts +16 -0
  91. package/dist/diff/service.js +41 -0
  92. package/dist/domain/index.d.ts +4 -0
  93. package/dist/domain/index.js +4 -0
  94. package/dist/events.d.ts +56 -0
  95. package/dist/events.js +1 -0
  96. package/dist/graph/graph.d.ts +36 -42
  97. package/dist/graph/graph.js +26 -17
  98. package/dist/graph/hits.d.ts +23 -0
  99. package/dist/graph/hits.js +111 -0
  100. package/dist/graph/metrics.d.ts +0 -4
  101. package/dist/graph/metrics.js +25 -9
  102. package/dist/graph/pagerank.d.ts +17 -4
  103. package/dist/graph/pagerank.js +126 -91
  104. package/dist/graph/simhash.d.ts +6 -0
  105. package/dist/graph/simhash.js +14 -0
  106. package/dist/index.d.ts +29 -8
  107. package/dist/index.js +29 -8
  108. package/dist/lock/hashKey.js +1 -1
  109. package/dist/lock/lockManager.d.ts +5 -1
  110. package/dist/lock/lockManager.js +38 -13
  111. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  112. package/dist/plugin-system/plugin-cli.js +31 -0
  113. package/dist/plugin-system/plugin-config.d.ts +16 -0
  114. package/dist/plugin-system/plugin-config.js +36 -0
  115. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  116. package/dist/plugin-system/plugin-loader.js +122 -0
  117. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  118. package/dist/plugin-system/plugin-registry.js +167 -0
  119. package/dist/plugin-system/plugin-types.d.ts +205 -0
  120. package/dist/plugin-system/plugin-types.js +1 -0
  121. package/dist/ports/index.d.ts +9 -0
  122. package/dist/ports/index.js +1 -0
  123. package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
  124. package/dist/report/crawlExport.d.ts +3 -0
  125. package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
  126. package/dist/report/crawl_template.d.ts +1 -0
  127. package/dist/report/crawl_template.js +7 -0
  128. package/dist/report/export.d.ts +3 -0
  129. package/dist/report/export.js +81 -0
  130. package/dist/report/html.js +15 -216
  131. package/dist/report/insight.d.ts +27 -0
  132. package/dist/report/insight.js +103 -0
  133. package/dist/scoring/health.d.ts +56 -0
  134. package/dist/scoring/health.js +213 -0
  135. package/dist/utils/chalk.d.ts +6 -0
  136. package/dist/utils/chalk.js +41 -0
  137. package/dist/utils/secureConfig.d.ts +23 -0
  138. package/dist/utils/secureConfig.js +128 -0
  139. package/package.json +12 -6
  140. package/CHANGELOG.md +0 -7
  141. package/dist/db/schema.d.ts +0 -2
  142. package/dist/graph/cluster.d.ts +0 -6
  143. package/dist/graph/cluster.js +0 -173
  144. package/dist/graph/duplicate.d.ts +0 -10
  145. package/dist/graph/duplicate.js +0 -251
  146. package/dist/report/sitegraphExport.d.ts +0 -3
  147. package/dist/report/sitegraph_template.d.ts +0 -1
  148. package/dist/report/sitegraph_template.js +0 -630
  149. package/dist/scoring/hits.d.ts +0 -9
  150. package/dist/scoring/hits.js +0 -111
  151. package/src/analysis/analyze.ts +0 -548
  152. package/src/analysis/content.ts +0 -62
  153. package/src/analysis/images.ts +0 -28
  154. package/src/analysis/links.ts +0 -41
  155. package/src/analysis/scoring.ts +0 -59
  156. package/src/analysis/seo.ts +0 -82
  157. package/src/analysis/structuredData.ts +0 -62
  158. package/src/audit/dns.ts +0 -49
  159. package/src/audit/headers.ts +0 -98
  160. package/src/audit/index.ts +0 -66
  161. package/src/audit/scoring.ts +0 -232
  162. package/src/audit/transport.ts +0 -258
  163. package/src/audit/types.ts +0 -102
  164. package/src/core/network/proxyAdapter.ts +0 -21
  165. package/src/core/network/rateLimiter.ts +0 -39
  166. package/src/core/network/redirectController.ts +0 -47
  167. package/src/core/network/responseLimiter.ts +0 -34
  168. package/src/core/network/retryPolicy.ts +0 -57
  169. package/src/core/scope/domainFilter.ts +0 -45
  170. package/src/core/scope/scopeManager.ts +0 -52
  171. package/src/core/scope/subdomainPolicy.ts +0 -39
  172. package/src/core/security/ipGuard.ts +0 -92
  173. package/src/crawler/crawl.ts +0 -382
  174. package/src/crawler/extract.ts +0 -34
  175. package/src/crawler/fetcher.ts +0 -233
  176. package/src/crawler/metricsRunner.ts +0 -124
  177. package/src/crawler/normalize.ts +0 -108
  178. package/src/crawler/parser.ts +0 -190
  179. package/src/crawler/sitemap.ts +0 -73
  180. package/src/crawler/trap.ts +0 -96
  181. package/src/db/graphLoader.ts +0 -105
  182. package/src/db/index.ts +0 -70
  183. package/src/db/repositories/EdgeRepository.ts +0 -29
  184. package/src/db/repositories/MetricsRepository.ts +0 -49
  185. package/src/db/repositories/PageRepository.ts +0 -128
  186. package/src/db/repositories/SiteRepository.ts +0 -32
  187. package/src/db/repositories/SnapshotRepository.ts +0 -74
  188. package/src/db/schema.ts +0 -177
  189. package/src/diff/compare.ts +0 -84
  190. package/src/graph/cluster.ts +0 -192
  191. package/src/graph/duplicate.ts +0 -286
  192. package/src/graph/graph.ts +0 -172
  193. package/src/graph/metrics.ts +0 -110
  194. package/src/graph/pagerank.ts +0 -125
  195. package/src/graph/simhash.ts +0 -61
  196. package/src/index.ts +0 -30
  197. package/src/lock/hashKey.ts +0 -51
  198. package/src/lock/lockManager.ts +0 -124
  199. package/src/lock/pidCheck.ts +0 -13
  200. package/src/report/html.ts +0 -227
  201. package/src/report/sitegraphExport.ts +0 -58
  202. package/src/scoring/hits.ts +0 -131
  203. package/src/scoring/orphanSeverity.ts +0 -176
  204. package/src/utils/version.ts +0 -18
  205. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  206. package/tests/analysis.unit.test.ts +0 -98
  207. package/tests/analyze.integration.test.ts +0 -98
  208. package/tests/audit/dns.test.ts +0 -31
  209. package/tests/audit/headers.test.ts +0 -45
  210. package/tests/audit/scoring.test.ts +0 -133
  211. package/tests/audit/security.test.ts +0 -12
  212. package/tests/audit/transport.test.ts +0 -112
  213. package/tests/clustering.test.ts +0 -118
  214. package/tests/crawler.test.ts +0 -358
  215. package/tests/db.test.ts +0 -159
  216. package/tests/diff.test.ts +0 -67
  217. package/tests/duplicate.test.ts +0 -110
  218. package/tests/fetcher.test.ts +0 -106
  219. package/tests/fetcher_safety.test.ts +0 -85
  220. package/tests/fixtures/analyze-crawl.json +0 -26
  221. package/tests/hits.test.ts +0 -134
  222. package/tests/html_report.test.ts +0 -58
  223. package/tests/lock/lockManager.test.ts +0 -138
  224. package/tests/metrics.test.ts +0 -196
  225. package/tests/normalize.test.ts +0 -101
  226. package/tests/orphanSeverity.test.ts +0 -160
  227. package/tests/pagerank.test.ts +0 -98
  228. package/tests/parser.test.ts +0 -117
  229. package/tests/proxy_safety.test.ts +0 -57
  230. package/tests/redirect_safety.test.ts +0 -73
  231. package/tests/safety.test.ts +0 -114
  232. package/tests/scope.test.ts +0 -66
  233. package/tests/scoring.test.ts +0 -59
  234. package/tests/sitemap.test.ts +0 -88
  235. package/tests/soft404.test.ts +0 -41
  236. package/tests/trap.test.ts +0 -39
  237. package/tests/visualization_data.test.ts +0 -46
  238. package/tsconfig.json +0 -11
@@ -1,382 +0,0 @@
1
- import { request } from 'undici';
2
- import pLimit from 'p-limit';
3
- import chalk from 'chalk';
4
- import robotsParser from 'robots-parser';
5
- import { Graph } from '../graph/graph.js';
6
- import { Fetcher } from './fetcher.js';
7
- import { Parser } from './parser.js';
8
- import { Sitemap } from './sitemap.js';
9
- import { normalizeUrl } from './normalize.js';
10
- import { TrapDetector } from './trap.js';
11
- import { ScopeManager } from '../core/scope/scopeManager.js';
12
- import { getDb } from '../db/index.js';
13
- import { SiteRepository } from '../db/repositories/SiteRepository.js';
14
- import { SnapshotRepository } from '../db/repositories/SnapshotRepository.js';
15
- import { PageRepository } from '../db/repositories/PageRepository.js';
16
- import { EdgeRepository } from '../db/repositories/EdgeRepository.js';
17
- import { MetricsRepository } from '../db/repositories/MetricsRepository.js';
18
- import { analyzeContent, calculateThinContentScore } from '../analysis/content.js';
19
- import { analyzeLinks } from '../analysis/links.js';
20
-
21
- export interface CrawlOptions {
22
- limit: number;
23
- depth: number;
24
- concurrency?: number;
25
- ignoreRobots?: boolean;
26
- stripQuery?: boolean;
27
- previousGraph?: Graph;
28
- sitemap?: string;
29
- debug?: boolean;
30
- detectSoft404?: boolean;
31
- detectTraps?: boolean;
32
- rate?: number;
33
- maxBytes?: number;
34
- allowedDomains?: string[];
35
- deniedDomains?: string[];
36
- includeSubdomains?: boolean;
37
- proxyUrl?: string;
38
- maxRedirects?: number;
39
- userAgent?: string;
40
- }
41
-
42
- interface QueueItem {
43
- url: string;
44
- depth: number;
45
- }
46
-
47
- export async function crawl(startUrl: string, options: CrawlOptions): Promise<number> {
48
- const visited = new Set<string>();
49
- const concurrency = Math.min(options.concurrency || 2, 10);
50
- const limitConcurrency = pLimit(concurrency);
51
- const trapDetector = new TrapDetector();
52
-
53
- const db = getDb();
54
- const siteRepo = new SiteRepository(db);
55
- const snapshotRepo = new SnapshotRepository(db);
56
- const pageRepo = new PageRepository(db);
57
- const edgeRepo = new EdgeRepository(db);
58
- const metricsRepo = new MetricsRepository(db);
59
-
60
- const rootUrl = normalizeUrl(startUrl, '', { stripQuery: options.stripQuery });
61
- if (!rootUrl) throw new Error('Invalid start URL');
62
-
63
- const urlObj = new URL(rootUrl);
64
- const domain = urlObj.hostname.replace('www.', '');
65
- const site = siteRepo.firstOrCreateSite(domain);
66
- const siteId = site.id;
67
-
68
- const snapshotId = snapshotRepo.createSnapshot(siteId, options.previousGraph ? 'incremental' : 'full');
69
- const rootOrigin = urlObj.origin;
70
-
71
- // DB Helper
72
- const savePageToDb = (url: string, depth: number, status: number, data: any = {}): number | null => {
73
- try {
74
- const existing = pageRepo.getPage(siteId!, url);
75
- const isSameSnapshot = existing?.last_seen_snapshot_id === snapshotId;
76
-
77
- return pageRepo.upsertAndGetId({
78
- site_id: siteId!,
79
- normalized_url: url,
80
- depth: isSameSnapshot ? existing.depth : depth,
81
- http_status: status,
82
- first_seen_snapshot_id: existing ? existing.first_seen_snapshot_id : snapshotId,
83
- last_seen_snapshot_id: snapshotId,
84
- canonical_url: data.canonical !== undefined ? data.canonical : existing?.canonical_url,
85
- content_hash: data.contentHash !== undefined ? data.contentHash : existing?.content_hash,
86
- simhash: data.simhash !== undefined ? data.simhash : existing?.simhash,
87
- etag: data.etag !== undefined ? data.etag : existing?.etag,
88
- last_modified: data.lastModified !== undefined ? data.lastModified : existing?.last_modified,
89
- html: data.html !== undefined ? data.html : existing?.html,
90
- soft404_score: data.soft404Score !== undefined ? data.soft404Score : existing?.soft404_score,
91
- noindex: data.noindex !== undefined ? (data.noindex ? 1 : 0) : existing?.noindex,
92
- nofollow: data.nofollow !== undefined ? (data.nofollow ? 1 : 0) : existing?.nofollow,
93
- security_error: data.securityError !== undefined ? data.securityError : existing?.security_error,
94
- retries: data.retries !== undefined ? data.retries : existing?.retries
95
- });
96
- } catch (e) {
97
- console.error(`Failed to save page ${url}:`, e);
98
- return null;
99
- }
100
- };
101
-
102
- const saveEdgeToDb = (sourceUrl: string, targetUrl: string, weight: number = 1.0, rel: string = 'internal') => {
103
- try {
104
- const sourceId = pageRepo.getIdByUrl(siteId!, sourceUrl);
105
- const targetId = pageRepo.getIdByUrl(siteId!, targetUrl);
106
- if (sourceId && targetId) {
107
- edgeRepo.insertEdge(snapshotId, sourceId, targetId, weight, rel);
108
- }
109
- } catch (e) {
110
- console.error(`Failed to save edge ${sourceUrl} -> ${targetUrl}:`, e);
111
- }
112
- };
113
-
114
- // Initialize Modules
115
- const scopeManager = new ScopeManager({
116
- allowedDomains: options.allowedDomains || [],
117
- deniedDomains: options.deniedDomains || [],
118
- includeSubdomains: options.includeSubdomains || false,
119
- rootUrl: startUrl
120
- });
121
-
122
- const fetcher = new Fetcher({
123
- rate: options.rate,
124
- proxyUrl: options.proxyUrl,
125
- scopeManager,
126
- maxRedirects: options.maxRedirects,
127
- userAgent: options.userAgent
128
- });
129
-
130
- const parser = new Parser();
131
- const sitemapFetcher = new Sitemap();
132
-
133
- // Handle robots.txt
134
- let robots: any = null;
135
- if (!options.ignoreRobots) {
136
- try {
137
- const robotsUrl = new URL('/robots.txt', rootOrigin).toString();
138
- const res = await request(robotsUrl, {
139
- maxRedirections: 3,
140
- headers: { 'User-Agent': 'crawlith/1.0' },
141
- headersTimeout: 5000,
142
- bodyTimeout: 5000
143
- });
144
- if (res.statusCode >= 200 && res.statusCode < 300) {
145
- const txt = await res.body.text();
146
- robots = (robotsParser as any)(robotsUrl, txt);
147
- } else {
148
- await res.body.dump();
149
- }
150
- } catch {
151
- console.warn('Failed to fetch robots.txt, proceeding...');
152
- }
153
- }
154
-
155
- // Queue Setup
156
- const queue: QueueItem[] = [];
157
- const uniqueQueue = new Set<string>();
158
-
159
- const addToQueue = (u: string, d: number) => {
160
- if (scopeManager.isUrlEligible(u) !== 'allowed') return;
161
- if (!uniqueQueue.has(u)) {
162
- uniqueQueue.add(u);
163
- queue.push({ url: u, depth: d });
164
- }
165
- };
166
-
167
- // Seed from Sitemap
168
- if (options.sitemap) {
169
- try {
170
- const sitemapUrl = options.sitemap === 'true' ? new URL('/sitemap.xml', rootOrigin).toString() : options.sitemap;
171
- if (sitemapUrl.startsWith('http')) {
172
- console.log(`Fetching sitemap: ${sitemapUrl}`);
173
- const sitemapUrls = await sitemapFetcher.fetch(sitemapUrl);
174
- for (const u of sitemapUrls) {
175
- const normalized = normalizeUrl(u, '', options);
176
- if (normalized) addToQueue(normalized, 0);
177
- }
178
- }
179
- } catch (e) {
180
- console.warn('Sitemap fetch failed', e);
181
- }
182
- }
183
-
184
- // Seed from startUrl
185
- addToQueue(rootUrl, 0);
186
-
187
- let pagesCrawled = 0;
188
- let active = 0;
189
- let reachedLimit = false;
190
- const maxDepthInCrawl = Math.min(options.depth, 10);
191
-
192
- const shouldEnqueue = (url: string, depth: number) => {
193
- if (visited.has(url)) return false;
194
- if (uniqueQueue.has(url)) return false;
195
- if (depth > maxDepthInCrawl) return false;
196
- if (scopeManager.isUrlEligible(url) !== 'allowed') return false;
197
-
198
- if (options.detectTraps) {
199
- const trap = trapDetector.checkTrap(url, depth);
200
- if (trap.risk > 0.8) return false;
201
- }
202
- return true;
203
- };
204
-
205
- return new Promise((resolve) => {
206
- const checkDone = () => {
207
- if (queue.length === 0 && active === 0) {
208
- snapshotRepo.updateSnapshotStatus(snapshotId, 'completed', {
209
- limit_reached: reachedLimit ? 1 : 0
210
- });
211
- resolve(snapshotId);
212
- return true;
213
- }
214
- return false;
215
- };
216
-
217
- const next = () => {
218
- if (checkDone()) return;
219
- if (pagesCrawled >= options.limit) {
220
- reachedLimit = true;
221
- if (active === 0) {
222
- snapshotRepo.updateSnapshotStatus(snapshotId, 'completed', {
223
- limit_reached: 1
224
- });
225
- resolve(snapshotId);
226
- }
227
- return;
228
- }
229
-
230
- while (queue.length > 0 && active < concurrency && pagesCrawled < options.limit) {
231
- const item = queue.shift()!;
232
- if (visited.has(item.url)) continue;
233
- if (robots && !robots.isAllowed(item.url, 'crawlith')) continue;
234
-
235
- active++;
236
- pagesCrawled++;
237
- visited.add(item.url);
238
-
239
- limitConcurrency(() => processPage(item)).finally(() => {
240
- active--;
241
- next();
242
- });
243
- }
244
- };
245
-
246
- const processPage = async (item: QueueItem) => {
247
- const { url, depth } = item;
248
- if (scopeManager.isUrlEligible(url) !== 'allowed') {
249
- savePageToDb(url, depth, 0, { securityError: 'blocked_by_domain_filter' });
250
- return;
251
- }
252
-
253
- const existingInDb = pageRepo.getPage(siteId!, url);
254
- savePageToDb(url, depth, 0);
255
-
256
- try {
257
- const res = await fetcher.fetch(url, {
258
- etag: existingInDb?.etag || undefined,
259
- lastModified: existingInDb?.last_modified || undefined,
260
- maxBytes: options.maxBytes,
261
- crawlDelay: robots ? robots.getCrawlDelay('crawlith') : undefined
262
- });
263
-
264
- if (options.debug) {
265
- console.log(`${chalk.gray(`[D:${depth}]`)} ${res.status} ${chalk.blue(url)}`);
266
- }
267
-
268
- if (res.status === 304) {
269
- savePageToDb(url, depth, 304);
270
- metricsRepo.insertMetrics({
271
- snapshot_id: snapshotId,
272
- page_id: existingInDb!.id,
273
- authority_score: null,
274
- hub_score: null,
275
- pagerank: null,
276
- pagerank_score: null,
277
- link_role: null,
278
- crawl_status: 'cached',
279
- word_count: null,
280
- thin_content_score: null,
281
- external_link_ratio: null,
282
- orphan_score: null,
283
- duplicate_cluster_id: null,
284
- duplicate_type: null,
285
- is_cluster_primary: 0
286
- });
287
- return;
288
- }
289
-
290
- const chain = res.redirectChain;
291
- for (const step of chain) {
292
- const source = normalizeUrl(step.url, '', options);
293
- const target = normalizeUrl(step.target, '', options);
294
- if (source && target) {
295
- savePageToDb(source, depth, step.status);
296
- savePageToDb(target, depth, 0);
297
- saveEdgeToDb(source, target);
298
- }
299
- }
300
-
301
- const finalUrl = normalizeUrl(res.finalUrl, '', options);
302
- if (!finalUrl) return;
303
-
304
- const isStringStatus = typeof res.status === 'string';
305
- if (isStringStatus || (typeof res.status === 'number' && res.status >= 300)) {
306
- savePageToDb(finalUrl, depth, typeof res.status === 'number' ? res.status : 0, {
307
- securityError: isStringStatus ? res.status : undefined,
308
- retries: res.retries
309
- });
310
- return;
311
- }
312
-
313
- if (res.status === 200) {
314
- const contentTypeHeader = res.headers['content-type'];
315
- const contentType = Array.isArray(contentTypeHeader) ? contentTypeHeader[0] : (contentTypeHeader || '');
316
- if (!contentType || !contentType.toLowerCase().includes('text/html')) {
317
- savePageToDb(finalUrl, depth, res.status);
318
- return;
319
- }
320
-
321
- savePageToDb(finalUrl, depth, res.status);
322
- const parseResult = parser.parse(res.body, finalUrl, res.status);
323
-
324
- const pageId = savePageToDb(finalUrl, depth, res.status, {
325
- html: parseResult.html,
326
- canonical: parseResult.canonical || undefined,
327
- noindex: parseResult.noindex,
328
- nofollow: parseResult.nofollow,
329
- contentHash: parseResult.contentHash,
330
- simhash: parseResult.simhash,
331
- soft404Score: parseResult.soft404Score,
332
- etag: res.etag,
333
- lastModified: res.lastModified,
334
- retries: res.retries
335
- });
336
-
337
- if (pageId) {
338
- try {
339
- const contentAnalysis = analyzeContent(parseResult.html);
340
- const linkAnalysis = analyzeLinks(parseResult.html, finalUrl, rootOrigin);
341
- const thinScore = calculateThinContentScore(contentAnalysis, 0);
342
-
343
- metricsRepo.insertMetrics({
344
- snapshot_id: snapshotId,
345
- page_id: pageId,
346
- authority_score: null,
347
- hub_score: null,
348
- pagerank: null,
349
- pagerank_score: null,
350
- link_role: null,
351
- crawl_status: 'fetched',
352
- word_count: contentAnalysis.wordCount,
353
- thin_content_score: thinScore,
354
- external_link_ratio: linkAnalysis.externalRatio,
355
- orphan_score: null,
356
- duplicate_cluster_id: null,
357
- duplicate_type: null,
358
- is_cluster_primary: 0
359
- });
360
- } catch (e) {
361
- console.error(`Error calculating per-page metrics for ${finalUrl}:`, e);
362
- }
363
- }
364
-
365
- for (const linkItem of parseResult.links) {
366
- const normalizedLink = normalizeUrl(linkItem.url, '', options);
367
- if (normalizedLink && normalizedLink !== finalUrl) {
368
- savePageToDb(normalizedLink, depth + 1, 0);
369
- saveEdgeToDb(finalUrl, normalizedLink, 1.0, 'internal');
370
- if (shouldEnqueue(normalizedLink, depth + 1)) {
371
- addToQueue(normalizedLink, depth + 1);
372
- }
373
- }
374
- }
375
- }
376
- } catch (e) {
377
- console.error(`Error processing ${url}:`, e);
378
- }
379
- };
380
- next();
381
- });
382
- }
@@ -1,34 +0,0 @@
1
- import * as cheerio from 'cheerio';
2
-
3
- /**
4
- * Extracts all links from an HTML document.
5
- * Returns absolute URLs.
6
- */
7
- export function extractLinks(html: string, baseUrl: string): string[] {
8
- try {
9
- const $ = cheerio.load(html);
10
- const links = new Set<string>();
11
-
12
- $('a').each((_, element) => {
13
- const href = $(element).attr('href');
14
- if (href) {
15
- try {
16
- const absoluteUrl = new URL(href, baseUrl);
17
- // Only http(s) links
18
- if (absoluteUrl.protocol === 'http:' || absoluteUrl.protocol === 'https:') {
19
- // Remove hash fragments immediately as they are irrelevant for crawling
20
- absoluteUrl.hash = '';
21
- links.add(absoluteUrl.toString());
22
- }
23
- } catch (_e) {
24
- // Invalid URL, skip
25
- }
26
- }
27
- });
28
-
29
- return Array.from(links);
30
- } catch (e) {
31
- console.error(`Error extracting links from ${baseUrl}:`, e);
32
- return [];
33
- }
34
- }
@@ -1,233 +0,0 @@
1
- import { request } from 'undici';
2
- import { IPGuard } from '../core/security/ipGuard.js';
3
- import { RateLimiter } from '../core/network/rateLimiter.js';
4
- import { RetryPolicy } from '../core/network/retryPolicy.js';
5
- import { ResponseLimiter } from '../core/network/responseLimiter.js';
6
- import { RedirectController } from '../core/network/redirectController.js';
7
- import { ProxyAdapter } from '../core/network/proxyAdapter.js';
8
- import { ScopeManager } from '../core/scope/scopeManager.js';
9
- import { version } from '../utils/version.js';
10
-
11
- export interface RedirectStep {
12
- url: string;
13
- status: number;
14
- target: string;
15
- }
16
-
17
- export interface FetchResult {
18
- status: number
19
- | 'blocked_internal_ip'
20
- | 'blocked_by_domain_filter'
21
- | 'blocked_subdomain'
22
- | 'oversized'
23
- | 'failed_after_retries'
24
- | 'network_error'
25
- | 'redirect_limit_exceeded'
26
- | 'redirect_loop'
27
- | 'proxy_connection_failed';
28
- headers: Record<string, string | string[] | undefined>;
29
- body: string;
30
- redirectChain: RedirectStep[];
31
- etag: string | null;
32
- lastModified: string | null;
33
- finalUrl: string;
34
- retries?: number;
35
- bytesReceived?: number;
36
- }
37
-
38
- export interface FetchOptions {
39
- etag?: string;
40
- lastModified?: string;
41
- rate?: number;
42
- maxBytes?: number;
43
- crawlDelay?: number;
44
- }
45
-
46
- export class Fetcher {
47
- private userAgent = 'crawlith/1.0';
48
- private rateLimiter: RateLimiter;
49
- private proxyAdapter: ProxyAdapter;
50
- private scopeManager?: ScopeManager;
51
- private maxRedirects: number;
52
-
53
- constructor(options: {
54
- rate?: number;
55
- proxyUrl?: string;
56
- scopeManager?: ScopeManager;
57
- maxRedirects?: number;
58
- userAgent?: string;
59
- } = {}) {
60
- this.rateLimiter = new RateLimiter(options.rate || 2);
61
- this.proxyAdapter = new ProxyAdapter(options.proxyUrl);
62
- this.scopeManager = options.scopeManager;
63
- this.maxRedirects = Math.min(options.maxRedirects ?? 2, 11);
64
- this.userAgent = options.userAgent || `crawlith/${version}`;
65
- }
66
-
67
- async fetch(url: string, options: FetchOptions = {}): Promise<FetchResult> {
68
- const maxBytes = options.maxBytes || 2000000;
69
- const redirectChain: RedirectStep[] = [];
70
- const redirectController = new RedirectController(this.maxRedirects, url);
71
-
72
- let currentUrl = url;
73
- let totalRetries = 0;
74
-
75
- // Use a while(true) and explicit return/continue to handle redirects
76
- while (true) {
77
- const urlObj = new URL(currentUrl);
78
-
79
- // 1. SSRF Guard
80
- const isSafe = await IPGuard.validateHost(urlObj.hostname);
81
- if (!isSafe) {
82
- return this.errorResult('blocked_internal_ip', currentUrl, redirectChain, totalRetries);
83
- }
84
-
85
- // 2. Scope Validation (Domain & Subdomain)
86
- if (this.scopeManager) {
87
- const eligibility = this.scopeManager.isUrlEligible(currentUrl);
88
- if (eligibility !== 'allowed') {
89
- return this.errorResult(eligibility, currentUrl, redirectChain, totalRetries);
90
- }
91
- }
92
-
93
- // 3. Rate Limiting
94
- await this.rateLimiter.waitForToken(urlObj.hostname, options.crawlDelay);
95
-
96
- try {
97
- // 4. Retry Strategy
98
- const result = await RetryPolicy.execute(
99
- async (attempt) => {
100
- if (attempt > 0) totalRetries++;
101
-
102
- const headers: Record<string, string> = {
103
- 'User-Agent': this.userAgent
104
- };
105
-
106
- // Conditional GET only for the FIRST request in a chain
107
- if (redirectChain.length === 0) {
108
- if (options.etag) headers['If-None-Match'] = options.etag;
109
- if (options.lastModified) headers['If-Modified-Since'] = options.lastModified;
110
- }
111
-
112
- const res = await request(currentUrl, {
113
- method: 'GET',
114
- headers,
115
- maxRedirections: 0,
116
- dispatcher: this.proxyAdapter.dispatcher,
117
- headersTimeout: 10000,
118
- bodyTimeout: 10000
119
- });
120
-
121
- if (RetryPolicy.isRetryableStatus(res.statusCode)) {
122
- await res.body.dump();
123
- throw new Error(`Status ${res.statusCode}`);
124
- }
125
-
126
- return res;
127
- },
128
- (error) => RetryPolicy.isNetworkError(error) || error.message.startsWith('Status ')
129
- );
130
-
131
- const status = result.statusCode;
132
- const resHeaders = result.headers;
133
-
134
- const getHeader = (name: string): string | null => {
135
- const val = resHeaders[name.toLowerCase()];
136
- if (Array.isArray(val)) return val[0];
137
- return (val as string) || null;
138
- };
139
-
140
- const etag = getHeader('etag');
141
- const lastModified = getHeader('last-modified');
142
-
143
- // Handle Redirects
144
- if (status >= 300 && status < 400 && status !== 304) {
145
- const location = getHeader('location');
146
- if (location) {
147
- let targetUrl: string;
148
- try {
149
- targetUrl = new URL(location, currentUrl).toString();
150
- } catch (_e) {
151
- // Bad redirect location, treat as final but maybe error?
152
- const body = await ResponseLimiter.streamToString(result.body, maxBytes);
153
- return { status, headers: resHeaders, body, redirectChain, etag: null, lastModified: null, finalUrl: currentUrl, retries: totalRetries };
154
- }
155
-
156
- const redirectError = redirectController.nextHop(targetUrl);
157
- if (redirectError) {
158
- await result.body.dump();
159
- return this.errorResult(redirectError, currentUrl, redirectChain, totalRetries);
160
- }
161
-
162
- redirectChain.push({ url: currentUrl, status, target: targetUrl });
163
- await result.body.dump();
164
- currentUrl = targetUrl;
165
- continue; // Next iteration for redirect target
166
- }
167
- }
168
-
169
- // 5. Max Response Size (Streaming)
170
- let bytesReceived = 0;
171
- try {
172
- const body = status === 304 ? '' : await ResponseLimiter.streamToString(
173
- result.body,
174
- maxBytes,
175
- (bytes) => { bytesReceived = bytes; }
176
- );
177
-
178
- return {
179
- status,
180
- headers: resHeaders,
181
- body,
182
- redirectChain,
183
- etag,
184
- lastModified,
185
- finalUrl: currentUrl,
186
- retries: totalRetries,
187
- bytesReceived
188
- };
189
- } catch (e: any) {
190
- if (e.message === 'Oversized response') {
191
- return {
192
- status: 'oversized',
193
- headers: resHeaders,
194
- body: '',
195
- redirectChain,
196
- etag: null,
197
- lastModified: null,
198
- finalUrl: currentUrl,
199
- retries: totalRetries,
200
- bytesReceived
201
- };
202
- }
203
- throw e;
204
- }
205
-
206
- } catch (error: any) {
207
- // Map common network errors to specific statuses if needed
208
- const isProxyError = error.message?.toLowerCase().includes('proxy') || error.code === 'ECONNREFUSED';
209
- const finalStatus = isProxyError ? 'proxy_connection_failed' : 'network_error';
210
-
211
- return this.errorResult(
212
- totalRetries >= RetryPolicy.DEFAULT_CONFIG.maxRetries ? 'failed_after_retries' : finalStatus,
213
- currentUrl,
214
- redirectChain,
215
- totalRetries
216
- );
217
- }
218
- }
219
- }
220
-
221
- private errorResult(status: any, finalUrl: string, redirectChain: RedirectStep[], retries: number): FetchResult {
222
- return {
223
- status,
224
- headers: {},
225
- body: '',
226
- redirectChain,
227
- etag: null,
228
- lastModified: null,
229
- finalUrl,
230
- retries
231
- };
232
- }
233
- }