@crawlith/core 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analyze.d.ts +29 -8
  4. package/dist/analysis/analyze.js +325 -221
  5. package/dist/analysis/clustering.d.ts +23 -0
  6. package/dist/analysis/clustering.js +206 -0
  7. package/dist/analysis/content.d.ts +1 -1
  8. package/dist/analysis/content.js +11 -5
  9. package/dist/analysis/duplicate.d.ts +34 -0
  10. package/dist/analysis/duplicate.js +305 -0
  11. package/dist/analysis/heading.d.ts +116 -0
  12. package/dist/analysis/heading.js +356 -0
  13. package/dist/analysis/images.d.ts +1 -1
  14. package/dist/analysis/images.js +6 -5
  15. package/dist/analysis/links.d.ts +1 -1
  16. package/dist/analysis/links.js +8 -8
  17. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  18. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  19. package/dist/analysis/scoring.js +4 -1
  20. package/dist/analysis/seo.d.ts +8 -4
  21. package/dist/analysis/seo.js +41 -30
  22. package/dist/analysis/soft404.d.ts +17 -0
  23. package/dist/analysis/soft404.js +62 -0
  24. package/dist/analysis/structuredData.d.ts +1 -1
  25. package/dist/analysis/structuredData.js +5 -4
  26. package/dist/application/index.d.ts +2 -0
  27. package/dist/application/index.js +2 -0
  28. package/dist/application/usecase.d.ts +3 -0
  29. package/dist/application/usecase.js +1 -0
  30. package/dist/application/usecases.d.ts +114 -0
  31. package/dist/application/usecases.js +201 -0
  32. package/dist/audit/index.js +1 -1
  33. package/dist/audit/transport.d.ts +1 -1
  34. package/dist/audit/transport.js +5 -4
  35. package/dist/audit/types.d.ts +1 -0
  36. package/dist/constants.d.ts +17 -0
  37. package/dist/constants.js +23 -0
  38. package/dist/core/scope/scopeManager.js +3 -0
  39. package/dist/crawler/crawl.d.ts +2 -2
  40. package/dist/crawler/crawler.d.ts +17 -5
  41. package/dist/crawler/crawler.js +259 -94
  42. package/dist/crawler/fetcher.d.ts +1 -1
  43. package/dist/crawler/fetcher.js +6 -6
  44. package/dist/crawler/metricsRunner.d.ts +21 -1
  45. package/dist/crawler/metricsRunner.js +181 -60
  46. package/dist/crawler/normalize.d.ts +41 -0
  47. package/dist/crawler/normalize.js +119 -3
  48. package/dist/crawler/parser.d.ts +1 -3
  49. package/dist/crawler/parser.js +2 -49
  50. package/dist/crawler/resolver.d.ts +11 -0
  51. package/dist/crawler/resolver.js +67 -0
  52. package/dist/crawler/sitemap.d.ts +4 -1
  53. package/dist/crawler/sitemap.js +24 -18
  54. package/dist/crawler/trap.d.ts +5 -1
  55. package/dist/crawler/trap.js +23 -2
  56. package/dist/db/CrawlithDB.d.ts +110 -0
  57. package/dist/db/CrawlithDB.js +500 -0
  58. package/dist/db/graphLoader.js +15 -32
  59. package/dist/db/index.d.ts +9 -1
  60. package/dist/db/index.js +39 -31
  61. package/dist/db/migrations.d.ts +2 -0
  62. package/dist/db/{schema.js → migrations.js} +90 -43
  63. package/dist/db/pluginRegistry.d.ts +9 -0
  64. package/dist/db/pluginRegistry.js +19 -0
  65. package/dist/db/repositories/EdgeRepository.d.ts +5 -0
  66. package/dist/db/repositories/EdgeRepository.js +7 -0
  67. package/dist/db/repositories/MetricsRepository.d.ts +13 -8
  68. package/dist/db/repositories/MetricsRepository.js +14 -6
  69. package/dist/db/repositories/PageRepository.d.ts +5 -3
  70. package/dist/db/repositories/PageRepository.js +68 -17
  71. package/dist/db/repositories/SiteRepository.d.ts +6 -0
  72. package/dist/db/repositories/SiteRepository.js +4 -0
  73. package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
  74. package/dist/db/repositories/SnapshotRepository.js +48 -10
  75. package/dist/db/reset.d.ts +9 -0
  76. package/dist/db/reset.js +32 -0
  77. package/dist/db/statements.d.ts +12 -0
  78. package/dist/db/statements.js +40 -0
  79. package/dist/diff/compare.d.ts +0 -5
  80. package/dist/diff/compare.js +0 -12
  81. package/dist/diff/service.d.ts +16 -0
  82. package/dist/diff/service.js +41 -0
  83. package/dist/domain/index.d.ts +4 -0
  84. package/dist/domain/index.js +4 -0
  85. package/dist/events.d.ts +8 -0
  86. package/dist/graph/graph.d.ts +20 -42
  87. package/dist/graph/graph.js +12 -16
  88. package/dist/graph/hits.d.ts +23 -0
  89. package/dist/graph/hits.js +111 -0
  90. package/dist/graph/metrics.d.ts +0 -4
  91. package/dist/graph/metrics.js +19 -15
  92. package/dist/graph/pagerank.d.ts +17 -4
  93. package/dist/graph/pagerank.js +126 -93
  94. package/dist/index.d.ts +27 -9
  95. package/dist/index.js +27 -9
  96. package/dist/lock/lockManager.d.ts +1 -0
  97. package/dist/lock/lockManager.js +15 -0
  98. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  99. package/dist/plugin-system/plugin-cli.js +31 -0
  100. package/dist/plugin-system/plugin-config.d.ts +16 -0
  101. package/dist/plugin-system/plugin-config.js +36 -0
  102. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  103. package/dist/plugin-system/plugin-loader.js +122 -0
  104. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  105. package/dist/plugin-system/plugin-registry.js +167 -0
  106. package/dist/plugin-system/plugin-types.d.ts +205 -0
  107. package/dist/plugin-system/plugin-types.js +1 -0
  108. package/dist/ports/index.d.ts +9 -0
  109. package/dist/ports/index.js +1 -0
  110. package/dist/report/export.d.ts +3 -0
  111. package/dist/report/export.js +81 -0
  112. package/dist/report/insight.d.ts +27 -0
  113. package/dist/report/insight.js +103 -0
  114. package/dist/scoring/health.d.ts +17 -11
  115. package/dist/scoring/health.js +183 -140
  116. package/dist/utils/chalk.d.ts +6 -0
  117. package/dist/utils/chalk.js +41 -0
  118. package/dist/utils/secureConfig.d.ts +23 -0
  119. package/dist/utils/secureConfig.js +128 -0
  120. package/package.json +10 -4
  121. package/CHANGELOG.md +0 -13
  122. package/dist/db/schema.d.ts +0 -2
  123. package/dist/graph/cluster.d.ts +0 -6
  124. package/dist/graph/cluster.js +0 -221
  125. package/dist/graph/duplicate.d.ts +0 -10
  126. package/dist/graph/duplicate.js +0 -302
  127. package/dist/scoring/hits.d.ts +0 -10
  128. package/dist/scoring/hits.js +0 -131
  129. package/scripts/copy-assets.js +0 -37
  130. package/src/analysis/analysis_list.html +0 -35
  131. package/src/analysis/analysis_page.html +0 -123
  132. package/src/analysis/analyze.ts +0 -505
  133. package/src/analysis/content.ts +0 -62
  134. package/src/analysis/images.ts +0 -28
  135. package/src/analysis/links.ts +0 -41
  136. package/src/analysis/scoring.ts +0 -66
  137. package/src/analysis/seo.ts +0 -82
  138. package/src/analysis/structuredData.ts +0 -62
  139. package/src/analysis/templates.ts +0 -9
  140. package/src/audit/dns.ts +0 -49
  141. package/src/audit/headers.ts +0 -98
  142. package/src/audit/index.ts +0 -66
  143. package/src/audit/scoring.ts +0 -232
  144. package/src/audit/transport.ts +0 -258
  145. package/src/audit/types.ts +0 -102
  146. package/src/core/network/proxyAdapter.ts +0 -21
  147. package/src/core/network/rateLimiter.ts +0 -39
  148. package/src/core/network/redirectController.ts +0 -47
  149. package/src/core/network/responseLimiter.ts +0 -34
  150. package/src/core/network/retryPolicy.ts +0 -57
  151. package/src/core/scope/domainFilter.ts +0 -45
  152. package/src/core/scope/scopeManager.ts +0 -52
  153. package/src/core/scope/subdomainPolicy.ts +0 -39
  154. package/src/core/security/ipGuard.ts +0 -171
  155. package/src/crawler/crawl.ts +0 -9
  156. package/src/crawler/crawler.ts +0 -601
  157. package/src/crawler/extract.ts +0 -39
  158. package/src/crawler/fetcher.ts +0 -251
  159. package/src/crawler/metricsRunner.ts +0 -137
  160. package/src/crawler/normalize.ts +0 -108
  161. package/src/crawler/parser.ts +0 -190
  162. package/src/crawler/sitemap.ts +0 -76
  163. package/src/crawler/trap.ts +0 -96
  164. package/src/db/graphLoader.ts +0 -135
  165. package/src/db/index.ts +0 -75
  166. package/src/db/repositories/EdgeRepository.ts +0 -43
  167. package/src/db/repositories/MetricsRepository.ts +0 -63
  168. package/src/db/repositories/PageRepository.ts +0 -228
  169. package/src/db/repositories/SiteRepository.ts +0 -43
  170. package/src/db/repositories/SnapshotRepository.ts +0 -99
  171. package/src/db/schema.ts +0 -177
  172. package/src/diff/compare.ts +0 -84
  173. package/src/events.ts +0 -16
  174. package/src/graph/cluster.ts +0 -246
  175. package/src/graph/duplicate.ts +0 -350
  176. package/src/graph/graph.ts +0 -192
  177. package/src/graph/metrics.ts +0 -125
  178. package/src/graph/pagerank.ts +0 -126
  179. package/src/graph/simhash.ts +0 -76
  180. package/src/index.ts +0 -33
  181. package/src/lock/hashKey.ts +0 -51
  182. package/src/lock/lockManager.ts +0 -132
  183. package/src/lock/pidCheck.ts +0 -13
  184. package/src/report/crawl.html +0 -879
  185. package/src/report/crawlExport.ts +0 -58
  186. package/src/report/crawl_template.ts +0 -9
  187. package/src/report/html.ts +0 -27
  188. package/src/scoring/health.ts +0 -241
  189. package/src/scoring/hits.ts +0 -153
  190. package/src/scoring/orphanSeverity.ts +0 -176
  191. package/src/utils/version.ts +0 -18
  192. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  193. package/tests/analysis.unit.test.ts +0 -142
  194. package/tests/analyze.integration.test.ts +0 -133
  195. package/tests/analyze_markdown.test.ts +0 -98
  196. package/tests/audit/audit.test.ts +0 -101
  197. package/tests/audit/dns.test.ts +0 -31
  198. package/tests/audit/headers.test.ts +0 -45
  199. package/tests/audit/scoring.test.ts +0 -133
  200. package/tests/audit/security.test.ts +0 -12
  201. package/tests/audit/transport.test.ts +0 -111
  202. package/tests/clustering.test.ts +0 -118
  203. package/tests/clustering_risk.test.ts +0 -118
  204. package/tests/crawler.test.ts +0 -364
  205. package/tests/db/index.test.ts +0 -134
  206. package/tests/db/repositories.test.ts +0 -115
  207. package/tests/db.test.ts +0 -159
  208. package/tests/db_repos.test.ts +0 -72
  209. package/tests/diff.test.ts +0 -67
  210. package/tests/duplicate.test.ts +0 -110
  211. package/tests/extract.test.ts +0 -86
  212. package/tests/fetcher.test.ts +0 -110
  213. package/tests/fetcher_safety.test.ts +0 -91
  214. package/tests/fixtures/analyze-crawl.json +0 -26
  215. package/tests/graph/graph.test.ts +0 -100
  216. package/tests/graphLoader.test.ts +0 -124
  217. package/tests/hits.test.ts +0 -134
  218. package/tests/html_report.test.ts +0 -59
  219. package/tests/ipGuard.test.ts +0 -73
  220. package/tests/lock/lockManager.test.ts +0 -198
  221. package/tests/metrics.test.ts +0 -196
  222. package/tests/normalize.test.ts +0 -88
  223. package/tests/orphanSeverity.test.ts +0 -160
  224. package/tests/pagerank.test.ts +0 -98
  225. package/tests/parser.test.ts +0 -117
  226. package/tests/proxy_safety.test.ts +0 -57
  227. package/tests/redirect_safety.test.ts +0 -77
  228. package/tests/renderAnalysisCsv.test.ts +0 -183
  229. package/tests/safety.test.ts +0 -126
  230. package/tests/scope.test.ts +0 -84
  231. package/tests/scoring.test.ts +0 -60
  232. package/tests/sitemap.test.ts +0 -100
  233. package/tests/soft404.test.ts +0 -41
  234. package/tests/ssrf_fix.test.ts +0 -69
  235. package/tests/trap.test.ts +0 -39
  236. package/tests/visualization_data.test.ts +0 -46
  237. package/tsconfig.json +0 -11
@@ -1,601 +0,0 @@
1
- import chalk from 'chalk';
2
- import pLimit from 'p-limit';
3
- import robotsParser from 'robots-parser';
4
- import { Graph, GraphNode } from '../graph/graph.js';
5
- import { Fetcher, FetchResult } from './fetcher.js';
6
- import { Parser } from './parser.js';
7
- import { Sitemap } from './sitemap.js';
8
- import { normalizeUrl } from './normalize.js';
9
- import { TrapDetector } from './trap.js';
10
- import { ScopeManager } from '../core/scope/scopeManager.js';
11
- import { getDb } from '../db/index.js';
12
- import { SiteRepository } from '../db/repositories/SiteRepository.js';
13
- import { SnapshotRepository } from '../db/repositories/SnapshotRepository.js';
14
- import { PageRepository } from '../db/repositories/PageRepository.js';
15
- import { EdgeRepository } from '../db/repositories/EdgeRepository.js';
16
- import { MetricsRepository } from '../db/repositories/MetricsRepository.js';
17
- import { analyzeContent, calculateThinContentScore } from '../analysis/content.js';
18
- import { analyzeLinks } from '../analysis/links.js';
19
- import { EngineContext } from '../events.js';
20
-
21
- export interface CrawlOptions {
22
- limit: number;
23
- depth: number;
24
- concurrency?: number;
25
- ignoreRobots?: boolean;
26
- stripQuery?: boolean;
27
- previousGraph?: Graph;
28
- sitemap?: string;
29
- debug?: boolean;
30
- detectSoft404?: boolean;
31
- detectTraps?: boolean;
32
- rate?: number;
33
- maxBytes?: number;
34
- allowedDomains?: string[];
35
- deniedDomains?: string[];
36
- includeSubdomains?: boolean;
37
- proxyUrl?: string;
38
- maxRedirects?: number;
39
- userAgent?: string;
40
- snapshotType?: 'full' | 'partial' | 'incremental';
41
- }
42
-
43
- interface QueueItem {
44
- url: string;
45
- depth: number;
46
- }
47
-
48
- // Fallback context for backward compatibility or when no context is provided
49
- const nullContext: EngineContext = {
50
- emit: (event) => {
51
- // Basic console fallback for critical events if no listener is attached
52
- // This maintains some visibility for consumers not using the event system
53
- if (event.type === 'error') {
54
- console.error(event.message, event.error || '');
55
- } else if (event.type === 'warn') {
56
- console.warn(event.message);
57
- }
58
- }
59
- };
60
-
61
- export class Crawler {
62
- private startUrl: string;
63
- private options: CrawlOptions;
64
- private context: EngineContext;
65
- private visited: Set<string>;
66
- private uniqueQueue: Set<string>;
67
- private queue: QueueItem[];
68
- private active: number;
69
- private pagesCrawled: number;
70
- private reachedLimit: boolean;
71
- private maxDepthInCrawl: number;
72
- private concurrency: number;
73
- private limitConcurrency: ReturnType<typeof pLimit>;
74
-
75
- // Repositories
76
- private siteRepo: SiteRepository | null = null;
77
- private snapshotRepo: SnapshotRepository | null = null;
78
- private pageRepo: PageRepository | null = null;
79
- private edgeRepo: EdgeRepository | null = null;
80
- private metricsRepo: MetricsRepository | null = null;
81
-
82
- // Site/Snapshot info
83
- private siteId: number | null = null;
84
- private snapshotId: number | null = null;
85
- private rootOrigin: string = '';
86
-
87
- // Discovery tracking
88
- private discoveryDepths: Map<string, number> = new Map();
89
-
90
- // Buffers for batch operations
91
- private pageBuffer: Map<string, any> = new Map();
92
- private edgeBuffer: { sourceUrl: string; targetUrl: string; weight: number; rel: string }[] = [];
93
- private metricsBuffer: any[] = [];
94
-
95
- // Modules
96
- private scopeManager: ScopeManager | null = null;
97
- private fetcher: Fetcher | null = null;
98
- private parser: Parser | null = null;
99
- private sitemapFetcher: Sitemap | null = null;
100
- private trapDetector: TrapDetector | null = null;
101
- private robots: any = null;
102
-
103
- constructor(startUrl: string, options: CrawlOptions, context?: EngineContext) {
104
- this.startUrl = startUrl;
105
- this.options = options;
106
- this.context = context || nullContext;
107
- this.visited = new Set<string>();
108
- this.uniqueQueue = new Set<string>();
109
- this.queue = [];
110
- this.active = 0;
111
- this.pagesCrawled = 0;
112
- this.reachedLimit = false;
113
- this.maxDepthInCrawl = Math.min(options.depth, 10);
114
- this.concurrency = Math.min(options.concurrency || 2, 10);
115
- this.limitConcurrency = pLimit(this.concurrency);
116
- }
117
-
118
- async initialize(): Promise<void> {
119
- const db = getDb();
120
- this.siteRepo = new SiteRepository(db);
121
- this.snapshotRepo = new SnapshotRepository(db);
122
- this.pageRepo = new PageRepository(db);
123
- this.edgeRepo = new EdgeRepository(db);
124
- this.metricsRepo = new MetricsRepository(db);
125
-
126
- const rootUrl = normalizeUrl(this.startUrl, '', { stripQuery: this.options.stripQuery });
127
- if (!rootUrl) throw new Error('Invalid start URL');
128
-
129
- const urlObj = new URL(rootUrl);
130
- const domain = urlObj.hostname.replace('www.', '');
131
- const site = this.siteRepo.firstOrCreateSite(domain);
132
- this.siteId = site.id;
133
- const type = this.options.snapshotType || (this.options.previousGraph ? 'incremental' : 'full');
134
- this.snapshotId = this.snapshotRepo.createSnapshot(this.siteId, type);
135
- this.rootOrigin = urlObj.origin;
136
- this.startUrl = rootUrl;
137
-
138
- // Seed discovery depth for root
139
- this.discoveryDepths.set(this.startUrl, 0);
140
- }
141
-
142
- setupModules(): void {
143
- this.scopeManager = new ScopeManager({
144
- allowedDomains: this.options.allowedDomains || [],
145
- deniedDomains: this.options.deniedDomains || [],
146
- includeSubdomains: this.options.includeSubdomains || false,
147
- rootUrl: this.startUrl
148
- });
149
-
150
- this.fetcher = new Fetcher({
151
- rate: this.options.rate,
152
- proxyUrl: this.options.proxyUrl,
153
- scopeManager: this.scopeManager,
154
- maxRedirects: this.options.maxRedirects,
155
- userAgent: this.options.userAgent
156
- });
157
-
158
- this.parser = new Parser();
159
- this.sitemapFetcher = new Sitemap(this.context);
160
- this.trapDetector = new TrapDetector();
161
- }
162
-
163
- async fetchRobots(): Promise<void> {
164
- try {
165
- const robotsUrl = new URL('/robots.txt', this.rootOrigin).toString();
166
- const res = await this.fetcher!.fetch(robotsUrl, { maxBytes: 500000 });
167
- if (res && typeof res.status === 'number' && res.status >= 200 && res.status < 300) {
168
- this.robots = (robotsParser as any)(robotsUrl, res.body);
169
- }
170
- } catch {
171
- // Suppressed expected network warnings when robots block
172
- console.warn('Failed to fetch robots.txt, proceeding...');
173
- }
174
- }
175
-
176
- shouldEnqueue(url: string, depth: number): boolean {
177
- if (this.visited.has(url)) return false;
178
- if (this.uniqueQueue.has(url)) return false;
179
- if (depth > this.maxDepthInCrawl) return false;
180
- if (this.scopeManager!.isUrlEligible(url) !== 'allowed') return false;
181
-
182
- if (this.options.detectTraps) {
183
- const trap = this.trapDetector!.checkTrap(url, depth);
184
- if (trap.risk > 0.8) return false;
185
- }
186
- return true;
187
- }
188
-
189
- addToQueue(u: string, d: number): void {
190
- if (this.scopeManager!.isUrlEligible(u) !== 'allowed') return;
191
- if (!this.uniqueQueue.has(u)) {
192
- this.uniqueQueue.add(u);
193
- this.queue.push({ url: u, depth: d });
194
- this.context.emit({ type: 'queue:enqueue', url: u, depth: d });
195
-
196
- const currentDiscovery = this.discoveryDepths.get(u);
197
- if (currentDiscovery === undefined || d < currentDiscovery) {
198
- this.discoveryDepths.set(u, d);
199
- }
200
- }
201
- }
202
-
203
- async seedQueue(): Promise<void> {
204
- // Seed from Sitemap
205
- if (this.options.sitemap) {
206
- try {
207
- const sitemapUrl = this.options.sitemap === 'true' ? new URL('/sitemap.xml', this.rootOrigin).toString() : this.options.sitemap;
208
- if (sitemapUrl.startsWith('http')) {
209
- this.context.emit({ type: 'info', message: 'Fetching sitemap', context: { url: sitemapUrl } });
210
- const sitemapUrls = await this.sitemapFetcher!.fetch(sitemapUrl);
211
- for (const u of sitemapUrls) {
212
- const normalized = normalizeUrl(u, '', this.options);
213
- if (normalized) this.addToQueue(normalized, 0);
214
- }
215
- }
216
- } catch (e) {
217
- this.context.emit({ type: 'warn', message: 'Sitemap fetch failed', context: e });
218
- }
219
- }
220
-
221
- // Seed from startUrl
222
- this.addToQueue(this.startUrl, 0);
223
- }
224
-
225
- private bufferPage(url: string, depth: number, status: number, data: any = {}): void {
226
- const existing = this.pageBuffer.get(url);
227
- const knownDiscovery = this.discoveryDepths.get(url);
228
-
229
- // Always use the best (minimum) depth discovered for this URL
230
- const finalDepth = knownDiscovery !== undefined ? Math.min(knownDiscovery, depth) : depth;
231
- if (knownDiscovery === undefined || depth < knownDiscovery) {
232
- this.discoveryDepths.set(url, depth);
233
- }
234
-
235
- // If we already have a buffered record, only update if the new one is more "complete" (has status)
236
- // or if the depth is better.
237
- if (existing) {
238
- const isStatusUpdate = status !== 0 && existing.http_status === 0;
239
- const isBetterDepth = finalDepth < existing.depth;
240
-
241
- if (!isStatusUpdate && !isBetterDepth && Object.keys(data).length === 0) {
242
- return;
243
- }
244
-
245
- this.pageBuffer.set(url, {
246
- ...existing,
247
- depth: finalDepth,
248
- http_status: status !== 0 ? status : existing.http_status,
249
- ...data
250
- });
251
- } else {
252
- this.pageBuffer.set(url, {
253
- site_id: this.siteId!,
254
- normalized_url: url,
255
- depth: finalDepth,
256
- http_status: status,
257
- last_seen_snapshot_id: this.snapshotId!,
258
- ...data
259
- });
260
- }
261
-
262
- if (this.pageBuffer.size >= 50) {
263
- this.flushPages();
264
- }
265
- }
266
-
267
- private flushPages(): void {
268
- if (this.pageBuffer.size === 0) return;
269
- this.pageRepo!.upsertMany(Array.from(this.pageBuffer.values()));
270
- this.pageBuffer.clear();
271
- }
272
-
273
- private bufferEdge(sourceUrl: string, targetUrl: string, weight: number = 1.0, rel: string = 'internal'): void {
274
- this.edgeBuffer.push({ sourceUrl, targetUrl, weight, rel });
275
- if (this.edgeBuffer.length >= 100) {
276
- this.flushEdges();
277
- }
278
- }
279
-
280
- private flushEdges(): void {
281
- if (this.edgeBuffer.length === 0) return;
282
-
283
- // To resolve URLs to IDs, we need to make sure pages are flushed first
284
- this.flushPages();
285
-
286
- const identities = this.pageRepo!.getPagesIdentityBySnapshot(this.snapshotId!);
287
- const urlToId = new Map(identities.map(p => [p.normalized_url, p.id]));
288
-
289
- const edgesToInsert = this.edgeBuffer
290
- .map(e => ({
291
- snapshot_id: this.snapshotId!,
292
- source_page_id: urlToId.get(e.sourceUrl)!,
293
- target_page_id: urlToId.get(e.targetUrl)!,
294
- weight: e.weight,
295
- rel: e.rel as any
296
- }))
297
- .filter(e => e.source_page_id !== undefined && e.target_page_id !== undefined);
298
-
299
- if (edgesToInsert.length > 0) {
300
- this.edgeRepo!.insertEdges(edgesToInsert);
301
- }
302
- this.edgeBuffer = [];
303
- }
304
-
305
- private bufferMetrics(url: string, data: any): void {
306
- this.metricsBuffer.push({ url, data });
307
- if (this.metricsBuffer.length >= 50) {
308
- this.flushMetrics();
309
- }
310
- }
311
-
312
- private flushMetrics(): void {
313
- if (this.metricsBuffer.length === 0) return;
314
-
315
- this.flushPages();
316
- const identities = this.pageRepo!.getPagesIdentityBySnapshot(this.snapshotId!);
317
- const urlToId = new Map(identities.map(p => [p.normalized_url, p.id]));
318
-
319
- const metricsList = this.metricsBuffer.map(item => {
320
- const pageId = urlToId.get(item.url);
321
- if (!pageId) return null;
322
- return {
323
- snapshot_id: this.snapshotId!,
324
- page_id: pageId,
325
- authority_score: null,
326
- hub_score: null,
327
- pagerank: null,
328
- pagerank_score: null,
329
- link_role: null,
330
- crawl_status: null,
331
- word_count: null,
332
- thin_content_score: null,
333
- external_link_ratio: null,
334
- orphan_score: null,
335
- duplicate_cluster_id: null,
336
- duplicate_type: null,
337
- is_cluster_primary: 0,
338
- ...item.data
339
- };
340
- }).filter(m => m !== null);
341
-
342
- if (metricsList.length > 0) {
343
- this.metricsRepo!.insertMany(metricsList as any[]);
344
- }
345
- this.metricsBuffer = [];
346
- }
347
-
348
- async flushAll(): Promise<void> {
349
- this.flushPages();
350
- this.flushEdges();
351
- this.flushMetrics();
352
- }
353
-
354
- private async fetchPage(url: string, depth: number, prevNode?: GraphNode): Promise<FetchResult | null> {
355
- const startTime = Date.now();
356
- try {
357
- this.context.emit({ type: 'crawl:start', url });
358
- const res = await this.fetcher!.fetch(url, {
359
- maxBytes: this.options.maxBytes,
360
- crawlDelay: this.robots ? this.robots.getCrawlDelay('crawlith') : undefined,
361
- etag: prevNode?.etag,
362
- lastModified: prevNode?.lastModified
363
- });
364
-
365
- const durationMs = Date.now() - startTime;
366
-
367
- this.context.emit({
368
- type: 'crawl:success',
369
- url,
370
- status: typeof res.status === 'number' ? res.status : 0,
371
- durationMs,
372
- depth
373
- });
374
-
375
- return res;
376
- } catch (e) {
377
- this.context.emit({ type: 'crawl:error', url, error: String(e), depth });
378
- return null;
379
- }
380
- }
381
-
382
- private handleCachedResponse(url: string, finalUrl: string, depth: number, prevNode: GraphNode): void {
383
- this.bufferPage(finalUrl, depth, 200, {
384
- html: prevNode.html,
385
- canonical_url: prevNode.canonical,
386
- content_hash: prevNode.contentHash,
387
- simhash: prevNode.simhash,
388
- etag: prevNode.etag,
389
- last_modified: prevNode.lastModified,
390
- noindex: prevNode.noindex ? 1 : 0,
391
- nofollow: prevNode.nofollow ? 1 : 0
392
- });
393
- this.bufferMetrics(finalUrl, {
394
- crawl_status: 'cached'
395
- });
396
-
397
- // Re-discovery links from previous graph to continue crawling if needed
398
- const prevLinks = this.options.previousGraph?.getEdges()
399
- .filter(e => e.source === url)
400
- .map(e => e.target);
401
-
402
- if (prevLinks) {
403
- for (const link of prevLinks) {
404
- const normalizedLink = normalizeUrl(link, '', this.options);
405
- if (normalizedLink && normalizedLink !== finalUrl) {
406
- this.bufferPage(normalizedLink, depth + 1, 0);
407
- this.bufferEdge(finalUrl, normalizedLink, 1.0, 'internal');
408
- if (this.shouldEnqueue(normalizedLink, depth + 1)) {
409
- this.addToQueue(normalizedLink, depth + 1);
410
- }
411
- }
412
- }
413
- }
414
- }
415
-
416
- private handleRedirects(chain: FetchResult['redirectChain'], depth: number): void {
417
- for (const step of chain) {
418
- const source = normalizeUrl(step.url, '', this.options);
419
- const target = normalizeUrl(step.target, '', this.options);
420
- if (source && target) {
421
- this.bufferPage(source, depth, step.status);
422
- this.bufferPage(target, depth, 0);
423
- this.bufferEdge(source, target);
424
- }
425
- }
426
- }
427
-
428
- private handleSuccessResponse(res: FetchResult, finalUrl: string, depth: number, isBlocked: boolean = false): void {
429
- const contentTypeHeader = res.headers['content-type'];
430
- const contentType = Array.isArray(contentTypeHeader) ? contentTypeHeader[0] : (contentTypeHeader || '');
431
- if (!contentType || !contentType.toLowerCase().includes('text/html')) {
432
- this.bufferPage(finalUrl, depth, typeof res.status === 'number' ? res.status : 0);
433
- return;
434
- }
435
-
436
- const parseResult = this.parser!.parse(res.body, finalUrl, res.status as number);
437
-
438
- this.bufferPage(finalUrl, depth, res.status as number, {
439
- html: parseResult.html,
440
- canonical_url: parseResult.canonical || undefined,
441
- noindex: parseResult.noindex ? 1 : 0,
442
- nofollow: parseResult.nofollow ? 1 : 0,
443
- content_hash: parseResult.contentHash,
444
- simhash: parseResult.simhash,
445
- soft404_score: parseResult.soft404Score,
446
- etag: res.etag,
447
- last_modified: res.lastModified,
448
- retries: res.retries
449
- });
450
-
451
- try {
452
- const contentAnalysis = analyzeContent(parseResult.html);
453
- const linkAnalysis = analyzeLinks(parseResult.html, finalUrl, this.rootOrigin);
454
- const thinScore = calculateThinContentScore(contentAnalysis, 0);
455
-
456
- this.bufferMetrics(finalUrl, {
457
- crawl_status: isBlocked ? 'blocked_by_robots' : 'fetched',
458
- word_count: contentAnalysis.wordCount,
459
- thin_content_score: thinScore,
460
- external_link_ratio: linkAnalysis.externalRatio
461
- });
462
- } catch (e) {
463
- this.context.emit({ type: 'error', message: 'Error calculating per-page metrics', error: e, context: { url: finalUrl } });
464
- }
465
-
466
- for (const linkItem of parseResult.links) {
467
- const normalizedLink = normalizeUrl(linkItem.url, '', this.options);
468
- if (normalizedLink && normalizedLink !== finalUrl) {
469
- this.bufferPage(normalizedLink, depth + 1, 0);
470
- this.bufferEdge(finalUrl, normalizedLink, 1.0, 'internal');
471
- if (this.shouldEnqueue(normalizedLink, depth + 1)) {
472
- this.addToQueue(normalizedLink, depth + 1);
473
- }
474
- }
475
- }
476
- }
477
-
478
- private async processPage(item: QueueItem, isBlocked: boolean = false): Promise<void> {
479
- const { url, depth } = item;
480
- if (this.scopeManager!.isUrlEligible(url) !== 'allowed') {
481
- this.bufferPage(url, depth, 0, { securityError: 'blocked_by_domain_filter' });
482
- return;
483
- }
484
-
485
- try {
486
- const prevNode = this.options.previousGraph?.nodes.get(url);
487
- const res = await this.fetchPage(url, depth, prevNode);
488
-
489
- if (!res) return;
490
-
491
- const finalUrl = normalizeUrl(res.finalUrl, '', this.options);
492
- if (!finalUrl) return;
493
-
494
- if (res.status === 304 && prevNode) {
495
- this.handleCachedResponse(url, finalUrl, depth, prevNode);
496
- return;
497
- }
498
-
499
- this.handleRedirects(res.redirectChain, depth);
500
-
501
- const isStringStatus = typeof res.status === 'string';
502
- if (isStringStatus || (typeof res.status === 'number' && res.status >= 300)) {
503
- const statusNum = typeof res.status === 'number' ? res.status : 0;
504
- this.bufferPage(finalUrl, depth, statusNum, {
505
- security_error: isStringStatus ? res.status : undefined,
506
- retries: res.retries
507
- });
508
- this.bufferMetrics(finalUrl, {
509
- crawl_status: isStringStatus ? res.status : 'fetched_error'
510
- });
511
- return;
512
- }
513
-
514
- if (res.status === 200) {
515
- this.handleSuccessResponse(res, finalUrl, depth, isBlocked);
516
- }
517
- } catch (e) {
518
- this.context.emit({ type: 'crawl:error', url, error: String(e), depth });
519
- }
520
- }
521
-
522
- async run(): Promise<number> {
523
- await this.initialize();
524
- this.setupModules();
525
- await this.fetchRobots();
526
- await this.seedQueue();
527
-
528
- return new Promise((resolve) => {
529
- const checkDone = async () => {
530
- if (this.queue.length === 0 && this.active === 0) {
531
- await this.flushAll();
532
- this.snapshotRepo!.updateSnapshotStatus(this.snapshotId!, 'completed', {
533
- limit_reached: this.reachedLimit ? 1 : 0
534
- });
535
- resolve(this.snapshotId!);
536
- return true;
537
- }
538
- return false;
539
- };
540
-
541
- const next = async () => {
542
- if (await checkDone()) return;
543
-
544
- if (this.pagesCrawled >= this.options.limit) {
545
- this.reachedLimit = true;
546
- if (this.active === 0) {
547
- await this.flushAll();
548
- this.snapshotRepo!.updateSnapshotStatus(this.snapshotId!, 'completed', {
549
- limit_reached: 1
550
- });
551
- this.context.emit({ type: 'crawl:limit-reached', limit: this.options.limit });
552
- resolve(this.snapshotId!);
553
- }
554
- return;
555
- }
556
-
557
- while (this.queue.length > 0 && this.active < this.concurrency && this.pagesCrawled < this.options.limit) {
558
- const item = this.queue.shift()!;
559
- if (this.visited.has(item.url)) continue;
560
-
561
- // Robust robots check: if path doesn't end in /, check both /path and /path/
562
- // to handle cases where normalization stripped a slash that robots.txt relies on.
563
- const isBlocked = this.robots && (
564
- !this.robots.isAllowed(item.url, 'crawlith') ||
565
- (!item.url.endsWith('/') && !this.robots.isAllowed(item.url + '/', 'crawlith'))
566
- );
567
-
568
- if (isBlocked) {
569
- if (this.options.debug) {
570
- console.log(`${chalk.yellow('⊘ Robots')} ${chalk.gray(item.url)}`);
571
- }
572
-
573
- // Tag as blocked for reporting
574
- this.bufferMetrics(item.url, {
575
- crawl_status: 'blocked_by_robots'
576
- });
577
- this.bufferPage(item.url, item.depth, 0);
578
-
579
- if (!this.options.ignoreRobots) {
580
- this.visited.add(item.url);
581
- this.pagesCrawled++;
582
- continue;
583
- }
584
- }
585
-
586
- this.active++;
587
- this.pagesCrawled++;
588
- this.visited.add(item.url);
589
-
590
- this.limitConcurrency(() => this.processPage(item, isBlocked)).finally(() => {
591
- this.active--;
592
- next();
593
- });
594
- }
595
-
596
- await checkDone();
597
- };
598
- next();
599
- });
600
- }
601
- }
@@ -1,39 +0,0 @@
1
- import * as cheerio from 'cheerio';
2
-
3
- /**
4
- * Extracts all links from an HTML document.
5
- * Returns absolute URLs.
6
- * @param html The HTML content string
7
- * @param baseUrl The base URL to resolve relative links against
8
- * @param onError Optional callback for handling extraction errors
9
- */
10
- export function extractLinks(html: string, baseUrl: string, onError?: (error: unknown) => void): string[] {
11
- try {
12
- const $ = cheerio.load(html);
13
- const links = new Set<string>();
14
-
15
- $('a').each((_, element) => {
16
- const href = $(element).attr('href');
17
- if (href) {
18
- try {
19
- const absoluteUrl = new URL(href, baseUrl);
20
- // Only http(s) links
21
- if (absoluteUrl.protocol === 'http:' || absoluteUrl.protocol === 'https:') {
22
- // Remove hash fragments immediately as they are irrelevant for crawling
23
- absoluteUrl.hash = '';
24
- links.add(absoluteUrl.toString());
25
- }
26
- } catch (_e) {
27
- // Invalid URL, skip
28
- }
29
- }
30
- });
31
-
32
- return Array.from(links);
33
- } catch (e) {
34
- if (onError) {
35
- onError(e);
36
- }
37
- return [];
38
- }
39
- }