@crawlith/core 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analysis_list.html +35 -0
  4. package/dist/analysis/analysis_page.html +123 -0
  5. package/dist/analysis/analyze.d.ts +40 -5
  6. package/dist/analysis/analyze.js +395 -347
  7. package/dist/analysis/clustering.d.ts +23 -0
  8. package/dist/analysis/clustering.js +206 -0
  9. package/dist/analysis/content.d.ts +1 -1
  10. package/dist/analysis/content.js +11 -5
  11. package/dist/analysis/duplicate.d.ts +34 -0
  12. package/dist/analysis/duplicate.js +305 -0
  13. package/dist/analysis/heading.d.ts +116 -0
  14. package/dist/analysis/heading.js +356 -0
  15. package/dist/analysis/images.d.ts +1 -1
  16. package/dist/analysis/images.js +6 -5
  17. package/dist/analysis/links.d.ts +1 -1
  18. package/dist/analysis/links.js +8 -8
  19. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  20. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  21. package/dist/analysis/scoring.js +11 -2
  22. package/dist/analysis/seo.d.ts +8 -4
  23. package/dist/analysis/seo.js +41 -30
  24. package/dist/analysis/soft404.d.ts +17 -0
  25. package/dist/analysis/soft404.js +62 -0
  26. package/dist/analysis/structuredData.d.ts +1 -1
  27. package/dist/analysis/structuredData.js +5 -4
  28. package/dist/analysis/templates.d.ts +2 -0
  29. package/dist/analysis/templates.js +7 -0
  30. package/dist/application/index.d.ts +2 -0
  31. package/dist/application/index.js +2 -0
  32. package/dist/application/usecase.d.ts +3 -0
  33. package/dist/application/usecase.js +1 -0
  34. package/dist/application/usecases.d.ts +114 -0
  35. package/dist/application/usecases.js +201 -0
  36. package/dist/audit/index.js +1 -1
  37. package/dist/audit/transport.d.ts +1 -1
  38. package/dist/audit/transport.js +5 -4
  39. package/dist/audit/types.d.ts +1 -0
  40. package/dist/constants.d.ts +17 -0
  41. package/dist/constants.js +23 -0
  42. package/dist/core/scope/scopeManager.js +3 -0
  43. package/dist/core/security/ipGuard.d.ts +11 -0
  44. package/dist/core/security/ipGuard.js +71 -3
  45. package/dist/crawler/crawl.d.ts +4 -22
  46. package/dist/crawler/crawl.js +4 -335
  47. package/dist/crawler/crawler.d.ts +87 -0
  48. package/dist/crawler/crawler.js +683 -0
  49. package/dist/crawler/extract.d.ts +4 -1
  50. package/dist/crawler/extract.js +7 -2
  51. package/dist/crawler/fetcher.d.ts +2 -1
  52. package/dist/crawler/fetcher.js +26 -11
  53. package/dist/crawler/metricsRunner.d.ts +23 -1
  54. package/dist/crawler/metricsRunner.js +202 -72
  55. package/dist/crawler/normalize.d.ts +41 -0
  56. package/dist/crawler/normalize.js +119 -3
  57. package/dist/crawler/parser.d.ts +1 -3
  58. package/dist/crawler/parser.js +2 -49
  59. package/dist/crawler/resolver.d.ts +11 -0
  60. package/dist/crawler/resolver.js +67 -0
  61. package/dist/crawler/sitemap.d.ts +6 -0
  62. package/dist/crawler/sitemap.js +27 -17
  63. package/dist/crawler/trap.d.ts +5 -1
  64. package/dist/crawler/trap.js +23 -2
  65. package/dist/db/CrawlithDB.d.ts +110 -0
  66. package/dist/db/CrawlithDB.js +500 -0
  67. package/dist/db/graphLoader.js +42 -30
  68. package/dist/db/index.d.ts +11 -0
  69. package/dist/db/index.js +41 -29
  70. package/dist/db/migrations.d.ts +2 -0
  71. package/dist/db/{schema.js → migrations.js} +90 -43
  72. package/dist/db/pluginRegistry.d.ts +9 -0
  73. package/dist/db/pluginRegistry.js +19 -0
  74. package/dist/db/repositories/EdgeRepository.d.ts +13 -0
  75. package/dist/db/repositories/EdgeRepository.js +20 -0
  76. package/dist/db/repositories/MetricsRepository.d.ts +16 -8
  77. package/dist/db/repositories/MetricsRepository.js +28 -7
  78. package/dist/db/repositories/PageRepository.d.ts +15 -2
  79. package/dist/db/repositories/PageRepository.js +169 -25
  80. package/dist/db/repositories/SiteRepository.d.ts +9 -0
  81. package/dist/db/repositories/SiteRepository.js +13 -0
  82. package/dist/db/repositories/SnapshotRepository.d.ts +14 -5
  83. package/dist/db/repositories/SnapshotRepository.js +64 -5
  84. package/dist/db/reset.d.ts +9 -0
  85. package/dist/db/reset.js +32 -0
  86. package/dist/db/statements.d.ts +12 -0
  87. package/dist/db/statements.js +40 -0
  88. package/dist/diff/compare.d.ts +0 -5
  89. package/dist/diff/compare.js +0 -12
  90. package/dist/diff/service.d.ts +16 -0
  91. package/dist/diff/service.js +41 -0
  92. package/dist/domain/index.d.ts +4 -0
  93. package/dist/domain/index.js +4 -0
  94. package/dist/events.d.ts +56 -0
  95. package/dist/events.js +1 -0
  96. package/dist/graph/graph.d.ts +36 -42
  97. package/dist/graph/graph.js +26 -17
  98. package/dist/graph/hits.d.ts +23 -0
  99. package/dist/graph/hits.js +111 -0
  100. package/dist/graph/metrics.d.ts +0 -4
  101. package/dist/graph/metrics.js +25 -9
  102. package/dist/graph/pagerank.d.ts +17 -4
  103. package/dist/graph/pagerank.js +126 -91
  104. package/dist/graph/simhash.d.ts +6 -0
  105. package/dist/graph/simhash.js +14 -0
  106. package/dist/index.d.ts +29 -8
  107. package/dist/index.js +29 -8
  108. package/dist/lock/hashKey.js +1 -1
  109. package/dist/lock/lockManager.d.ts +5 -1
  110. package/dist/lock/lockManager.js +38 -13
  111. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  112. package/dist/plugin-system/plugin-cli.js +31 -0
  113. package/dist/plugin-system/plugin-config.d.ts +16 -0
  114. package/dist/plugin-system/plugin-config.js +36 -0
  115. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  116. package/dist/plugin-system/plugin-loader.js +122 -0
  117. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  118. package/dist/plugin-system/plugin-registry.js +167 -0
  119. package/dist/plugin-system/plugin-types.d.ts +205 -0
  120. package/dist/plugin-system/plugin-types.js +1 -0
  121. package/dist/ports/index.d.ts +9 -0
  122. package/dist/ports/index.js +1 -0
  123. package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
  124. package/dist/report/crawlExport.d.ts +3 -0
  125. package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
  126. package/dist/report/crawl_template.d.ts +1 -0
  127. package/dist/report/crawl_template.js +7 -0
  128. package/dist/report/export.d.ts +3 -0
  129. package/dist/report/export.js +81 -0
  130. package/dist/report/html.js +15 -216
  131. package/dist/report/insight.d.ts +27 -0
  132. package/dist/report/insight.js +103 -0
  133. package/dist/scoring/health.d.ts +56 -0
  134. package/dist/scoring/health.js +213 -0
  135. package/dist/utils/chalk.d.ts +6 -0
  136. package/dist/utils/chalk.js +41 -0
  137. package/dist/utils/secureConfig.d.ts +23 -0
  138. package/dist/utils/secureConfig.js +128 -0
  139. package/package.json +12 -6
  140. package/CHANGELOG.md +0 -7
  141. package/dist/db/schema.d.ts +0 -2
  142. package/dist/graph/cluster.d.ts +0 -6
  143. package/dist/graph/cluster.js +0 -173
  144. package/dist/graph/duplicate.d.ts +0 -10
  145. package/dist/graph/duplicate.js +0 -251
  146. package/dist/report/sitegraphExport.d.ts +0 -3
  147. package/dist/report/sitegraph_template.d.ts +0 -1
  148. package/dist/report/sitegraph_template.js +0 -630
  149. package/dist/scoring/hits.d.ts +0 -9
  150. package/dist/scoring/hits.js +0 -111
  151. package/src/analysis/analyze.ts +0 -548
  152. package/src/analysis/content.ts +0 -62
  153. package/src/analysis/images.ts +0 -28
  154. package/src/analysis/links.ts +0 -41
  155. package/src/analysis/scoring.ts +0 -59
  156. package/src/analysis/seo.ts +0 -82
  157. package/src/analysis/structuredData.ts +0 -62
  158. package/src/audit/dns.ts +0 -49
  159. package/src/audit/headers.ts +0 -98
  160. package/src/audit/index.ts +0 -66
  161. package/src/audit/scoring.ts +0 -232
  162. package/src/audit/transport.ts +0 -258
  163. package/src/audit/types.ts +0 -102
  164. package/src/core/network/proxyAdapter.ts +0 -21
  165. package/src/core/network/rateLimiter.ts +0 -39
  166. package/src/core/network/redirectController.ts +0 -47
  167. package/src/core/network/responseLimiter.ts +0 -34
  168. package/src/core/network/retryPolicy.ts +0 -57
  169. package/src/core/scope/domainFilter.ts +0 -45
  170. package/src/core/scope/scopeManager.ts +0 -52
  171. package/src/core/scope/subdomainPolicy.ts +0 -39
  172. package/src/core/security/ipGuard.ts +0 -92
  173. package/src/crawler/crawl.ts +0 -382
  174. package/src/crawler/extract.ts +0 -34
  175. package/src/crawler/fetcher.ts +0 -233
  176. package/src/crawler/metricsRunner.ts +0 -124
  177. package/src/crawler/normalize.ts +0 -108
  178. package/src/crawler/parser.ts +0 -190
  179. package/src/crawler/sitemap.ts +0 -73
  180. package/src/crawler/trap.ts +0 -96
  181. package/src/db/graphLoader.ts +0 -105
  182. package/src/db/index.ts +0 -70
  183. package/src/db/repositories/EdgeRepository.ts +0 -29
  184. package/src/db/repositories/MetricsRepository.ts +0 -49
  185. package/src/db/repositories/PageRepository.ts +0 -128
  186. package/src/db/repositories/SiteRepository.ts +0 -32
  187. package/src/db/repositories/SnapshotRepository.ts +0 -74
  188. package/src/db/schema.ts +0 -177
  189. package/src/diff/compare.ts +0 -84
  190. package/src/graph/cluster.ts +0 -192
  191. package/src/graph/duplicate.ts +0 -286
  192. package/src/graph/graph.ts +0 -172
  193. package/src/graph/metrics.ts +0 -110
  194. package/src/graph/pagerank.ts +0 -125
  195. package/src/graph/simhash.ts +0 -61
  196. package/src/index.ts +0 -30
  197. package/src/lock/hashKey.ts +0 -51
  198. package/src/lock/lockManager.ts +0 -124
  199. package/src/lock/pidCheck.ts +0 -13
  200. package/src/report/html.ts +0 -227
  201. package/src/report/sitegraphExport.ts +0 -58
  202. package/src/scoring/hits.ts +0 -131
  203. package/src/scoring/orphanSeverity.ts +0 -176
  204. package/src/utils/version.ts +0 -18
  205. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  206. package/tests/analysis.unit.test.ts +0 -98
  207. package/tests/analyze.integration.test.ts +0 -98
  208. package/tests/audit/dns.test.ts +0 -31
  209. package/tests/audit/headers.test.ts +0 -45
  210. package/tests/audit/scoring.test.ts +0 -133
  211. package/tests/audit/security.test.ts +0 -12
  212. package/tests/audit/transport.test.ts +0 -112
  213. package/tests/clustering.test.ts +0 -118
  214. package/tests/crawler.test.ts +0 -358
  215. package/tests/db.test.ts +0 -159
  216. package/tests/diff.test.ts +0 -67
  217. package/tests/duplicate.test.ts +0 -110
  218. package/tests/fetcher.test.ts +0 -106
  219. package/tests/fetcher_safety.test.ts +0 -85
  220. package/tests/fixtures/analyze-crawl.json +0 -26
  221. package/tests/hits.test.ts +0 -134
  222. package/tests/html_report.test.ts +0 -58
  223. package/tests/lock/lockManager.test.ts +0 -138
  224. package/tests/metrics.test.ts +0 -196
  225. package/tests/normalize.test.ts +0 -101
  226. package/tests/orphanSeverity.test.ts +0 -160
  227. package/tests/pagerank.test.ts +0 -98
  228. package/tests/parser.test.ts +0 -117
  229. package/tests/proxy_safety.test.ts +0 -57
  230. package/tests/redirect_safety.test.ts +0 -73
  231. package/tests/safety.test.ts +0 -114
  232. package/tests/scope.test.ts +0 -66
  233. package/tests/scoring.test.ts +0 -59
  234. package/tests/sitemap.test.ts +0 -88
  235. package/tests/soft404.test.ts +0 -41
  236. package/tests/trap.test.ts +0 -39
  237. package/tests/visualization_data.test.ts +0 -46
  238. package/tsconfig.json +0 -11
@@ -0,0 +1,683 @@
1
+ import chalk from '../utils/chalk.js';
2
+ import pLimit from 'p-limit';
3
+ import robotsParser from 'robots-parser';
4
+ import { Fetcher } from './fetcher.js';
5
+ import { Parser } from './parser.js';
6
+ import { Sitemap } from './sitemap.js';
7
+ import { normalizeUrl, UrlUtil } from './normalize.js';
8
+ import { UrlResolver } from './resolver.js';
9
+ import { ScopeManager } from '../core/scope/scopeManager.js';
10
+ import { getDb } from '../db/index.js';
11
+ import { SiteRepository } from '../db/repositories/SiteRepository.js';
12
+ import { SnapshotRepository } from '../db/repositories/SnapshotRepository.js';
13
+ import { PageRepository } from '../db/repositories/PageRepository.js';
14
+ import { EdgeRepository } from '../db/repositories/EdgeRepository.js';
15
+ import { MetricsRepository } from '../db/repositories/MetricsRepository.js';
16
+ import { analyzeContent, calculateThinContentScore } from '../analysis/content.js';
17
+ import { analyzeLinks } from '../analysis/links.js';
18
+ import { DEFAULTS } from '../constants.js';
19
+ // Fallback context for backward compatibility or when no context is provided
20
+ const nullContext = {
21
+ emit: (event) => {
22
+ // Basic console fallback for critical events if no listener is attached
23
+ // This maintains some visibility for consumers not using the event system
24
+ if (event.type === 'error') {
25
+ console.error(event.message, event.error || '');
26
+ }
27
+ else if (event.type === 'warn') {
28
+ console.warn(event.message);
29
+ }
30
+ }
31
+ };
32
+ export class Crawler {
33
+ startUrl;
34
+ options;
35
+ context;
36
+ registry;
37
+ visited;
38
+ uniqueQueue;
39
+ queue;
40
+ active;
41
+ pagesCrawled;
42
+ reachedLimit;
43
+ maxDepthInCrawl;
44
+ concurrency;
45
+ limitConcurrency;
46
+ // Repositories
47
+ siteRepo = null;
48
+ snapshotRepo = null;
49
+ pageRepo = null;
50
+ edgeRepo = null;
51
+ metricsRepo = null;
52
+ // Site/Snapshot info
53
+ siteId = null;
54
+ snapshotId = null;
55
+ reusingSnapshot = false;
56
+ runType = 'completed';
57
+ rootOrigin = '';
58
+ // Discovery tracking
59
+ discoveryDepths = new Map();
60
+ // Buffers for batch operations
61
+ pageBuffer = new Map();
62
+ edgeBuffer = [];
63
+ metricsBuffer = [];
64
+ pendingSitemaps = 0;
65
+ edgesFound = 0;
66
+ lastProgressEmitAt = 0;
67
+ progressPhase = 'crawling';
68
+ // Modules
69
+ scopeManager = null;
70
+ fetcher = null;
71
+ parser = null;
72
+ sitemapFetcher = null;
73
+ robots = null;
74
+ constructor(startUrl, options, context) {
75
+ this.startUrl = startUrl;
76
+ this.options = options;
77
+ this.context = context || nullContext;
78
+ this.registry = options.registry;
79
+ this.visited = new Set();
80
+ this.uniqueQueue = new Set();
81
+ this.queue = [];
82
+ this.active = 0;
83
+ this.pagesCrawled = 0;
84
+ this.reachedLimit = false;
85
+ this.maxDepthInCrawl = Math.min(options.depth || DEFAULTS.MAX_DEPTH, DEFAULTS.MAX_DEPTH_LIMIT);
86
+ this.concurrency = Math.min(options.concurrency || DEFAULTS.CONCURRENCY, DEFAULTS.CONCURRENCY_LIMIT);
87
+ this.limitConcurrency = pLimit(this.concurrency);
88
+ }
89
+ toStorageUrl(url) {
90
+ return UrlUtil.isInternal(url, this.rootOrigin) ? UrlUtil.toPath(url, this.rootOrigin) : url;
91
+ }
92
+ async initialize() {
93
+ const db = getDb();
94
+ this.siteRepo = new SiteRepository(db);
95
+ this.snapshotRepo = new SnapshotRepository(db);
96
+ this.pageRepo = new PageRepository(db);
97
+ this.edgeRepo = new EdgeRepository(db);
98
+ this.metricsRepo = new MetricsRepository(db);
99
+ // Use resolver to find canonical origin and SSL
100
+ const resolver = new UrlResolver();
101
+ const tempFetcher = new Fetcher({ userAgent: this.options.userAgent, rate: this.options.rate });
102
+ const resolved = await resolver.resolve(this.startUrl, tempFetcher);
103
+ this.rootOrigin = resolved.url;
104
+ // Use the resolved absolute URL as the base — NOT this.startUrl which may be
105
+ // a bare domain (e.g. 'callforpaper.org') that would be treated as a relative
106
+ // path when passed to normalizeUrl, producing '/callforpaper.org'.
107
+ const rootUrl = normalizeUrl(this.rootOrigin, '', { stripQuery: this.options.stripQuery });
108
+ if (!rootUrl)
109
+ throw new Error('Invalid start URL');
110
+ const urlObj = new URL(this.rootOrigin);
111
+ const domain = urlObj.hostname.replace('www.', '');
112
+ const site = this.siteRepo.firstOrCreateSite(domain);
113
+ this.siteId = site.id;
114
+ // Persist the resolved preferred URL and SSL status
115
+ this.siteRepo.updateSitePreference(this.siteId, {
116
+ preferred_url: this.rootOrigin,
117
+ ssl: this.rootOrigin.startsWith('https') ? 1 : 0
118
+ });
119
+ this.rootOrigin = urlObj.origin;
120
+ // Keep storage path-first for internal URLs and reconcile any legacy absolute rows.
121
+ this.pageRepo.reconcileInternalUrls(this.siteId, this.rootOrigin);
122
+ this.startUrl = this.toStorageUrl(rootUrl);
123
+ // Now that rootOrigin is resolved, initialize ScopeManager with the correct absolute origin
124
+ this.scopeManager = new ScopeManager({
125
+ allowedDomains: this.options.allowedDomains || [],
126
+ deniedDomains: this.options.deniedDomains || [],
127
+ includeSubdomains: this.options.includeSubdomains || false,
128
+ rootUrl: this.rootOrigin
129
+ });
130
+ // Update fetcher with the now-initialized scopeManager
131
+ if (this.fetcher) {
132
+ this.fetcher.scopeManager = this.scopeManager;
133
+ }
134
+ // Every scan now creates a new snapshot (no reuse)
135
+ const runType = this.options.snapshotRunType || (this.options.previousGraph ? 'incremental' : 'completed');
136
+ this.snapshotId = this.snapshotRepo.createSnapshot(this.siteId, runType);
137
+ this.runType = runType;
138
+ // Expose snapshot context for plugins that persist per-snapshot data.
139
+ this.context.snapshotId = this.snapshotId;
140
+ // Seed discovery depth for root
141
+ this.discoveryDepths.set(this.startUrl, 0);
142
+ }
143
+ setupModules() {
144
+ this.fetcher = new Fetcher({
145
+ rate: this.options.rate,
146
+ proxyUrl: this.options.proxyUrl,
147
+ scopeManager: this.scopeManager ?? undefined,
148
+ maxRedirects: this.options.maxRedirects,
149
+ userAgent: this.options.userAgent
150
+ });
151
+ this.parser = new Parser();
152
+ this.sitemapFetcher = new Sitemap(this.context, this.fetcher);
153
+ }
154
+ async fetchRobots() {
155
+ const robotsUrl = new URL('/robots.txt', this.rootOrigin).toString();
156
+ try {
157
+ const res = await this.fetcher.fetch(robotsUrl, { maxBytes: 500000 });
158
+ if (res && typeof res.status === 'number' && res.status >= 200 && res.status < 300) {
159
+ this.robots = robotsParser(robotsUrl, res.body);
160
+ }
161
+ }
162
+ catch {
163
+ // Suppressed expected network warnings when robots block
164
+ console.warn('Failed to fetch robots.txt, proceeding...');
165
+ }
166
+ }
167
+ shouldEnqueue(url, depth) {
168
+ if (this.visited.has(url))
169
+ return false;
170
+ if (this.uniqueQueue.has(url))
171
+ return false;
172
+ if (depth > this.maxDepthInCrawl)
173
+ return false;
174
+ if (this.scopeManager.isUrlEligible(url) !== 'allowed')
175
+ return false;
176
+ if (this.registry) {
177
+ const allowed = this.registry.runSyncBailHook('shouldEnqueueUrl', this.context, url, depth);
178
+ if (allowed === false)
179
+ return false;
180
+ }
181
+ return true;
182
+ }
183
+ addToQueue(u, d, data = {}) {
184
+ if (this.scopeManager.isUrlEligible(u) !== 'allowed')
185
+ return;
186
+ if (!this.uniqueQueue.has(u)) {
187
+ this.uniqueQueue.add(u);
188
+ this.queue.push({ url: u, depth: d });
189
+ this.context.emit({ type: 'queue:enqueue', url: u, depth: d });
190
+ this.emitProgress();
191
+ this.bufferPage(u, d, 0, data);
192
+ const currentDiscovery = this.discoveryDepths.get(u);
193
+ if (currentDiscovery === undefined || d < currentDiscovery) {
194
+ this.discoveryDepths.set(u, d);
195
+ }
196
+ }
197
+ }
198
+ async seedQueue() {
199
+ // Seed from startUrl first to ensure it's prioritized in the queue
200
+ this.addToQueue(this.startUrl, 0);
201
+ const sitemapsToFetch = new Set();
202
+ // 1. Explicitly configured sitemap
203
+ if (this.options.sitemap && this.runType !== 'single') {
204
+ const explicitUrl = this.options.sitemap === 'true' || this.options.sitemap === true
205
+ ? new URL('/sitemap.xml', this.rootOrigin).toString()
206
+ : this.options.sitemap;
207
+ if (typeof explicitUrl === 'string' && explicitUrl.startsWith('http')) {
208
+ sitemapsToFetch.add(explicitUrl);
209
+ }
210
+ }
211
+ // 2. Discover sitemaps from robots.txt (unless explicitly disabled)
212
+ // Only auto-fetch on the FIRST real crawl (full/incremental).
213
+ // page --live reuses snapshots and should NOT trigger sitemap fetch.
214
+ const isFirstFullCrawl = this.runType !== 'single' && !this.snapshotRepo?.hasFullCrawl(this.siteId);
215
+ if (this.options.sitemap !== false && (this.options.sitemap || isFirstFullCrawl) && this.robots && this.runType !== 'single') {
216
+ const robotsSitemaps = this.robots.getSitemaps();
217
+ for (const s of robotsSitemaps) {
218
+ if (s)
219
+ sitemapsToFetch.add(s);
220
+ }
221
+ }
222
+ // Process all discovered sitemaps in background
223
+ if (sitemapsToFetch.size > 0) {
224
+ for (const sitemapUrl of sitemapsToFetch) {
225
+ this.pendingSitemaps++;
226
+ // KICK OFF BACKGROUND TASK (Un-awaited)
227
+ (async () => {
228
+ try {
229
+ this.context.emit({ type: 'debug', message: 'Fetching sitemap in background', context: { url: sitemapUrl } });
230
+ const sitemapUrls = await this.sitemapFetcher.fetch(sitemapUrl);
231
+ if (sitemapUrls.length > 0) {
232
+ this.context.emit({ type: 'debug', message: `Mapping ${sitemapUrls.length} URLs from sitemap... (Background)` });
233
+ const sitemapEntries = sitemapUrls.map(u => {
234
+ const normalized = normalizeUrl(u, this.rootOrigin, this.options);
235
+ if (!normalized)
236
+ return null;
237
+ const path = this.toStorageUrl(normalized);
238
+ return {
239
+ site_id: this.siteId,
240
+ normalized_url: path,
241
+ first_seen_snapshot_id: this.snapshotId,
242
+ last_seen_snapshot_id: this.snapshotId,
243
+ discovered_via_sitemap: 1,
244
+ depth: 0,
245
+ http_status: 0
246
+ };
247
+ }).filter((p) => p !== null);
248
+ // Bulk register to DB
249
+ this.pageRepo.upsertMany(sitemapEntries);
250
+ // Add to queue for Actual Crawling
251
+ for (const entry of sitemapEntries) {
252
+ this.addToQueue(entry.normalized_url, 0, { discovered_via_sitemap: 1 });
253
+ }
254
+ }
255
+ }
256
+ catch (e) {
257
+ this.context.emit({ type: 'warn', message: 'Sitemap fetch failed', context: { url: sitemapUrl, error: String(e) } });
258
+ }
259
+ finally {
260
+ this.pendingSitemaps--;
261
+ }
262
+ })();
263
+ }
264
+ }
265
+ }
266
+ bufferPage(url, depth, status, data = {}) {
267
+ const existing = this.pageBuffer.get(url);
268
+ const knownDiscovery = this.discoveryDepths.get(url);
269
+ // Always use the best (minimum) depth discovered for this URL
270
+ const finalDepth = knownDiscovery !== undefined ? Math.min(knownDiscovery, depth) : depth;
271
+ if (knownDiscovery === undefined || depth < knownDiscovery) {
272
+ this.discoveryDepths.set(url, depth);
273
+ }
274
+ // If we already have a buffered record, only update if the new one is more "complete" (has status)
275
+ // or if the depth is better.
276
+ if (existing) {
277
+ const isStatusUpdate = status !== 0 && existing.http_status === 0;
278
+ const isBetterDepth = finalDepth < existing.depth;
279
+ if (!isStatusUpdate && !isBetterDepth && Object.keys(data).length === 0) {
280
+ return;
281
+ }
282
+ this.pageBuffer.set(url, {
283
+ ...existing,
284
+ depth: finalDepth,
285
+ http_status: status !== 0 ? status : existing.http_status,
286
+ ...data
287
+ });
288
+ }
289
+ else {
290
+ this.pageBuffer.set(url, {
291
+ site_id: this.siteId,
292
+ normalized_url: url,
293
+ depth: finalDepth,
294
+ http_status: status,
295
+ last_seen_snapshot_id: this.snapshotId,
296
+ ...data
297
+ });
298
+ }
299
+ if (this.pageBuffer.size >= 50) {
300
+ this.flushPages();
301
+ }
302
+ }
303
+ flushPages() {
304
+ if (this.pageBuffer.size === 0)
305
+ return;
306
+ this.pageRepo.upsertMany(Array.from(this.pageBuffer.values()));
307
+ this.pageBuffer.clear();
308
+ }
309
+ bufferEdge(sourceUrl, targetUrl, weight = 1.0, rel = 'internal') {
310
+ this.edgeBuffer.push({ sourceUrl, targetUrl, weight, rel });
311
+ this.edgesFound += 1;
312
+ this.emitProgress();
313
+ if (this.edgeBuffer.length >= 100) {
314
+ this.flushEdges();
315
+ }
316
+ }
317
+ emitProgress(force = false) {
318
+ const now = Date.now();
319
+ if (!force && now - this.lastProgressEmitAt < 200)
320
+ return;
321
+ this.lastProgressEmitAt = now;
322
+ this.context.emit({
323
+ type: 'crawl:progress',
324
+ pagesCrawled: this.pagesCrawled,
325
+ queued: this.queue.length,
326
+ active: this.active,
327
+ nodesFound: this.uniqueQueue.size,
328
+ edgesFound: this.edgesFound,
329
+ phase: this.progressPhase
330
+ });
331
+ }
332
+ flushEdges() {
333
+ if (this.edgeBuffer.length === 0)
334
+ return;
335
+ // To resolve URLs to IDs, we need to make sure pages are flushed first
336
+ this.flushPages();
337
+ const identities = this.pageRepo.getPagesIdentityBySnapshot(this.snapshotId);
338
+ const urlToId = new Map(identities.map(p => [p.normalized_url, p.id]));
339
+ // When reusing a snapshot, clean up stale edges for pages being re-crawled
340
+ if (this.reusingSnapshot) {
341
+ const sourcePageIds = new Set(this.edgeBuffer.map(e => urlToId.get(e.sourceUrl)).filter((id) => id !== undefined));
342
+ for (const pageId of sourcePageIds) {
343
+ this.edgeRepo.deleteEdgesForPage(this.snapshotId, pageId);
344
+ }
345
+ }
346
+ const edgesToInsert = this.edgeBuffer
347
+ .map(e => ({
348
+ snapshot_id: this.snapshotId,
349
+ source_page_id: urlToId.get(e.sourceUrl),
350
+ target_page_id: urlToId.get(e.targetUrl),
351
+ weight: e.weight,
352
+ rel: e.rel
353
+ }))
354
+ .filter(e => e.source_page_id !== undefined && e.target_page_id !== undefined);
355
+ if (edgesToInsert.length > 0) {
356
+ this.edgeRepo.insertEdges(edgesToInsert);
357
+ }
358
+ this.edgeBuffer = [];
359
+ }
360
+ bufferMetrics(url, data) {
361
+ this.metricsBuffer.push({ url, data });
362
+ if (this.metricsBuffer.length >= 50) {
363
+ this.flushMetrics();
364
+ }
365
+ }
366
+ flushMetrics() {
367
+ if (this.metricsBuffer.length === 0)
368
+ return;
369
+ this.flushPages();
370
+ const identities = this.pageRepo.getPagesIdentityBySnapshot(this.snapshotId);
371
+ const urlToId = new Map(identities.map(p => [p.normalized_url, p.id]));
372
+ const metricsList = this.metricsBuffer.map(item => {
373
+ const pageId = urlToId.get(item.url);
374
+ if (!pageId)
375
+ return null;
376
+ return {
377
+ snapshot_id: this.snapshotId,
378
+ page_id: pageId,
379
+ crawl_status: null,
380
+ word_count: null,
381
+ thin_content_score: null,
382
+ external_link_ratio: null,
383
+ pagerank_score: null,
384
+ hub_score: null,
385
+ auth_score: null,
386
+ link_role: null,
387
+ duplicate_cluster_id: null,
388
+ duplicate_type: null,
389
+ cluster_id: null,
390
+ soft404_score: null,
391
+ heading_score: null,
392
+ orphan_score: null,
393
+ orphan_type: null,
394
+ impact_level: null,
395
+ heading_data: null,
396
+ is_cluster_primary: 0,
397
+ ...item.data
398
+ };
399
+ }).filter(m => m !== null);
400
+ if (metricsList.length > 0) {
401
+ this.metricsRepo.insertMany(metricsList);
402
+ }
403
+ this.metricsBuffer = [];
404
+ }
405
+ async flushAll() {
406
+ this.flushPages();
407
+ this.flushEdges();
408
+ this.flushMetrics();
409
+ }
410
+ async fetchPage(url, depth, prevNode) {
411
+ const startTime = Date.now();
412
+ try {
413
+ this.context.emit({ type: 'crawl:start', url });
414
+ const res = await this.fetcher.fetch(url, {
415
+ maxBytes: this.options.maxBytes,
416
+ crawlDelay: this.robots ? this.robots.getCrawlDelay('crawlith') : undefined,
417
+ etag: prevNode?.etag,
418
+ lastModified: prevNode?.lastModified
419
+ });
420
+ const durationMs = Date.now() - startTime;
421
+ this.context.emit({
422
+ type: 'crawl:success',
423
+ url,
424
+ status: typeof res.status === 'number' ? res.status : 0,
425
+ durationMs,
426
+ depth
427
+ });
428
+ return res;
429
+ }
430
+ catch (e) {
431
+ this.context.emit({ type: 'crawl:error', url, error: String(e), depth });
432
+ return null;
433
+ }
434
+ }
435
+ handleCachedResponse(url, finalUrl, depth, prevNode) {
436
+ const path = url;
437
+ const finalPath = this.toStorageUrl(finalUrl);
438
+ this.bufferPage(finalPath, depth, prevNode.status, {
439
+ html: prevNode.html,
440
+ canonical_url: prevNode.canonical,
441
+ noindex: prevNode.noindex ? 1 : 0,
442
+ nofollow: prevNode.nofollow ? 1 : 0,
443
+ content_hash: prevNode.contentHash,
444
+ simhash: prevNode.simhash,
445
+ etag: prevNode.etag,
446
+ last_modified: prevNode.lastModified
447
+ });
448
+ this.bufferMetrics(finalPath, {
449
+ crawl_status: 'cached',
450
+ word_count: prevNode.wordCount,
451
+ thin_content_score: prevNode.thinContentScore,
452
+ external_link_ratio: prevNode.externalLinkRatio
453
+ });
454
+ // Re-discovery links from previous graph to continue crawling if needed
455
+ const prevLinks = this.options.previousGraph?.getEdges()
456
+ .filter(e => e.source === path)
457
+ .map(e => e.target);
458
+ if (prevLinks) {
459
+ for (const link of prevLinks) {
460
+ const normalizedLink = normalizeUrl(link, this.rootOrigin, this.options);
461
+ if (normalizedLink) {
462
+ const path = this.toStorageUrl(normalizedLink);
463
+ if (path !== url) {
464
+ this.bufferPage(path, depth + 1, 0);
465
+ this.bufferEdge(url, path, 1.0, 'internal');
466
+ if (this.shouldEnqueue(path, depth + 1)) {
467
+ this.addToQueue(path, depth + 1);
468
+ }
469
+ }
470
+ }
471
+ }
472
+ }
473
+ }
474
+ handleRedirects(chain, depth) {
475
+ for (const step of chain) {
476
+ const sourceAbs = normalizeUrl(step.url, this.rootOrigin, this.options);
477
+ const targetAbs = normalizeUrl(step.target, this.rootOrigin, this.options);
478
+ if (sourceAbs && targetAbs) {
479
+ const sourcePath = this.toStorageUrl(sourceAbs);
480
+ const targetPath = this.toStorageUrl(targetAbs);
481
+ const sourceInternal = UrlUtil.isInternal(sourceAbs, this.rootOrigin);
482
+ const targetInternal = UrlUtil.isInternal(targetAbs, this.rootOrigin);
483
+ this.bufferPage(sourcePath, depth, step.status, { is_internal: sourceInternal ? 1 : 0 });
484
+ this.bufferPage(targetPath, depth, 0, { is_internal: targetInternal ? 1 : 0 });
485
+ this.bufferEdge(sourcePath, targetPath, 1.0, targetInternal ? 'internal' : 'external');
486
+ }
487
+ }
488
+ }
489
+ handleSuccessResponse(res, path, absoluteUrl, depth, isBlocked = false) {
490
+ const contentTypeHeader = res.headers['content-type'];
491
+ const contentType = Array.isArray(contentTypeHeader) ? contentTypeHeader[0] : (contentTypeHeader || '');
492
+ if (!contentType || !contentType.toLowerCase().includes('text/html')) {
493
+ this.bufferPage(path, depth, typeof res.status === 'number' ? res.status : 0);
494
+ return;
495
+ }
496
+ const parseResult = this.parser.parse(res.body, absoluteUrl, res.status);
497
+ if (this.registry) {
498
+ this.registry.runHook('onPageParsed', this.context, {
499
+ url: absoluteUrl,
500
+ status: res.status,
501
+ depth: depth,
502
+ headers: res.headers,
503
+ ...parseResult
504
+ });
505
+ }
506
+ this.bufferPage(path, depth, res.status, {
507
+ html: parseResult.html,
508
+ canonical_url: parseResult.canonical || undefined,
509
+ noindex: parseResult.noindex ? 1 : 0,
510
+ nofollow: parseResult.nofollow ? 1 : 0,
511
+ content_hash: parseResult.contentHash,
512
+ simhash: parseResult.simhash,
513
+ etag: res.etag,
514
+ last_modified: res.lastModified,
515
+ retries: res.retries,
516
+ bytes_received: res.bytesReceived
517
+ });
518
+ try {
519
+ const contentAnalysis = analyzeContent(parseResult.html);
520
+ const linkAnalysis = analyzeLinks(parseResult.html, absoluteUrl, this.rootOrigin);
521
+ const thinScore = calculateThinContentScore(contentAnalysis, 0);
522
+ this.bufferMetrics(path, {
523
+ crawl_status: isBlocked ? 'blocked_by_robots' : 'fetched',
524
+ word_count: contentAnalysis.wordCount,
525
+ thin_content_score: thinScore,
526
+ external_link_ratio: linkAnalysis.externalRatio
527
+ });
528
+ }
529
+ catch (e) {
530
+ this.context.emit({ type: 'error', message: 'Error calculating per-page metrics', error: e, context: { url: absoluteUrl } });
531
+ }
532
+ for (const linkItem of parseResult.links) {
533
+ const normalizedLink = normalizeUrl(linkItem.url, absoluteUrl, this.options);
534
+ if (normalizedLink) {
535
+ const targetPath = this.toStorageUrl(normalizedLink);
536
+ if (targetPath !== path) {
537
+ const isInternal = UrlUtil.isInternal(normalizedLink, this.rootOrigin);
538
+ this.bufferPage(targetPath, depth + 1, 0, { is_internal: isInternal ? 1 : 0 });
539
+ this.bufferEdge(path, targetPath, 1.0, isInternal ? 'internal' : 'external');
540
+ if (isInternal && this.shouldEnqueue(targetPath, depth + 1)) {
541
+ this.addToQueue(targetPath, depth + 1);
542
+ }
543
+ }
544
+ }
545
+ }
546
+ }
547
+ async processPage(item, isBlocked = false) {
548
+ const { url, depth } = item;
549
+ if (this.scopeManager.isUrlEligible(url) !== 'allowed') {
550
+ this.bufferPage(url, depth, 0, { securityError: 'blocked_by_domain_filter' });
551
+ return;
552
+ }
553
+ // Convert stored path to absolute URL for fetching.
554
+ // External/subdomain URLs are already absolute (UrlUtil.toPath returns them as-is).
555
+ const fetchUrl = UrlUtil.toAbsolute(url, this.rootOrigin);
556
+ try {
557
+ const prevNode = this.options.previousGraph?.nodes.get(url);
558
+ const res = await this.fetchPage(fetchUrl, depth, prevNode);
559
+ if (!res)
560
+ return;
561
+ const finalUrl = normalizeUrl(res.finalUrl, this.rootOrigin, this.options);
562
+ if (!finalUrl)
563
+ return;
564
+ const fullUrl = finalUrl; // Already absolute
565
+ const finalPath = this.toStorageUrl(finalUrl);
566
+ if (res.status === 304 && prevNode) {
567
+ this.handleCachedResponse(url, finalUrl, depth, prevNode);
568
+ return;
569
+ }
570
+ this.handleRedirects(res.redirectChain, depth);
571
+ const isStringStatus = typeof res.status === 'string';
572
+ if (isStringStatus || (typeof res.status === 'number' && res.status >= 300)) {
573
+ const statusNum = typeof res.status === 'number' ? res.status : 0;
574
+ this.bufferPage(finalPath, depth, statusNum, {
575
+ security_error: isStringStatus ? res.status : undefined,
576
+ retries: res.retries
577
+ });
578
+ this.bufferMetrics(finalPath, {
579
+ crawl_status: isStringStatus ? res.status : 'fetched_error'
580
+ });
581
+ return;
582
+ }
583
+ if (res.status === 200) {
584
+ this.handleSuccessResponse(res, finalPath, fullUrl, depth, isBlocked);
585
+ }
586
+ }
587
+ catch (e) {
588
+ this.context.emit({ type: 'crawl:error', url, error: String(e), depth });
589
+ }
590
+ }
591
+ async run() {
592
+ // 1. Setup fetcher and basic modules
593
+ this.setupModules();
594
+ // 2. Initialize repositories, resolve URL (SSL/WWW), and set up site context
595
+ await this.initialize();
596
+ if (this.options.robots) {
597
+ this.robots = this.options.robots;
598
+ }
599
+ else {
600
+ await this.fetchRobots();
601
+ }
602
+ await this.seedQueue();
603
+ return new Promise((resolve) => {
604
+ const checkDone = async () => {
605
+ if (this.queue.length === 0 && this.active === 0 && this.pendingSitemaps === 0) {
606
+ this.progressPhase = 'finalizing';
607
+ this.emitProgress(true);
608
+ await this.flushAll();
609
+ this.snapshotRepo.updateSnapshotStatus(this.snapshotId, 'completed', {
610
+ limit_reached: this.reachedLimit ? 1 : 0
611
+ });
612
+ this.snapshotRepo.pruneSnapshots(this.siteId, DEFAULTS.MAX_SNAPSHOTS, DEFAULTS.MAX_SINGLE_SNAPSHOTS, this.snapshotId);
613
+ if (this.reusingSnapshot) {
614
+ this.snapshotRepo.touchSnapshot(this.snapshotId);
615
+ }
616
+ resolve(this.snapshotId);
617
+ return true;
618
+ }
619
+ return false;
620
+ };
621
+ const next = async () => {
622
+ if (await checkDone())
623
+ return;
624
+ if (this.pagesCrawled >= this.options.limit) {
625
+ this.reachedLimit = true;
626
+ this.progressPhase = 'limit reached';
627
+ this.emitProgress();
628
+ if (this.active === 0) {
629
+ this.context.emit({ type: 'crawl:limit-reached', limit: this.options.limit });
630
+ this.progressPhase = 'finalizing';
631
+ this.emitProgress(true);
632
+ await this.flushAll();
633
+ this.snapshotRepo.updateSnapshotStatus(this.snapshotId, 'completed', {
634
+ limit_reached: 1
635
+ });
636
+ this.snapshotRepo.pruneSnapshots(this.siteId, DEFAULTS.MAX_SNAPSHOTS, DEFAULTS.MAX_SINGLE_SNAPSHOTS, this.snapshotId);
637
+ if (this.reusingSnapshot) {
638
+ this.snapshotRepo.touchSnapshot(this.snapshotId);
639
+ }
640
+ resolve(this.snapshotId);
641
+ }
642
+ return;
643
+ }
644
+ while (this.queue.length > 0 && this.active < this.concurrency && this.pagesCrawled < this.options.limit) {
645
+ const item = this.queue.shift();
646
+ if (this.visited.has(item.url))
647
+ continue;
648
+ // Robust robots check: reconstruct absolute URL since robots-parser needs full URLs,
649
+ // not root-relative paths. Also check /path/ variant in case robots.txt uses trailing slash.
650
+ const absUrlForRobots = UrlUtil.toAbsolute(item.url, this.rootOrigin);
651
+ const isBlocked = this.robots && (!this.robots.isAllowed(absUrlForRobots, 'crawlith') ||
652
+ (!absUrlForRobots.endsWith('/') && !this.robots.isAllowed(absUrlForRobots + '/', 'crawlith')));
653
+ if (isBlocked) {
654
+ if (this.options.debug) {
655
+ console.log(`${chalk.yellow('⊘ Robots')} ${chalk.gray(item.url)}`);
656
+ }
657
+ // Tag as blocked for reporting
658
+ this.bufferMetrics(item.url, {
659
+ crawl_status: 'blocked_by_robots'
660
+ });
661
+ this.bufferPage(item.url, item.depth, 0);
662
+ if (!this.options.ignoreRobots) {
663
+ this.visited.add(item.url);
664
+ this.pagesCrawled++;
665
+ continue;
666
+ }
667
+ }
668
+ this.active++;
669
+ this.pagesCrawled++;
670
+ this.visited.add(item.url);
671
+ this.limitConcurrency(() => this.processPage(item, isBlocked)).finally(() => {
672
+ this.active--;
673
+ this.emitProgress();
674
+ next();
675
+ });
676
+ }
677
+ this.emitProgress();
678
+ await checkDone();
679
+ };
680
+ next();
681
+ });
682
+ }
683
+ }
@@ -1,5 +1,8 @@
1
1
  /**
2
2
  * Extracts all links from an HTML document.
3
3
  * Returns absolute URLs.
4
+ * @param html The HTML content string
5
+ * @param baseUrl The base URL to resolve relative links against
6
+ * @param onError Optional callback for handling extraction errors
4
7
  */
5
- export declare function extractLinks(html: string, baseUrl: string): string[];
8
+ export declare function extractLinks(html: string, baseUrl: string, onError?: (error: unknown) => void): string[];