@crawlith/core 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analyze.d.ts +29 -8
  4. package/dist/analysis/analyze.js +325 -221
  5. package/dist/analysis/clustering.d.ts +23 -0
  6. package/dist/analysis/clustering.js +206 -0
  7. package/dist/analysis/content.d.ts +1 -1
  8. package/dist/analysis/content.js +11 -5
  9. package/dist/analysis/duplicate.d.ts +34 -0
  10. package/dist/analysis/duplicate.js +305 -0
  11. package/dist/analysis/heading.d.ts +116 -0
  12. package/dist/analysis/heading.js +356 -0
  13. package/dist/analysis/images.d.ts +1 -1
  14. package/dist/analysis/images.js +6 -5
  15. package/dist/analysis/links.d.ts +1 -1
  16. package/dist/analysis/links.js +8 -8
  17. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  18. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  19. package/dist/analysis/scoring.js +4 -1
  20. package/dist/analysis/seo.d.ts +8 -4
  21. package/dist/analysis/seo.js +41 -30
  22. package/dist/analysis/soft404.d.ts +17 -0
  23. package/dist/analysis/soft404.js +62 -0
  24. package/dist/analysis/structuredData.d.ts +1 -1
  25. package/dist/analysis/structuredData.js +5 -4
  26. package/dist/application/index.d.ts +2 -0
  27. package/dist/application/index.js +2 -0
  28. package/dist/application/usecase.d.ts +3 -0
  29. package/dist/application/usecase.js +1 -0
  30. package/dist/application/usecases.d.ts +114 -0
  31. package/dist/application/usecases.js +201 -0
  32. package/dist/audit/index.js +1 -1
  33. package/dist/audit/transport.d.ts +1 -1
  34. package/dist/audit/transport.js +5 -4
  35. package/dist/audit/types.d.ts +1 -0
  36. package/dist/constants.d.ts +17 -0
  37. package/dist/constants.js +23 -0
  38. package/dist/core/scope/scopeManager.js +3 -0
  39. package/dist/crawler/crawl.d.ts +2 -2
  40. package/dist/crawler/crawler.d.ts +17 -5
  41. package/dist/crawler/crawler.js +259 -94
  42. package/dist/crawler/fetcher.d.ts +1 -1
  43. package/dist/crawler/fetcher.js +6 -6
  44. package/dist/crawler/metricsRunner.d.ts +21 -1
  45. package/dist/crawler/metricsRunner.js +181 -60
  46. package/dist/crawler/normalize.d.ts +41 -0
  47. package/dist/crawler/normalize.js +119 -3
  48. package/dist/crawler/parser.d.ts +1 -3
  49. package/dist/crawler/parser.js +2 -49
  50. package/dist/crawler/resolver.d.ts +11 -0
  51. package/dist/crawler/resolver.js +67 -0
  52. package/dist/crawler/sitemap.d.ts +4 -1
  53. package/dist/crawler/sitemap.js +24 -18
  54. package/dist/crawler/trap.d.ts +5 -1
  55. package/dist/crawler/trap.js +23 -2
  56. package/dist/db/CrawlithDB.d.ts +110 -0
  57. package/dist/db/CrawlithDB.js +500 -0
  58. package/dist/db/graphLoader.js +15 -32
  59. package/dist/db/index.d.ts +9 -1
  60. package/dist/db/index.js +39 -31
  61. package/dist/db/migrations.d.ts +2 -0
  62. package/dist/db/{schema.js → migrations.js} +90 -43
  63. package/dist/db/pluginRegistry.d.ts +9 -0
  64. package/dist/db/pluginRegistry.js +19 -0
  65. package/dist/db/repositories/EdgeRepository.d.ts +5 -0
  66. package/dist/db/repositories/EdgeRepository.js +7 -0
  67. package/dist/db/repositories/MetricsRepository.d.ts +13 -8
  68. package/dist/db/repositories/MetricsRepository.js +14 -6
  69. package/dist/db/repositories/PageRepository.d.ts +5 -3
  70. package/dist/db/repositories/PageRepository.js +68 -17
  71. package/dist/db/repositories/SiteRepository.d.ts +6 -0
  72. package/dist/db/repositories/SiteRepository.js +4 -0
  73. package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
  74. package/dist/db/repositories/SnapshotRepository.js +48 -10
  75. package/dist/db/reset.d.ts +9 -0
  76. package/dist/db/reset.js +32 -0
  77. package/dist/db/statements.d.ts +12 -0
  78. package/dist/db/statements.js +40 -0
  79. package/dist/diff/compare.d.ts +0 -5
  80. package/dist/diff/compare.js +0 -12
  81. package/dist/diff/service.d.ts +16 -0
  82. package/dist/diff/service.js +41 -0
  83. package/dist/domain/index.d.ts +4 -0
  84. package/dist/domain/index.js +4 -0
  85. package/dist/events.d.ts +8 -0
  86. package/dist/graph/graph.d.ts +20 -42
  87. package/dist/graph/graph.js +12 -16
  88. package/dist/graph/hits.d.ts +23 -0
  89. package/dist/graph/hits.js +111 -0
  90. package/dist/graph/metrics.d.ts +0 -4
  91. package/dist/graph/metrics.js +19 -15
  92. package/dist/graph/pagerank.d.ts +17 -4
  93. package/dist/graph/pagerank.js +126 -93
  94. package/dist/index.d.ts +27 -9
  95. package/dist/index.js +27 -9
  96. package/dist/lock/lockManager.d.ts +1 -0
  97. package/dist/lock/lockManager.js +15 -0
  98. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  99. package/dist/plugin-system/plugin-cli.js +31 -0
  100. package/dist/plugin-system/plugin-config.d.ts +16 -0
  101. package/dist/plugin-system/plugin-config.js +36 -0
  102. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  103. package/dist/plugin-system/plugin-loader.js +122 -0
  104. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  105. package/dist/plugin-system/plugin-registry.js +167 -0
  106. package/dist/plugin-system/plugin-types.d.ts +205 -0
  107. package/dist/plugin-system/plugin-types.js +1 -0
  108. package/dist/ports/index.d.ts +9 -0
  109. package/dist/ports/index.js +1 -0
  110. package/dist/report/export.d.ts +3 -0
  111. package/dist/report/export.js +81 -0
  112. package/dist/report/insight.d.ts +27 -0
  113. package/dist/report/insight.js +103 -0
  114. package/dist/scoring/health.d.ts +17 -11
  115. package/dist/scoring/health.js +183 -140
  116. package/dist/utils/chalk.d.ts +6 -0
  117. package/dist/utils/chalk.js +41 -0
  118. package/dist/utils/secureConfig.d.ts +23 -0
  119. package/dist/utils/secureConfig.js +128 -0
  120. package/package.json +10 -4
  121. package/CHANGELOG.md +0 -13
  122. package/dist/db/schema.d.ts +0 -2
  123. package/dist/graph/cluster.d.ts +0 -6
  124. package/dist/graph/cluster.js +0 -221
  125. package/dist/graph/duplicate.d.ts +0 -10
  126. package/dist/graph/duplicate.js +0 -302
  127. package/dist/scoring/hits.d.ts +0 -10
  128. package/dist/scoring/hits.js +0 -131
  129. package/scripts/copy-assets.js +0 -37
  130. package/src/analysis/analysis_list.html +0 -35
  131. package/src/analysis/analysis_page.html +0 -123
  132. package/src/analysis/analyze.ts +0 -505
  133. package/src/analysis/content.ts +0 -62
  134. package/src/analysis/images.ts +0 -28
  135. package/src/analysis/links.ts +0 -41
  136. package/src/analysis/scoring.ts +0 -66
  137. package/src/analysis/seo.ts +0 -82
  138. package/src/analysis/structuredData.ts +0 -62
  139. package/src/analysis/templates.ts +0 -9
  140. package/src/audit/dns.ts +0 -49
  141. package/src/audit/headers.ts +0 -98
  142. package/src/audit/index.ts +0 -66
  143. package/src/audit/scoring.ts +0 -232
  144. package/src/audit/transport.ts +0 -258
  145. package/src/audit/types.ts +0 -102
  146. package/src/core/network/proxyAdapter.ts +0 -21
  147. package/src/core/network/rateLimiter.ts +0 -39
  148. package/src/core/network/redirectController.ts +0 -47
  149. package/src/core/network/responseLimiter.ts +0 -34
  150. package/src/core/network/retryPolicy.ts +0 -57
  151. package/src/core/scope/domainFilter.ts +0 -45
  152. package/src/core/scope/scopeManager.ts +0 -52
  153. package/src/core/scope/subdomainPolicy.ts +0 -39
  154. package/src/core/security/ipGuard.ts +0 -171
  155. package/src/crawler/crawl.ts +0 -9
  156. package/src/crawler/crawler.ts +0 -601
  157. package/src/crawler/extract.ts +0 -39
  158. package/src/crawler/fetcher.ts +0 -251
  159. package/src/crawler/metricsRunner.ts +0 -137
  160. package/src/crawler/normalize.ts +0 -108
  161. package/src/crawler/parser.ts +0 -190
  162. package/src/crawler/sitemap.ts +0 -76
  163. package/src/crawler/trap.ts +0 -96
  164. package/src/db/graphLoader.ts +0 -135
  165. package/src/db/index.ts +0 -75
  166. package/src/db/repositories/EdgeRepository.ts +0 -43
  167. package/src/db/repositories/MetricsRepository.ts +0 -63
  168. package/src/db/repositories/PageRepository.ts +0 -228
  169. package/src/db/repositories/SiteRepository.ts +0 -43
  170. package/src/db/repositories/SnapshotRepository.ts +0 -99
  171. package/src/db/schema.ts +0 -177
  172. package/src/diff/compare.ts +0 -84
  173. package/src/events.ts +0 -16
  174. package/src/graph/cluster.ts +0 -246
  175. package/src/graph/duplicate.ts +0 -350
  176. package/src/graph/graph.ts +0 -192
  177. package/src/graph/metrics.ts +0 -125
  178. package/src/graph/pagerank.ts +0 -126
  179. package/src/graph/simhash.ts +0 -76
  180. package/src/index.ts +0 -33
  181. package/src/lock/hashKey.ts +0 -51
  182. package/src/lock/lockManager.ts +0 -132
  183. package/src/lock/pidCheck.ts +0 -13
  184. package/src/report/crawl.html +0 -879
  185. package/src/report/crawlExport.ts +0 -58
  186. package/src/report/crawl_template.ts +0 -9
  187. package/src/report/html.ts +0 -27
  188. package/src/scoring/health.ts +0 -241
  189. package/src/scoring/hits.ts +0 -153
  190. package/src/scoring/orphanSeverity.ts +0 -176
  191. package/src/utils/version.ts +0 -18
  192. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  193. package/tests/analysis.unit.test.ts +0 -142
  194. package/tests/analyze.integration.test.ts +0 -133
  195. package/tests/analyze_markdown.test.ts +0 -98
  196. package/tests/audit/audit.test.ts +0 -101
  197. package/tests/audit/dns.test.ts +0 -31
  198. package/tests/audit/headers.test.ts +0 -45
  199. package/tests/audit/scoring.test.ts +0 -133
  200. package/tests/audit/security.test.ts +0 -12
  201. package/tests/audit/transport.test.ts +0 -111
  202. package/tests/clustering.test.ts +0 -118
  203. package/tests/clustering_risk.test.ts +0 -118
  204. package/tests/crawler.test.ts +0 -364
  205. package/tests/db/index.test.ts +0 -134
  206. package/tests/db/repositories.test.ts +0 -115
  207. package/tests/db.test.ts +0 -159
  208. package/tests/db_repos.test.ts +0 -72
  209. package/tests/diff.test.ts +0 -67
  210. package/tests/duplicate.test.ts +0 -110
  211. package/tests/extract.test.ts +0 -86
  212. package/tests/fetcher.test.ts +0 -110
  213. package/tests/fetcher_safety.test.ts +0 -91
  214. package/tests/fixtures/analyze-crawl.json +0 -26
  215. package/tests/graph/graph.test.ts +0 -100
  216. package/tests/graphLoader.test.ts +0 -124
  217. package/tests/hits.test.ts +0 -134
  218. package/tests/html_report.test.ts +0 -59
  219. package/tests/ipGuard.test.ts +0 -73
  220. package/tests/lock/lockManager.test.ts +0 -198
  221. package/tests/metrics.test.ts +0 -196
  222. package/tests/normalize.test.ts +0 -88
  223. package/tests/orphanSeverity.test.ts +0 -160
  224. package/tests/pagerank.test.ts +0 -98
  225. package/tests/parser.test.ts +0 -117
  226. package/tests/proxy_safety.test.ts +0 -57
  227. package/tests/redirect_safety.test.ts +0 -77
  228. package/tests/renderAnalysisCsv.test.ts +0 -183
  229. package/tests/safety.test.ts +0 -126
  230. package/tests/scope.test.ts +0 -84
  231. package/tests/scoring.test.ts +0 -60
  232. package/tests/sitemap.test.ts +0 -100
  233. package/tests/soft404.test.ts +0 -41
  234. package/tests/ssrf_fix.test.ts +0 -69
  235. package/tests/trap.test.ts +0 -39
  236. package/tests/visualization_data.test.ts +0 -46
  237. package/tsconfig.json +0 -11
@@ -1,11 +1,11 @@
1
- import chalk from 'chalk';
1
+ import chalk from '../utils/chalk.js';
2
2
  import pLimit from 'p-limit';
3
3
  import robotsParser from 'robots-parser';
4
4
  import { Fetcher } from './fetcher.js';
5
5
  import { Parser } from './parser.js';
6
6
  import { Sitemap } from './sitemap.js';
7
- import { normalizeUrl } from './normalize.js';
8
- import { TrapDetector } from './trap.js';
7
+ import { normalizeUrl, UrlUtil } from './normalize.js';
8
+ import { UrlResolver } from './resolver.js';
9
9
  import { ScopeManager } from '../core/scope/scopeManager.js';
10
10
  import { getDb } from '../db/index.js';
11
11
  import { SiteRepository } from '../db/repositories/SiteRepository.js';
@@ -15,6 +15,7 @@ import { EdgeRepository } from '../db/repositories/EdgeRepository.js';
15
15
  import { MetricsRepository } from '../db/repositories/MetricsRepository.js';
16
16
  import { analyzeContent, calculateThinContentScore } from '../analysis/content.js';
17
17
  import { analyzeLinks } from '../analysis/links.js';
18
+ import { DEFAULTS } from '../constants.js';
18
19
  // Fallback context for backward compatibility or when no context is provided
19
20
  const nullContext = {
20
21
  emit: (event) => {
@@ -32,6 +33,7 @@ export class Crawler {
32
33
  startUrl;
33
34
  options;
34
35
  context;
36
+ registry;
35
37
  visited;
36
38
  uniqueQueue;
37
39
  queue;
@@ -50,6 +52,8 @@ export class Crawler {
50
52
  // Site/Snapshot info
51
53
  siteId = null;
52
54
  snapshotId = null;
55
+ reusingSnapshot = false;
56
+ runType = 'completed';
53
57
  rootOrigin = '';
54
58
  // Discovery tracking
55
59
  discoveryDepths = new Map();
@@ -57,27 +61,34 @@ export class Crawler {
57
61
  pageBuffer = new Map();
58
62
  edgeBuffer = [];
59
63
  metricsBuffer = [];
64
+ pendingSitemaps = 0;
65
+ edgesFound = 0;
66
+ lastProgressEmitAt = 0;
67
+ progressPhase = 'crawling';
60
68
  // Modules
61
69
  scopeManager = null;
62
70
  fetcher = null;
63
71
  parser = null;
64
72
  sitemapFetcher = null;
65
- trapDetector = null;
66
73
  robots = null;
67
74
  constructor(startUrl, options, context) {
68
75
  this.startUrl = startUrl;
69
76
  this.options = options;
70
77
  this.context = context || nullContext;
78
+ this.registry = options.registry;
71
79
  this.visited = new Set();
72
80
  this.uniqueQueue = new Set();
73
81
  this.queue = [];
74
82
  this.active = 0;
75
83
  this.pagesCrawled = 0;
76
84
  this.reachedLimit = false;
77
- this.maxDepthInCrawl = Math.min(options.depth, 10);
78
- this.concurrency = Math.min(options.concurrency || 2, 10);
85
+ this.maxDepthInCrawl = Math.min(options.depth || DEFAULTS.MAX_DEPTH, DEFAULTS.MAX_DEPTH_LIMIT);
86
+ this.concurrency = Math.min(options.concurrency || DEFAULTS.CONCURRENCY, DEFAULTS.CONCURRENCY_LIMIT);
79
87
  this.limitConcurrency = pLimit(this.concurrency);
80
88
  }
89
+ toStorageUrl(url) {
90
+ return UrlUtil.isInternal(url, this.rootOrigin) ? UrlUtil.toPath(url, this.rootOrigin) : url;
91
+ }
81
92
  async initialize() {
82
93
  const db = getDb();
83
94
  this.siteRepo = new SiteRepository(db);
@@ -85,41 +96,64 @@ export class Crawler {
85
96
  this.pageRepo = new PageRepository(db);
86
97
  this.edgeRepo = new EdgeRepository(db);
87
98
  this.metricsRepo = new MetricsRepository(db);
88
- const rootUrl = normalizeUrl(this.startUrl, '', { stripQuery: this.options.stripQuery });
99
+ // Use resolver to find canonical origin and SSL
100
+ const resolver = new UrlResolver();
101
+ const tempFetcher = new Fetcher({ userAgent: this.options.userAgent, rate: this.options.rate });
102
+ const resolved = await resolver.resolve(this.startUrl, tempFetcher);
103
+ this.rootOrigin = resolved.url;
104
+ // Use the resolved absolute URL as the base — NOT this.startUrl which may be
105
+ // a bare domain (e.g. 'callforpaper.org') that would be treated as a relative
106
+ // path when passed to normalizeUrl, producing '/callforpaper.org'.
107
+ const rootUrl = normalizeUrl(this.rootOrigin, '', { stripQuery: this.options.stripQuery });
89
108
  if (!rootUrl)
90
109
  throw new Error('Invalid start URL');
91
- const urlObj = new URL(rootUrl);
110
+ const urlObj = new URL(this.rootOrigin);
92
111
  const domain = urlObj.hostname.replace('www.', '');
93
112
  const site = this.siteRepo.firstOrCreateSite(domain);
94
113
  this.siteId = site.id;
95
- const type = this.options.snapshotType || (this.options.previousGraph ? 'incremental' : 'full');
96
- this.snapshotId = this.snapshotRepo.createSnapshot(this.siteId, type);
114
+ // Persist the resolved preferred URL and SSL status
115
+ this.siteRepo.updateSitePreference(this.siteId, {
116
+ preferred_url: this.rootOrigin,
117
+ ssl: this.rootOrigin.startsWith('https') ? 1 : 0
118
+ });
97
119
  this.rootOrigin = urlObj.origin;
98
- this.startUrl = rootUrl;
99
- // Seed discovery depth for root
100
- this.discoveryDepths.set(this.startUrl, 0);
101
- }
102
- setupModules() {
120
+ // Keep storage path-first for internal URLs and reconcile any legacy absolute rows.
121
+ this.pageRepo.reconcileInternalUrls(this.siteId, this.rootOrigin);
122
+ this.startUrl = this.toStorageUrl(rootUrl);
123
+ // Now that rootOrigin is resolved, initialize ScopeManager with the correct absolute origin
103
124
  this.scopeManager = new ScopeManager({
104
125
  allowedDomains: this.options.allowedDomains || [],
105
126
  deniedDomains: this.options.deniedDomains || [],
106
127
  includeSubdomains: this.options.includeSubdomains || false,
107
- rootUrl: this.startUrl
128
+ rootUrl: this.rootOrigin
108
129
  });
130
+ // Update fetcher with the now-initialized scopeManager
131
+ if (this.fetcher) {
132
+ this.fetcher.scopeManager = this.scopeManager;
133
+ }
134
+ // Every scan now creates a new snapshot (no reuse)
135
+ const runType = this.options.snapshotRunType || (this.options.previousGraph ? 'incremental' : 'completed');
136
+ this.snapshotId = this.snapshotRepo.createSnapshot(this.siteId, runType);
137
+ this.runType = runType;
138
+ // Expose snapshot context for plugins that persist per-snapshot data.
139
+ this.context.snapshotId = this.snapshotId;
140
+ // Seed discovery depth for root
141
+ this.discoveryDepths.set(this.startUrl, 0);
142
+ }
143
+ setupModules() {
109
144
  this.fetcher = new Fetcher({
110
145
  rate: this.options.rate,
111
146
  proxyUrl: this.options.proxyUrl,
112
- scopeManager: this.scopeManager,
147
+ scopeManager: this.scopeManager ?? undefined,
113
148
  maxRedirects: this.options.maxRedirects,
114
149
  userAgent: this.options.userAgent
115
150
  });
116
151
  this.parser = new Parser();
117
- this.sitemapFetcher = new Sitemap(this.context);
118
- this.trapDetector = new TrapDetector();
152
+ this.sitemapFetcher = new Sitemap(this.context, this.fetcher);
119
153
  }
120
154
  async fetchRobots() {
155
+ const robotsUrl = new URL('/robots.txt', this.rootOrigin).toString();
121
156
  try {
122
- const robotsUrl = new URL('/robots.txt', this.rootOrigin).toString();
123
157
  const res = await this.fetcher.fetch(robotsUrl, { maxBytes: 500000 });
124
158
  if (res && typeof res.status === 'number' && res.status >= 200 && res.status < 300) {
125
159
  this.robots = robotsParser(robotsUrl, res.body);
@@ -139,20 +173,22 @@ export class Crawler {
139
173
  return false;
140
174
  if (this.scopeManager.isUrlEligible(url) !== 'allowed')
141
175
  return false;
142
- if (this.options.detectTraps) {
143
- const trap = this.trapDetector.checkTrap(url, depth);
144
- if (trap.risk > 0.8)
176
+ if (this.registry) {
177
+ const allowed = this.registry.runSyncBailHook('shouldEnqueueUrl', this.context, url, depth);
178
+ if (allowed === false)
145
179
  return false;
146
180
  }
147
181
  return true;
148
182
  }
149
- addToQueue(u, d) {
183
+ addToQueue(u, d, data = {}) {
150
184
  if (this.scopeManager.isUrlEligible(u) !== 'allowed')
151
185
  return;
152
186
  if (!this.uniqueQueue.has(u)) {
153
187
  this.uniqueQueue.add(u);
154
188
  this.queue.push({ url: u, depth: d });
155
189
  this.context.emit({ type: 'queue:enqueue', url: u, depth: d });
190
+ this.emitProgress();
191
+ this.bufferPage(u, d, 0, data);
156
192
  const currentDiscovery = this.discoveryDepths.get(u);
157
193
  if (currentDiscovery === undefined || d < currentDiscovery) {
158
194
  this.discoveryDepths.set(u, d);
@@ -160,26 +196,72 @@ export class Crawler {
160
196
  }
161
197
  }
162
198
  async seedQueue() {
163
- // Seed from Sitemap
164
- if (this.options.sitemap) {
165
- try {
166
- const sitemapUrl = this.options.sitemap === 'true' ? new URL('/sitemap.xml', this.rootOrigin).toString() : this.options.sitemap;
167
- if (sitemapUrl.startsWith('http')) {
168
- this.context.emit({ type: 'info', message: 'Fetching sitemap', context: { url: sitemapUrl } });
169
- const sitemapUrls = await this.sitemapFetcher.fetch(sitemapUrl);
170
- for (const u of sitemapUrls) {
171
- const normalized = normalizeUrl(u, '', this.options);
172
- if (normalized)
173
- this.addToQueue(normalized, 0);
174
- }
175
- }
199
+ // Seed from startUrl first to ensure it's prioritized in the queue
200
+ this.addToQueue(this.startUrl, 0);
201
+ const sitemapsToFetch = new Set();
202
+ // 1. Explicitly configured sitemap
203
+ if (this.options.sitemap && this.runType !== 'single') {
204
+ const explicitUrl = this.options.sitemap === 'true' || this.options.sitemap === true
205
+ ? new URL('/sitemap.xml', this.rootOrigin).toString()
206
+ : this.options.sitemap;
207
+ if (typeof explicitUrl === 'string' && explicitUrl.startsWith('http')) {
208
+ sitemapsToFetch.add(explicitUrl);
176
209
  }
177
- catch (e) {
178
- this.context.emit({ type: 'warn', message: 'Sitemap fetch failed', context: e });
210
+ }
211
+ // 2. Discover sitemaps from robots.txt (unless explicitly disabled)
212
+ // Only auto-fetch on the FIRST real crawl (full/incremental).
213
+ // page --live reuses snapshots and should NOT trigger sitemap fetch.
214
+ const isFirstFullCrawl = this.runType !== 'single' && !this.snapshotRepo?.hasFullCrawl(this.siteId);
215
+ if (this.options.sitemap !== false && (this.options.sitemap || isFirstFullCrawl) && this.robots && this.runType !== 'single') {
216
+ const robotsSitemaps = this.robots.getSitemaps();
217
+ for (const s of robotsSitemaps) {
218
+ if (s)
219
+ sitemapsToFetch.add(s);
220
+ }
221
+ }
222
+ // Process all discovered sitemaps in background
223
+ if (sitemapsToFetch.size > 0) {
224
+ for (const sitemapUrl of sitemapsToFetch) {
225
+ this.pendingSitemaps++;
226
+ // KICK OFF BACKGROUND TASK (Un-awaited)
227
+ (async () => {
228
+ try {
229
+ this.context.emit({ type: 'debug', message: 'Fetching sitemap in background', context: { url: sitemapUrl } });
230
+ const sitemapUrls = await this.sitemapFetcher.fetch(sitemapUrl);
231
+ if (sitemapUrls.length > 0) {
232
+ this.context.emit({ type: 'debug', message: `Mapping ${sitemapUrls.length} URLs from sitemap... (Background)` });
233
+ const sitemapEntries = sitemapUrls.map(u => {
234
+ const normalized = normalizeUrl(u, this.rootOrigin, this.options);
235
+ if (!normalized)
236
+ return null;
237
+ const path = this.toStorageUrl(normalized);
238
+ return {
239
+ site_id: this.siteId,
240
+ normalized_url: path,
241
+ first_seen_snapshot_id: this.snapshotId,
242
+ last_seen_snapshot_id: this.snapshotId,
243
+ discovered_via_sitemap: 1,
244
+ depth: 0,
245
+ http_status: 0
246
+ };
247
+ }).filter((p) => p !== null);
248
+ // Bulk register to DB
249
+ this.pageRepo.upsertMany(sitemapEntries);
250
+ // Add to queue for Actual Crawling
251
+ for (const entry of sitemapEntries) {
252
+ this.addToQueue(entry.normalized_url, 0, { discovered_via_sitemap: 1 });
253
+ }
254
+ }
255
+ }
256
+ catch (e) {
257
+ this.context.emit({ type: 'warn', message: 'Sitemap fetch failed', context: { url: sitemapUrl, error: String(e) } });
258
+ }
259
+ finally {
260
+ this.pendingSitemaps--;
261
+ }
262
+ })();
179
263
  }
180
264
  }
181
- // Seed from startUrl
182
- this.addToQueue(this.startUrl, 0);
183
265
  }
184
266
  bufferPage(url, depth, status, data = {}) {
185
267
  const existing = this.pageBuffer.get(url);
@@ -226,10 +308,27 @@ export class Crawler {
226
308
  }
227
309
  bufferEdge(sourceUrl, targetUrl, weight = 1.0, rel = 'internal') {
228
310
  this.edgeBuffer.push({ sourceUrl, targetUrl, weight, rel });
311
+ this.edgesFound += 1;
312
+ this.emitProgress();
229
313
  if (this.edgeBuffer.length >= 100) {
230
314
  this.flushEdges();
231
315
  }
232
316
  }
317
+ emitProgress(force = false) {
318
+ const now = Date.now();
319
+ if (!force && now - this.lastProgressEmitAt < 200)
320
+ return;
321
+ this.lastProgressEmitAt = now;
322
+ this.context.emit({
323
+ type: 'crawl:progress',
324
+ pagesCrawled: this.pagesCrawled,
325
+ queued: this.queue.length,
326
+ active: this.active,
327
+ nodesFound: this.uniqueQueue.size,
328
+ edgesFound: this.edgesFound,
329
+ phase: this.progressPhase
330
+ });
331
+ }
233
332
  flushEdges() {
234
333
  if (this.edgeBuffer.length === 0)
235
334
  return;
@@ -237,6 +336,13 @@ export class Crawler {
237
336
  this.flushPages();
238
337
  const identities = this.pageRepo.getPagesIdentityBySnapshot(this.snapshotId);
239
338
  const urlToId = new Map(identities.map(p => [p.normalized_url, p.id]));
339
+ // When reusing a snapshot, clean up stale edges for pages being re-crawled
340
+ if (this.reusingSnapshot) {
341
+ const sourcePageIds = new Set(this.edgeBuffer.map(e => urlToId.get(e.sourceUrl)).filter((id) => id !== undefined));
342
+ for (const pageId of sourcePageIds) {
343
+ this.edgeRepo.deleteEdgesForPage(this.snapshotId, pageId);
344
+ }
345
+ }
240
346
  const edgesToInsert = this.edgeBuffer
241
347
  .map(e => ({
242
348
  snapshot_id: this.snapshotId,
@@ -270,18 +376,23 @@ export class Crawler {
270
376
  return {
271
377
  snapshot_id: this.snapshotId,
272
378
  page_id: pageId,
273
- authority_score: null,
274
- hub_score: null,
275
- pagerank: null,
276
- pagerank_score: null,
277
- link_role: null,
278
379
  crawl_status: null,
279
380
  word_count: null,
280
381
  thin_content_score: null,
281
382
  external_link_ratio: null,
282
- orphan_score: null,
383
+ pagerank_score: null,
384
+ hub_score: null,
385
+ auth_score: null,
386
+ link_role: null,
283
387
  duplicate_cluster_id: null,
284
388
  duplicate_type: null,
389
+ cluster_id: null,
390
+ soft404_score: null,
391
+ heading_score: null,
392
+ orphan_score: null,
393
+ orphan_type: null,
394
+ impact_level: null,
395
+ heading_data: null,
285
396
  is_cluster_primary: 0,
286
397
  ...item.data
287
398
  };
@@ -322,31 +433,39 @@ export class Crawler {
322
433
  }
323
434
  }
324
435
  handleCachedResponse(url, finalUrl, depth, prevNode) {
325
- this.bufferPage(finalUrl, depth, 200, {
436
+ const path = url;
437
+ const finalPath = this.toStorageUrl(finalUrl);
438
+ this.bufferPage(finalPath, depth, prevNode.status, {
326
439
  html: prevNode.html,
327
440
  canonical_url: prevNode.canonical,
441
+ noindex: prevNode.noindex ? 1 : 0,
442
+ nofollow: prevNode.nofollow ? 1 : 0,
328
443
  content_hash: prevNode.contentHash,
329
444
  simhash: prevNode.simhash,
330
445
  etag: prevNode.etag,
331
- last_modified: prevNode.lastModified,
332
- noindex: prevNode.noindex ? 1 : 0,
333
- nofollow: prevNode.nofollow ? 1 : 0
446
+ last_modified: prevNode.lastModified
334
447
  });
335
- this.bufferMetrics(finalUrl, {
336
- crawl_status: 'cached'
448
+ this.bufferMetrics(finalPath, {
449
+ crawl_status: 'cached',
450
+ word_count: prevNode.wordCount,
451
+ thin_content_score: prevNode.thinContentScore,
452
+ external_link_ratio: prevNode.externalLinkRatio
337
453
  });
338
454
  // Re-discovery links from previous graph to continue crawling if needed
339
455
  const prevLinks = this.options.previousGraph?.getEdges()
340
- .filter(e => e.source === url)
456
+ .filter(e => e.source === path)
341
457
  .map(e => e.target);
342
458
  if (prevLinks) {
343
459
  for (const link of prevLinks) {
344
- const normalizedLink = normalizeUrl(link, '', this.options);
345
- if (normalizedLink && normalizedLink !== finalUrl) {
346
- this.bufferPage(normalizedLink, depth + 1, 0);
347
- this.bufferEdge(finalUrl, normalizedLink, 1.0, 'internal');
348
- if (this.shouldEnqueue(normalizedLink, depth + 1)) {
349
- this.addToQueue(normalizedLink, depth + 1);
460
+ const normalizedLink = normalizeUrl(link, this.rootOrigin, this.options);
461
+ if (normalizedLink) {
462
+ const path = this.toStorageUrl(normalizedLink);
463
+ if (path !== url) {
464
+ this.bufferPage(path, depth + 1, 0);
465
+ this.bufferEdge(url, path, 1.0, 'internal');
466
+ if (this.shouldEnqueue(path, depth + 1)) {
467
+ this.addToQueue(path, depth + 1);
468
+ }
350
469
  }
351
470
  }
352
471
  }
@@ -354,40 +473,53 @@ export class Crawler {
354
473
  }
355
474
  handleRedirects(chain, depth) {
356
475
  for (const step of chain) {
357
- const source = normalizeUrl(step.url, '', this.options);
358
- const target = normalizeUrl(step.target, '', this.options);
359
- if (source && target) {
360
- this.bufferPage(source, depth, step.status);
361
- this.bufferPage(target, depth, 0);
362
- this.bufferEdge(source, target);
476
+ const sourceAbs = normalizeUrl(step.url, this.rootOrigin, this.options);
477
+ const targetAbs = normalizeUrl(step.target, this.rootOrigin, this.options);
478
+ if (sourceAbs && targetAbs) {
479
+ const sourcePath = this.toStorageUrl(sourceAbs);
480
+ const targetPath = this.toStorageUrl(targetAbs);
481
+ const sourceInternal = UrlUtil.isInternal(sourceAbs, this.rootOrigin);
482
+ const targetInternal = UrlUtil.isInternal(targetAbs, this.rootOrigin);
483
+ this.bufferPage(sourcePath, depth, step.status, { is_internal: sourceInternal ? 1 : 0 });
484
+ this.bufferPage(targetPath, depth, 0, { is_internal: targetInternal ? 1 : 0 });
485
+ this.bufferEdge(sourcePath, targetPath, 1.0, targetInternal ? 'internal' : 'external');
363
486
  }
364
487
  }
365
488
  }
366
- handleSuccessResponse(res, finalUrl, depth, isBlocked = false) {
489
+ handleSuccessResponse(res, path, absoluteUrl, depth, isBlocked = false) {
367
490
  const contentTypeHeader = res.headers['content-type'];
368
491
  const contentType = Array.isArray(contentTypeHeader) ? contentTypeHeader[0] : (contentTypeHeader || '');
369
492
  if (!contentType || !contentType.toLowerCase().includes('text/html')) {
370
- this.bufferPage(finalUrl, depth, typeof res.status === 'number' ? res.status : 0);
493
+ this.bufferPage(path, depth, typeof res.status === 'number' ? res.status : 0);
371
494
  return;
372
495
  }
373
- const parseResult = this.parser.parse(res.body, finalUrl, res.status);
374
- this.bufferPage(finalUrl, depth, res.status, {
496
+ const parseResult = this.parser.parse(res.body, absoluteUrl, res.status);
497
+ if (this.registry) {
498
+ this.registry.runHook('onPageParsed', this.context, {
499
+ url: absoluteUrl,
500
+ status: res.status,
501
+ depth: depth,
502
+ headers: res.headers,
503
+ ...parseResult
504
+ });
505
+ }
506
+ this.bufferPage(path, depth, res.status, {
375
507
  html: parseResult.html,
376
508
  canonical_url: parseResult.canonical || undefined,
377
509
  noindex: parseResult.noindex ? 1 : 0,
378
510
  nofollow: parseResult.nofollow ? 1 : 0,
379
511
  content_hash: parseResult.contentHash,
380
512
  simhash: parseResult.simhash,
381
- soft404_score: parseResult.soft404Score,
382
513
  etag: res.etag,
383
514
  last_modified: res.lastModified,
384
- retries: res.retries
515
+ retries: res.retries,
516
+ bytes_received: res.bytesReceived
385
517
  });
386
518
  try {
387
519
  const contentAnalysis = analyzeContent(parseResult.html);
388
- const linkAnalysis = analyzeLinks(parseResult.html, finalUrl, this.rootOrigin);
520
+ const linkAnalysis = analyzeLinks(parseResult.html, absoluteUrl, this.rootOrigin);
389
521
  const thinScore = calculateThinContentScore(contentAnalysis, 0);
390
- this.bufferMetrics(finalUrl, {
522
+ this.bufferMetrics(path, {
391
523
  crawl_status: isBlocked ? 'blocked_by_robots' : 'fetched',
392
524
  word_count: contentAnalysis.wordCount,
393
525
  thin_content_score: thinScore,
@@ -395,15 +527,19 @@ export class Crawler {
395
527
  });
396
528
  }
397
529
  catch (e) {
398
- this.context.emit({ type: 'error', message: 'Error calculating per-page metrics', error: e, context: { url: finalUrl } });
530
+ this.context.emit({ type: 'error', message: 'Error calculating per-page metrics', error: e, context: { url: absoluteUrl } });
399
531
  }
400
532
  for (const linkItem of parseResult.links) {
401
- const normalizedLink = normalizeUrl(linkItem.url, '', this.options);
402
- if (normalizedLink && normalizedLink !== finalUrl) {
403
- this.bufferPage(normalizedLink, depth + 1, 0);
404
- this.bufferEdge(finalUrl, normalizedLink, 1.0, 'internal');
405
- if (this.shouldEnqueue(normalizedLink, depth + 1)) {
406
- this.addToQueue(normalizedLink, depth + 1);
533
+ const normalizedLink = normalizeUrl(linkItem.url, absoluteUrl, this.options);
534
+ if (normalizedLink) {
535
+ const targetPath = this.toStorageUrl(normalizedLink);
536
+ if (targetPath !== path) {
537
+ const isInternal = UrlUtil.isInternal(normalizedLink, this.rootOrigin);
538
+ this.bufferPage(targetPath, depth + 1, 0, { is_internal: isInternal ? 1 : 0 });
539
+ this.bufferEdge(path, targetPath, 1.0, isInternal ? 'internal' : 'external');
540
+ if (isInternal && this.shouldEnqueue(targetPath, depth + 1)) {
541
+ this.addToQueue(targetPath, depth + 1);
542
+ }
407
543
  }
408
544
  }
409
545
  }
@@ -414,14 +550,19 @@ export class Crawler {
414
550
  this.bufferPage(url, depth, 0, { securityError: 'blocked_by_domain_filter' });
415
551
  return;
416
552
  }
553
+ // Convert stored path to absolute URL for fetching.
554
+ // External/subdomain URLs are already absolute (UrlUtil.toPath returns them as-is).
555
+ const fetchUrl = UrlUtil.toAbsolute(url, this.rootOrigin);
417
556
  try {
418
557
  const prevNode = this.options.previousGraph?.nodes.get(url);
419
- const res = await this.fetchPage(url, depth, prevNode);
558
+ const res = await this.fetchPage(fetchUrl, depth, prevNode);
420
559
  if (!res)
421
560
  return;
422
- const finalUrl = normalizeUrl(res.finalUrl, '', this.options);
561
+ const finalUrl = normalizeUrl(res.finalUrl, this.rootOrigin, this.options);
423
562
  if (!finalUrl)
424
563
  return;
564
+ const fullUrl = finalUrl; // Already absolute
565
+ const finalPath = this.toStorageUrl(finalUrl);
425
566
  if (res.status === 304 && prevNode) {
426
567
  this.handleCachedResponse(url, finalUrl, depth, prevNode);
427
568
  return;
@@ -430,17 +571,17 @@ export class Crawler {
430
571
  const isStringStatus = typeof res.status === 'string';
431
572
  if (isStringStatus || (typeof res.status === 'number' && res.status >= 300)) {
432
573
  const statusNum = typeof res.status === 'number' ? res.status : 0;
433
- this.bufferPage(finalUrl, depth, statusNum, {
574
+ this.bufferPage(finalPath, depth, statusNum, {
434
575
  security_error: isStringStatus ? res.status : undefined,
435
576
  retries: res.retries
436
577
  });
437
- this.bufferMetrics(finalUrl, {
578
+ this.bufferMetrics(finalPath, {
438
579
  crawl_status: isStringStatus ? res.status : 'fetched_error'
439
580
  });
440
581
  return;
441
582
  }
442
583
  if (res.status === 200) {
443
- this.handleSuccessResponse(res, finalUrl, depth, isBlocked);
584
+ this.handleSuccessResponse(res, finalPath, fullUrl, depth, isBlocked);
444
585
  }
445
586
  }
446
587
  catch (e) {
@@ -448,17 +589,30 @@ export class Crawler {
448
589
  }
449
590
  }
450
591
  async run() {
451
- await this.initialize();
592
+ // 1. Setup fetcher and basic modules
452
593
  this.setupModules();
453
- await this.fetchRobots();
594
+ // 2. Initialize repositories, resolve URL (SSL/WWW), and set up site context
595
+ await this.initialize();
596
+ if (this.options.robots) {
597
+ this.robots = this.options.robots;
598
+ }
599
+ else {
600
+ await this.fetchRobots();
601
+ }
454
602
  await this.seedQueue();
455
603
  return new Promise((resolve) => {
456
604
  const checkDone = async () => {
457
- if (this.queue.length === 0 && this.active === 0) {
605
+ if (this.queue.length === 0 && this.active === 0 && this.pendingSitemaps === 0) {
606
+ this.progressPhase = 'finalizing';
607
+ this.emitProgress(true);
458
608
  await this.flushAll();
459
609
  this.snapshotRepo.updateSnapshotStatus(this.snapshotId, 'completed', {
460
610
  limit_reached: this.reachedLimit ? 1 : 0
461
611
  });
612
+ this.snapshotRepo.pruneSnapshots(this.siteId, DEFAULTS.MAX_SNAPSHOTS, DEFAULTS.MAX_SINGLE_SNAPSHOTS, this.snapshotId);
613
+ if (this.reusingSnapshot) {
614
+ this.snapshotRepo.touchSnapshot(this.snapshotId);
615
+ }
462
616
  resolve(this.snapshotId);
463
617
  return true;
464
618
  }
@@ -469,12 +623,20 @@ export class Crawler {
469
623
  return;
470
624
  if (this.pagesCrawled >= this.options.limit) {
471
625
  this.reachedLimit = true;
626
+ this.progressPhase = 'limit reached';
627
+ this.emitProgress();
472
628
  if (this.active === 0) {
629
+ this.context.emit({ type: 'crawl:limit-reached', limit: this.options.limit });
630
+ this.progressPhase = 'finalizing';
631
+ this.emitProgress(true);
473
632
  await this.flushAll();
474
633
  this.snapshotRepo.updateSnapshotStatus(this.snapshotId, 'completed', {
475
634
  limit_reached: 1
476
635
  });
477
- this.context.emit({ type: 'crawl:limit-reached', limit: this.options.limit });
636
+ this.snapshotRepo.pruneSnapshots(this.siteId, DEFAULTS.MAX_SNAPSHOTS, DEFAULTS.MAX_SINGLE_SNAPSHOTS, this.snapshotId);
637
+ if (this.reusingSnapshot) {
638
+ this.snapshotRepo.touchSnapshot(this.snapshotId);
639
+ }
478
640
  resolve(this.snapshotId);
479
641
  }
480
642
  return;
@@ -483,10 +645,11 @@ export class Crawler {
483
645
  const item = this.queue.shift();
484
646
  if (this.visited.has(item.url))
485
647
  continue;
486
- // Robust robots check: if path doesn't end in /, check both /path and /path/
487
- // to handle cases where normalization stripped a slash that robots.txt relies on.
488
- const isBlocked = this.robots && (!this.robots.isAllowed(item.url, 'crawlith') ||
489
- (!item.url.endsWith('/') && !this.robots.isAllowed(item.url + '/', 'crawlith')));
648
+ // Robust robots check: reconstruct absolute URL since robots-parser needs full URLs,
649
+ // not root-relative paths. Also check /path/ variant in case robots.txt uses trailing slash.
650
+ const absUrlForRobots = UrlUtil.toAbsolute(item.url, this.rootOrigin);
651
+ const isBlocked = this.robots && (!this.robots.isAllowed(absUrlForRobots, 'crawlith') ||
652
+ (!absUrlForRobots.endsWith('/') && !this.robots.isAllowed(absUrlForRobots + '/', 'crawlith')));
490
653
  if (isBlocked) {
491
654
  if (this.options.debug) {
492
655
  console.log(`${chalk.yellow('⊘ Robots')} ${chalk.gray(item.url)}`);
@@ -507,9 +670,11 @@ export class Crawler {
507
670
  this.visited.add(item.url);
508
671
  this.limitConcurrency(() => this.processPage(item, isBlocked)).finally(() => {
509
672
  this.active--;
673
+ this.emitProgress();
510
674
  next();
511
675
  });
512
676
  }
677
+ this.emitProgress();
513
678
  await checkDone();
514
679
  };
515
680
  next();
@@ -23,7 +23,7 @@ export interface FetchOptions {
23
23
  crawlDelay?: number;
24
24
  }
25
25
  export declare class Fetcher {
26
- private userAgent;
26
+ userAgent: string;
27
27
  private rateLimiter;
28
28
  private proxyAdapter;
29
29
  private secureDispatcher;
@@ -6,16 +6,16 @@ import { RetryPolicy } from '../core/network/retryPolicy.js';
6
6
  import { ResponseLimiter } from '../core/network/responseLimiter.js';
7
7
  import { RedirectController } from '../core/network/redirectController.js';
8
8
  import { ProxyAdapter } from '../core/network/proxyAdapter.js';
9
- import { version } from '../utils/version.js';
9
+ import { DEFAULTS } from '../constants.js';
10
10
  export class Fetcher {
11
- userAgent = 'crawlith/1.0';
11
+ userAgent = DEFAULTS.USER_AGENT;
12
12
  rateLimiter;
13
13
  proxyAdapter;
14
14
  secureDispatcher;
15
15
  scopeManager;
16
16
  maxRedirects;
17
17
  constructor(options = {}) {
18
- this.rateLimiter = new RateLimiter(options.rate || 2);
18
+ this.rateLimiter = new RateLimiter(options.rate || DEFAULTS.RATE_LIMIT);
19
19
  this.proxyAdapter = new ProxyAdapter(options.proxyUrl);
20
20
  if (this.proxyAdapter.dispatcher) {
21
21
  this.secureDispatcher = this.proxyAdapter.dispatcher;
@@ -24,11 +24,11 @@ export class Fetcher {
24
24
  this.secureDispatcher = IPGuard.getSecureDispatcher();
25
25
  }
26
26
  this.scopeManager = options.scopeManager;
27
- this.maxRedirects = Math.min(options.maxRedirects ?? 2, 11);
28
- this.userAgent = options.userAgent || `crawlith/${version}`;
27
+ this.maxRedirects = Math.min(options.maxRedirects ?? DEFAULTS.MAX_REDIRECTS, DEFAULTS.MAX_REDIRECTS_LIMIT);
28
+ this.userAgent = options.userAgent || DEFAULTS.USER_AGENT;
29
29
  }
30
30
  async fetch(url, options = {}) {
31
- const maxBytes = options.maxBytes || 2000000;
31
+ const maxBytes = options.maxBytes || DEFAULTS.MAX_BYTES;
32
32
  const redirectChain = [];
33
33
  const redirectController = new RedirectController(this.maxRedirects, url);
34
34
  let currentUrl = url;