@crawlith/core 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analysis_list.html +35 -0
  4. package/dist/analysis/analysis_page.html +123 -0
  5. package/dist/analysis/analyze.d.ts +40 -5
  6. package/dist/analysis/analyze.js +395 -347
  7. package/dist/analysis/clustering.d.ts +23 -0
  8. package/dist/analysis/clustering.js +206 -0
  9. package/dist/analysis/content.d.ts +1 -1
  10. package/dist/analysis/content.js +11 -5
  11. package/dist/analysis/duplicate.d.ts +34 -0
  12. package/dist/analysis/duplicate.js +305 -0
  13. package/dist/analysis/heading.d.ts +116 -0
  14. package/dist/analysis/heading.js +356 -0
  15. package/dist/analysis/images.d.ts +1 -1
  16. package/dist/analysis/images.js +6 -5
  17. package/dist/analysis/links.d.ts +1 -1
  18. package/dist/analysis/links.js +8 -8
  19. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  20. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  21. package/dist/analysis/scoring.js +11 -2
  22. package/dist/analysis/seo.d.ts +8 -4
  23. package/dist/analysis/seo.js +41 -30
  24. package/dist/analysis/soft404.d.ts +17 -0
  25. package/dist/analysis/soft404.js +62 -0
  26. package/dist/analysis/structuredData.d.ts +1 -1
  27. package/dist/analysis/structuredData.js +5 -4
  28. package/dist/analysis/templates.d.ts +2 -0
  29. package/dist/analysis/templates.js +7 -0
  30. package/dist/application/index.d.ts +2 -0
  31. package/dist/application/index.js +2 -0
  32. package/dist/application/usecase.d.ts +3 -0
  33. package/dist/application/usecase.js +1 -0
  34. package/dist/application/usecases.d.ts +114 -0
  35. package/dist/application/usecases.js +201 -0
  36. package/dist/audit/index.js +1 -1
  37. package/dist/audit/transport.d.ts +1 -1
  38. package/dist/audit/transport.js +5 -4
  39. package/dist/audit/types.d.ts +1 -0
  40. package/dist/constants.d.ts +17 -0
  41. package/dist/constants.js +23 -0
  42. package/dist/core/scope/scopeManager.js +3 -0
  43. package/dist/core/security/ipGuard.d.ts +11 -0
  44. package/dist/core/security/ipGuard.js +71 -3
  45. package/dist/crawler/crawl.d.ts +4 -22
  46. package/dist/crawler/crawl.js +4 -335
  47. package/dist/crawler/crawler.d.ts +87 -0
  48. package/dist/crawler/crawler.js +683 -0
  49. package/dist/crawler/extract.d.ts +4 -1
  50. package/dist/crawler/extract.js +7 -2
  51. package/dist/crawler/fetcher.d.ts +2 -1
  52. package/dist/crawler/fetcher.js +26 -11
  53. package/dist/crawler/metricsRunner.d.ts +23 -1
  54. package/dist/crawler/metricsRunner.js +202 -72
  55. package/dist/crawler/normalize.d.ts +41 -0
  56. package/dist/crawler/normalize.js +119 -3
  57. package/dist/crawler/parser.d.ts +1 -3
  58. package/dist/crawler/parser.js +2 -49
  59. package/dist/crawler/resolver.d.ts +11 -0
  60. package/dist/crawler/resolver.js +67 -0
  61. package/dist/crawler/sitemap.d.ts +6 -0
  62. package/dist/crawler/sitemap.js +27 -17
  63. package/dist/crawler/trap.d.ts +5 -1
  64. package/dist/crawler/trap.js +23 -2
  65. package/dist/db/CrawlithDB.d.ts +110 -0
  66. package/dist/db/CrawlithDB.js +500 -0
  67. package/dist/db/graphLoader.js +42 -30
  68. package/dist/db/index.d.ts +11 -0
  69. package/dist/db/index.js +41 -29
  70. package/dist/db/migrations.d.ts +2 -0
  71. package/dist/db/{schema.js → migrations.js} +90 -43
  72. package/dist/db/pluginRegistry.d.ts +9 -0
  73. package/dist/db/pluginRegistry.js +19 -0
  74. package/dist/db/repositories/EdgeRepository.d.ts +13 -0
  75. package/dist/db/repositories/EdgeRepository.js +20 -0
  76. package/dist/db/repositories/MetricsRepository.d.ts +16 -8
  77. package/dist/db/repositories/MetricsRepository.js +28 -7
  78. package/dist/db/repositories/PageRepository.d.ts +15 -2
  79. package/dist/db/repositories/PageRepository.js +169 -25
  80. package/dist/db/repositories/SiteRepository.d.ts +9 -0
  81. package/dist/db/repositories/SiteRepository.js +13 -0
  82. package/dist/db/repositories/SnapshotRepository.d.ts +14 -5
  83. package/dist/db/repositories/SnapshotRepository.js +64 -5
  84. package/dist/db/reset.d.ts +9 -0
  85. package/dist/db/reset.js +32 -0
  86. package/dist/db/statements.d.ts +12 -0
  87. package/dist/db/statements.js +40 -0
  88. package/dist/diff/compare.d.ts +0 -5
  89. package/dist/diff/compare.js +0 -12
  90. package/dist/diff/service.d.ts +16 -0
  91. package/dist/diff/service.js +41 -0
  92. package/dist/domain/index.d.ts +4 -0
  93. package/dist/domain/index.js +4 -0
  94. package/dist/events.d.ts +56 -0
  95. package/dist/events.js +1 -0
  96. package/dist/graph/graph.d.ts +36 -42
  97. package/dist/graph/graph.js +26 -17
  98. package/dist/graph/hits.d.ts +23 -0
  99. package/dist/graph/hits.js +111 -0
  100. package/dist/graph/metrics.d.ts +0 -4
  101. package/dist/graph/metrics.js +25 -9
  102. package/dist/graph/pagerank.d.ts +17 -4
  103. package/dist/graph/pagerank.js +126 -91
  104. package/dist/graph/simhash.d.ts +6 -0
  105. package/dist/graph/simhash.js +14 -0
  106. package/dist/index.d.ts +29 -8
  107. package/dist/index.js +29 -8
  108. package/dist/lock/hashKey.js +1 -1
  109. package/dist/lock/lockManager.d.ts +5 -1
  110. package/dist/lock/lockManager.js +38 -13
  111. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  112. package/dist/plugin-system/plugin-cli.js +31 -0
  113. package/dist/plugin-system/plugin-config.d.ts +16 -0
  114. package/dist/plugin-system/plugin-config.js +36 -0
  115. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  116. package/dist/plugin-system/plugin-loader.js +122 -0
  117. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  118. package/dist/plugin-system/plugin-registry.js +167 -0
  119. package/dist/plugin-system/plugin-types.d.ts +205 -0
  120. package/dist/plugin-system/plugin-types.js +1 -0
  121. package/dist/ports/index.d.ts +9 -0
  122. package/dist/ports/index.js +1 -0
  123. package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
  124. package/dist/report/crawlExport.d.ts +3 -0
  125. package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
  126. package/dist/report/crawl_template.d.ts +1 -0
  127. package/dist/report/crawl_template.js +7 -0
  128. package/dist/report/export.d.ts +3 -0
  129. package/dist/report/export.js +81 -0
  130. package/dist/report/html.js +15 -216
  131. package/dist/report/insight.d.ts +27 -0
  132. package/dist/report/insight.js +103 -0
  133. package/dist/scoring/health.d.ts +56 -0
  134. package/dist/scoring/health.js +213 -0
  135. package/dist/utils/chalk.d.ts +6 -0
  136. package/dist/utils/chalk.js +41 -0
  137. package/dist/utils/secureConfig.d.ts +23 -0
  138. package/dist/utils/secureConfig.js +128 -0
  139. package/package.json +12 -6
  140. package/CHANGELOG.md +0 -7
  141. package/dist/db/schema.d.ts +0 -2
  142. package/dist/graph/cluster.d.ts +0 -6
  143. package/dist/graph/cluster.js +0 -173
  144. package/dist/graph/duplicate.d.ts +0 -10
  145. package/dist/graph/duplicate.js +0 -251
  146. package/dist/report/sitegraphExport.d.ts +0 -3
  147. package/dist/report/sitegraph_template.d.ts +0 -1
  148. package/dist/report/sitegraph_template.js +0 -630
  149. package/dist/scoring/hits.d.ts +0 -9
  150. package/dist/scoring/hits.js +0 -111
  151. package/src/analysis/analyze.ts +0 -548
  152. package/src/analysis/content.ts +0 -62
  153. package/src/analysis/images.ts +0 -28
  154. package/src/analysis/links.ts +0 -41
  155. package/src/analysis/scoring.ts +0 -59
  156. package/src/analysis/seo.ts +0 -82
  157. package/src/analysis/structuredData.ts +0 -62
  158. package/src/audit/dns.ts +0 -49
  159. package/src/audit/headers.ts +0 -98
  160. package/src/audit/index.ts +0 -66
  161. package/src/audit/scoring.ts +0 -232
  162. package/src/audit/transport.ts +0 -258
  163. package/src/audit/types.ts +0 -102
  164. package/src/core/network/proxyAdapter.ts +0 -21
  165. package/src/core/network/rateLimiter.ts +0 -39
  166. package/src/core/network/redirectController.ts +0 -47
  167. package/src/core/network/responseLimiter.ts +0 -34
  168. package/src/core/network/retryPolicy.ts +0 -57
  169. package/src/core/scope/domainFilter.ts +0 -45
  170. package/src/core/scope/scopeManager.ts +0 -52
  171. package/src/core/scope/subdomainPolicy.ts +0 -39
  172. package/src/core/security/ipGuard.ts +0 -92
  173. package/src/crawler/crawl.ts +0 -382
  174. package/src/crawler/extract.ts +0 -34
  175. package/src/crawler/fetcher.ts +0 -233
  176. package/src/crawler/metricsRunner.ts +0 -124
  177. package/src/crawler/normalize.ts +0 -108
  178. package/src/crawler/parser.ts +0 -190
  179. package/src/crawler/sitemap.ts +0 -73
  180. package/src/crawler/trap.ts +0 -96
  181. package/src/db/graphLoader.ts +0 -105
  182. package/src/db/index.ts +0 -70
  183. package/src/db/repositories/EdgeRepository.ts +0 -29
  184. package/src/db/repositories/MetricsRepository.ts +0 -49
  185. package/src/db/repositories/PageRepository.ts +0 -128
  186. package/src/db/repositories/SiteRepository.ts +0 -32
  187. package/src/db/repositories/SnapshotRepository.ts +0 -74
  188. package/src/db/schema.ts +0 -177
  189. package/src/diff/compare.ts +0 -84
  190. package/src/graph/cluster.ts +0 -192
  191. package/src/graph/duplicate.ts +0 -286
  192. package/src/graph/graph.ts +0 -172
  193. package/src/graph/metrics.ts +0 -110
  194. package/src/graph/pagerank.ts +0 -125
  195. package/src/graph/simhash.ts +0 -61
  196. package/src/index.ts +0 -30
  197. package/src/lock/hashKey.ts +0 -51
  198. package/src/lock/lockManager.ts +0 -124
  199. package/src/lock/pidCheck.ts +0 -13
  200. package/src/report/html.ts +0 -227
  201. package/src/report/sitegraphExport.ts +0 -58
  202. package/src/scoring/hits.ts +0 -131
  203. package/src/scoring/orphanSeverity.ts +0 -176
  204. package/src/utils/version.ts +0 -18
  205. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  206. package/tests/analysis.unit.test.ts +0 -98
  207. package/tests/analyze.integration.test.ts +0 -98
  208. package/tests/audit/dns.test.ts +0 -31
  209. package/tests/audit/headers.test.ts +0 -45
  210. package/tests/audit/scoring.test.ts +0 -133
  211. package/tests/audit/security.test.ts +0 -12
  212. package/tests/audit/transport.test.ts +0 -112
  213. package/tests/clustering.test.ts +0 -118
  214. package/tests/crawler.test.ts +0 -358
  215. package/tests/db.test.ts +0 -159
  216. package/tests/diff.test.ts +0 -67
  217. package/tests/duplicate.test.ts +0 -110
  218. package/tests/fetcher.test.ts +0 -106
  219. package/tests/fetcher_safety.test.ts +0 -85
  220. package/tests/fixtures/analyze-crawl.json +0 -26
  221. package/tests/hits.test.ts +0 -134
  222. package/tests/html_report.test.ts +0 -58
  223. package/tests/lock/lockManager.test.ts +0 -138
  224. package/tests/metrics.test.ts +0 -196
  225. package/tests/normalize.test.ts +0 -101
  226. package/tests/orphanSeverity.test.ts +0 -160
  227. package/tests/pagerank.test.ts +0 -98
  228. package/tests/parser.test.ts +0 -117
  229. package/tests/proxy_safety.test.ts +0 -57
  230. package/tests/redirect_safety.test.ts +0 -73
  231. package/tests/safety.test.ts +0 -114
  232. package/tests/scope.test.ts +0 -66
  233. package/tests/scoring.test.ts +0 -59
  234. package/tests/sitemap.test.ts +0 -88
  235. package/tests/soft404.test.ts +0 -41
  236. package/tests/trap.test.ts +0 -39
  237. package/tests/visualization_data.test.ts +0 -46
  238. package/tsconfig.json +0 -11
@@ -1,3 +1,5 @@
1
+ import * as dns from 'dns';
2
+ import { Agent } from 'undici';
1
3
  export declare class IPGuard {
2
4
  /**
3
5
  * Checks if an IP address is internal/private
@@ -7,5 +9,14 @@ export declare class IPGuard {
7
9
  * Resolves a hostname and validates all result IPs
8
10
  */
9
11
  static validateHost(host: string): Promise<boolean>;
12
+ /**
13
+ * Custom lookup function for undici that validates the resolved IP.
14
+ * Prevents DNS Rebinding attacks by checking the IP immediately before connection.
15
+ */
16
+ static secureLookup(hostname: string, options: dns.LookupOneOptions | dns.LookupAllOptions, callback: (err: NodeJS.ErrnoException | null, address: string | dns.LookupAddress[], family: number) => void): void;
17
+ /**
18
+ * Returns an undici Agent configured with secure DNS lookup.
19
+ */
20
+ static getSecureDispatcher(): Agent;
10
21
  private static expandIPv6;
11
22
  }
@@ -1,6 +1,7 @@
1
1
  import * as dns from 'dns';
2
2
  import * as net from 'net';
3
3
  import { promisify } from 'util';
4
+ import { Agent } from 'undici';
4
5
  const resolve4 = promisify(dns.resolve4);
5
6
  const resolve6 = promisify(dns.resolve6);
6
7
  export class IPGuard {
@@ -43,6 +44,14 @@ export class IPGuard {
43
44
  // fe80::/10 (Link Local)
44
45
  if ((firstWord & 0xffc0) === 0xfe80)
45
46
  return true;
47
+ // IPv4-mapped IPv6: ::ffff:0:0/96
48
+ if (expanded.startsWith('0000:0000:0000:0000:0000:ffff:')) {
49
+ const parts = expanded.split(':');
50
+ const p7 = parseInt(parts[6], 16);
51
+ const p8 = parseInt(parts[7], 16);
52
+ const ip4 = `${(p7 >> 8) & 255}.${p7 & 255}.${(p8 >> 8) & 255}.${p8 & 255}`;
53
+ return IPGuard.isInternal(ip4);
54
+ }
46
55
  return false;
47
56
  }
48
57
  return true; // Unknown format, block it for safety
@@ -67,12 +76,71 @@ export class IPGuard {
67
76
  return false;
68
77
  }
69
78
  }
79
+ /**
80
+ * Custom lookup function for undici that validates the resolved IP.
81
+ * Prevents DNS Rebinding attacks by checking the IP immediately before connection.
82
+ */
83
+ static secureLookup(hostname, options, callback) {
84
+ dns.lookup(hostname, options, (err, address, family) => {
85
+ if (err) {
86
+ return callback(err, address, family);
87
+ }
88
+ const checkIP = (ip) => {
89
+ if (IPGuard.isInternal(ip)) {
90
+ return new Error(`Blocked internal IP: ${ip}`);
91
+ }
92
+ return null;
93
+ };
94
+ if (typeof address === 'string') {
95
+ const error = checkIP(address);
96
+ if (error) {
97
+ // Return a custom error that undici will propagate
98
+ const blockedError = new Error(`Blocked internal IP: ${address}`);
99
+ blockedError.code = 'EBLOCKED';
100
+ return callback(blockedError, address, family);
101
+ }
102
+ }
103
+ else if (Array.isArray(address)) {
104
+ // Handle array of addresses (if options.all is true)
105
+ for (const addr of address) {
106
+ const error = checkIP(addr.address);
107
+ if (error) {
108
+ const blockedError = new Error(`Blocked internal IP: ${addr.address}`);
109
+ blockedError.code = 'EBLOCKED';
110
+ return callback(blockedError, address, family);
111
+ }
112
+ }
113
+ }
114
+ callback(null, address, family);
115
+ });
116
+ }
117
+ /**
118
+ * Returns an undici Agent configured with secure DNS lookup.
119
+ */
120
+ static getSecureDispatcher() {
121
+ return new Agent({
122
+ connect: {
123
+ lookup: IPGuard.secureLookup
124
+ }
125
+ });
126
+ }
70
127
  static expandIPv6(ip) {
71
128
  if (ip === '::')
72
129
  return '0000:0000:0000:0000:0000:0000:0000:0000';
73
- let full = ip;
74
- if (ip.includes('::')) {
75
- const parts = ip.split('::');
130
+ let normalizedIp = ip;
131
+ if (ip.includes('.')) {
132
+ const lastColonIndex = ip.lastIndexOf(':');
133
+ const lastPart = ip.substring(lastColonIndex + 1);
134
+ if (net.isIPv4(lastPart)) {
135
+ const parts = lastPart.split('.').map(Number);
136
+ const hex1 = ((parts[0] << 8) | parts[1]).toString(16);
137
+ const hex2 = ((parts[2] << 8) | parts[3]).toString(16);
138
+ normalizedIp = ip.substring(0, lastColonIndex + 1) + hex1 + ':' + hex2;
139
+ }
140
+ }
141
+ let full = normalizedIp;
142
+ if (normalizedIp.includes('::')) {
143
+ const parts = normalizedIp.split('::');
76
144
  const left = parts[0].split(':').filter(x => x !== '');
77
145
  const right = parts[1].split(':').filter(x => x !== '');
78
146
  const missing = 8 - (left.length + right.length);
@@ -1,22 +1,4 @@
1
- import { Graph } from '../graph/graph.js';
2
- export interface CrawlOptions {
3
- limit: number;
4
- depth: number;
5
- concurrency?: number;
6
- ignoreRobots?: boolean;
7
- stripQuery?: boolean;
8
- previousGraph?: Graph;
9
- sitemap?: string;
10
- debug?: boolean;
11
- detectSoft404?: boolean;
12
- detectTraps?: boolean;
13
- rate?: number;
14
- maxBytes?: number;
15
- allowedDomains?: string[];
16
- deniedDomains?: string[];
17
- includeSubdomains?: boolean;
18
- proxyUrl?: string;
19
- maxRedirects?: number;
20
- userAgent?: string;
21
- }
22
- export declare function crawl(startUrl: string, options: CrawlOptions): Promise<number>;
1
+ import { type CrawlOptions } from './crawler.js';
2
+ import { EngineContext } from '../events.js';
3
+ export type { CrawlOptions };
4
+ export declare function crawl(startUrl: string, options: CrawlOptions, context?: EngineContext): Promise<number>;
@@ -1,336 +1,5 @@
1
- import { request } from 'undici';
2
- import pLimit from 'p-limit';
3
- import chalk from 'chalk';
4
- import robotsParser from 'robots-parser';
5
- import { Fetcher } from './fetcher.js';
6
- import { Parser } from './parser.js';
7
- import { Sitemap } from './sitemap.js';
8
- import { normalizeUrl } from './normalize.js';
9
- import { TrapDetector } from './trap.js';
10
- import { ScopeManager } from '../core/scope/scopeManager.js';
11
- import { getDb } from '../db/index.js';
12
- import { SiteRepository } from '../db/repositories/SiteRepository.js';
13
- import { SnapshotRepository } from '../db/repositories/SnapshotRepository.js';
14
- import { PageRepository } from '../db/repositories/PageRepository.js';
15
- import { EdgeRepository } from '../db/repositories/EdgeRepository.js';
16
- import { MetricsRepository } from '../db/repositories/MetricsRepository.js';
17
- import { analyzeContent, calculateThinContentScore } from '../analysis/content.js';
18
- import { analyzeLinks } from '../analysis/links.js';
19
- export async function crawl(startUrl, options) {
20
- const visited = new Set();
21
- const concurrency = Math.min(options.concurrency || 2, 10);
22
- const limitConcurrency = pLimit(concurrency);
23
- const trapDetector = new TrapDetector();
24
- const db = getDb();
25
- const siteRepo = new SiteRepository(db);
26
- const snapshotRepo = new SnapshotRepository(db);
27
- const pageRepo = new PageRepository(db);
28
- const edgeRepo = new EdgeRepository(db);
29
- const metricsRepo = new MetricsRepository(db);
30
- const rootUrl = normalizeUrl(startUrl, '', { stripQuery: options.stripQuery });
31
- if (!rootUrl)
32
- throw new Error('Invalid start URL');
33
- const urlObj = new URL(rootUrl);
34
- const domain = urlObj.hostname.replace('www.', '');
35
- const site = siteRepo.firstOrCreateSite(domain);
36
- const siteId = site.id;
37
- const snapshotId = snapshotRepo.createSnapshot(siteId, options.previousGraph ? 'incremental' : 'full');
38
- const rootOrigin = urlObj.origin;
39
- // DB Helper
40
- const savePageToDb = (url, depth, status, data = {}) => {
41
- try {
42
- const existing = pageRepo.getPage(siteId, url);
43
- const isSameSnapshot = existing?.last_seen_snapshot_id === snapshotId;
44
- return pageRepo.upsertAndGetId({
45
- site_id: siteId,
46
- normalized_url: url,
47
- depth: isSameSnapshot ? existing.depth : depth,
48
- http_status: status,
49
- first_seen_snapshot_id: existing ? existing.first_seen_snapshot_id : snapshotId,
50
- last_seen_snapshot_id: snapshotId,
51
- canonical_url: data.canonical !== undefined ? data.canonical : existing?.canonical_url,
52
- content_hash: data.contentHash !== undefined ? data.contentHash : existing?.content_hash,
53
- simhash: data.simhash !== undefined ? data.simhash : existing?.simhash,
54
- etag: data.etag !== undefined ? data.etag : existing?.etag,
55
- last_modified: data.lastModified !== undefined ? data.lastModified : existing?.last_modified,
56
- html: data.html !== undefined ? data.html : existing?.html,
57
- soft404_score: data.soft404Score !== undefined ? data.soft404Score : existing?.soft404_score,
58
- noindex: data.noindex !== undefined ? (data.noindex ? 1 : 0) : existing?.noindex,
59
- nofollow: data.nofollow !== undefined ? (data.nofollow ? 1 : 0) : existing?.nofollow,
60
- security_error: data.securityError !== undefined ? data.securityError : existing?.security_error,
61
- retries: data.retries !== undefined ? data.retries : existing?.retries
62
- });
63
- }
64
- catch (e) {
65
- console.error(`Failed to save page ${url}:`, e);
66
- return null;
67
- }
68
- };
69
- const saveEdgeToDb = (sourceUrl, targetUrl, weight = 1.0, rel = 'internal') => {
70
- try {
71
- const sourceId = pageRepo.getIdByUrl(siteId, sourceUrl);
72
- const targetId = pageRepo.getIdByUrl(siteId, targetUrl);
73
- if (sourceId && targetId) {
74
- edgeRepo.insertEdge(snapshotId, sourceId, targetId, weight, rel);
75
- }
76
- }
77
- catch (e) {
78
- console.error(`Failed to save edge ${sourceUrl} -> ${targetUrl}:`, e);
79
- }
80
- };
81
- // Initialize Modules
82
- const scopeManager = new ScopeManager({
83
- allowedDomains: options.allowedDomains || [],
84
- deniedDomains: options.deniedDomains || [],
85
- includeSubdomains: options.includeSubdomains || false,
86
- rootUrl: startUrl
87
- });
88
- const fetcher = new Fetcher({
89
- rate: options.rate,
90
- proxyUrl: options.proxyUrl,
91
- scopeManager,
92
- maxRedirects: options.maxRedirects,
93
- userAgent: options.userAgent
94
- });
95
- const parser = new Parser();
96
- const sitemapFetcher = new Sitemap();
97
- // Handle robots.txt
98
- let robots = null;
99
- if (!options.ignoreRobots) {
100
- try {
101
- const robotsUrl = new URL('/robots.txt', rootOrigin).toString();
102
- const res = await request(robotsUrl, {
103
- maxRedirections: 3,
104
- headers: { 'User-Agent': 'crawlith/1.0' },
105
- headersTimeout: 5000,
106
- bodyTimeout: 5000
107
- });
108
- if (res.statusCode >= 200 && res.statusCode < 300) {
109
- const txt = await res.body.text();
110
- robots = robotsParser(robotsUrl, txt);
111
- }
112
- else {
113
- await res.body.dump();
114
- }
115
- }
116
- catch {
117
- console.warn('Failed to fetch robots.txt, proceeding...');
118
- }
119
- }
120
- // Queue Setup
121
- const queue = [];
122
- const uniqueQueue = new Set();
123
- const addToQueue = (u, d) => {
124
- if (scopeManager.isUrlEligible(u) !== 'allowed')
125
- return;
126
- if (!uniqueQueue.has(u)) {
127
- uniqueQueue.add(u);
128
- queue.push({ url: u, depth: d });
129
- }
130
- };
131
- // Seed from Sitemap
132
- if (options.sitemap) {
133
- try {
134
- const sitemapUrl = options.sitemap === 'true' ? new URL('/sitemap.xml', rootOrigin).toString() : options.sitemap;
135
- if (sitemapUrl.startsWith('http')) {
136
- console.log(`Fetching sitemap: ${sitemapUrl}`);
137
- const sitemapUrls = await sitemapFetcher.fetch(sitemapUrl);
138
- for (const u of sitemapUrls) {
139
- const normalized = normalizeUrl(u, '', options);
140
- if (normalized)
141
- addToQueue(normalized, 0);
142
- }
143
- }
144
- }
145
- catch (e) {
146
- console.warn('Sitemap fetch failed', e);
147
- }
148
- }
149
- // Seed from startUrl
150
- addToQueue(rootUrl, 0);
151
- let pagesCrawled = 0;
152
- let active = 0;
153
- let reachedLimit = false;
154
- const maxDepthInCrawl = Math.min(options.depth, 10);
155
- const shouldEnqueue = (url, depth) => {
156
- if (visited.has(url))
157
- return false;
158
- if (uniqueQueue.has(url))
159
- return false;
160
- if (depth > maxDepthInCrawl)
161
- return false;
162
- if (scopeManager.isUrlEligible(url) !== 'allowed')
163
- return false;
164
- if (options.detectTraps) {
165
- const trap = trapDetector.checkTrap(url, depth);
166
- if (trap.risk > 0.8)
167
- return false;
168
- }
169
- return true;
170
- };
171
- return new Promise((resolve) => {
172
- const checkDone = () => {
173
- if (queue.length === 0 && active === 0) {
174
- snapshotRepo.updateSnapshotStatus(snapshotId, 'completed', {
175
- limit_reached: reachedLimit ? 1 : 0
176
- });
177
- resolve(snapshotId);
178
- return true;
179
- }
180
- return false;
181
- };
182
- const next = () => {
183
- if (checkDone())
184
- return;
185
- if (pagesCrawled >= options.limit) {
186
- reachedLimit = true;
187
- if (active === 0) {
188
- snapshotRepo.updateSnapshotStatus(snapshotId, 'completed', {
189
- limit_reached: 1
190
- });
191
- resolve(snapshotId);
192
- }
193
- return;
194
- }
195
- while (queue.length > 0 && active < concurrency && pagesCrawled < options.limit) {
196
- const item = queue.shift();
197
- if (visited.has(item.url))
198
- continue;
199
- if (robots && !robots.isAllowed(item.url, 'crawlith'))
200
- continue;
201
- active++;
202
- pagesCrawled++;
203
- visited.add(item.url);
204
- limitConcurrency(() => processPage(item)).finally(() => {
205
- active--;
206
- next();
207
- });
208
- }
209
- };
210
- const processPage = async (item) => {
211
- const { url, depth } = item;
212
- if (scopeManager.isUrlEligible(url) !== 'allowed') {
213
- savePageToDb(url, depth, 0, { securityError: 'blocked_by_domain_filter' });
214
- return;
215
- }
216
- const existingInDb = pageRepo.getPage(siteId, url);
217
- savePageToDb(url, depth, 0);
218
- try {
219
- const res = await fetcher.fetch(url, {
220
- etag: existingInDb?.etag || undefined,
221
- lastModified: existingInDb?.last_modified || undefined,
222
- maxBytes: options.maxBytes,
223
- crawlDelay: robots ? robots.getCrawlDelay('crawlith') : undefined
224
- });
225
- if (options.debug) {
226
- console.log(`${chalk.gray(`[D:${depth}]`)} ${res.status} ${chalk.blue(url)}`);
227
- }
228
- if (res.status === 304) {
229
- savePageToDb(url, depth, 304);
230
- metricsRepo.insertMetrics({
231
- snapshot_id: snapshotId,
232
- page_id: existingInDb.id,
233
- authority_score: null,
234
- hub_score: null,
235
- pagerank: null,
236
- pagerank_score: null,
237
- link_role: null,
238
- crawl_status: 'cached',
239
- word_count: null,
240
- thin_content_score: null,
241
- external_link_ratio: null,
242
- orphan_score: null,
243
- duplicate_cluster_id: null,
244
- duplicate_type: null,
245
- is_cluster_primary: 0
246
- });
247
- return;
248
- }
249
- const chain = res.redirectChain;
250
- for (const step of chain) {
251
- const source = normalizeUrl(step.url, '', options);
252
- const target = normalizeUrl(step.target, '', options);
253
- if (source && target) {
254
- savePageToDb(source, depth, step.status);
255
- savePageToDb(target, depth, 0);
256
- saveEdgeToDb(source, target);
257
- }
258
- }
259
- const finalUrl = normalizeUrl(res.finalUrl, '', options);
260
- if (!finalUrl)
261
- return;
262
- const isStringStatus = typeof res.status === 'string';
263
- if (isStringStatus || (typeof res.status === 'number' && res.status >= 300)) {
264
- savePageToDb(finalUrl, depth, typeof res.status === 'number' ? res.status : 0, {
265
- securityError: isStringStatus ? res.status : undefined,
266
- retries: res.retries
267
- });
268
- return;
269
- }
270
- if (res.status === 200) {
271
- const contentTypeHeader = res.headers['content-type'];
272
- const contentType = Array.isArray(contentTypeHeader) ? contentTypeHeader[0] : (contentTypeHeader || '');
273
- if (!contentType || !contentType.toLowerCase().includes('text/html')) {
274
- savePageToDb(finalUrl, depth, res.status);
275
- return;
276
- }
277
- savePageToDb(finalUrl, depth, res.status);
278
- const parseResult = parser.parse(res.body, finalUrl, res.status);
279
- const pageId = savePageToDb(finalUrl, depth, res.status, {
280
- html: parseResult.html,
281
- canonical: parseResult.canonical || undefined,
282
- noindex: parseResult.noindex,
283
- nofollow: parseResult.nofollow,
284
- contentHash: parseResult.contentHash,
285
- simhash: parseResult.simhash,
286
- soft404Score: parseResult.soft404Score,
287
- etag: res.etag,
288
- lastModified: res.lastModified,
289
- retries: res.retries
290
- });
291
- if (pageId) {
292
- try {
293
- const contentAnalysis = analyzeContent(parseResult.html);
294
- const linkAnalysis = analyzeLinks(parseResult.html, finalUrl, rootOrigin);
295
- const thinScore = calculateThinContentScore(contentAnalysis, 0);
296
- metricsRepo.insertMetrics({
297
- snapshot_id: snapshotId,
298
- page_id: pageId,
299
- authority_score: null,
300
- hub_score: null,
301
- pagerank: null,
302
- pagerank_score: null,
303
- link_role: null,
304
- crawl_status: 'fetched',
305
- word_count: contentAnalysis.wordCount,
306
- thin_content_score: thinScore,
307
- external_link_ratio: linkAnalysis.externalRatio,
308
- orphan_score: null,
309
- duplicate_cluster_id: null,
310
- duplicate_type: null,
311
- is_cluster_primary: 0
312
- });
313
- }
314
- catch (e) {
315
- console.error(`Error calculating per-page metrics for ${finalUrl}:`, e);
316
- }
317
- }
318
- for (const linkItem of parseResult.links) {
319
- const normalizedLink = normalizeUrl(linkItem.url, '', options);
320
- if (normalizedLink && normalizedLink !== finalUrl) {
321
- savePageToDb(normalizedLink, depth + 1, 0);
322
- saveEdgeToDb(finalUrl, normalizedLink, 1.0, 'internal');
323
- if (shouldEnqueue(normalizedLink, depth + 1)) {
324
- addToQueue(normalizedLink, depth + 1);
325
- }
326
- }
327
- }
328
- }
329
- }
330
- catch (e) {
331
- console.error(`Error processing ${url}:`, e);
332
- }
333
- };
334
- next();
335
- });
1
+ import { Crawler } from './crawler.js';
2
+ export async function crawl(startUrl, options, context) {
3
+ const crawler = new Crawler(startUrl, options, context);
4
+ return crawler.run();
336
5
  }
@@ -0,0 +1,87 @@
1
+ import { Graph } from '../graph/graph.js';
2
+ import { EngineContext } from '../events.js';
3
+ import { PluginRegistry } from '../plugin-system/plugin-registry.js';
4
+ export interface CrawlOptions {
5
+ limit: number;
6
+ depth: number;
7
+ concurrency?: number;
8
+ ignoreRobots?: boolean;
9
+ stripQuery?: boolean;
10
+ previousGraph?: Graph;
11
+ sitemap?: string | boolean;
12
+ debug?: boolean;
13
+ detectSoft404?: boolean;
14
+ detectTraps?: boolean;
15
+ rate?: number;
16
+ maxBytes?: number;
17
+ allowedDomains?: string[];
18
+ deniedDomains?: string[];
19
+ includeSubdomains?: boolean;
20
+ proxyUrl?: string;
21
+ maxRedirects?: number;
22
+ userAgent?: string;
23
+ snapshotRunType?: 'completed' | 'incremental' | 'single';
24
+ registry?: PluginRegistry;
25
+ plugins?: any[];
26
+ robots?: any;
27
+ }
28
+ export declare class Crawler {
29
+ private startUrl;
30
+ private options;
31
+ private context;
32
+ private registry?;
33
+ private visited;
34
+ private uniqueQueue;
35
+ private queue;
36
+ private active;
37
+ private pagesCrawled;
38
+ private reachedLimit;
39
+ private maxDepthInCrawl;
40
+ private concurrency;
41
+ private limitConcurrency;
42
+ private siteRepo;
43
+ private snapshotRepo;
44
+ private pageRepo;
45
+ private edgeRepo;
46
+ private metricsRepo;
47
+ private siteId;
48
+ private snapshotId;
49
+ private reusingSnapshot;
50
+ private runType;
51
+ private rootOrigin;
52
+ private discoveryDepths;
53
+ private pageBuffer;
54
+ private edgeBuffer;
55
+ private metricsBuffer;
56
+ private pendingSitemaps;
57
+ private edgesFound;
58
+ private lastProgressEmitAt;
59
+ private progressPhase;
60
+ private scopeManager;
61
+ private fetcher;
62
+ private parser;
63
+ private sitemapFetcher;
64
+ private robots;
65
+ constructor(startUrl: string, options: CrawlOptions, context?: EngineContext);
66
+ private toStorageUrl;
67
+ initialize(): Promise<void>;
68
+ setupModules(): void;
69
+ private fetchRobots;
70
+ shouldEnqueue(url: string, depth: number): boolean;
71
+ addToQueue(u: string, d: number, data?: any): void;
72
+ seedQueue(): Promise<void>;
73
+ private bufferPage;
74
+ private flushPages;
75
+ private bufferEdge;
76
+ private emitProgress;
77
+ private flushEdges;
78
+ private bufferMetrics;
79
+ private flushMetrics;
80
+ flushAll(): Promise<void>;
81
+ private fetchPage;
82
+ private handleCachedResponse;
83
+ private handleRedirects;
84
+ private handleSuccessResponse;
85
+ private processPage;
86
+ run(): Promise<number>;
87
+ }