sentinel-scanner 2.4.1 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. package/.cspell.json +19 -51
  2. package/.github/ISSUE_TEMPLATE/config.yml +1 -1
  3. package/.github/PULL_REQUEST_TEMPLATE.md +2 -2
  4. package/.github/workflows/stale.yaml +20 -0
  5. package/.github/workflows/webapp-scanner.yml +31 -19
  6. package/.github/workflows/welcome.yaml +9 -55
  7. package/.husky/pre-commit +35 -0
  8. package/.vscode/extensions.json +7 -0
  9. package/.vscode/launch.json +20 -0
  10. package/.vscode/settings.json +32 -0
  11. package/.vscode/tasks.json +24 -0
  12. package/CHANGELOG.md +7 -3
  13. package/CODE_OF_CONDUCT.md +4 -1
  14. package/CONTRIBUTING.md +2 -2
  15. package/README.md +5 -0
  16. package/api-extractor.json +30 -30
  17. package/biome.json +6 -32
  18. package/build/index.d.ts +0 -147
  19. package/build/index.js +111 -2633
  20. package/package.json +69 -102
  21. package/scripts/build.ts +68 -78
  22. package/scripts/test.ts +55 -0
  23. package/src/__tests__/spider.test.ts +44 -0
  24. package/src/commands/spider.ts +61 -126
  25. package/src/index.ts +23 -26
  26. package/src/spider/index.ts +345 -0
  27. package/src/spider/types/index.ts +21 -0
  28. package/src/spider/types/schema.ts +54 -0
  29. package/src/utils/index.ts +199 -3
  30. package/tsconfig.json +19 -18
  31. package/.github/assets/header.png +0 -0
  32. package/.github/dependabot.yml +0 -11
  33. package/.github/workflows/pr.yaml +0 -64
  34. package/.nsprc +0 -3
  35. package/build/bin.js +0 -2679
  36. package/build/xhr-sync-worker.js +0 -59
  37. package/docs/CNAME +0 -1
  38. package/docs/disclaimer.md +0 -68
  39. package/docs/headers/details.md +0 -114
  40. package/docs/headers/index.md +0 -73
  41. package/docs/index.md +0 -82
  42. package/docs/ports/index.md +0 -86
  43. package/docs/scoring.md +0 -91
  44. package/docs/spider/index.md +0 -61
  45. package/docs/sql-injection/details.md +0 -109
  46. package/docs/sql-injection/index.md +0 -73
  47. package/docs/xss/details.md +0 -92
  48. package/docs/xss/index.md +0 -73
  49. package/scripts/extras/document-shim.js +0 -4
  50. package/src/bin.ts +0 -29
  51. package/src/commands/header.ts +0 -150
  52. package/src/commands/ports.ts +0 -175
  53. package/src/commands/sqli.ts +0 -150
  54. package/src/commands/xss.ts +0 -149
  55. package/src/modules/headers/headers.ts +0 -161
  56. package/src/modules/headers/index.ts +0 -179
  57. package/src/modules/ports/index.ts +0 -311
  58. package/src/modules/spider/index.ts +0 -178
  59. package/src/modules/sqli/index.ts +0 -486
  60. package/src/modules/sqli/payloads.json +0 -156
  61. package/src/modules/xss/index.ts +0 -401
  62. package/src/modules/xss/payloads.json +0 -2692
  63. package/src/utils/types.ts +0 -7
@@ -1,179 +0,0 @@
1
- import type { Vulnerability } from "../../index.js";
2
- import { generateCVSS } from "../../utils/index.js";
3
- import { createLogger } from "../../utils/index.js";
4
- import { informationLeakChecks, securityChecks } from "./headers.js";
5
-
6
- const DEFAULT_CVSS_BASE = {
7
- attackVector: "N",
8
- attackComplexity: "L",
9
- privilegesRequired: "N",
10
- userInteraction: "N",
11
- scope: "U",
12
- confidentialityImpact: "N",
13
- integrityImpact: "N",
14
- availabilityImpact: "N",
15
- };
16
-
17
- export type HeaderScannerOptions = {
18
- spiderResults: Array<string>;
19
- retries?: number;
20
- timeout?: number;
21
- concurrency?: number;
22
- };
23
-
24
- export type HeadersData = {
25
- name: string;
26
- description: string;
27
- recommendation: string;
28
- check: (value: string) => boolean;
29
- };
30
-
31
- export default class HeaderScanner {
32
- private securityHeaders: HeadersData[];
33
- private informationalHeaders: HeadersData[];
34
- private spiderResults: Array<string>;
35
- private logger = createLogger("Header Scanner");
36
- private retries = 3;
37
- private timeout = 5000;
38
- private concurrency = 10;
39
- private vulnerabilities: Array<Vulnerability> = [];
40
-
41
- constructor(options: HeaderScannerOptions) {
42
- this.spiderResults = options.spiderResults;
43
-
44
- if (options.retries) {
45
- this.retries = options.retries;
46
- }
47
-
48
- if (options.timeout) {
49
- this.timeout = options.timeout;
50
- }
51
-
52
- if (options.concurrency) {
53
- this.concurrency = options.concurrency;
54
- }
55
-
56
- this.securityHeaders = securityChecks;
57
- this.informationalHeaders = informationLeakChecks;
58
- }
59
-
60
- private withRetries = async <T>(
61
- fn: () => Promise<T>,
62
- retries: number,
63
- ): Promise<T> => {
64
- let lastError: Error | undefined;
65
- for (let i = 0; i < retries; i++) {
66
- try {
67
- return await fn();
68
- } catch (error) {
69
- lastError = error as Error;
70
- }
71
- }
72
- throw lastError;
73
- };
74
-
75
- private getHeaders = async (url: string): Promise<Headers> => {
76
- const response = await fetch(url);
77
- return response.headers;
78
- };
79
-
80
- private chunkArray = <T>(
81
- array: Array<T>,
82
- chunkSize: number,
83
- ): Array<Array<T>> => {
84
- const chunks = [];
85
- for (let i = 0; i < array.length; i += chunkSize) {
86
- chunks.push(array.slice(i, i + chunkSize));
87
- }
88
- return chunks;
89
- };
90
-
91
- private checkHeaders = (headers: Headers, url: string): void => {
92
- for (const header of this.securityHeaders) {
93
- const hasheader = headers.has(header.name);
94
- if (hasheader) {
95
- const value = headers.get(header.name);
96
- if (!value) {
97
- // handle Missing Security Headers
98
- const { baseScore: score, severity: level } = generateCVSS(
99
- "CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:N/I:N/A:N",
100
- );
101
-
102
- this.vulnerabilities.push({
103
- type: level,
104
- severity: score,
105
- url,
106
- description: `Header ${header.name} was not found. ${header.description} recommendation: ${header.recommendation}`,
107
- });
108
-
109
- continue;
110
- }
111
-
112
- const check = header.check(value);
113
-
114
- if (!check) {
115
- // handle Insecure value for header
116
- const { baseScore: score, severity: level } = generateCVSS(
117
- "CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:N/I:N/A:N",
118
- );
119
-
120
- this.vulnerabilities.push({
121
- type: level,
122
- severity: score,
123
- url,
124
- description: `Header ${header.name} was found it had the following value ${value} it ${header.description} recommendation: ${header.recommendation}`,
125
- });
126
- }
127
- }
128
- }
129
-
130
- for (const infoHeader of this.informationalHeaders) {
131
- // Should Not Have Informational Headers
132
- const hasheader = headers.has(infoHeader.name);
133
- if (hasheader) {
134
- const value = headers.get(infoHeader.name);
135
- if (!value) {
136
- continue;
137
- }
138
-
139
- const check = infoHeader.check(value);
140
-
141
- if (!check) {
142
- // handle Insecure value for header
143
- const { baseScore: score, severity: level } = generateCVSS(
144
- "CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:N/I:N/A:N",
145
- );
146
-
147
- this.vulnerabilities.push({
148
- type: level,
149
- severity: score,
150
- url,
151
- description: `Header ${infoHeader.name} was found it ${infoHeader.description} recommendation: ${infoHeader.recommendation}`,
152
- });
153
- }
154
- }
155
- }
156
- };
157
-
158
- async scan(): Promise<Array<Vulnerability>> {
159
- const chunks = this.chunkArray(this.spiderResults, 10);
160
-
161
- for (const chunk of chunks) {
162
- await Promise.all(
163
- chunk.map(async (url) => {
164
- try {
165
- const headers = await this.withRetries(
166
- () => this.getHeaders(url),
167
- this.retries,
168
- );
169
- this.checkHeaders(headers, url);
170
- } catch (error) {
171
- this.logger.error(`Error scanning headers for ${url}: ${error}`);
172
- }
173
- }),
174
- );
175
- }
176
-
177
- return this.vulnerabilities;
178
- }
179
- }
@@ -1,311 +0,0 @@
1
- import dns from "node:dns";
2
- import net from "node:net";
3
- import { createLogger, generateCVSS } from "../../utils/index.js";
4
- import type { Vulnerability } from "../../utils/types.js"; // Assuming this interface is in types.ts
5
-
6
- export interface PortsScannerOpts {
7
- spiderResults: Array<string>;
8
- fromPort?: number;
9
- toPort?: number;
10
- allowList?: Array<number>;
11
- concurrency?: number;
12
- timeout?: number;
13
- }
14
-
15
- export default class PortsScanner {
16
- private allowList: Array<number> = [22, 80, 443];
17
- private toScan: Array<number> = [];
18
- private spiderResults: Array<string> = [];
19
- private concurrency = 30;
20
- private timeout = 10000;
21
- private domain: Set<string> = new Set();
22
- private logger = createLogger("PortsScanner");
23
-
24
- constructor(opts: PortsScannerOpts) {
25
- this.spiderResults = opts.spiderResults;
26
- this.allowList = opts.allowList || this.allowList;
27
- this.toScan = this.getPortsToScan(opts.fromPort, opts.toPort);
28
- this.concurrency = opts.concurrency || this.concurrency;
29
- this.timeout = opts.timeout || this.timeout;
30
-
31
- this.validateSpiderResults(this.spiderResults);
32
-
33
- this.spiderResults.map((url) => {
34
- this.domain.add(this.getDomainFromUrl(url));
35
- });
36
-
37
- this.logger.info(
38
- `PortsScanner initialized with ${this.domain.size} domains and ${this.toScan.length} ports to scan`,
39
- );
40
- }
41
-
42
- private validateSpiderResults(spiderResults: Array<string>) {
43
- if (!spiderResults) {
44
- throw new Error("Missing required spiderResults parameter");
45
- }
46
-
47
- if (!Array.isArray(spiderResults)) {
48
- throw new Error("spiderResults must be an array");
49
- }
50
-
51
- if (Array.isArray(spiderResults) && spiderResults.length === 0) {
52
- throw new Error("spiderResults array cannot be empty");
53
- }
54
-
55
- spiderResults.some((url) => {
56
- if (typeof url !== "string") {
57
- throw new Error("spiderResults array must contain only strings");
58
- }
59
- });
60
- }
61
-
62
- private getPortsToScan(fromPort = 1, toPort = 65535): Array<number> {
63
- const allowSet = new Set(this.allowList);
64
- const ports = [];
65
-
66
- for (let i = fromPort; i <= toPort; i++) {
67
- if (!allowSet.has(i)) {
68
- ports.push(i);
69
- }
70
- }
71
-
72
- this.logger.info(`Scanning ports from ${fromPort} to ${toPort}`);
73
-
74
- return ports;
75
- }
76
-
77
- private getDomainFromUrl(url: string): string {
78
- const urlObj = new URL(url);
79
- return urlObj.hostname;
80
- }
81
-
82
- // Scan a specific port on a given IP
83
- private async scanPort(host: string, port: number): Promise<boolean> {
84
- this.logger.info(`Scanning port ${port} on ${host}`);
85
- return new Promise((resolve, reject) => {
86
- const socket = new net.Socket();
87
-
88
- // Create a timeout using setTimeout
89
- const timeout = setTimeout(() => {
90
- socket.destroy();
91
- this.logger.info(`Timeout occurred for ${host}:${port}`);
92
- resolve(false); // Timeout occurred
93
- }, this.timeout);
94
-
95
- socket.on("connect", () => {
96
- clearTimeout(timeout); // Clear timeout when connection is successful
97
- socket.destroy();
98
- this.logger.info(`Port ${port} is open on ${host}`);
99
- resolve(true); // Port is open
100
- });
101
-
102
- socket.on("timeout", () => {
103
- clearTimeout(timeout); // Clear timeout in case of socket timeout event
104
- socket.destroy();
105
- this.logger.info(`Timeout for ${host}:${port}`);
106
- resolve(false); // Timeout event triggered
107
- });
108
-
109
- socket.on("error", (err) => {
110
- clearTimeout(timeout); // Clear timeout in case of socket error
111
- socket.destroy();
112
- this.logger.info(`Error for ${host}:${port} - ${err.message}`);
113
- resolve(false); // Port closed or connection failed
114
- });
115
-
116
- socket.connect(port, host); // Initiates connection to the port
117
- });
118
- }
119
-
120
- private async scanDomain(domain: string): Promise<Vulnerability[]> {
121
- const vulnerabilities: Vulnerability[] = [];
122
- for (const port of this.toScan) {
123
- try {
124
- const isOpen = await this.scanPort(domain, port);
125
- if (isOpen) {
126
- const vulnerability = await this.generateVulnerability(domain, port);
127
- this.logger.info(
128
- `Vulnerability found: ${vulnerability.severity} - ${vulnerability.description} - ${vulnerability.url} - ${vulnerability.type}`,
129
- );
130
- if (vulnerability) vulnerabilities.push(vulnerability);
131
- }
132
- } catch (err) {
133
- this.logger.error(`Error scanning port ${port} on ${domain}: ${err}`);
134
- }
135
- }
136
- return vulnerabilities;
137
- }
138
-
139
- // Limit the number of concurrent scans
140
- private async executeWithConcurrency<T>(
141
- tasks: (() => Promise<T>)[],
142
- concurrency: number,
143
- ): Promise<T[]> {
144
- const results: T[] = [];
145
- const queue: Array<() => Promise<T>> = [...tasks];
146
- let activePromises = 0;
147
-
148
- this.logger.info(
149
- `Executing ${tasks.length} tasks with concurrency ${concurrency}`,
150
- );
151
-
152
- // Helper to process a task
153
- const processQueue = async () => {
154
- if (queue.length === 0 && activePromises === 0) return; // All tasks done
155
-
156
- if (activePromises < concurrency && queue.length > 0) {
157
- const nextTask = queue.shift();
158
- if (!nextTask) return;
159
- activePromises++;
160
- try {
161
- const result = await nextTask();
162
- results.push(result);
163
- } catch (err) {
164
- console.error(err);
165
- } finally {
166
- activePromises--;
167
- // Continue processing the queue recursively
168
- await processQueue(); // Await the recursive call
169
- }
170
- }
171
- };
172
-
173
- // Start processing the queue
174
- await processQueue();
175
-
176
- return results;
177
- }
178
-
179
- private getBanner(host: string, port: number): Promise<string> {
180
- return new Promise((resolve, reject) => {
181
- const socket = new net.Socket();
182
- let banner = "";
183
-
184
- socket.setTimeout(this.timeout);
185
-
186
- socket.on("data", (data) => {
187
- this.logger.info(`Received data from ${host}:${port} - ${data}`);
188
- banner += data.toString();
189
- });
190
-
191
- socket.on("end", () => {
192
- socket.destroy();
193
- this.logger.info(`Banner for ${host}:${port} - ${banner}`);
194
- resolve(banner);
195
- });
196
-
197
- socket.on("timeout", () => {
198
- socket.destroy();
199
- this.logger.info(`Timeout for ${host}:${port}`);
200
- resolve(banner);
201
- });
202
-
203
- socket.on("error", () => {
204
- socket.destroy();
205
- this.logger.info(`Error for ${host}:${port}`);
206
- resolve(banner);
207
- });
208
-
209
- socket.connect(port, host);
210
- });
211
- }
212
-
213
- // Generate a vulnerability report based on the open port
214
- private async generateVulnerability(
215
- domain: string,
216
- port: number,
217
- ): Promise<Vulnerability> {
218
- let type: Vulnerability["type"] = "Info";
219
- let severity = 1;
220
- let description = `Port ${port} is open on ${domain}`;
221
- let payloads: string[] | undefined;
222
-
223
- const banner = await this.getBanner(domain, port);
224
-
225
- if (banner.includes("SSH")) {
226
- type = generateCVSS(
227
- "CVSS:3.0/AV:N/AC:L/PR:H/UI:N/S:U/C:H/I:H/A:H",
228
- ).severity;
229
- severity = generateCVSS(
230
- "CVSS:3.0/AV:N/AC:L/PR:H/UI:N/S:U/C:H/I:H/A:H",
231
- ).baseScore;
232
- description = "SSH service detected. Ensure strong credentials.";
233
- } else if (banner.includes("HTTP")) {
234
- type = generateCVSS(
235
- "CVSS:3.0/AV:N/AC:L/PR:N/UI:N/S:U/C:N/I:N/A:N",
236
- ).severity;
237
- severity = generateCVSS(
238
- "CVSS:3.0/AV:N/AC:L/PR:N/UI:N/S:U/C:N/I:N/A:N",
239
- ).baseScore;
240
- description =
241
- "HTTP service detected. Check for outdated software or misconfigurations.";
242
- } else if (banner.includes("HTTPS")) {
243
- type = generateCVSS(
244
- "CVSS:3.0/AV:N/AC:L/PR:N/UI:N/S:U/C:N/I:N/A:N",
245
- ).severity;
246
- severity = generateCVSS(
247
- "CVSS:3.0/AV:N/AC:L/PR:N/UI:N/S:U/C:N/I:N/A:N",
248
- ).baseScore;
249
- description =
250
- "HTTPS service detected. Ensure SSL/TLS is properly configured.";
251
- } else if (banner.includes("MySQL")) {
252
- type = generateCVSS(
253
- "CVSS:3.0/AV:N/AC:L/PR:H/UI:N/S:U/C:H/I:H/A:H",
254
- ).severity;
255
- severity = generateCVSS(
256
- "CVSS:3.0/AV:N/AC:L/PR:H/UI:N/S:U/C:H/I:H/A:H",
257
- ).baseScore;
258
- description =
259
- "MySQL service detected. Verify access restrictions and secure configurations.";
260
- } else if (banner.includes("SMTP")) {
261
- type = generateCVSS(
262
- "CVSS:3.0/AV:N/AC:L/PR:L/UI:N/S:U/C:L/I:L/A:L",
263
- ).severity;
264
- severity = generateCVSS(
265
- "CVSS:3.0/AV:N/AC:L/PR:L/UI:N/S:U/C:L/I:L/A:L",
266
- ).baseScore;
267
- description =
268
- "SMTP service detected. Verify access restrictions and secure configurations.";
269
- } else if (banner.includes("FTP")) {
270
- type = generateCVSS(
271
- "CVSS:3.0/AV:N/AC:L/PR:L/UI:N/S:U/C:H/I:H/A:H",
272
- ).severity;
273
- severity = generateCVSS(
274
- "CVSS:3.0/AV:N/AC:L/PR:L/UI:N/S:U/C:H/I:H/A:H",
275
- ).baseScore;
276
- description =
277
- "FTP service detected. Verify access restrictions and secure configurations.";
278
- } else {
279
- type = generateCVSS(
280
- "CVSS:3.0/AV:N/AC:L/PR:U/UI:N/S:U/C:U/I:U/A:U",
281
- ).severity;
282
- severity = generateCVSS(
283
- "CVSS:3.0/AV:N/AC:L/PR:U/UI:N/S:U/C:U/I:U/A:U",
284
- ).baseScore;
285
- description = `Unknown service on port ${port}. Investigate further. Banner of the service: ${banner}`;
286
- }
287
-
288
- return {
289
- type,
290
- severity,
291
- url: `${domain}:${port}`,
292
- description,
293
- payloads,
294
- };
295
- }
296
-
297
- // Start scanning
298
- public async scan(): Promise<Vulnerability[]> {
299
- this.logger.info("Starting scan");
300
- const tasks = Array.from(this.domain).map((domain) => () => {
301
- this.logger.info(`Scanning domain ${domain}`);
302
- return this.scanDomain(domain);
303
- });
304
-
305
- const vulnerabilities = await this.executeWithConcurrency(
306
- tasks,
307
- this.concurrency,
308
- );
309
- return vulnerabilities.flat();
310
- }
311
- }
@@ -1,178 +0,0 @@
1
- import jsdom from "jsdom";
2
- import UserAgent from "user-agents";
3
- import { createLogger } from "../../utils/index.js";
4
-
5
- export interface SpiderScannerOptions {
6
- depth?: number;
7
- concurrency?: number;
8
- retries?: number;
9
- timeout?: number;
10
- }
11
-
12
- export default class SpiderScanner {
13
- private header: Record<string, string> = {
14
- "User-Agent": new UserAgent().toString(),
15
- };
16
- private url: URL;
17
- private logger = createLogger("SpiderScanner");
18
-
19
- private depth: number;
20
- private concurrency: number;
21
- private retries: number;
22
- private timeout: number;
23
-
24
- constructor(url: string, options: SpiderScannerOptions = {}) {
25
- const {
26
- depth = 250,
27
- concurrency = 5,
28
- retries = 3,
29
- timeout = 5000,
30
- } = options;
31
- this.depth = depth;
32
- this.concurrency = concurrency;
33
- this.retries = retries;
34
- this.timeout = timeout;
35
-
36
- try {
37
- this.url = new URL(url);
38
- this.logger.info(
39
- `Initialized with URL: ${url}, User-Agent: ${this.header["User-Agent"]}`,
40
- );
41
- } catch (error) {
42
- if (error instanceof TypeError) {
43
- this.logger.error("Invalid URL");
44
- throw new Error("Invalid URL");
45
- }
46
- this.logger.error(`Unexpected error in constructor: ${error}`);
47
- throw error;
48
- }
49
- }
50
-
51
- private normalizeDomain(domain: string): string {
52
- return domain.startsWith("www.") ? domain.slice(4) : domain;
53
- }
54
-
55
- private convertRelativeUrlToAbsolute(url: string): string {
56
- return new URL(url, this.url.toString()).toString();
57
- }
58
-
59
- private isInternalLink(url: string): boolean {
60
- try {
61
- const parsedUrl = new URL(url, this.url.href);
62
- if (!["http:", "https:"].includes(parsedUrl.protocol)) {
63
- return false;
64
- }
65
- const baseDomain = this.normalizeDomain(this.url.hostname);
66
- const parsedDomain = this.normalizeDomain(parsedUrl.hostname);
67
- return parsedDomain === baseDomain;
68
- } catch (error) {
69
- this.logger.warn(`Error parsing URL: ${url} - ${error}`);
70
- return false;
71
- }
72
- }
73
-
74
- private async fetchWithRetries(
75
- url: string,
76
- retries: number,
77
- ): Promise<string | null> {
78
- for (let attempt = 1; attempt <= retries; attempt++) {
79
- const controller = new AbortController();
80
- const timeoutId = setTimeout(() => controller.abort(), this.timeout);
81
-
82
- try {
83
- this.logger.debug(`Fetching URL (Attempt ${attempt}): ${url}`);
84
- const randomUserAgent = new UserAgent().toString();
85
- this.logger.info(`Changing User-Agent to: ${randomUserAgent}`);
86
- this.header["User-Agent"] = randomUserAgent;
87
-
88
- const response = await fetch(url, {
89
- headers: this.header,
90
- signal: controller.signal,
91
- });
92
-
93
- clearTimeout(timeoutId);
94
-
95
- if (response.ok) {
96
- this.logger.info(`Successfully fetched URL: ${url}`);
97
- return await response.text();
98
- }
99
-
100
- this.logger.warn(`Failed to fetch URL (${response.status}): ${url}`);
101
- } catch (error) {
102
- if ((error as Error).name === "AbortError") {
103
- this.logger.warn(`Fetch timed out: ${url}`);
104
- } else {
105
- this.logger.error(`Error fetching URL: ${url} - ${error}`);
106
- }
107
- }
108
- }
109
- return null;
110
- }
111
-
112
- private extractLinks(html: string): string[] {
113
- const { JSDOM } = jsdom;
114
- const dom = new JSDOM(html);
115
- const links = Array.from(dom.window.document.querySelectorAll("a"));
116
- const hrefs = links.map((link) => link.href);
117
- const internalLinks = hrefs.filter((href) => this.isInternalLink(href));
118
- this.logger.debug(
119
- `Extracted ${internalLinks.length} internal links from HTML content`,
120
- );
121
- const assetRegex = new RegExp(
122
- /https?:\/\/(?:[a-zA-Z0-9-]+\.)+[a-zA-Z]{2,6}\/(?:[^ ]*\.(?:jpg|jpeg|png|gif|css|js|svg|woff|woff2|ttf|eot|ico|mp4|webp|pdf))/,
123
- );
124
-
125
- return internalLinks
126
- .map((link) => this.convertRelativeUrlToAbsolute(link))
127
- .filter((link) => !assetRegex.test(link));
128
- }
129
-
130
- public async crawl(): Promise<Array<string>> {
131
- const visited = new Set<string>();
132
- const queue = new Set<string>([this.url.href]);
133
- const resultLinks = new Set<string>();
134
-
135
- const fetchAndExtract = async (currentUrl: string) => {
136
- if (visited.has(currentUrl)) {
137
- this.logger.debug(`Skipping already visited URL: ${currentUrl}`);
138
- return;
139
- }
140
- visited.add(currentUrl);
141
- this.logger.info(`Visiting URL: ${currentUrl}`);
142
-
143
- const html = await this.fetchWithRetries(currentUrl, this.retries);
144
- if (!html) return;
145
-
146
- const links = this.extractLinks(html);
147
-
148
- for (const link of links) {
149
- if (!visited.has(link) && queue.size < this.depth) {
150
- queue.add(link);
151
- this.logger.debug(`Added to queue: ${link}`);
152
- }
153
- }
154
- resultLinks.add(currentUrl);
155
- };
156
-
157
- const processBatch = async () => {
158
- const batch = Array.from(queue).slice(0, this.concurrency);
159
- for (const url of batch) {
160
- queue.delete(url);
161
- }
162
- await Promise.allSettled(batch.map((url) => fetchAndExtract(url)));
163
- };
164
-
165
- this.logger.info(
166
- `Starting crawl with depth: ${this.depth}, concurrency: ${this.concurrency}`,
167
- );
168
- while (queue.size > 0 && visited.size < this.depth) {
169
- await processBatch();
170
- }
171
-
172
- this.logger.info(
173
- `Crawling completed. Total pages visited: ${resultLinks.size}`,
174
- );
175
-
176
- return Array.from(resultLinks);
177
- }
178
- }