sentinel-scanner 2.4.1 → 2.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.cspell.json +19 -51
- package/.github/ISSUE_TEMPLATE/config.yml +1 -1
- package/.github/PULL_REQUEST_TEMPLATE.md +2 -2
- package/.github/workflows/stale.yaml +20 -0
- package/.github/workflows/webapp-scanner.yml +31 -19
- package/.github/workflows/welcome.yaml +9 -55
- package/.husky/pre-commit +35 -0
- package/.vscode/extensions.json +7 -0
- package/.vscode/launch.json +20 -0
- package/.vscode/settings.json +32 -0
- package/.vscode/tasks.json +24 -0
- package/CHANGELOG.md +7 -3
- package/CODE_OF_CONDUCT.md +4 -1
- package/CONTRIBUTING.md +2 -2
- package/README.md +5 -0
- package/api-extractor.json +30 -30
- package/biome.json +6 -32
- package/build/index.d.ts +0 -147
- package/build/index.js +111 -2633
- package/package.json +69 -102
- package/scripts/build.ts +68 -78
- package/scripts/test.ts +55 -0
- package/src/__tests__/spider.test.ts +44 -0
- package/src/commands/spider.ts +61 -126
- package/src/index.ts +23 -26
- package/src/spider/index.ts +345 -0
- package/src/spider/types/index.ts +21 -0
- package/src/spider/types/schema.ts +54 -0
- package/src/utils/index.ts +199 -3
- package/tsconfig.json +19 -18
- package/.github/assets/header.png +0 -0
- package/.github/dependabot.yml +0 -11
- package/.github/workflows/pr.yaml +0 -64
- package/.nsprc +0 -3
- package/build/bin.js +0 -2679
- package/build/xhr-sync-worker.js +0 -59
- package/docs/CNAME +0 -1
- package/docs/disclaimer.md +0 -68
- package/docs/headers/details.md +0 -114
- package/docs/headers/index.md +0 -73
- package/docs/index.md +0 -82
- package/docs/ports/index.md +0 -86
- package/docs/scoring.md +0 -91
- package/docs/spider/index.md +0 -61
- package/docs/sql-injection/details.md +0 -109
- package/docs/sql-injection/index.md +0 -73
- package/docs/xss/details.md +0 -92
- package/docs/xss/index.md +0 -73
- package/scripts/extras/document-shim.js +0 -4
- package/src/bin.ts +0 -29
- package/src/commands/header.ts +0 -150
- package/src/commands/ports.ts +0 -175
- package/src/commands/sqli.ts +0 -150
- package/src/commands/xss.ts +0 -149
- package/src/modules/headers/headers.ts +0 -161
- package/src/modules/headers/index.ts +0 -179
- package/src/modules/ports/index.ts +0 -311
- package/src/modules/spider/index.ts +0 -178
- package/src/modules/sqli/index.ts +0 -486
- package/src/modules/sqli/payloads.json +0 -156
- package/src/modules/xss/index.ts +0 -401
- package/src/modules/xss/payloads.json +0 -2692
- package/src/utils/types.ts +0 -7
package/src/commands/spider.ts
CHANGED
@@ -1,17 +1,9 @@
|
|
1
1
|
import fs from "node:fs";
|
2
2
|
import path from "node:path";
|
3
3
|
import type { ArgumentsCamelCase, CommandModule } from "yargs";
|
4
|
-
import
|
5
|
-
import {
|
6
|
-
|
7
|
-
export type SpiderScannerCLIOptions = {
|
8
|
-
url: string;
|
9
|
-
depth?: number;
|
10
|
-
output?: string;
|
11
|
-
concurrency?: number;
|
12
|
-
timeout?: number;
|
13
|
-
retries?: number;
|
14
|
-
};
|
4
|
+
import { Spider } from "../spider/index.ts";
|
5
|
+
import type { SpiderConstructorOptions } from "../spider/types/index.ts";
|
6
|
+
import { createLogger } from "../utils/index.ts";
|
15
7
|
|
16
8
|
const cliLogger = createLogger("CLI");
|
17
9
|
|
@@ -21,145 +13,88 @@ export const spiderCommand: CommandModule = {
|
|
21
13
|
"Crawl a website and get an array of URLs which are internal to the website",
|
22
14
|
builder: (yargs) => {
|
23
15
|
return yargs
|
24
|
-
.option("
|
25
|
-
alias: "
|
16
|
+
.option("seed", {
|
17
|
+
alias: "s",
|
18
|
+
describe: "The seed URL to start crawling",
|
26
19
|
type: "string",
|
27
|
-
description: "The URL of the website to scan",
|
28
20
|
demandOption: true,
|
29
|
-
coerce: (
|
21
|
+
coerce: (arg) => {
|
30
22
|
try {
|
31
|
-
new URL(
|
23
|
+
new URL(arg);
|
32
24
|
|
33
|
-
return
|
25
|
+
return arg;
|
34
26
|
} catch (error) {
|
35
|
-
|
27
|
+
cliLogger.error(error instanceof Error ? error.message : error);
|
28
|
+
process.exit(1);
|
36
29
|
}
|
37
30
|
},
|
38
31
|
})
|
39
|
-
.option("
|
32
|
+
.option("maxDepth", {
|
40
33
|
alias: "d",
|
34
|
+
describe: "The maximum depth to crawl",
|
41
35
|
type: "number",
|
42
|
-
description: "The maximum depth to crawl",
|
43
36
|
default: 250,
|
44
|
-
coerce: (depth) => {
|
45
|
-
if (depth < 0) {
|
46
|
-
throw new Error("Depth must be a positive number");
|
47
|
-
}
|
48
|
-
|
49
|
-
if (depth > 250) {
|
50
|
-
throw new Error("Depth must be less than 250");
|
51
|
-
}
|
52
|
-
|
53
|
-
return depth;
|
54
|
-
},
|
55
37
|
})
|
56
|
-
.option("
|
57
|
-
alias: "
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
coerce: (output) => {
|
62
|
-
try {
|
63
|
-
// Should throw an error if the path is invalid
|
64
|
-
// Should Be A JSON File
|
65
|
-
const resolvedPath = path.resolve(output);
|
66
|
-
const parsedPath = path.parse(resolvedPath);
|
67
|
-
|
68
|
-
if (parsedPath.ext !== ".json") {
|
69
|
-
throw new Error("Output file must be a JSON file");
|
70
|
-
}
|
71
|
-
|
72
|
-
if (fs.existsSync(resolvedPath)) {
|
73
|
-
throw new Error("Output file already exists");
|
74
|
-
}
|
75
|
-
|
76
|
-
return resolvedPath;
|
77
|
-
} catch (error) {
|
78
|
-
throw new Error(`Invalid output file: ${output}`);
|
79
|
-
}
|
80
|
-
},
|
81
|
-
default: getDefaultFilePath(),
|
38
|
+
.option("maxRetries", {
|
39
|
+
alias: "r",
|
40
|
+
describe: "The maximum retries for a failed request",
|
41
|
+
type: "number",
|
42
|
+
default: 3,
|
82
43
|
})
|
83
44
|
.option("concurrency", {
|
84
45
|
alias: "c",
|
46
|
+
describe: "The number of concurrent requests",
|
85
47
|
type: "number",
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
if (concurrency > 20) {
|
94
|
-
throw new Error("Concurrency must be less than 20");
|
95
|
-
}
|
96
|
-
|
97
|
-
return concurrency;
|
98
|
-
},
|
48
|
+
default: 30,
|
49
|
+
})
|
50
|
+
.option("ignoreExternalLinks", {
|
51
|
+
alias: "i",
|
52
|
+
describe: "Ignore external links",
|
53
|
+
type: "boolean",
|
54
|
+
default: true,
|
99
55
|
})
|
100
56
|
.option("timeout", {
|
101
57
|
alias: "t",
|
58
|
+
describe: "Request timeout in milliseconds",
|
102
59
|
type: "number",
|
103
|
-
|
104
|
-
default: 5000,
|
105
|
-
coerce: (timeout) => {
|
106
|
-
if (timeout < 0) {
|
107
|
-
throw new Error("Timeout must be a positive number");
|
108
|
-
}
|
109
|
-
|
110
|
-
if (timeout > 25_000) {
|
111
|
-
throw new Error("Timeout must be less than 25,000");
|
112
|
-
}
|
113
|
-
|
114
|
-
return timeout;
|
115
|
-
},
|
60
|
+
default: 8000,
|
116
61
|
})
|
117
|
-
.option("
|
118
|
-
alias: "
|
119
|
-
|
120
|
-
|
121
|
-
default:
|
122
|
-
coerce: (retries) => {
|
123
|
-
if (retries < 0) {
|
124
|
-
throw new Error("Retries must be a positive number");
|
125
|
-
}
|
126
|
-
|
127
|
-
if (retries > 10) {
|
128
|
-
throw new Error("Retries must be less than 10");
|
129
|
-
}
|
130
|
-
|
131
|
-
return retries;
|
132
|
-
},
|
62
|
+
.option("output", {
|
63
|
+
alias: "o",
|
64
|
+
describe: "Output file path",
|
65
|
+
type: "string",
|
66
|
+
default: getDefaultFilePath(),
|
133
67
|
});
|
134
68
|
},
|
135
|
-
handler: async (
|
69
|
+
handler: async (yargs) => {
|
70
|
+
const args = yargs as ArgumentsCamelCase<{
|
71
|
+
seed: string;
|
72
|
+
maxDepth: number;
|
73
|
+
maxRetries: number;
|
74
|
+
concurrency: number;
|
75
|
+
ignoreExternalLinks: boolean;
|
76
|
+
timeout: number;
|
77
|
+
output: string;
|
78
|
+
}>;
|
79
|
+
const opts: SpiderConstructorOptions = {
|
80
|
+
seed: args.seed,
|
81
|
+
maxDepth: args.maxDepth || 250,
|
82
|
+
maxRetries: args.maxRetries || 3,
|
83
|
+
concurrency: args.concurrency || 30,
|
84
|
+
ignoreExternalLinks:
|
85
|
+
args.ignoreExternalLinks === undefined
|
86
|
+
? true
|
87
|
+
: args.ignoreExternalLinks,
|
88
|
+
timeout: args.timeout || 8000,
|
89
|
+
};
|
90
|
+
|
91
|
+
const scanner = new Spider(opts);
|
136
92
|
try {
|
137
|
-
const
|
138
|
-
|
139
|
-
|
140
|
-
depth: argData.depth ?? 250,
|
141
|
-
concurrency: argData.concurrency ?? 10,
|
142
|
-
timeout: argData.timeout ?? 5000,
|
143
|
-
retries: argData.retries ?? 3,
|
144
|
-
});
|
145
|
-
|
146
|
-
cliLogger.info("Starting to crawl website");
|
147
|
-
|
148
|
-
const results = await scanner.crawl();
|
149
|
-
|
150
|
-
if (argData.output) {
|
151
|
-
fs.writeFileSync(argData.output, JSON.stringify(results, null, 2));
|
152
|
-
cliLogger.info(`Results written to ${argData.output}`);
|
153
|
-
} else {
|
154
|
-
const resolvedPath = getDefaultFilePath();
|
155
|
-
fs.writeFileSync(resolvedPath, JSON.stringify(results, null, 2));
|
156
|
-
cliLogger.info(`Results written to ${resolvedPath}`);
|
157
|
-
}
|
93
|
+
const results = await scanner.scan();
|
94
|
+
fs.writeFileSync(args.output, JSON.stringify(results, null, 2));
|
95
|
+
cliLogger.info(`Results saved to ${args.output}`);
|
158
96
|
} catch (error) {
|
159
|
-
|
160
|
-
cliLogger.error(error.message);
|
161
|
-
}
|
162
|
-
cliLogger.error("Failed to run spider command");
|
97
|
+
cliLogger.error(error instanceof Error ? error.message : error);
|
163
98
|
process.exit(1);
|
164
99
|
}
|
165
100
|
},
|
@@ -187,7 +122,7 @@ const getDefaultFilePath = () => {
|
|
187
122
|
}
|
188
123
|
|
189
124
|
return resolvedPath;
|
190
|
-
} catch (
|
125
|
+
} catch (_) {
|
191
126
|
throw new Error("Invalid output file");
|
192
127
|
}
|
193
128
|
};
|
package/src/index.ts
CHANGED
@@ -1,27 +1,24 @@
|
|
1
|
-
|
2
|
-
type HeadersData,
|
3
|
-
type HeaderScannerOptions,
|
4
|
-
} from "./modules/headers/index.js";
|
5
|
-
import PortsScanner, { type PortsScannerOpts } from "./modules/ports/index.js";
|
6
|
-
import SpiderScanner, {
|
7
|
-
type SpiderScannerOptions,
|
8
|
-
} from "./modules/spider/index.js";
|
9
|
-
import SqliScanner, {
|
10
|
-
type SqliConstructorOpts,
|
11
|
-
type SQLErrors,
|
12
|
-
type SupportedDatabases,
|
13
|
-
} from "./modules/sqli/index.js";
|
14
|
-
import XSSScanner, { type XSSConstructorOpts } from "./modules/xss/index.js";
|
15
|
-
import { Vulnerability } from "./utils/types.js";
|
1
|
+
#!/usr/bin/env node --no-warnings
|
16
2
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
3
|
+
import yargs from "yargs";
|
4
|
+
import { hideBin } from "yargs/helpers";
|
5
|
+
import { spiderCommand } from "./commands/spider.ts";
|
6
|
+
import { getPackageData } from "./utils/index.ts";
|
7
|
+
|
8
|
+
const { name, version } = getPackageData();
|
9
|
+
|
10
|
+
const commandHandler = yargs(hideBin(process.argv));
|
11
|
+
|
12
|
+
commandHandler.demandCommand();
|
13
|
+
commandHandler.version(version);
|
14
|
+
commandHandler.scriptName(name);
|
15
|
+
commandHandler.usage("Usage: $0 <command> [options]");
|
16
|
+
commandHandler.help().alias("help", "h");
|
17
|
+
commandHandler.version().alias("version", "v");
|
18
|
+
commandHandler.strict();
|
19
|
+
commandHandler.showHelpOnFail(true);
|
20
|
+
|
21
|
+
commandHandler.command(spiderCommand);
|
22
|
+
|
23
|
+
commandHandler.version().alias("version", "v");
|
24
|
+
commandHandler.parse();
|
@@ -0,0 +1,345 @@
|
|
1
|
+
import { parse } from "node-html-parser";
|
2
|
+
import {
|
3
|
+
chunkArray,
|
4
|
+
createLogger,
|
5
|
+
safeStringify,
|
6
|
+
withRetries,
|
7
|
+
} from "../utils/index.ts";
|
8
|
+
import type { SpiderConstructorOptions, SpiderResults } from "./types/index.ts";
|
9
|
+
import { SpiderConstructorOptionsSchema } from "./types/schema.ts";
|
10
|
+
|
11
|
+
/**
|
12
|
+
* The Spider class is used to scan a web application by crawling through the URLs and extracting information.
|
13
|
+
* The Spider class uses a breadth-first search algorithm to crawl through the URLs.
|
14
|
+
*/
|
15
|
+
export class Spider {
|
16
|
+
/**
|
17
|
+
* The logger instance for the Spider class.
|
18
|
+
* We use this to log messages to the console.
|
19
|
+
*/
|
20
|
+
private logger = createLogger("Spider");
|
21
|
+
/**
|
22
|
+
* The options provided to the Spider constructor.
|
23
|
+
* These options are used to configure the behavior of the Spider.
|
24
|
+
*
|
25
|
+
* @see SpiderConstructorOptionsSchema
|
26
|
+
*/
|
27
|
+
private options: SpiderConstructorOptions;
|
28
|
+
|
29
|
+
constructor(opts: SpiderConstructorOptions) {
|
30
|
+
/**
|
31
|
+
* Validate the options provided to the Spider constructor.
|
32
|
+
*/
|
33
|
+
const result = SpiderConstructorOptionsSchema.safeParse(opts);
|
34
|
+
|
35
|
+
if (result.error !== undefined || !result.data) {
|
36
|
+
/**
|
37
|
+
* If the options are invalid, we should throw an error and exit the process.
|
38
|
+
*/
|
39
|
+
this.logger.error("Invalid options provided to the Spider constructor.");
|
40
|
+
throw new Error(
|
41
|
+
`Invalid options provided to the Spider constructor: ${safeStringify(
|
42
|
+
result.error,
|
43
|
+
)}`,
|
44
|
+
);
|
45
|
+
}
|
46
|
+
|
47
|
+
/**
|
48
|
+
* If the options are valid, we can proceed with the initialization of the Spider.
|
49
|
+
*/
|
50
|
+
this.options = SpiderConstructorOptionsSchema.parse(opts);
|
51
|
+
|
52
|
+
/**
|
53
|
+
* Log the options provided to the Spider constructor.
|
54
|
+
*/
|
55
|
+
this.logger.info(
|
56
|
+
`Spider created with options: ${safeStringify(this.options)}`,
|
57
|
+
);
|
58
|
+
}
|
59
|
+
|
60
|
+
private isInternalUrl(url: string): boolean {
|
61
|
+
/**
|
62
|
+
* Check if the URL starts with the seed URL.
|
63
|
+
* If it does, then it is an internal URL.
|
64
|
+
* Otherwise, it is an external URL.
|
65
|
+
*/
|
66
|
+
return new URL(url).origin === new URL(this.options.seed).origin;
|
67
|
+
}
|
68
|
+
|
69
|
+
/**
|
70
|
+
* Fetches the page at the given URL.
|
71
|
+
* @param url - The URL of the page to fetch.
|
72
|
+
* @returns A promise that resolves to the fetched page content as a string.
|
73
|
+
*/
|
74
|
+
private async fetchPage(url: string): Promise<string | null> {
|
75
|
+
const fetchUrl = (url: string) => {
|
76
|
+
this.logger.info(`Fetching URL: ${url}`);
|
77
|
+
/**
|
78
|
+
* We return a promise that resolves when the first of the following promises resolves.
|
79
|
+
* This allows us to handle cases where the request takes too long to complete.
|
80
|
+
*/
|
81
|
+
return Promise.race([
|
82
|
+
/**
|
83
|
+
* We use the `fetch` API to fetch the page at the given URL.
|
84
|
+
*/
|
85
|
+
fetch(url, {
|
86
|
+
/**
|
87
|
+
* We set the `redirect` option to "follow" to follow redirects.
|
88
|
+
*
|
89
|
+
* @see https://developer.mozilla.org/en-US/docs/Web/API/WindowOrWorkerGlobalScope/fetch#parameters
|
90
|
+
*/
|
91
|
+
redirect: "follow",
|
92
|
+
})
|
93
|
+
/**
|
94
|
+
* We extract the text content of the response.
|
95
|
+
* This will be the HTML content of the page.
|
96
|
+
*/
|
97
|
+
.then((res) => res.text()),
|
98
|
+
/**
|
99
|
+
* We create a promise that resolves to null after the specified timeout.
|
100
|
+
* This handles cases where the request takes too long to complete.
|
101
|
+
*/
|
102
|
+
new Promise<string | null>((resolve) =>
|
103
|
+
setTimeout(() => resolve(null), this.options.timeout),
|
104
|
+
),
|
105
|
+
]);
|
106
|
+
};
|
107
|
+
|
108
|
+
/**
|
109
|
+
* Fetch the page at the given URL.
|
110
|
+
* We use the `withRetries` utility function to retry the fetch operation
|
111
|
+
* in case of a failure.
|
112
|
+
*/
|
113
|
+
return await withRetries(fetchUrl, [url], this.options.maxRetries);
|
114
|
+
}
|
115
|
+
|
116
|
+
private normalizeUrl(baseUrl: string, href: string): string | null {
|
117
|
+
try {
|
118
|
+
if (href.startsWith("http://") || href.startsWith("https://")) {
|
119
|
+
return new URL(href).toString();
|
120
|
+
}
|
121
|
+
|
122
|
+
if (href.startsWith("/")) {
|
123
|
+
return new URL(href, baseUrl).toString();
|
124
|
+
}
|
125
|
+
|
126
|
+
const url = new URL(href, baseUrl);
|
127
|
+
|
128
|
+
return url.toString();
|
129
|
+
} catch (error) {
|
130
|
+
/**
|
131
|
+
* If an error occurs while normalizing the URL, log the error and return null.
|
132
|
+
*/
|
133
|
+
this.logger.error(`Error normalizing URL: ${href}`);
|
134
|
+
this.logger.error(error);
|
135
|
+
return null;
|
136
|
+
}
|
137
|
+
}
|
138
|
+
|
139
|
+
/**
|
140
|
+
* Extracts URLs from the given HTML content using a URL regex and a base URL.
|
141
|
+
*
|
142
|
+
* @param html - The HTML content from which to extract URLs.
|
143
|
+
* @param baseUrl - The base URL used to normalize the extracted URLs.
|
144
|
+
* @returns An array of extracted URLs.
|
145
|
+
*/
|
146
|
+
private extractUrls(html: string, baseUrl: string) {
|
147
|
+
const extracted = new Set<string>();
|
148
|
+
|
149
|
+
/**
|
150
|
+
* Parse the HTML content using the `parse` function from the `node-html-parser` package.
|
151
|
+
*/
|
152
|
+
const root = parse(html);
|
153
|
+
|
154
|
+
/**
|
155
|
+
* Find all the anchor elements in the HTML content.
|
156
|
+
*/
|
157
|
+
const anchors = root
|
158
|
+
.querySelectorAll("a")
|
159
|
+
.concat(root.querySelectorAll("link"))
|
160
|
+
.concat(root.querySelectorAll("area"))
|
161
|
+
.concat(root.querySelectorAll("base"));
|
162
|
+
|
163
|
+
/**
|
164
|
+
* Iterate over the anchor elements.
|
165
|
+
*/
|
166
|
+
for (const anchor of anchors) {
|
167
|
+
/**
|
168
|
+
* Extract the `href` attribute from the anchor element.
|
169
|
+
*/
|
170
|
+
const href = anchor.getAttribute("href");
|
171
|
+
|
172
|
+
/**
|
173
|
+
* If the `href` attribute is not present, skip to the next anchor element.
|
174
|
+
*/
|
175
|
+
if (!href) {
|
176
|
+
continue;
|
177
|
+
}
|
178
|
+
|
179
|
+
/**
|
180
|
+
* Normalize the extracted URL using the base URL.
|
181
|
+
*/
|
182
|
+
const normalized = this.normalizeUrl(baseUrl, href);
|
183
|
+
|
184
|
+
if (normalized) {
|
185
|
+
if (
|
186
|
+
this.options.ignoreExternalLinks &&
|
187
|
+
!this.isInternalUrl(normalized)
|
188
|
+
) {
|
189
|
+
this.logger.info(`Ignoring external URL: ${normalized}`);
|
190
|
+
continue;
|
191
|
+
}
|
192
|
+
|
193
|
+
extracted.add(normalized);
|
194
|
+
}
|
195
|
+
}
|
196
|
+
|
197
|
+
/**
|
198
|
+
* Return the array of extracted URLs.
|
199
|
+
*/
|
200
|
+
return Array.from(extracted);
|
201
|
+
}
|
202
|
+
|
203
|
+
/**
|
204
|
+
* Scans the web application by crawling through the URLs and extracting information.
|
205
|
+
* Returns the spider results containing the seed URL and the visited URLs.
|
206
|
+
*
|
207
|
+
* @returns A promise that resolves to the spider results.
|
208
|
+
* @see SpiderResults
|
209
|
+
*/
|
210
|
+
public async scan(): Promise<SpiderResults> {
|
211
|
+
this.logger.info("Starting scan...");
|
212
|
+
/**
|
213
|
+
* Create a set to keep track of visited URLs.
|
214
|
+
* This set will be used to avoid visiting the same URL multiple times.
|
215
|
+
* Initially, the set is empty.
|
216
|
+
*/
|
217
|
+
const visited = new Set<string>();
|
218
|
+
/**
|
219
|
+
* Create a queue of URLs to visit.
|
220
|
+
* Initially, the queue contains only the seed URL.
|
221
|
+
*/
|
222
|
+
const queue = new Set<string>([this.options.seed]);
|
223
|
+
|
224
|
+
/**
|
225
|
+
* Process a URL.
|
226
|
+
* This function fetches the content of the URL, extracts URLs from the content, and adds the extracted URLs to the queue.
|
227
|
+
* It also adds the current URL to the set of visited URLs.
|
228
|
+
*
|
229
|
+
* @param url - The URL to process.
|
230
|
+
* @returns A promise that resolves to an array of extracted URLs.
|
231
|
+
*/
|
232
|
+
const processUrl = async (url: string) => {
|
233
|
+
this.logger.info(`Processing URL: ${url}`);
|
234
|
+
/**
|
235
|
+
* Fetch the page at the given URL.
|
236
|
+
*/
|
237
|
+
const pageContent = await this.fetchPage(url);
|
238
|
+
|
239
|
+
/**
|
240
|
+
* Extract URLs from the fetched page content.
|
241
|
+
* and log the number of URLs extracted.
|
242
|
+
*/
|
243
|
+
if (!pageContent) {
|
244
|
+
this.logger.warn(`Failed to fetch URL: ${url}`);
|
245
|
+
return [];
|
246
|
+
}
|
247
|
+
|
248
|
+
const extractedUrls = this.extractUrls(pageContent, url);
|
249
|
+
this.logger.info(`Extracted ${extractedUrls.length} URLs`);
|
250
|
+
|
251
|
+
/**
|
252
|
+
* Add the current URL to the set of visited URLs.
|
253
|
+
*/
|
254
|
+
visited.add(url);
|
255
|
+
|
256
|
+
/**
|
257
|
+
* Return the extracted URLs.
|
258
|
+
*/
|
259
|
+
return extractedUrls;
|
260
|
+
};
|
261
|
+
|
262
|
+
/**
|
263
|
+
* Process a batch of URLs.
|
264
|
+
* This function fetches the content of the URLs in the batch,
|
265
|
+
* extracts URLs from the content, and adds the extracted URLs to the queue.
|
266
|
+
* It also removes the processed URLs from the queue.
|
267
|
+
*
|
268
|
+
* @param batch - The batch of URLs to process.
|
269
|
+
* @returns A promise that resolves when the batch is processed.
|
270
|
+
*/
|
271
|
+
const processBatch = async (batch: string[]) => {
|
272
|
+
/**
|
273
|
+
* Process the URLs in the current batch.
|
274
|
+
*/
|
275
|
+
const promises = batch.map(processUrl);
|
276
|
+
/**
|
277
|
+
* Wait for all the promises to resolve.
|
278
|
+
*/
|
279
|
+
const results = await Promise.all(promises);
|
280
|
+
/**
|
281
|
+
* Flatten the results to get a single array of URLs.
|
282
|
+
* Then log the number of URLs processed.
|
283
|
+
*/
|
284
|
+
const urls = results.flat();
|
285
|
+
this.logger.info(`Processed ${batch.length} URLs`);
|
286
|
+
|
287
|
+
/**
|
288
|
+
* Add the extracted URLs to the queue.
|
289
|
+
*/
|
290
|
+
for (const url of urls) {
|
291
|
+
this.logger.info(`Adding URL to queue: ${url}`);
|
292
|
+
if (!visited.has(url)) {
|
293
|
+
this.logger.info(`URL not visited: ${url}`);
|
294
|
+
queue.add(url);
|
295
|
+
visited.add(url);
|
296
|
+
}
|
297
|
+
}
|
298
|
+
|
299
|
+
/**
|
300
|
+
* Remove the processed URLs from the queue.
|
301
|
+
*/
|
302
|
+
for (const url of batch) {
|
303
|
+
queue.delete(url);
|
304
|
+
}
|
305
|
+
};
|
306
|
+
|
307
|
+
/**
|
308
|
+
* Initialize the current depth to 0.
|
309
|
+
*/
|
310
|
+
let currentDepth = 0;
|
311
|
+
|
312
|
+
while (queue.size > 0 && currentDepth < this.options.maxDepth) {
|
313
|
+
this.logger.info(`Processing depth: ${currentDepth}`);
|
314
|
+
/**
|
315
|
+
* Split the queue into batches of URLs.
|
316
|
+
*/
|
317
|
+
const batches = chunkArray(Array.from(queue), this.options.concurrency);
|
318
|
+
/**
|
319
|
+
* Iterate over the batches of URLs.
|
320
|
+
*/
|
321
|
+
for (const batch of batches) {
|
322
|
+
/**
|
323
|
+
* Process the current batch of URLs.
|
324
|
+
*/
|
325
|
+
await withRetries(processBatch, [batch], this.options.maxRetries);
|
326
|
+
}
|
327
|
+
|
328
|
+
/**
|
329
|
+
* Increment the current depth.
|
330
|
+
*/
|
331
|
+
currentDepth++;
|
332
|
+
this.logger.silly(`Processed depth: ${currentDepth}`);
|
333
|
+
}
|
334
|
+
|
335
|
+
/**
|
336
|
+
* Return The Spider Results
|
337
|
+
*
|
338
|
+
* @see SpiderResults
|
339
|
+
*/
|
340
|
+
return {
|
341
|
+
seed: this.options.seed,
|
342
|
+
urls: Array.from(visited),
|
343
|
+
};
|
344
|
+
}
|
345
|
+
}
|
@@ -0,0 +1,21 @@
|
|
1
|
+
import type { z } from "zod";
|
2
|
+
import type {
|
3
|
+
SpiderConstructorOptionsSchema,
|
4
|
+
SpiderResultSchema,
|
5
|
+
} from "./schema.ts";
|
6
|
+
|
7
|
+
/**
|
8
|
+
* Represents the options for constructing a Spider object.
|
9
|
+
*
|
10
|
+
* @see SpiderConstructorOptionsSchema
|
11
|
+
*/
|
12
|
+
export type SpiderConstructorOptions = z.infer<
|
13
|
+
typeof SpiderConstructorOptionsSchema
|
14
|
+
>;
|
15
|
+
|
16
|
+
/**
|
17
|
+
* Represents the result of a Spider object.
|
18
|
+
*
|
19
|
+
* @see SpiderResultSchema
|
20
|
+
*/
|
21
|
+
export type SpiderResults = z.infer<typeof SpiderResultSchema>;
|
@@ -0,0 +1,54 @@
|
|
1
|
+
import { z } from "zod";
|
2
|
+
|
3
|
+
/**
|
4
|
+
* Options for constructing a Spider instance.
|
5
|
+
*/
|
6
|
+
export const SpiderConstructorOptionsSchema = z
|
7
|
+
.object({
|
8
|
+
/**
|
9
|
+
* The seed URL for the spider to start crawling from.
|
10
|
+
*/
|
11
|
+
seed: z.string().url(),
|
12
|
+
|
13
|
+
/**
|
14
|
+
* The maximum depth of crawling. Defaults to 250.
|
15
|
+
*/
|
16
|
+
maxDepth: z.number().int().positive().max(250).default(250),
|
17
|
+
|
18
|
+
/**
|
19
|
+
* The concurrency level for crawling. Defaults to 10.
|
20
|
+
*/
|
21
|
+
concurrency: z.number().int().positive().max(30).default(30),
|
22
|
+
|
23
|
+
/**
|
24
|
+
* Whether to ignore external links. Defaults to true.
|
25
|
+
*/
|
26
|
+
ignoreExternalLinks: z.boolean().default(true),
|
27
|
+
/**
|
28
|
+
* The maximum number of retries for failed requests. Defaults to 3.
|
29
|
+
*/
|
30
|
+
maxRetries: z.number().int().positive().max(10).default(3),
|
31
|
+
|
32
|
+
/**
|
33
|
+
* The timeout for requests in milliseconds. Defaults to 5000.
|
34
|
+
*/
|
35
|
+
timeout: z.number().int().positive().max(60_000).default(5000),
|
36
|
+
})
|
37
|
+
/**
|
38
|
+
* Ensure that default values are applied when the options are not provided.
|
39
|
+
*/
|
40
|
+
.strict();
|
41
|
+
|
42
|
+
/**
|
43
|
+
* Represents the result of a spider operation.
|
44
|
+
*/
|
45
|
+
export const SpiderResultSchema = z.object({
|
46
|
+
/**
|
47
|
+
* The seed URL used for the spider operation.
|
48
|
+
*/
|
49
|
+
seed: z.string(),
|
50
|
+
/**
|
51
|
+
* An array of URLs found during the spider operation.
|
52
|
+
*/
|
53
|
+
urls: z.array(z.string()),
|
54
|
+
});
|