sentinel-scanner 1.0.1 → 1.1.0-alpha.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,193 @@
1
+ import fs from "node:fs";
2
+ import path from "node:path";
3
+ import type { ArgumentsCamelCase, CommandModule } from "yargs";
4
+ import SpiderScanner from "../modules/spider";
5
+ import { createLogger } from "../utils";
6
+
7
+ export type SpiderScannerCLIOptions = {
8
+ url: string;
9
+ depth?: number;
10
+ output?: string;
11
+ concurrency?: number;
12
+ timeout?: number;
13
+ retries?: number;
14
+ };
15
+
16
+ const cliLogger = createLogger("CLI");
17
+
18
+ export const spiderCommand: CommandModule = {
19
+ command: "spider",
20
+ describe:
21
+ "Crawl a website and get an array of URLs which are internal to the website",
22
+ builder: (yargs) => {
23
+ return yargs
24
+ .option("url", {
25
+ alias: "u",
26
+ type: "string",
27
+ description: "The URL of the website to scan",
28
+ demandOption: true,
29
+ coerce: (url) => {
30
+ try {
31
+ new URL(url);
32
+
33
+ return url;
34
+ } catch (error) {
35
+ throw new Error(`Invalid URL: ${url}`);
36
+ }
37
+ },
38
+ })
39
+ .option("depth", {
40
+ alias: "d",
41
+ type: "number",
42
+ description: "The maximum depth to crawl",
43
+ default: 250,
44
+ coerce: (depth) => {
45
+ if (depth < 0) {
46
+ throw new Error("Depth must be a positive number");
47
+ }
48
+
49
+ if (depth > 250) {
50
+ throw new Error("Depth must be less than 250");
51
+ }
52
+
53
+ return depth;
54
+ },
55
+ })
56
+ .option("output", {
57
+ alias: "o",
58
+ type: "string",
59
+ description:
60
+ "The output file to write the results to. Must be a JSON file",
61
+ coerce: (output) => {
62
+ try {
63
+ // Should throw an error if the path is invalid
64
+ // Should Be A JSON File
65
+ const resolvedPath = path.resolve(output);
66
+ const parsedPath = path.parse(resolvedPath);
67
+
68
+ if (parsedPath.ext !== ".json") {
69
+ throw new Error("Output file must be a JSON file");
70
+ }
71
+
72
+ if (fs.existsSync(resolvedPath)) {
73
+ throw new Error("Output file already exists");
74
+ }
75
+
76
+ return resolvedPath;
77
+ } catch (error) {
78
+ throw new Error(`Invalid output file: ${output}`);
79
+ }
80
+ },
81
+ default: getDefaultFilePath(),
82
+ })
83
+ .option("concurrency", {
84
+ alias: "c",
85
+ type: "number",
86
+ description: "The number of concurrent requests to make",
87
+ default: 10,
88
+ coerce: (concurrency) => {
89
+ if (concurrency < 1) {
90
+ throw new Error("Concurrency must be a positive number");
91
+ }
92
+
93
+ if (concurrency > 20) {
94
+ throw new Error("Concurrency must be less than 20");
95
+ }
96
+
97
+ return concurrency;
98
+ },
99
+ })
100
+ .option("timeout", {
101
+ alias: "t",
102
+ type: "number",
103
+ description: "The timeout for each request in milliseconds",
104
+ default: 5000,
105
+ coerce: (timeout) => {
106
+ if (timeout < 0) {
107
+ throw new Error("Timeout must be a positive number");
108
+ }
109
+
110
+ if (timeout > 25_000) {
111
+ throw new Error("Timeout must be less than 25,000");
112
+ }
113
+
114
+ return timeout;
115
+ },
116
+ })
117
+ .option("retries", {
118
+ alias: "r",
119
+ type: "number",
120
+ description: "The number of retries for each request",
121
+ default: 3,
122
+ coerce: (retries) => {
123
+ if (retries < 0) {
124
+ throw new Error("Retries must be a positive number");
125
+ }
126
+
127
+ if (retries > 10) {
128
+ throw new Error("Retries must be less than 10");
129
+ }
130
+
131
+ return retries;
132
+ },
133
+ });
134
+ },
135
+ handler: async (args) => {
136
+ try {
137
+ const argData = args as ArgumentsCamelCase<SpiderScannerCLIOptions>;
138
+
139
+ const scanner = new SpiderScanner(argData.url, {
140
+ depth: argData.depth ?? 250,
141
+ concurrency: argData.concurrency ?? 10,
142
+ timeout: argData.timeout ?? 5000,
143
+ retries: argData.retries ?? 3,
144
+ });
145
+
146
+ cliLogger.info("Starting to crawl website");
147
+
148
+ const results = await scanner.crawl();
149
+
150
+ if (argData.output) {
151
+ fs.writeFileSync(argData.output, JSON.stringify(results, null, 2));
152
+ cliLogger.info(`Results written to ${argData.output}`);
153
+ } else {
154
+ const resolvedPath = getDefaultFilePath();
155
+ fs.writeFileSync(resolvedPath, JSON.stringify(results, null, 2));
156
+ cliLogger.info(`Results written to ${resolvedPath}`);
157
+ }
158
+ } catch (error) {
159
+ if (error instanceof Error) {
160
+ cliLogger.error(error.message);
161
+ }
162
+ cliLogger.error("Failed to run spider command");
163
+ process.exit(1);
164
+ }
165
+ },
166
+ };
167
+
168
+ const getDefaultFilePath = () => {
169
+ try {
170
+ const resolvedDir = path.resolve("sentinel_output");
171
+ // Check If Directory Exists
172
+ if (!fs.existsSync(resolvedDir)) {
173
+ fs.mkdirSync(resolvedDir);
174
+ }
175
+
176
+ const resolvedPath = path.resolve(
177
+ `sentinel_output/spider_${Date.now()}.json`,
178
+ );
179
+ // Check If File Exists
180
+ if (fs.existsSync(resolvedPath)) {
181
+ throw new Error("Output file already exists");
182
+ }
183
+ const parsedPath = path.parse(resolvedPath);
184
+
185
+ if (parsedPath.ext !== ".json") {
186
+ throw new Error("Output file must be a JSON file");
187
+ }
188
+
189
+ return resolvedPath;
190
+ } catch (error) {
191
+ throw new Error("Invalid output file");
192
+ }
193
+ };
package/src/index.ts CHANGED
@@ -1,27 +1,3 @@
1
- #!/usr/bin/env node
1
+ import SpiderScanner, { type SpiderScannerOptions } from "./modules/spider";
2
2
 
3
- import { Command } from "commander";
4
- // @ts-ignore: For TypeScript compatibility when importing JSON files
5
- import packageData from "../package.json";
6
-
7
- // Create a new Command object
8
- const program = new Command();
9
-
10
- // Set version, name, and description from the package.json
11
- program
12
- .version(packageData.version)
13
- .name(packageData.name)
14
- .description(packageData.description);
15
-
16
- // Add a help command explicitly if needed
17
- program.helpOption("-h, --help", "Display help for command");
18
-
19
- // Parse command-line arguments
20
- program.parse(process.argv);
21
-
22
- const options = program.opts();
23
-
24
- // If no arguments are provided, display help
25
- if (Object.keys(options).length === 0) {
26
- program.help();
27
- }
3
+ export { SpiderScanner, type SpiderScannerOptions };
@@ -0,0 +1,212 @@
1
+ import fetch from "isomorphic-fetch";
2
+ import jsdom from "jsdom";
3
+ import UserAgent from "user-agents";
4
+ import { createLogger } from "../../utils";
5
+
6
+ export interface SpiderScannerOptions {
7
+ depth?: number;
8
+ concurrency?: number;
9
+ retries?: number;
10
+ timeout?: number;
11
+ }
12
+
13
+ export default class SpiderScanner {
14
+ private header: Record<string, string> = {
15
+ "User-Agent": new UserAgent().toString(),
16
+ };
17
+ private url: URL;
18
+ private logger = createLogger("SpiderScanner");
19
+
20
+ private depth: number;
21
+ private concurrency: number;
22
+ private retries: number;
23
+ private timeout: number;
24
+
25
+ constructor(url: string, options: SpiderScannerOptions = {}) {
26
+ const {
27
+ depth = 250,
28
+ concurrency = 5,
29
+ retries = 3,
30
+ timeout = 5000,
31
+ } = options;
32
+ this.depth = depth;
33
+ this.concurrency = concurrency;
34
+ this.retries = retries;
35
+ this.timeout = timeout;
36
+
37
+ try {
38
+ this.url = new URL(url);
39
+ this.logger.info(
40
+ `Initialized with URL: ${url}, User-Agent: ${this.header["User-Agent"]}`,
41
+ );
42
+ } catch (error) {
43
+ if (error instanceof TypeError) {
44
+ this.logger.error("Invalid URL");
45
+ throw new Error("Invalid URL");
46
+ }
47
+ this.logger.error(`Unexpected error in constructor: ${error}`);
48
+ throw error;
49
+ }
50
+ }
51
+
52
+ private normalizeDomain(domain: string): string {
53
+ return domain.startsWith("www.") ? domain.slice(4) : domain;
54
+ }
55
+
56
+ private convertRelativeUrlToAbsolute(url: string): string {
57
+ return new URL(url, this.url.toString()).toString();
58
+ }
59
+
60
+ private isInternalLink(url: string): boolean {
61
+ try {
62
+ const parsedUrl = new URL(url, this.url.href);
63
+ if (!["http:", "https:"].includes(parsedUrl.protocol)) {
64
+ return false;
65
+ }
66
+ const baseDomain = this.normalizeDomain(this.url.hostname);
67
+ const parsedDomain = this.normalizeDomain(parsedUrl.hostname);
68
+ return parsedDomain === baseDomain;
69
+ } catch (error) {
70
+ this.logger.warn(`Error parsing URL: ${url} - ${error}`);
71
+ return false;
72
+ }
73
+ }
74
+
75
+ private async fetchWithRetries(
76
+ url: string,
77
+ retries: number,
78
+ ): Promise<string | null> {
79
+ for (let attempt = 1; attempt <= retries; attempt++) {
80
+ const controller = new AbortController();
81
+ const timeoutId = setTimeout(() => controller.abort(), this.timeout);
82
+
83
+ try {
84
+ this.logger.debug(`Fetching URL (Attempt ${attempt}): ${url}`);
85
+ const randomUserAgent = new UserAgent().toString();
86
+ this.logger.info(`Changing User-Agent to: ${randomUserAgent}`);
87
+ this.header["User-Agent"] = randomUserAgent;
88
+ const response = await fetch(url, {
89
+ headers: this.header,
90
+ signal: controller.signal,
91
+ redirect: "follow",
92
+ });
93
+
94
+ clearTimeout(timeoutId);
95
+
96
+ if (response.ok) {
97
+ this.logger.info(`Successfully fetched URL: ${url}`);
98
+ return await response.text();
99
+ }
100
+
101
+ this.logger.warn(`Failed to fetch URL (${response.status}): ${url}`);
102
+ } catch (error) {
103
+ if ((error as Error).name === "AbortError") {
104
+ this.logger.warn(`Fetch timed out: ${url}`);
105
+ } else {
106
+ this.logger.error(`Error fetching URL: ${url} - ${error}`);
107
+ }
108
+ }
109
+ }
110
+ return null;
111
+ }
112
+
113
+ private extractLinks(html: string): string[] {
114
+ const { JSDOM } = jsdom;
115
+ const dom = new JSDOM(html);
116
+ const links = Array.from(dom.window.document.querySelectorAll("a"));
117
+ const hrefs = links.map((link) => link.href);
118
+ const internalLinks = hrefs.filter((href) => this.isInternalLink(href));
119
+ this.logger.debug(
120
+ `Extracted ${internalLinks.length} internal links from HTML content`,
121
+ );
122
+ return internalLinks.map((link) => this.convertRelativeUrlToAbsolute(link));
123
+ }
124
+
125
+ public async crawl(): Promise<Array<string>> {
126
+ const visited = new Set<string>();
127
+ const queue = new Set<string>([this.url.href]);
128
+ const resultLinks = new Set<string>();
129
+
130
+ // Assets to ignore
131
+ const assetExtensions = [
132
+ ".css",
133
+ ".js",
134
+ ".png",
135
+ ".jpg",
136
+ ".jpeg",
137
+ ".gif",
138
+ ".svg",
139
+ ".ico",
140
+ ".webp",
141
+ ".mp4",
142
+ ".mp3",
143
+ ".wav",
144
+ ".avi",
145
+ ".mov",
146
+ ".webm",
147
+ ".pdf",
148
+ ".doc",
149
+ ".docx",
150
+ ".xls",
151
+ ".xlsx",
152
+ ".ppt",
153
+ ".pptx",
154
+ ".zip",
155
+ ".rar",
156
+ ".tar",
157
+ ".gz",
158
+ ];
159
+
160
+ const fetchAndExtract = async (currentUrl: string) => {
161
+ if (visited.has(currentUrl)) {
162
+ this.logger.debug(`Skipping already visited URL: ${currentUrl}`);
163
+ return;
164
+ }
165
+ visited.add(currentUrl);
166
+ this.logger.info(`Visiting URL: ${currentUrl}`);
167
+
168
+ const html = await this.fetchWithRetries(currentUrl, this.retries);
169
+ if (!html) return;
170
+
171
+ const links = this.extractLinks(html);
172
+
173
+ // Filter out asset links
174
+ for (const link of links) {
175
+ if (assetExtensions.some((ext) => link.endsWith(ext))) {
176
+ this.logger.debug(`Ignoring asset link: ${link}`);
177
+ continue;
178
+ }
179
+ this.logger.debug(`Found link: ${link}`);
180
+ }
181
+
182
+ for (const link of links) {
183
+ if (!visited.has(link) && queue.size < this.depth) {
184
+ queue.add(link);
185
+ this.logger.debug(`Added to queue: ${link}`);
186
+ }
187
+ }
188
+ resultLinks.add(currentUrl);
189
+ };
190
+
191
+ const processBatch = async () => {
192
+ const batch = Array.from(queue).slice(0, this.concurrency);
193
+ for (const url of batch) {
194
+ queue.delete(url);
195
+ }
196
+ await Promise.allSettled(batch.map((url) => fetchAndExtract(url)));
197
+ };
198
+
199
+ this.logger.info(
200
+ `Starting crawl with depth: ${this.depth}, concurrency: ${this.concurrency}`,
201
+ );
202
+ while (queue.size > 0 && visited.size < this.depth) {
203
+ await processBatch();
204
+ }
205
+
206
+ this.logger.info(
207
+ `Crawling completed. Total pages visited: ${resultLinks.size}`,
208
+ );
209
+
210
+ return Array.from(resultLinks);
211
+ }
212
+ }
@@ -0,0 +1,29 @@
1
+ import winston from "winston";
2
+
3
+ export const createLogger = (label: string) =>
4
+ winston.createLogger({
5
+ levels: {
6
+ error: 0,
7
+ warn: 1,
8
+ info: 2,
9
+ http: 3,
10
+ verbose: 4,
11
+ debug: 5,
12
+ silly: 6,
13
+ },
14
+ format: winston.format.combine(
15
+ winston.format.label({ label }),
16
+ winston.format.colorize(),
17
+ winston.format.timestamp({
18
+ format: () => {
19
+ return new Date().toLocaleString("en-US");
20
+ },
21
+ }),
22
+ winston.format.align(),
23
+ winston.format.printf(
24
+ (info) =>
25
+ `\x1b[34m(${info.label})\x1b[0m \x1b[33m${info.timestamp}\x1b[0m [${info.level}]: ${info.message}`,
26
+ ),
27
+ ),
28
+ transports: [new winston.transports.Console()],
29
+ });
package/tsconfig.json CHANGED
@@ -2,13 +2,13 @@
2
2
  "include": ["./src/**/*.ts"],
3
3
  "compilerOptions": {
4
4
  "lib": ["es2023"],
5
- "module": "nodenext",
5
+ "module": "CommonJS",
6
6
  "target": "es2022",
7
- "moduleResolution": "nodenext",
7
+ "moduleResolution": "node",
8
+ "allowSyntheticDefaultImports": true,
8
9
 
9
10
  "rootDir": "./src",
10
11
  "outDir": "build",
11
- "resolvePackageJsonImports": true,
12
12
 
13
13
  "strict": true,
14
14
  "noUncheckedIndexedAccess": true,
@@ -18,7 +18,7 @@
18
18
  "forceConsistentCasingInFileNames": true,
19
19
  "declaration": true,
20
20
  "resolveJsonModule": true,
21
- "emitDeclarationOnly": true,
22
- "allowImportingTsExtensions": true
23
- }
21
+ "allowImportingTsExtensions": false
22
+ },
23
+ "exclude": ["src/__tests__/**/*"]
24
24
  }
File without changes