sentinel-scanner 1.0.1 → 1.1.0-alpha.2
Sign up to get free protection for your applications and to get access to all the features.
- package/.cspell.json +20 -3
- package/.github/workflows/pr.yaml +86 -0
- package/.github/workflows/welcome.yaml +66 -0
- package/CHANGELOG.md +3 -3
- package/DISCLAIMER.md +64 -0
- package/LICENSE +2 -2
- package/README.md +20 -1
- package/build/bin.js +376 -0
- package/build/bin.js.map +7 -0
- package/build/index.d.ts +24 -0
- package/build/index.js +191 -3099
- package/build/index.js.map +4 -4
- package/package.json +19 -9
- package/scripts/build.ts +5 -2
- package/src/bin.ts +20 -0
- package/src/commands/spider.ts +193 -0
- package/src/index.ts +2 -26
- package/src/modules/spider/index.ts +212 -0
- package/src/utils/index.ts +29 -0
- package/tsconfig.json +6 -6
- package/src/__tests__/index.test.ts +0 -0
@@ -0,0 +1,193 @@
|
|
1
|
+
import fs from "node:fs";
|
2
|
+
import path from "node:path";
|
3
|
+
import type { ArgumentsCamelCase, CommandModule } from "yargs";
|
4
|
+
import SpiderScanner from "../modules/spider";
|
5
|
+
import { createLogger } from "../utils";
|
6
|
+
|
7
|
+
export type SpiderScannerCLIOptions = {
|
8
|
+
url: string;
|
9
|
+
depth?: number;
|
10
|
+
output?: string;
|
11
|
+
concurrency?: number;
|
12
|
+
timeout?: number;
|
13
|
+
retries?: number;
|
14
|
+
};
|
15
|
+
|
16
|
+
const cliLogger = createLogger("CLI");
|
17
|
+
|
18
|
+
export const spiderCommand: CommandModule = {
|
19
|
+
command: "spider",
|
20
|
+
describe:
|
21
|
+
"Crawl a website and get an array of URLs which are internal to the website",
|
22
|
+
builder: (yargs) => {
|
23
|
+
return yargs
|
24
|
+
.option("url", {
|
25
|
+
alias: "u",
|
26
|
+
type: "string",
|
27
|
+
description: "The URL of the website to scan",
|
28
|
+
demandOption: true,
|
29
|
+
coerce: (url) => {
|
30
|
+
try {
|
31
|
+
new URL(url);
|
32
|
+
|
33
|
+
return url;
|
34
|
+
} catch (error) {
|
35
|
+
throw new Error(`Invalid URL: ${url}`);
|
36
|
+
}
|
37
|
+
},
|
38
|
+
})
|
39
|
+
.option("depth", {
|
40
|
+
alias: "d",
|
41
|
+
type: "number",
|
42
|
+
description: "The maximum depth to crawl",
|
43
|
+
default: 250,
|
44
|
+
coerce: (depth) => {
|
45
|
+
if (depth < 0) {
|
46
|
+
throw new Error("Depth must be a positive number");
|
47
|
+
}
|
48
|
+
|
49
|
+
if (depth > 250) {
|
50
|
+
throw new Error("Depth must be less than 250");
|
51
|
+
}
|
52
|
+
|
53
|
+
return depth;
|
54
|
+
},
|
55
|
+
})
|
56
|
+
.option("output", {
|
57
|
+
alias: "o",
|
58
|
+
type: "string",
|
59
|
+
description:
|
60
|
+
"The output file to write the results to. Must be a JSON file",
|
61
|
+
coerce: (output) => {
|
62
|
+
try {
|
63
|
+
// Should throw an error if the path is invalid
|
64
|
+
// Should Be A JSON File
|
65
|
+
const resolvedPath = path.resolve(output);
|
66
|
+
const parsedPath = path.parse(resolvedPath);
|
67
|
+
|
68
|
+
if (parsedPath.ext !== ".json") {
|
69
|
+
throw new Error("Output file must be a JSON file");
|
70
|
+
}
|
71
|
+
|
72
|
+
if (fs.existsSync(resolvedPath)) {
|
73
|
+
throw new Error("Output file already exists");
|
74
|
+
}
|
75
|
+
|
76
|
+
return resolvedPath;
|
77
|
+
} catch (error) {
|
78
|
+
throw new Error(`Invalid output file: ${output}`);
|
79
|
+
}
|
80
|
+
},
|
81
|
+
default: getDefaultFilePath(),
|
82
|
+
})
|
83
|
+
.option("concurrency", {
|
84
|
+
alias: "c",
|
85
|
+
type: "number",
|
86
|
+
description: "The number of concurrent requests to make",
|
87
|
+
default: 10,
|
88
|
+
coerce: (concurrency) => {
|
89
|
+
if (concurrency < 1) {
|
90
|
+
throw new Error("Concurrency must be a positive number");
|
91
|
+
}
|
92
|
+
|
93
|
+
if (concurrency > 20) {
|
94
|
+
throw new Error("Concurrency must be less than 20");
|
95
|
+
}
|
96
|
+
|
97
|
+
return concurrency;
|
98
|
+
},
|
99
|
+
})
|
100
|
+
.option("timeout", {
|
101
|
+
alias: "t",
|
102
|
+
type: "number",
|
103
|
+
description: "The timeout for each request in milliseconds",
|
104
|
+
default: 5000,
|
105
|
+
coerce: (timeout) => {
|
106
|
+
if (timeout < 0) {
|
107
|
+
throw new Error("Timeout must be a positive number");
|
108
|
+
}
|
109
|
+
|
110
|
+
if (timeout > 25_000) {
|
111
|
+
throw new Error("Timeout must be less than 25,000");
|
112
|
+
}
|
113
|
+
|
114
|
+
return timeout;
|
115
|
+
},
|
116
|
+
})
|
117
|
+
.option("retries", {
|
118
|
+
alias: "r",
|
119
|
+
type: "number",
|
120
|
+
description: "The number of retries for each request",
|
121
|
+
default: 3,
|
122
|
+
coerce: (retries) => {
|
123
|
+
if (retries < 0) {
|
124
|
+
throw new Error("Retries must be a positive number");
|
125
|
+
}
|
126
|
+
|
127
|
+
if (retries > 10) {
|
128
|
+
throw new Error("Retries must be less than 10");
|
129
|
+
}
|
130
|
+
|
131
|
+
return retries;
|
132
|
+
},
|
133
|
+
});
|
134
|
+
},
|
135
|
+
handler: async (args) => {
|
136
|
+
try {
|
137
|
+
const argData = args as ArgumentsCamelCase<SpiderScannerCLIOptions>;
|
138
|
+
|
139
|
+
const scanner = new SpiderScanner(argData.url, {
|
140
|
+
depth: argData.depth ?? 250,
|
141
|
+
concurrency: argData.concurrency ?? 10,
|
142
|
+
timeout: argData.timeout ?? 5000,
|
143
|
+
retries: argData.retries ?? 3,
|
144
|
+
});
|
145
|
+
|
146
|
+
cliLogger.info("Starting to crawl website");
|
147
|
+
|
148
|
+
const results = await scanner.crawl();
|
149
|
+
|
150
|
+
if (argData.output) {
|
151
|
+
fs.writeFileSync(argData.output, JSON.stringify(results, null, 2));
|
152
|
+
cliLogger.info(`Results written to ${argData.output}`);
|
153
|
+
} else {
|
154
|
+
const resolvedPath = getDefaultFilePath();
|
155
|
+
fs.writeFileSync(resolvedPath, JSON.stringify(results, null, 2));
|
156
|
+
cliLogger.info(`Results written to ${resolvedPath}`);
|
157
|
+
}
|
158
|
+
} catch (error) {
|
159
|
+
if (error instanceof Error) {
|
160
|
+
cliLogger.error(error.message);
|
161
|
+
}
|
162
|
+
cliLogger.error("Failed to run spider command");
|
163
|
+
process.exit(1);
|
164
|
+
}
|
165
|
+
},
|
166
|
+
};
|
167
|
+
|
168
|
+
const getDefaultFilePath = () => {
|
169
|
+
try {
|
170
|
+
const resolvedDir = path.resolve("sentinel_output");
|
171
|
+
// Check If Directory Exists
|
172
|
+
if (!fs.existsSync(resolvedDir)) {
|
173
|
+
fs.mkdirSync(resolvedDir);
|
174
|
+
}
|
175
|
+
|
176
|
+
const resolvedPath = path.resolve(
|
177
|
+
`sentinel_output/spider_${Date.now()}.json`,
|
178
|
+
);
|
179
|
+
// Check If File Exists
|
180
|
+
if (fs.existsSync(resolvedPath)) {
|
181
|
+
throw new Error("Output file already exists");
|
182
|
+
}
|
183
|
+
const parsedPath = path.parse(resolvedPath);
|
184
|
+
|
185
|
+
if (parsedPath.ext !== ".json") {
|
186
|
+
throw new Error("Output file must be a JSON file");
|
187
|
+
}
|
188
|
+
|
189
|
+
return resolvedPath;
|
190
|
+
} catch (error) {
|
191
|
+
throw new Error("Invalid output file");
|
192
|
+
}
|
193
|
+
};
|
package/src/index.ts
CHANGED
@@ -1,27 +1,3 @@
|
|
1
|
-
|
1
|
+
import SpiderScanner, { type SpiderScannerOptions } from "./modules/spider";
|
2
2
|
|
3
|
-
|
4
|
-
// @ts-ignore: For TypeScript compatibility when importing JSON files
|
5
|
-
import packageData from "../package.json";
|
6
|
-
|
7
|
-
// Create a new Command object
|
8
|
-
const program = new Command();
|
9
|
-
|
10
|
-
// Set version, name, and description from the package.json
|
11
|
-
program
|
12
|
-
.version(packageData.version)
|
13
|
-
.name(packageData.name)
|
14
|
-
.description(packageData.description);
|
15
|
-
|
16
|
-
// Add a help command explicitly if needed
|
17
|
-
program.helpOption("-h, --help", "Display help for command");
|
18
|
-
|
19
|
-
// Parse command-line arguments
|
20
|
-
program.parse(process.argv);
|
21
|
-
|
22
|
-
const options = program.opts();
|
23
|
-
|
24
|
-
// If no arguments are provided, display help
|
25
|
-
if (Object.keys(options).length === 0) {
|
26
|
-
program.help();
|
27
|
-
}
|
3
|
+
export { SpiderScanner, type SpiderScannerOptions };
|
@@ -0,0 +1,212 @@
|
|
1
|
+
import fetch from "isomorphic-fetch";
|
2
|
+
import jsdom from "jsdom";
|
3
|
+
import UserAgent from "user-agents";
|
4
|
+
import { createLogger } from "../../utils";
|
5
|
+
|
6
|
+
export interface SpiderScannerOptions {
|
7
|
+
depth?: number;
|
8
|
+
concurrency?: number;
|
9
|
+
retries?: number;
|
10
|
+
timeout?: number;
|
11
|
+
}
|
12
|
+
|
13
|
+
export default class SpiderScanner {
|
14
|
+
private header: Record<string, string> = {
|
15
|
+
"User-Agent": new UserAgent().toString(),
|
16
|
+
};
|
17
|
+
private url: URL;
|
18
|
+
private logger = createLogger("SpiderScanner");
|
19
|
+
|
20
|
+
private depth: number;
|
21
|
+
private concurrency: number;
|
22
|
+
private retries: number;
|
23
|
+
private timeout: number;
|
24
|
+
|
25
|
+
constructor(url: string, options: SpiderScannerOptions = {}) {
|
26
|
+
const {
|
27
|
+
depth = 250,
|
28
|
+
concurrency = 5,
|
29
|
+
retries = 3,
|
30
|
+
timeout = 5000,
|
31
|
+
} = options;
|
32
|
+
this.depth = depth;
|
33
|
+
this.concurrency = concurrency;
|
34
|
+
this.retries = retries;
|
35
|
+
this.timeout = timeout;
|
36
|
+
|
37
|
+
try {
|
38
|
+
this.url = new URL(url);
|
39
|
+
this.logger.info(
|
40
|
+
`Initialized with URL: ${url}, User-Agent: ${this.header["User-Agent"]}`,
|
41
|
+
);
|
42
|
+
} catch (error) {
|
43
|
+
if (error instanceof TypeError) {
|
44
|
+
this.logger.error("Invalid URL");
|
45
|
+
throw new Error("Invalid URL");
|
46
|
+
}
|
47
|
+
this.logger.error(`Unexpected error in constructor: ${error}`);
|
48
|
+
throw error;
|
49
|
+
}
|
50
|
+
}
|
51
|
+
|
52
|
+
private normalizeDomain(domain: string): string {
|
53
|
+
return domain.startsWith("www.") ? domain.slice(4) : domain;
|
54
|
+
}
|
55
|
+
|
56
|
+
private convertRelativeUrlToAbsolute(url: string): string {
|
57
|
+
return new URL(url, this.url.toString()).toString();
|
58
|
+
}
|
59
|
+
|
60
|
+
private isInternalLink(url: string): boolean {
|
61
|
+
try {
|
62
|
+
const parsedUrl = new URL(url, this.url.href);
|
63
|
+
if (!["http:", "https:"].includes(parsedUrl.protocol)) {
|
64
|
+
return false;
|
65
|
+
}
|
66
|
+
const baseDomain = this.normalizeDomain(this.url.hostname);
|
67
|
+
const parsedDomain = this.normalizeDomain(parsedUrl.hostname);
|
68
|
+
return parsedDomain === baseDomain;
|
69
|
+
} catch (error) {
|
70
|
+
this.logger.warn(`Error parsing URL: ${url} - ${error}`);
|
71
|
+
return false;
|
72
|
+
}
|
73
|
+
}
|
74
|
+
|
75
|
+
private async fetchWithRetries(
|
76
|
+
url: string,
|
77
|
+
retries: number,
|
78
|
+
): Promise<string | null> {
|
79
|
+
for (let attempt = 1; attempt <= retries; attempt++) {
|
80
|
+
const controller = new AbortController();
|
81
|
+
const timeoutId = setTimeout(() => controller.abort(), this.timeout);
|
82
|
+
|
83
|
+
try {
|
84
|
+
this.logger.debug(`Fetching URL (Attempt ${attempt}): ${url}`);
|
85
|
+
const randomUserAgent = new UserAgent().toString();
|
86
|
+
this.logger.info(`Changing User-Agent to: ${randomUserAgent}`);
|
87
|
+
this.header["User-Agent"] = randomUserAgent;
|
88
|
+
const response = await fetch(url, {
|
89
|
+
headers: this.header,
|
90
|
+
signal: controller.signal,
|
91
|
+
redirect: "follow",
|
92
|
+
});
|
93
|
+
|
94
|
+
clearTimeout(timeoutId);
|
95
|
+
|
96
|
+
if (response.ok) {
|
97
|
+
this.logger.info(`Successfully fetched URL: ${url}`);
|
98
|
+
return await response.text();
|
99
|
+
}
|
100
|
+
|
101
|
+
this.logger.warn(`Failed to fetch URL (${response.status}): ${url}`);
|
102
|
+
} catch (error) {
|
103
|
+
if ((error as Error).name === "AbortError") {
|
104
|
+
this.logger.warn(`Fetch timed out: ${url}`);
|
105
|
+
} else {
|
106
|
+
this.logger.error(`Error fetching URL: ${url} - ${error}`);
|
107
|
+
}
|
108
|
+
}
|
109
|
+
}
|
110
|
+
return null;
|
111
|
+
}
|
112
|
+
|
113
|
+
private extractLinks(html: string): string[] {
|
114
|
+
const { JSDOM } = jsdom;
|
115
|
+
const dom = new JSDOM(html);
|
116
|
+
const links = Array.from(dom.window.document.querySelectorAll("a"));
|
117
|
+
const hrefs = links.map((link) => link.href);
|
118
|
+
const internalLinks = hrefs.filter((href) => this.isInternalLink(href));
|
119
|
+
this.logger.debug(
|
120
|
+
`Extracted ${internalLinks.length} internal links from HTML content`,
|
121
|
+
);
|
122
|
+
return internalLinks.map((link) => this.convertRelativeUrlToAbsolute(link));
|
123
|
+
}
|
124
|
+
|
125
|
+
public async crawl(): Promise<Array<string>> {
|
126
|
+
const visited = new Set<string>();
|
127
|
+
const queue = new Set<string>([this.url.href]);
|
128
|
+
const resultLinks = new Set<string>();
|
129
|
+
|
130
|
+
// Assets to ignore
|
131
|
+
const assetExtensions = [
|
132
|
+
".css",
|
133
|
+
".js",
|
134
|
+
".png",
|
135
|
+
".jpg",
|
136
|
+
".jpeg",
|
137
|
+
".gif",
|
138
|
+
".svg",
|
139
|
+
".ico",
|
140
|
+
".webp",
|
141
|
+
".mp4",
|
142
|
+
".mp3",
|
143
|
+
".wav",
|
144
|
+
".avi",
|
145
|
+
".mov",
|
146
|
+
".webm",
|
147
|
+
".pdf",
|
148
|
+
".doc",
|
149
|
+
".docx",
|
150
|
+
".xls",
|
151
|
+
".xlsx",
|
152
|
+
".ppt",
|
153
|
+
".pptx",
|
154
|
+
".zip",
|
155
|
+
".rar",
|
156
|
+
".tar",
|
157
|
+
".gz",
|
158
|
+
];
|
159
|
+
|
160
|
+
const fetchAndExtract = async (currentUrl: string) => {
|
161
|
+
if (visited.has(currentUrl)) {
|
162
|
+
this.logger.debug(`Skipping already visited URL: ${currentUrl}`);
|
163
|
+
return;
|
164
|
+
}
|
165
|
+
visited.add(currentUrl);
|
166
|
+
this.logger.info(`Visiting URL: ${currentUrl}`);
|
167
|
+
|
168
|
+
const html = await this.fetchWithRetries(currentUrl, this.retries);
|
169
|
+
if (!html) return;
|
170
|
+
|
171
|
+
const links = this.extractLinks(html);
|
172
|
+
|
173
|
+
// Filter out asset links
|
174
|
+
for (const link of links) {
|
175
|
+
if (assetExtensions.some((ext) => link.endsWith(ext))) {
|
176
|
+
this.logger.debug(`Ignoring asset link: ${link}`);
|
177
|
+
continue;
|
178
|
+
}
|
179
|
+
this.logger.debug(`Found link: ${link}`);
|
180
|
+
}
|
181
|
+
|
182
|
+
for (const link of links) {
|
183
|
+
if (!visited.has(link) && queue.size < this.depth) {
|
184
|
+
queue.add(link);
|
185
|
+
this.logger.debug(`Added to queue: ${link}`);
|
186
|
+
}
|
187
|
+
}
|
188
|
+
resultLinks.add(currentUrl);
|
189
|
+
};
|
190
|
+
|
191
|
+
const processBatch = async () => {
|
192
|
+
const batch = Array.from(queue).slice(0, this.concurrency);
|
193
|
+
for (const url of batch) {
|
194
|
+
queue.delete(url);
|
195
|
+
}
|
196
|
+
await Promise.allSettled(batch.map((url) => fetchAndExtract(url)));
|
197
|
+
};
|
198
|
+
|
199
|
+
this.logger.info(
|
200
|
+
`Starting crawl with depth: ${this.depth}, concurrency: ${this.concurrency}`,
|
201
|
+
);
|
202
|
+
while (queue.size > 0 && visited.size < this.depth) {
|
203
|
+
await processBatch();
|
204
|
+
}
|
205
|
+
|
206
|
+
this.logger.info(
|
207
|
+
`Crawling completed. Total pages visited: ${resultLinks.size}`,
|
208
|
+
);
|
209
|
+
|
210
|
+
return Array.from(resultLinks);
|
211
|
+
}
|
212
|
+
}
|
@@ -0,0 +1,29 @@
|
|
1
|
+
import winston from "winston";
|
2
|
+
|
3
|
+
export const createLogger = (label: string) =>
|
4
|
+
winston.createLogger({
|
5
|
+
levels: {
|
6
|
+
error: 0,
|
7
|
+
warn: 1,
|
8
|
+
info: 2,
|
9
|
+
http: 3,
|
10
|
+
verbose: 4,
|
11
|
+
debug: 5,
|
12
|
+
silly: 6,
|
13
|
+
},
|
14
|
+
format: winston.format.combine(
|
15
|
+
winston.format.label({ label }),
|
16
|
+
winston.format.colorize(),
|
17
|
+
winston.format.timestamp({
|
18
|
+
format: () => {
|
19
|
+
return new Date().toLocaleString("en-US");
|
20
|
+
},
|
21
|
+
}),
|
22
|
+
winston.format.align(),
|
23
|
+
winston.format.printf(
|
24
|
+
(info) =>
|
25
|
+
`\x1b[34m(${info.label})\x1b[0m \x1b[33m${info.timestamp}\x1b[0m [${info.level}]: ${info.message}`,
|
26
|
+
),
|
27
|
+
),
|
28
|
+
transports: [new winston.transports.Console()],
|
29
|
+
});
|
package/tsconfig.json
CHANGED
@@ -2,13 +2,13 @@
|
|
2
2
|
"include": ["./src/**/*.ts"],
|
3
3
|
"compilerOptions": {
|
4
4
|
"lib": ["es2023"],
|
5
|
-
"module": "
|
5
|
+
"module": "CommonJS",
|
6
6
|
"target": "es2022",
|
7
|
-
"moduleResolution": "
|
7
|
+
"moduleResolution": "node",
|
8
|
+
"allowSyntheticDefaultImports": true,
|
8
9
|
|
9
10
|
"rootDir": "./src",
|
10
11
|
"outDir": "build",
|
11
|
-
"resolvePackageJsonImports": true,
|
12
12
|
|
13
13
|
"strict": true,
|
14
14
|
"noUncheckedIndexedAccess": true,
|
@@ -18,7 +18,7 @@
|
|
18
18
|
"forceConsistentCasingInFileNames": true,
|
19
19
|
"declaration": true,
|
20
20
|
"resolveJsonModule": true,
|
21
|
-
"
|
22
|
-
|
23
|
-
|
21
|
+
"allowImportingTsExtensions": false
|
22
|
+
},
|
23
|
+
"exclude": ["src/__tests__/**/*"]
|
24
24
|
}
|
File without changes
|