sentinel-scanner 1.0.1 → 1.1.0-alpha.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.cspell.json +20 -3
- package/.github/workflows/pr.yaml +86 -0
- package/.github/workflows/welcome.yaml +66 -0
- package/CHANGELOG.md +3 -3
- package/DISCLAIMER.md +64 -0
- package/LICENSE +2 -2
- package/README.md +20 -1
- package/build/bin.js +376 -0
- package/build/bin.js.map +7 -0
- package/build/index.d.ts +24 -0
- package/build/index.js +191 -3099
- package/build/index.js.map +4 -4
- package/package.json +19 -9
- package/scripts/build.ts +5 -2
- package/src/bin.ts +20 -0
- package/src/commands/spider.ts +193 -0
- package/src/index.ts +2 -26
- package/src/modules/spider/index.ts +212 -0
- package/src/utils/index.ts +29 -0
- package/tsconfig.json +6 -6
- package/src/__tests__/index.test.ts +0 -0
@@ -0,0 +1,193 @@
|
|
1
|
+
import fs from "node:fs";
|
2
|
+
import path from "node:path";
|
3
|
+
import type { ArgumentsCamelCase, CommandModule } from "yargs";
|
4
|
+
import SpiderScanner from "../modules/spider";
|
5
|
+
import { createLogger } from "../utils";
|
6
|
+
|
7
|
+
export type SpiderScannerCLIOptions = {
|
8
|
+
url: string;
|
9
|
+
depth?: number;
|
10
|
+
output?: string;
|
11
|
+
concurrency?: number;
|
12
|
+
timeout?: number;
|
13
|
+
retries?: number;
|
14
|
+
};
|
15
|
+
|
16
|
+
const cliLogger = createLogger("CLI");
|
17
|
+
|
18
|
+
export const spiderCommand: CommandModule = {
|
19
|
+
command: "spider",
|
20
|
+
describe:
|
21
|
+
"Crawl a website and get an array of URLs which are internal to the website",
|
22
|
+
builder: (yargs) => {
|
23
|
+
return yargs
|
24
|
+
.option("url", {
|
25
|
+
alias: "u",
|
26
|
+
type: "string",
|
27
|
+
description: "The URL of the website to scan",
|
28
|
+
demandOption: true,
|
29
|
+
coerce: (url) => {
|
30
|
+
try {
|
31
|
+
new URL(url);
|
32
|
+
|
33
|
+
return url;
|
34
|
+
} catch (error) {
|
35
|
+
throw new Error(`Invalid URL: ${url}`);
|
36
|
+
}
|
37
|
+
},
|
38
|
+
})
|
39
|
+
.option("depth", {
|
40
|
+
alias: "d",
|
41
|
+
type: "number",
|
42
|
+
description: "The maximum depth to crawl",
|
43
|
+
default: 250,
|
44
|
+
coerce: (depth) => {
|
45
|
+
if (depth < 0) {
|
46
|
+
throw new Error("Depth must be a positive number");
|
47
|
+
}
|
48
|
+
|
49
|
+
if (depth > 250) {
|
50
|
+
throw new Error("Depth must be less than 250");
|
51
|
+
}
|
52
|
+
|
53
|
+
return depth;
|
54
|
+
},
|
55
|
+
})
|
56
|
+
.option("output", {
|
57
|
+
alias: "o",
|
58
|
+
type: "string",
|
59
|
+
description:
|
60
|
+
"The output file to write the results to. Must be a JSON file",
|
61
|
+
coerce: (output) => {
|
62
|
+
try {
|
63
|
+
// Should throw an error if the path is invalid
|
64
|
+
// Should Be A JSON File
|
65
|
+
const resolvedPath = path.resolve(output);
|
66
|
+
const parsedPath = path.parse(resolvedPath);
|
67
|
+
|
68
|
+
if (parsedPath.ext !== ".json") {
|
69
|
+
throw new Error("Output file must be a JSON file");
|
70
|
+
}
|
71
|
+
|
72
|
+
if (fs.existsSync(resolvedPath)) {
|
73
|
+
throw new Error("Output file already exists");
|
74
|
+
}
|
75
|
+
|
76
|
+
return resolvedPath;
|
77
|
+
} catch (error) {
|
78
|
+
throw new Error(`Invalid output file: ${output}`);
|
79
|
+
}
|
80
|
+
},
|
81
|
+
default: getDefaultFilePath(),
|
82
|
+
})
|
83
|
+
.option("concurrency", {
|
84
|
+
alias: "c",
|
85
|
+
type: "number",
|
86
|
+
description: "The number of concurrent requests to make",
|
87
|
+
default: 10,
|
88
|
+
coerce: (concurrency) => {
|
89
|
+
if (concurrency < 1) {
|
90
|
+
throw new Error("Concurrency must be a positive number");
|
91
|
+
}
|
92
|
+
|
93
|
+
if (concurrency > 20) {
|
94
|
+
throw new Error("Concurrency must be less than 20");
|
95
|
+
}
|
96
|
+
|
97
|
+
return concurrency;
|
98
|
+
},
|
99
|
+
})
|
100
|
+
.option("timeout", {
|
101
|
+
alias: "t",
|
102
|
+
type: "number",
|
103
|
+
description: "The timeout for each request in milliseconds",
|
104
|
+
default: 5000,
|
105
|
+
coerce: (timeout) => {
|
106
|
+
if (timeout < 0) {
|
107
|
+
throw new Error("Timeout must be a positive number");
|
108
|
+
}
|
109
|
+
|
110
|
+
if (timeout > 25_000) {
|
111
|
+
throw new Error("Timeout must be less than 25,000");
|
112
|
+
}
|
113
|
+
|
114
|
+
return timeout;
|
115
|
+
},
|
116
|
+
})
|
117
|
+
.option("retries", {
|
118
|
+
alias: "r",
|
119
|
+
type: "number",
|
120
|
+
description: "The number of retries for each request",
|
121
|
+
default: 3,
|
122
|
+
coerce: (retries) => {
|
123
|
+
if (retries < 0) {
|
124
|
+
throw new Error("Retries must be a positive number");
|
125
|
+
}
|
126
|
+
|
127
|
+
if (retries > 10) {
|
128
|
+
throw new Error("Retries must be less than 10");
|
129
|
+
}
|
130
|
+
|
131
|
+
return retries;
|
132
|
+
},
|
133
|
+
});
|
134
|
+
},
|
135
|
+
handler: async (args) => {
|
136
|
+
try {
|
137
|
+
const argData = args as ArgumentsCamelCase<SpiderScannerCLIOptions>;
|
138
|
+
|
139
|
+
const scanner = new SpiderScanner(argData.url, {
|
140
|
+
depth: argData.depth ?? 250,
|
141
|
+
concurrency: argData.concurrency ?? 10,
|
142
|
+
timeout: argData.timeout ?? 5000,
|
143
|
+
retries: argData.retries ?? 3,
|
144
|
+
});
|
145
|
+
|
146
|
+
cliLogger.info("Starting to crawl website");
|
147
|
+
|
148
|
+
const results = await scanner.crawl();
|
149
|
+
|
150
|
+
if (argData.output) {
|
151
|
+
fs.writeFileSync(argData.output, JSON.stringify(results, null, 2));
|
152
|
+
cliLogger.info(`Results written to ${argData.output}`);
|
153
|
+
} else {
|
154
|
+
const resolvedPath = getDefaultFilePath();
|
155
|
+
fs.writeFileSync(resolvedPath, JSON.stringify(results, null, 2));
|
156
|
+
cliLogger.info(`Results written to ${resolvedPath}`);
|
157
|
+
}
|
158
|
+
} catch (error) {
|
159
|
+
if (error instanceof Error) {
|
160
|
+
cliLogger.error(error.message);
|
161
|
+
}
|
162
|
+
cliLogger.error("Failed to run spider command");
|
163
|
+
process.exit(1);
|
164
|
+
}
|
165
|
+
},
|
166
|
+
};
|
167
|
+
|
168
|
+
const getDefaultFilePath = () => {
|
169
|
+
try {
|
170
|
+
const resolvedDir = path.resolve("sentinel_output");
|
171
|
+
// Check If Directory Exists
|
172
|
+
if (!fs.existsSync(resolvedDir)) {
|
173
|
+
fs.mkdirSync(resolvedDir);
|
174
|
+
}
|
175
|
+
|
176
|
+
const resolvedPath = path.resolve(
|
177
|
+
`sentinel_output/spider_${Date.now()}.json`,
|
178
|
+
);
|
179
|
+
// Check If File Exists
|
180
|
+
if (fs.existsSync(resolvedPath)) {
|
181
|
+
throw new Error("Output file already exists");
|
182
|
+
}
|
183
|
+
const parsedPath = path.parse(resolvedPath);
|
184
|
+
|
185
|
+
if (parsedPath.ext !== ".json") {
|
186
|
+
throw new Error("Output file must be a JSON file");
|
187
|
+
}
|
188
|
+
|
189
|
+
return resolvedPath;
|
190
|
+
} catch (error) {
|
191
|
+
throw new Error("Invalid output file");
|
192
|
+
}
|
193
|
+
};
|
package/src/index.ts
CHANGED
@@ -1,27 +1,3 @@
|
|
1
|
-
|
1
|
+
import SpiderScanner, { type SpiderScannerOptions } from "./modules/spider";
|
2
2
|
|
3
|
-
|
4
|
-
// @ts-ignore: For TypeScript compatibility when importing JSON files
|
5
|
-
import packageData from "../package.json";
|
6
|
-
|
7
|
-
// Create a new Command object
|
8
|
-
const program = new Command();
|
9
|
-
|
10
|
-
// Set version, name, and description from the package.json
|
11
|
-
program
|
12
|
-
.version(packageData.version)
|
13
|
-
.name(packageData.name)
|
14
|
-
.description(packageData.description);
|
15
|
-
|
16
|
-
// Add a help command explicitly if needed
|
17
|
-
program.helpOption("-h, --help", "Display help for command");
|
18
|
-
|
19
|
-
// Parse command-line arguments
|
20
|
-
program.parse(process.argv);
|
21
|
-
|
22
|
-
const options = program.opts();
|
23
|
-
|
24
|
-
// If no arguments are provided, display help
|
25
|
-
if (Object.keys(options).length === 0) {
|
26
|
-
program.help();
|
27
|
-
}
|
3
|
+
export { SpiderScanner, type SpiderScannerOptions };
|
@@ -0,0 +1,212 @@
|
|
1
|
+
import fetch from "isomorphic-fetch";
|
2
|
+
import jsdom from "jsdom";
|
3
|
+
import UserAgent from "user-agents";
|
4
|
+
import { createLogger } from "../../utils";
|
5
|
+
|
6
|
+
export interface SpiderScannerOptions {
|
7
|
+
depth?: number;
|
8
|
+
concurrency?: number;
|
9
|
+
retries?: number;
|
10
|
+
timeout?: number;
|
11
|
+
}
|
12
|
+
|
13
|
+
export default class SpiderScanner {
|
14
|
+
private header: Record<string, string> = {
|
15
|
+
"User-Agent": new UserAgent().toString(),
|
16
|
+
};
|
17
|
+
private url: URL;
|
18
|
+
private logger = createLogger("SpiderScanner");
|
19
|
+
|
20
|
+
private depth: number;
|
21
|
+
private concurrency: number;
|
22
|
+
private retries: number;
|
23
|
+
private timeout: number;
|
24
|
+
|
25
|
+
constructor(url: string, options: SpiderScannerOptions = {}) {
|
26
|
+
const {
|
27
|
+
depth = 250,
|
28
|
+
concurrency = 5,
|
29
|
+
retries = 3,
|
30
|
+
timeout = 5000,
|
31
|
+
} = options;
|
32
|
+
this.depth = depth;
|
33
|
+
this.concurrency = concurrency;
|
34
|
+
this.retries = retries;
|
35
|
+
this.timeout = timeout;
|
36
|
+
|
37
|
+
try {
|
38
|
+
this.url = new URL(url);
|
39
|
+
this.logger.info(
|
40
|
+
`Initialized with URL: ${url}, User-Agent: ${this.header["User-Agent"]}`,
|
41
|
+
);
|
42
|
+
} catch (error) {
|
43
|
+
if (error instanceof TypeError) {
|
44
|
+
this.logger.error("Invalid URL");
|
45
|
+
throw new Error("Invalid URL");
|
46
|
+
}
|
47
|
+
this.logger.error(`Unexpected error in constructor: ${error}`);
|
48
|
+
throw error;
|
49
|
+
}
|
50
|
+
}
|
51
|
+
|
52
|
+
private normalizeDomain(domain: string): string {
|
53
|
+
return domain.startsWith("www.") ? domain.slice(4) : domain;
|
54
|
+
}
|
55
|
+
|
56
|
+
private convertRelativeUrlToAbsolute(url: string): string {
|
57
|
+
return new URL(url, this.url.toString()).toString();
|
58
|
+
}
|
59
|
+
|
60
|
+
private isInternalLink(url: string): boolean {
|
61
|
+
try {
|
62
|
+
const parsedUrl = new URL(url, this.url.href);
|
63
|
+
if (!["http:", "https:"].includes(parsedUrl.protocol)) {
|
64
|
+
return false;
|
65
|
+
}
|
66
|
+
const baseDomain = this.normalizeDomain(this.url.hostname);
|
67
|
+
const parsedDomain = this.normalizeDomain(parsedUrl.hostname);
|
68
|
+
return parsedDomain === baseDomain;
|
69
|
+
} catch (error) {
|
70
|
+
this.logger.warn(`Error parsing URL: ${url} - ${error}`);
|
71
|
+
return false;
|
72
|
+
}
|
73
|
+
}
|
74
|
+
|
75
|
+
private async fetchWithRetries(
|
76
|
+
url: string,
|
77
|
+
retries: number,
|
78
|
+
): Promise<string | null> {
|
79
|
+
for (let attempt = 1; attempt <= retries; attempt++) {
|
80
|
+
const controller = new AbortController();
|
81
|
+
const timeoutId = setTimeout(() => controller.abort(), this.timeout);
|
82
|
+
|
83
|
+
try {
|
84
|
+
this.logger.debug(`Fetching URL (Attempt ${attempt}): ${url}`);
|
85
|
+
const randomUserAgent = new UserAgent().toString();
|
86
|
+
this.logger.info(`Changing User-Agent to: ${randomUserAgent}`);
|
87
|
+
this.header["User-Agent"] = randomUserAgent;
|
88
|
+
const response = await fetch(url, {
|
89
|
+
headers: this.header,
|
90
|
+
signal: controller.signal,
|
91
|
+
redirect: "follow",
|
92
|
+
});
|
93
|
+
|
94
|
+
clearTimeout(timeoutId);
|
95
|
+
|
96
|
+
if (response.ok) {
|
97
|
+
this.logger.info(`Successfully fetched URL: ${url}`);
|
98
|
+
return await response.text();
|
99
|
+
}
|
100
|
+
|
101
|
+
this.logger.warn(`Failed to fetch URL (${response.status}): ${url}`);
|
102
|
+
} catch (error) {
|
103
|
+
if ((error as Error).name === "AbortError") {
|
104
|
+
this.logger.warn(`Fetch timed out: ${url}`);
|
105
|
+
} else {
|
106
|
+
this.logger.error(`Error fetching URL: ${url} - ${error}`);
|
107
|
+
}
|
108
|
+
}
|
109
|
+
}
|
110
|
+
return null;
|
111
|
+
}
|
112
|
+
|
113
|
+
private extractLinks(html: string): string[] {
|
114
|
+
const { JSDOM } = jsdom;
|
115
|
+
const dom = new JSDOM(html);
|
116
|
+
const links = Array.from(dom.window.document.querySelectorAll("a"));
|
117
|
+
const hrefs = links.map((link) => link.href);
|
118
|
+
const internalLinks = hrefs.filter((href) => this.isInternalLink(href));
|
119
|
+
this.logger.debug(
|
120
|
+
`Extracted ${internalLinks.length} internal links from HTML content`,
|
121
|
+
);
|
122
|
+
return internalLinks.map((link) => this.convertRelativeUrlToAbsolute(link));
|
123
|
+
}
|
124
|
+
|
125
|
+
public async crawl(): Promise<Array<string>> {
|
126
|
+
const visited = new Set<string>();
|
127
|
+
const queue = new Set<string>([this.url.href]);
|
128
|
+
const resultLinks = new Set<string>();
|
129
|
+
|
130
|
+
// Assets to ignore
|
131
|
+
const assetExtensions = [
|
132
|
+
".css",
|
133
|
+
".js",
|
134
|
+
".png",
|
135
|
+
".jpg",
|
136
|
+
".jpeg",
|
137
|
+
".gif",
|
138
|
+
".svg",
|
139
|
+
".ico",
|
140
|
+
".webp",
|
141
|
+
".mp4",
|
142
|
+
".mp3",
|
143
|
+
".wav",
|
144
|
+
".avi",
|
145
|
+
".mov",
|
146
|
+
".webm",
|
147
|
+
".pdf",
|
148
|
+
".doc",
|
149
|
+
".docx",
|
150
|
+
".xls",
|
151
|
+
".xlsx",
|
152
|
+
".ppt",
|
153
|
+
".pptx",
|
154
|
+
".zip",
|
155
|
+
".rar",
|
156
|
+
".tar",
|
157
|
+
".gz",
|
158
|
+
];
|
159
|
+
|
160
|
+
const fetchAndExtract = async (currentUrl: string) => {
|
161
|
+
if (visited.has(currentUrl)) {
|
162
|
+
this.logger.debug(`Skipping already visited URL: ${currentUrl}`);
|
163
|
+
return;
|
164
|
+
}
|
165
|
+
visited.add(currentUrl);
|
166
|
+
this.logger.info(`Visiting URL: ${currentUrl}`);
|
167
|
+
|
168
|
+
const html = await this.fetchWithRetries(currentUrl, this.retries);
|
169
|
+
if (!html) return;
|
170
|
+
|
171
|
+
const links = this.extractLinks(html);
|
172
|
+
|
173
|
+
// Filter out asset links
|
174
|
+
for (const link of links) {
|
175
|
+
if (assetExtensions.some((ext) => link.endsWith(ext))) {
|
176
|
+
this.logger.debug(`Ignoring asset link: ${link}`);
|
177
|
+
continue;
|
178
|
+
}
|
179
|
+
this.logger.debug(`Found link: ${link}`);
|
180
|
+
}
|
181
|
+
|
182
|
+
for (const link of links) {
|
183
|
+
if (!visited.has(link) && queue.size < this.depth) {
|
184
|
+
queue.add(link);
|
185
|
+
this.logger.debug(`Added to queue: ${link}`);
|
186
|
+
}
|
187
|
+
}
|
188
|
+
resultLinks.add(currentUrl);
|
189
|
+
};
|
190
|
+
|
191
|
+
const processBatch = async () => {
|
192
|
+
const batch = Array.from(queue).slice(0, this.concurrency);
|
193
|
+
for (const url of batch) {
|
194
|
+
queue.delete(url);
|
195
|
+
}
|
196
|
+
await Promise.allSettled(batch.map((url) => fetchAndExtract(url)));
|
197
|
+
};
|
198
|
+
|
199
|
+
this.logger.info(
|
200
|
+
`Starting crawl with depth: ${this.depth}, concurrency: ${this.concurrency}`,
|
201
|
+
);
|
202
|
+
while (queue.size > 0 && visited.size < this.depth) {
|
203
|
+
await processBatch();
|
204
|
+
}
|
205
|
+
|
206
|
+
this.logger.info(
|
207
|
+
`Crawling completed. Total pages visited: ${resultLinks.size}`,
|
208
|
+
);
|
209
|
+
|
210
|
+
return Array.from(resultLinks);
|
211
|
+
}
|
212
|
+
}
|
@@ -0,0 +1,29 @@
|
|
1
|
+
import winston from "winston";
|
2
|
+
|
3
|
+
export const createLogger = (label: string) =>
|
4
|
+
winston.createLogger({
|
5
|
+
levels: {
|
6
|
+
error: 0,
|
7
|
+
warn: 1,
|
8
|
+
info: 2,
|
9
|
+
http: 3,
|
10
|
+
verbose: 4,
|
11
|
+
debug: 5,
|
12
|
+
silly: 6,
|
13
|
+
},
|
14
|
+
format: winston.format.combine(
|
15
|
+
winston.format.label({ label }),
|
16
|
+
winston.format.colorize(),
|
17
|
+
winston.format.timestamp({
|
18
|
+
format: () => {
|
19
|
+
return new Date().toLocaleString("en-US");
|
20
|
+
},
|
21
|
+
}),
|
22
|
+
winston.format.align(),
|
23
|
+
winston.format.printf(
|
24
|
+
(info) =>
|
25
|
+
`\x1b[34m(${info.label})\x1b[0m \x1b[33m${info.timestamp}\x1b[0m [${info.level}]: ${info.message}`,
|
26
|
+
),
|
27
|
+
),
|
28
|
+
transports: [new winston.transports.Console()],
|
29
|
+
});
|
package/tsconfig.json
CHANGED
@@ -2,13 +2,13 @@
|
|
2
2
|
"include": ["./src/**/*.ts"],
|
3
3
|
"compilerOptions": {
|
4
4
|
"lib": ["es2023"],
|
5
|
-
"module": "
|
5
|
+
"module": "CommonJS",
|
6
6
|
"target": "es2022",
|
7
|
-
"moduleResolution": "
|
7
|
+
"moduleResolution": "node",
|
8
|
+
"allowSyntheticDefaultImports": true,
|
8
9
|
|
9
10
|
"rootDir": "./src",
|
10
11
|
"outDir": "build",
|
11
|
-
"resolvePackageJsonImports": true,
|
12
12
|
|
13
13
|
"strict": true,
|
14
14
|
"noUncheckedIndexedAccess": true,
|
@@ -18,7 +18,7 @@
|
|
18
18
|
"forceConsistentCasingInFileNames": true,
|
19
19
|
"declaration": true,
|
20
20
|
"resolveJsonModule": true,
|
21
|
-
"
|
22
|
-
|
23
|
-
|
21
|
+
"allowImportingTsExtensions": false
|
22
|
+
},
|
23
|
+
"exclude": ["src/__tests__/**/*"]
|
24
24
|
}
|
File without changes
|