@akotliar/sitemap-qa 1.0.0-alpha.2 → 1.0.0-alpha.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/dist/index.cjs +0 -2398
- package/dist/index.cjs.map +0 -1
- package/dist/index.d.cts +0 -1
package/dist/index.cjs
DELETED
|
@@ -1,2398 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
"use strict";
|
|
3
|
-
var __create = Object.create;
|
|
4
|
-
var __defProp = Object.defineProperty;
|
|
5
|
-
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
6
|
-
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
7
|
-
var __getProtoOf = Object.getPrototypeOf;
|
|
8
|
-
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
9
|
-
var __copyProps = (to, from, except, desc) => {
|
|
10
|
-
if (from && typeof from === "object" || typeof from === "function") {
|
|
11
|
-
for (let key of __getOwnPropNames(from))
|
|
12
|
-
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
13
|
-
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
14
|
-
}
|
|
15
|
-
return to;
|
|
16
|
-
};
|
|
17
|
-
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
|
|
18
|
-
// If the importer is in node compatibility mode or this is not an ESM
|
|
19
|
-
// file that has been converted to a CommonJS file using a Babel-
|
|
20
|
-
// compatible transform (i.e. "__esModule" has not been set), then set
|
|
21
|
-
// "default" to the CommonJS "module.exports" for node compatibility.
|
|
22
|
-
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
|
|
23
|
-
mod
|
|
24
|
-
));
|
|
25
|
-
|
|
26
|
-
// src/index.ts
|
|
27
|
-
var import_config2 = require("dotenv/config");
|
|
28
|
-
var import_commander2 = require("commander");
|
|
29
|
-
|
|
30
|
-
// src/commands/analyze.ts
|
|
31
|
-
var import_commander = require("commander");
|
|
32
|
-
var import_fs3 = require("fs");
|
|
33
|
-
var import_ora = __toESM(require("ora"), 1);
|
|
34
|
-
var import_chalk = __toESM(require("chalk"), 1);
|
|
35
|
-
var import_cli_progress = __toESM(require("cli-progress"), 1);
|
|
36
|
-
var import_os3 = __toESM(require("os"), 1);
|
|
37
|
-
|
|
38
|
-
// src/config/config-loader.ts
|
|
39
|
-
var import_promises = require("fs/promises");
|
|
40
|
-
var import_fs = require("fs");
|
|
41
|
-
var import_path = require("path");
|
|
42
|
-
var import_os = require("os");
|
|
43
|
-
|
|
44
|
-
// src/types/config.ts
|
|
45
|
-
var DEFAULT_CONFIG = {
|
|
46
|
-
timeout: 30,
|
|
47
|
-
concurrency: 10,
|
|
48
|
-
parsingConcurrency: 50,
|
|
49
|
-
// Optimized for network-bound parallel parsing
|
|
50
|
-
discoveryConcurrency: 50,
|
|
51
|
-
// Optimized for recursive sitemap index discovery
|
|
52
|
-
outputFormat: "html",
|
|
53
|
-
outputDir: "./sitemap-qa/report",
|
|
54
|
-
verbose: false,
|
|
55
|
-
baseUrl: "https://example.com",
|
|
56
|
-
// Default for tests
|
|
57
|
-
acceptedPatterns: [],
|
|
58
|
-
riskDetectionBatchSize: 1e4,
|
|
59
|
-
riskDetectionConcurrency: void 0,
|
|
60
|
-
// Auto-detect in risk-detector.ts
|
|
61
|
-
progressBar: void 0,
|
|
62
|
-
// Auto-detect TTY
|
|
63
|
-
silent: false,
|
|
64
|
-
benchmark: false
|
|
65
|
-
};
|
|
66
|
-
|
|
67
|
-
// src/config/config-loader.ts
|
|
68
|
-
async function loadConfig(cliOptions) {
|
|
69
|
-
let config = { ...DEFAULT_CONFIG };
|
|
70
|
-
const globalConfigPath = (0, import_path.join)((0, import_os.homedir)(), ".sitemap-qa", "config.json");
|
|
71
|
-
if ((0, import_fs.existsSync)(globalConfigPath)) {
|
|
72
|
-
try {
|
|
73
|
-
const globalConfig = JSON.parse(await (0, import_promises.readFile)(globalConfigPath, "utf-8"));
|
|
74
|
-
config = { ...config, ...globalConfig };
|
|
75
|
-
} catch (error) {
|
|
76
|
-
console.warn(`Warning: Failed to load global config: ${error}`);
|
|
77
|
-
}
|
|
78
|
-
}
|
|
79
|
-
const projectConfigPath = (0, import_path.join)(process.cwd(), ".sitemap-qa.config.json");
|
|
80
|
-
if ((0, import_fs.existsSync)(projectConfigPath)) {
|
|
81
|
-
try {
|
|
82
|
-
const projectConfig = JSON.parse(await (0, import_promises.readFile)(projectConfigPath, "utf-8"));
|
|
83
|
-
config = { ...config, ...projectConfig };
|
|
84
|
-
} catch (error) {
|
|
85
|
-
console.warn(`Warning: Failed to load project config: ${error}`);
|
|
86
|
-
}
|
|
87
|
-
}
|
|
88
|
-
const envConfig = loadFromEnv();
|
|
89
|
-
config = { ...config, ...envConfig };
|
|
90
|
-
config = mergeCliOptions(config, cliOptions);
|
|
91
|
-
if (cliOptions.baseUrl) {
|
|
92
|
-
config.baseUrl = cliOptions.baseUrl;
|
|
93
|
-
}
|
|
94
|
-
validateConfig(config);
|
|
95
|
-
return config;
|
|
96
|
-
}
|
|
97
|
-
function loadFromEnv() {
|
|
98
|
-
const env = {};
|
|
99
|
-
if (process.env.SITEMAP_VERIFY_TIMEOUT) {
|
|
100
|
-
env.timeout = parseInt(process.env.SITEMAP_VERIFY_TIMEOUT, 10);
|
|
101
|
-
}
|
|
102
|
-
return env;
|
|
103
|
-
}
|
|
104
|
-
function mergeCliOptions(config, cliOptions) {
|
|
105
|
-
const merged = { ...config };
|
|
106
|
-
if (cliOptions.timeout && cliOptions.timeout !== "30") {
|
|
107
|
-
merged.timeout = parseInt(cliOptions.timeout, 10);
|
|
108
|
-
}
|
|
109
|
-
if (cliOptions.output) {
|
|
110
|
-
merged.outputFormat = cliOptions.output;
|
|
111
|
-
}
|
|
112
|
-
if (cliOptions.outputDir) {
|
|
113
|
-
merged.outputDir = cliOptions.outputDir;
|
|
114
|
-
}
|
|
115
|
-
if (cliOptions.verbose === true) {
|
|
116
|
-
merged.verbose = true;
|
|
117
|
-
}
|
|
118
|
-
if (cliOptions.acceptedPatterns) {
|
|
119
|
-
merged.acceptedPatterns = cliOptions.acceptedPatterns.split(",").map((p) => p.trim()).filter(Boolean);
|
|
120
|
-
}
|
|
121
|
-
return merged;
|
|
122
|
-
}
|
|
123
|
-
function validateConfig(config) {
|
|
124
|
-
if (config.timeout < 1 || config.timeout > 300) {
|
|
125
|
-
throw new Error("Timeout must be between 1 and 300 seconds");
|
|
126
|
-
}
|
|
127
|
-
if (!["json", "html"].includes(config.outputFormat)) {
|
|
128
|
-
throw new Error("Output format must be json or html");
|
|
129
|
-
}
|
|
130
|
-
}
|
|
131
|
-
|
|
132
|
-
// src/errors/network-errors.ts
|
|
133
|
-
var NetworkError = class extends Error {
|
|
134
|
-
constructor(url, originalError) {
|
|
135
|
-
super(`Network request failed for ${url}: ${originalError.message}`);
|
|
136
|
-
this.url = url;
|
|
137
|
-
this.originalError = originalError;
|
|
138
|
-
this.name = "NetworkError";
|
|
139
|
-
}
|
|
140
|
-
code = "NETWORK_ERROR";
|
|
141
|
-
};
|
|
142
|
-
var HttpError = class extends Error {
|
|
143
|
-
constructor(url, statusCode, statusText) {
|
|
144
|
-
let message = `HTTP ${statusCode} error for ${url}`;
|
|
145
|
-
if (statusCode === 403) {
|
|
146
|
-
message += "\n Note: 403 Forbidden often indicates bot protection (Cloudflare, etc.) or access restrictions";
|
|
147
|
-
}
|
|
148
|
-
super(message);
|
|
149
|
-
this.url = url;
|
|
150
|
-
this.statusCode = statusCode;
|
|
151
|
-
this.statusText = statusText;
|
|
152
|
-
this.name = "HttpError";
|
|
153
|
-
}
|
|
154
|
-
code = "HTTP_ERROR";
|
|
155
|
-
};
|
|
156
|
-
|
|
157
|
-
// src/utils/http-client.ts
|
|
158
|
-
var import_playwright = require("playwright");
|
|
159
|
-
var import_axios = __toESM(require("axios"), 1);
|
|
160
|
-
var import_http = require("http");
|
|
161
|
-
var import_https = require("https");
|
|
162
|
-
var httpAgent = new import_http.Agent({
|
|
163
|
-
keepAlive: true,
|
|
164
|
-
maxSockets: 200,
|
|
165
|
-
// Allow many concurrent connections
|
|
166
|
-
maxFreeSockets: 50,
|
|
167
|
-
timeout: 15e3
|
|
168
|
-
});
|
|
169
|
-
var httpsAgent = new import_https.Agent({
|
|
170
|
-
keepAlive: true,
|
|
171
|
-
maxSockets: 200,
|
|
172
|
-
maxFreeSockets: 50,
|
|
173
|
-
timeout: 15e3
|
|
174
|
-
});
|
|
175
|
-
var axiosInstance = import_axios.default.create({
|
|
176
|
-
httpAgent,
|
|
177
|
-
httpsAgent,
|
|
178
|
-
maxRedirects: 5,
|
|
179
|
-
validateStatus: () => true
|
|
180
|
-
// Don't throw on any status code
|
|
181
|
-
});
|
|
182
|
-
async function fetchUrlWithBrowser(url, timeout) {
|
|
183
|
-
let browser;
|
|
184
|
-
try {
|
|
185
|
-
browser = await import_playwright.chromium.launch({
|
|
186
|
-
headless: true,
|
|
187
|
-
args: [
|
|
188
|
-
"--disable-blink-features=AutomationControlled",
|
|
189
|
-
// Hide automation flags
|
|
190
|
-
"--disable-dev-shm-usage",
|
|
191
|
-
"--no-sandbox"
|
|
192
|
-
]
|
|
193
|
-
});
|
|
194
|
-
const context = await browser.newContext({
|
|
195
|
-
userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
196
|
-
viewport: { width: 1920, height: 1080 },
|
|
197
|
-
locale: "en-US",
|
|
198
|
-
timezoneId: "America/New_York",
|
|
199
|
-
extraHTTPHeaders: {
|
|
200
|
-
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
|
201
|
-
"Accept-Language": "en-US,en;q=0.9",
|
|
202
|
-
"Accept-Encoding": "gzip, deflate, br",
|
|
203
|
-
"DNT": "1",
|
|
204
|
-
"Connection": "keep-alive",
|
|
205
|
-
"Upgrade-Insecure-Requests": "1"
|
|
206
|
-
}
|
|
207
|
-
});
|
|
208
|
-
const page = await context.newPage();
|
|
209
|
-
await page.addInitScript(() => {
|
|
210
|
-
Object.defineProperty(navigator, "webdriver", {
|
|
211
|
-
get: () => false
|
|
212
|
-
});
|
|
213
|
-
window.chrome = {
|
|
214
|
-
runtime: {}
|
|
215
|
-
};
|
|
216
|
-
const originalQuery = window.navigator.permissions.query;
|
|
217
|
-
window.navigator.permissions.query = (parameters) => parameters.name === "notifications" ? Promise.resolve({ state: Notification.permission }) : originalQuery(parameters);
|
|
218
|
-
});
|
|
219
|
-
page.setDefaultTimeout(timeout * 1e3);
|
|
220
|
-
const response = await page.goto(url, {
|
|
221
|
-
waitUntil: "domcontentloaded",
|
|
222
|
-
// Changed from networkidle - faster for simple XML
|
|
223
|
-
timeout: timeout * 1e3
|
|
224
|
-
});
|
|
225
|
-
if (!response) {
|
|
226
|
-
throw new Error("No response received from page");
|
|
227
|
-
}
|
|
228
|
-
const statusCode = response.status();
|
|
229
|
-
const content = await page.content();
|
|
230
|
-
const finalUrl = page.url();
|
|
231
|
-
await browser.close();
|
|
232
|
-
if (statusCode >= 200 && statusCode < 300) {
|
|
233
|
-
return {
|
|
234
|
-
content,
|
|
235
|
-
statusCode,
|
|
236
|
-
url: finalUrl
|
|
237
|
-
};
|
|
238
|
-
}
|
|
239
|
-
throw new HttpError(finalUrl, statusCode);
|
|
240
|
-
} catch (error) {
|
|
241
|
-
if (browser) {
|
|
242
|
-
await browser.close();
|
|
243
|
-
}
|
|
244
|
-
if (error.code === "HTTP_ERROR") {
|
|
245
|
-
throw error;
|
|
246
|
-
}
|
|
247
|
-
throw new NetworkError(url, error);
|
|
248
|
-
}
|
|
249
|
-
}
|
|
250
|
-
async function fetchUrl(url, options = {}) {
|
|
251
|
-
const {
|
|
252
|
-
timeout = 30,
|
|
253
|
-
maxRetries = 3,
|
|
254
|
-
retryDelay = 1e3,
|
|
255
|
-
useBrowser = false,
|
|
256
|
-
disableBrowserFallback = false
|
|
257
|
-
} = options;
|
|
258
|
-
new URL(url);
|
|
259
|
-
const retryableStatuses = [408, 429, 500, 502, 503, 504];
|
|
260
|
-
let lastError = null;
|
|
261
|
-
let attemptedBrowser = false;
|
|
262
|
-
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
|
263
|
-
try {
|
|
264
|
-
if (useBrowser || attemptedBrowser) {
|
|
265
|
-
return await fetchUrlWithBrowser(url, timeout);
|
|
266
|
-
}
|
|
267
|
-
const response = await axiosInstance.get(url, {
|
|
268
|
-
timeout: timeout * 1e3,
|
|
269
|
-
headers: {
|
|
270
|
-
"User-Agent": "sitemap-qa/1.0.0 (compatible; +https://github.com/Akotliar/sitemap-qa)",
|
|
271
|
-
"Accept": "text/xml,application/xml,text/plain,*/*",
|
|
272
|
-
"Accept-Encoding": "gzip, deflate",
|
|
273
|
-
"Connection": "keep-alive"
|
|
274
|
-
}
|
|
275
|
-
});
|
|
276
|
-
const statusCode = response.status;
|
|
277
|
-
const body = response.data;
|
|
278
|
-
if (statusCode >= 200 && statusCode < 300) {
|
|
279
|
-
return {
|
|
280
|
-
content: typeof body === "string" ? body : JSON.stringify(body),
|
|
281
|
-
statusCode,
|
|
282
|
-
url: response.request?.res?.responseUrl || url
|
|
283
|
-
// Final URL after redirects
|
|
284
|
-
};
|
|
285
|
-
}
|
|
286
|
-
if (statusCode === 403 && !attemptedBrowser && !disableBrowserFallback) {
|
|
287
|
-
attemptedBrowser = true;
|
|
288
|
-
continue;
|
|
289
|
-
}
|
|
290
|
-
if (!retryableStatuses.includes(statusCode)) {
|
|
291
|
-
throw new HttpError(response.request?.res?.responseUrl || url, statusCode);
|
|
292
|
-
}
|
|
293
|
-
lastError = new HttpError(response.request?.res?.responseUrl || url, statusCode);
|
|
294
|
-
} catch (error) {
|
|
295
|
-
if (error.code === "HTTP_ERROR") {
|
|
296
|
-
const httpError = error;
|
|
297
|
-
if (!retryableStatuses.includes(httpError.statusCode)) {
|
|
298
|
-
throw error;
|
|
299
|
-
}
|
|
300
|
-
lastError = error;
|
|
301
|
-
} else {
|
|
302
|
-
lastError = new NetworkError(url, error);
|
|
303
|
-
}
|
|
304
|
-
if (attempt === maxRetries) break;
|
|
305
|
-
}
|
|
306
|
-
if (attempt < maxRetries) {
|
|
307
|
-
const delay = retryDelay * Math.pow(2, attempt);
|
|
308
|
-
await new Promise((resolve) => setTimeout(resolve, delay));
|
|
309
|
-
}
|
|
310
|
-
}
|
|
311
|
-
throw lastError;
|
|
312
|
-
}
|
|
313
|
-
|
|
314
|
-
// src/core/discovery.ts
|
|
315
|
-
async function tryStandardPaths(baseUrl, config) {
|
|
316
|
-
const baseDomain = new URL(baseUrl).origin;
|
|
317
|
-
const accessIssues = [];
|
|
318
|
-
const standardPaths = [
|
|
319
|
-
"/sitemap.xml",
|
|
320
|
-
"/sitemap_index.xml",
|
|
321
|
-
"/sitemap-index.xml"
|
|
322
|
-
];
|
|
323
|
-
const results = await Promise.allSettled(
|
|
324
|
-
standardPaths.map(async (path) => {
|
|
325
|
-
const sitemapUrl = `${baseDomain}${path}`;
|
|
326
|
-
try {
|
|
327
|
-
const result = await fetchUrl(sitemapUrl, {
|
|
328
|
-
timeout: config.timeout,
|
|
329
|
-
maxRetries: 0
|
|
330
|
-
// Don't retry on standard paths - fail fast
|
|
331
|
-
});
|
|
332
|
-
if (result.statusCode === 200) {
|
|
333
|
-
if (config.verbose) {
|
|
334
|
-
console.log(`\u2713 Found sitemap at: ${sitemapUrl}`);
|
|
335
|
-
}
|
|
336
|
-
return { found: true, url: sitemapUrl };
|
|
337
|
-
}
|
|
338
|
-
return { found: false };
|
|
339
|
-
} catch (error) {
|
|
340
|
-
if (error instanceof HttpError) {
|
|
341
|
-
if (error.statusCode === 401 || error.statusCode === 403) {
|
|
342
|
-
accessIssues.push({
|
|
343
|
-
url: sitemapUrl,
|
|
344
|
-
statusCode: error.statusCode,
|
|
345
|
-
error: error.statusCode === 401 ? "Unauthorized" : "Access Denied"
|
|
346
|
-
});
|
|
347
|
-
if (config.verbose) {
|
|
348
|
-
console.log(`\u26A0 Access denied: ${sitemapUrl} (${error.statusCode})`);
|
|
349
|
-
}
|
|
350
|
-
} else if (config.verbose) {
|
|
351
|
-
console.log(`\u2717 Not found: ${sitemapUrl} (${error.statusCode})`);
|
|
352
|
-
}
|
|
353
|
-
} else if (config.verbose) {
|
|
354
|
-
console.log(`\u2717 Not found: ${sitemapUrl}`);
|
|
355
|
-
}
|
|
356
|
-
return { found: false };
|
|
357
|
-
}
|
|
358
|
-
})
|
|
359
|
-
);
|
|
360
|
-
for (const result of results) {
|
|
361
|
-
if (result.status === "fulfilled" && result.value.found) {
|
|
362
|
-
return { sitemaps: [result.value.url], issues: accessIssues };
|
|
363
|
-
}
|
|
364
|
-
}
|
|
365
|
-
if (config.verbose) {
|
|
366
|
-
console.log("No sitemap found at standard paths");
|
|
367
|
-
}
|
|
368
|
-
return { sitemaps: [], issues: accessIssues };
|
|
369
|
-
}
|
|
370
|
-
async function parseRobotsTxt(baseUrl, config) {
|
|
371
|
-
const robotsUrl = `${new URL(baseUrl).origin}/robots.txt`;
|
|
372
|
-
try {
|
|
373
|
-
const result = await fetchUrl(robotsUrl, {
|
|
374
|
-
timeout: config.timeout,
|
|
375
|
-
maxRetries: 1
|
|
376
|
-
});
|
|
377
|
-
const lines = result.content.split("\n");
|
|
378
|
-
const sitemaps = [];
|
|
379
|
-
for (const line of lines) {
|
|
380
|
-
const match = line.match(/^Sitemap:\s*(.+)$/i);
|
|
381
|
-
if (match) {
|
|
382
|
-
const sitemapUrl = match[1].trim();
|
|
383
|
-
try {
|
|
384
|
-
new URL(sitemapUrl);
|
|
385
|
-
sitemaps.push(sitemapUrl);
|
|
386
|
-
} catch {
|
|
387
|
-
if (config.verbose) {
|
|
388
|
-
console.warn(`Invalid sitemap URL in robots.txt: ${sitemapUrl}`);
|
|
389
|
-
}
|
|
390
|
-
}
|
|
391
|
-
}
|
|
392
|
-
}
|
|
393
|
-
if (config.verbose && sitemaps.length > 0) {
|
|
394
|
-
console.log(`Found ${sitemaps.length} sitemap(s) in robots.txt`);
|
|
395
|
-
}
|
|
396
|
-
return sitemaps;
|
|
397
|
-
} catch (error) {
|
|
398
|
-
if (config.verbose) {
|
|
399
|
-
console.log(`No robots.txt found at ${robotsUrl}`);
|
|
400
|
-
}
|
|
401
|
-
return [];
|
|
402
|
-
}
|
|
403
|
-
}
|
|
404
|
-
function isSitemapIndex(xmlContent) {
|
|
405
|
-
if (xmlContent.includes("<sitemapindex")) {
|
|
406
|
-
return true;
|
|
407
|
-
}
|
|
408
|
-
if (xmlContent.includes("<urlset")) {
|
|
409
|
-
const urlBlockRegex = /<url[^>]*>.*?<loc>([^<]+)<\/loc>.*?<\/url>/gs;
|
|
410
|
-
const matches = Array.from(xmlContent.matchAll(urlBlockRegex));
|
|
411
|
-
const samplesToCheck = Math.min(5, matches.length);
|
|
412
|
-
let sitemapLikeCount = 0;
|
|
413
|
-
for (let i = 0; i < samplesToCheck; i++) {
|
|
414
|
-
const url = matches[i][1].trim().toLowerCase();
|
|
415
|
-
if (url.includes("sitemap") || url.endsWith(".xml")) {
|
|
416
|
-
sitemapLikeCount++;
|
|
417
|
-
}
|
|
418
|
-
}
|
|
419
|
-
return sitemapLikeCount > samplesToCheck / 2;
|
|
420
|
-
}
|
|
421
|
-
return false;
|
|
422
|
-
}
|
|
423
|
-
function extractSitemapIndexUrls(xmlContent) {
|
|
424
|
-
const urls = [];
|
|
425
|
-
if (xmlContent.includes("<sitemapindex")) {
|
|
426
|
-
const sitemapBlockRegex = /<sitemap[^>]*>(.*?)<\/sitemap>/gs;
|
|
427
|
-
let sitemapMatch;
|
|
428
|
-
while ((sitemapMatch = sitemapBlockRegex.exec(xmlContent)) !== null) {
|
|
429
|
-
const locMatch = /<loc>([^<]+)<\/loc>/i.exec(sitemapMatch[1]);
|
|
430
|
-
if (locMatch) {
|
|
431
|
-
const url = locMatch[1].trim();
|
|
432
|
-
try {
|
|
433
|
-
new URL(url);
|
|
434
|
-
urls.push(url);
|
|
435
|
-
} catch {
|
|
436
|
-
}
|
|
437
|
-
}
|
|
438
|
-
}
|
|
439
|
-
} else {
|
|
440
|
-
const urlBlockRegex = /<url[^>]*>(.*?)<\/url>/gs;
|
|
441
|
-
let urlMatch;
|
|
442
|
-
while ((urlMatch = urlBlockRegex.exec(xmlContent)) !== null) {
|
|
443
|
-
const locMatch = /<loc>([^<]+)<\/loc>/i.exec(urlMatch[1]);
|
|
444
|
-
if (locMatch) {
|
|
445
|
-
const url = locMatch[1].trim();
|
|
446
|
-
if (url.toLowerCase().includes("sitemap") || url.toLowerCase().endsWith(".xml")) {
|
|
447
|
-
try {
|
|
448
|
-
new URL(url);
|
|
449
|
-
urls.push(url);
|
|
450
|
-
} catch {
|
|
451
|
-
}
|
|
452
|
-
}
|
|
453
|
-
}
|
|
454
|
-
}
|
|
455
|
-
}
|
|
456
|
-
return urls;
|
|
457
|
-
}
|
|
458
|
-
async function discoverAllSitemaps(initialSitemaps, config) {
|
|
459
|
-
const finalSitemaps = [];
|
|
460
|
-
const toProcess = [...initialSitemaps];
|
|
461
|
-
const processed = /* @__PURE__ */ new Set();
|
|
462
|
-
const inaccessible = /* @__PURE__ */ new Set();
|
|
463
|
-
const BATCH_SIZE = config.discoveryConcurrency || 50;
|
|
464
|
-
while (toProcess.length > 0) {
|
|
465
|
-
const batch = toProcess.splice(0, Math.min(BATCH_SIZE, toProcess.length));
|
|
466
|
-
const batchResults = await Promise.all(batch.map(async (sitemapUrl) => {
|
|
467
|
-
if (processed.has(sitemapUrl)) {
|
|
468
|
-
if (config.verbose) {
|
|
469
|
-
console.warn(`Skipping duplicate sitemap: ${sitemapUrl}`);
|
|
470
|
-
}
|
|
471
|
-
return { type: "skip" };
|
|
472
|
-
}
|
|
473
|
-
processed.add(sitemapUrl);
|
|
474
|
-
try {
|
|
475
|
-
const result = await fetchUrl(sitemapUrl, {
|
|
476
|
-
timeout: config.timeout,
|
|
477
|
-
maxRetries: 2
|
|
478
|
-
});
|
|
479
|
-
if (isSitemapIndex(result.content)) {
|
|
480
|
-
if (config.verbose) {
|
|
481
|
-
console.log(`Found sitemap index: ${sitemapUrl}`);
|
|
482
|
-
}
|
|
483
|
-
const childUrls = extractSitemapIndexUrls(result.content);
|
|
484
|
-
if (config.verbose) {
|
|
485
|
-
console.log(` \u2514\u2500 Contains ${childUrls.length} child sitemap(s)`);
|
|
486
|
-
}
|
|
487
|
-
return { type: "index", childUrls };
|
|
488
|
-
} else {
|
|
489
|
-
if (config.verbose) {
|
|
490
|
-
console.log(`\u2713 Discovered sitemap: ${sitemapUrl}`);
|
|
491
|
-
}
|
|
492
|
-
return { type: "sitemap", url: sitemapUrl };
|
|
493
|
-
}
|
|
494
|
-
} catch (error) {
|
|
495
|
-
inaccessible.add(sitemapUrl);
|
|
496
|
-
if (config.verbose) {
|
|
497
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
498
|
-
console.warn(`Failed to fetch sitemap ${sitemapUrl}: ${message}`);
|
|
499
|
-
}
|
|
500
|
-
return { type: "failed" };
|
|
501
|
-
}
|
|
502
|
-
}));
|
|
503
|
-
for (const result of batchResults) {
|
|
504
|
-
if (result.type === "index") {
|
|
505
|
-
toProcess.push(...result.childUrls);
|
|
506
|
-
} else if (result.type === "sitemap") {
|
|
507
|
-
finalSitemaps.push(result.url);
|
|
508
|
-
}
|
|
509
|
-
}
|
|
510
|
-
if (processed.size > 1e3) {
|
|
511
|
-
console.warn(`\u26A0\uFE0F Processed over 1000 sitemap URLs. Stopping to prevent excessive requests.`);
|
|
512
|
-
break;
|
|
513
|
-
}
|
|
514
|
-
}
|
|
515
|
-
if (finalSitemaps.length === 0 && inaccessible.size > 0) {
|
|
516
|
-
console.warn(`
|
|
517
|
-
\u26A0\uFE0F All ${inaccessible.size} sitemap(s) were inaccessible`);
|
|
518
|
-
console.warn(`Common causes: 403/404 errors, network issues, or bot protection`);
|
|
519
|
-
}
|
|
520
|
-
return finalSitemaps;
|
|
521
|
-
}
|
|
522
|
-
async function discoverSitemaps(baseUrl, config) {
|
|
523
|
-
const normalizedUrl = new URL(baseUrl).origin;
|
|
524
|
-
if (config.verbose) {
|
|
525
|
-
console.log("Checking robots.txt for sitemap directives...");
|
|
526
|
-
}
|
|
527
|
-
const robotsSitemaps = await parseRobotsTxt(normalizedUrl, config);
|
|
528
|
-
if (robotsSitemaps.length > 0) {
|
|
529
|
-
const sitemaps = await discoverAllSitemaps(robotsSitemaps, config);
|
|
530
|
-
return {
|
|
531
|
-
sitemaps,
|
|
532
|
-
source: "robots-txt",
|
|
533
|
-
accessIssues: []
|
|
534
|
-
};
|
|
535
|
-
}
|
|
536
|
-
if (config.verbose) {
|
|
537
|
-
console.log("Trying standard sitemap paths...");
|
|
538
|
-
}
|
|
539
|
-
const { sitemaps: standardSitemaps, issues } = await tryStandardPaths(normalizedUrl, config);
|
|
540
|
-
if (standardSitemaps.length > 0) {
|
|
541
|
-
const sitemaps = await discoverAllSitemaps(standardSitemaps, config);
|
|
542
|
-
if (sitemaps.length > 0) {
|
|
543
|
-
return {
|
|
544
|
-
sitemaps,
|
|
545
|
-
source: "standard-path",
|
|
546
|
-
accessIssues: []
|
|
547
|
-
};
|
|
548
|
-
}
|
|
549
|
-
return {
|
|
550
|
-
sitemaps: [],
|
|
551
|
-
source: "standard-path",
|
|
552
|
-
accessIssues: issues
|
|
553
|
-
};
|
|
554
|
-
}
|
|
555
|
-
return {
|
|
556
|
-
sitemaps: [],
|
|
557
|
-
source: "none",
|
|
558
|
-
accessIssues: issues
|
|
559
|
-
};
|
|
560
|
-
}
|
|
561
|
-
|
|
562
|
-
// src/core/parser.ts
|
|
563
|
-
var import_fast_xml_parser = require("fast-xml-parser");
|
|
564
|
-
var VALID_CHANGEFREQ = /* @__PURE__ */ new Set([
|
|
565
|
-
"always",
|
|
566
|
-
"hourly",
|
|
567
|
-
"daily",
|
|
568
|
-
"weekly",
|
|
569
|
-
"monthly",
|
|
570
|
-
"yearly",
|
|
571
|
-
"never"
|
|
572
|
-
]);
|
|
573
|
-
var parser = new import_fast_xml_parser.XMLParser({
|
|
574
|
-
ignoreAttributes: false,
|
|
575
|
-
attributeNamePrefix: "@_",
|
|
576
|
-
textNodeName: "_text",
|
|
577
|
-
parseAttributeValue: true,
|
|
578
|
-
trimValues: true,
|
|
579
|
-
allowBooleanAttributes: true,
|
|
580
|
-
parseTagValue: false
|
|
581
|
-
// Keep values as strings for validation
|
|
582
|
-
});
|
|
583
|
-
function extractUrls(parsedXml, sitemapUrl) {
|
|
584
|
-
const urls = [];
|
|
585
|
-
if (parsedXml.urlset) {
|
|
586
|
-
const urlNodes = Array.isArray(parsedXml.urlset.url) ? parsedXml.urlset.url : [parsedXml.urlset.url];
|
|
587
|
-
for (let i = 0; i < urlNodes.length; i++) {
|
|
588
|
-
const node = urlNodes[i];
|
|
589
|
-
if (!node || !node.loc) {
|
|
590
|
-
continue;
|
|
591
|
-
}
|
|
592
|
-
urls.push({
|
|
593
|
-
loc: node.loc,
|
|
594
|
-
lastmod: node.lastmod,
|
|
595
|
-
changefreq: node.changefreq,
|
|
596
|
-
priority: node.priority ? parseFloat(node.priority) : void 0,
|
|
597
|
-
source: sitemapUrl
|
|
598
|
-
});
|
|
599
|
-
}
|
|
600
|
-
}
|
|
601
|
-
return urls;
|
|
602
|
-
}
|
|
603
|
-
async function parseSitemap(xml, sitemapUrl) {
|
|
604
|
-
const errors = [];
|
|
605
|
-
try {
|
|
606
|
-
const validationResult = import_fast_xml_parser.XMLValidator.validate(xml);
|
|
607
|
-
if (validationResult !== true) {
|
|
608
|
-
const validationError = typeof validationResult === "object" ? validationResult.err.msg : "Invalid XML";
|
|
609
|
-
return {
|
|
610
|
-
urls: [],
|
|
611
|
-
errors: [
|
|
612
|
-
`[${sitemapUrl}] XML parsing failed: ${validationError}`
|
|
613
|
-
],
|
|
614
|
-
totalCount: 0,
|
|
615
|
-
sitemapUrl
|
|
616
|
-
};
|
|
617
|
-
}
|
|
618
|
-
const parsed = parser.parse(xml);
|
|
619
|
-
const urls = extractUrls(parsed, sitemapUrl);
|
|
620
|
-
const validUrls = [];
|
|
621
|
-
for (const entry of urls) {
|
|
622
|
-
try {
|
|
623
|
-
new URL(entry.loc);
|
|
624
|
-
if (entry.priority !== void 0) {
|
|
625
|
-
if (entry.priority < 0 || entry.priority > 1) {
|
|
626
|
-
errors.push(
|
|
627
|
-
`Invalid priority ${entry.priority} for ${entry.loc} - clamping to 0-1`
|
|
628
|
-
);
|
|
629
|
-
entry.priority = Math.max(0, Math.min(1, entry.priority));
|
|
630
|
-
}
|
|
631
|
-
}
|
|
632
|
-
if (entry.changefreq) {
|
|
633
|
-
if (!VALID_CHANGEFREQ.has(entry.changefreq.toLowerCase())) {
|
|
634
|
-
errors.push(
|
|
635
|
-
`Invalid changefreq "${entry.changefreq}" for ${entry.loc}`
|
|
636
|
-
);
|
|
637
|
-
entry.changefreq = void 0;
|
|
638
|
-
}
|
|
639
|
-
}
|
|
640
|
-
validUrls.push(entry);
|
|
641
|
-
} catch (urlError) {
|
|
642
|
-
errors.push(`Invalid URL format: ${entry.loc}`);
|
|
643
|
-
}
|
|
644
|
-
}
|
|
645
|
-
return {
|
|
646
|
-
urls: validUrls,
|
|
647
|
-
errors,
|
|
648
|
-
totalCount: validUrls.length,
|
|
649
|
-
sitemapUrl
|
|
650
|
-
};
|
|
651
|
-
} catch (parseError) {
|
|
652
|
-
const errorMsg = parseError instanceof Error ? parseError.message : String(parseError);
|
|
653
|
-
return {
|
|
654
|
-
urls: [],
|
|
655
|
-
errors: [
|
|
656
|
-
`[${sitemapUrl}] XML parsing failed: ${errorMsg}`
|
|
657
|
-
],
|
|
658
|
-
totalCount: 0,
|
|
659
|
-
sitemapUrl
|
|
660
|
-
};
|
|
661
|
-
}
|
|
662
|
-
}
|
|
663
|
-
|
|
664
|
-
// src/utils/batch-processor.ts
|
|
665
|
-
function chunkArray(array, chunkSize) {
|
|
666
|
-
const chunks = [];
|
|
667
|
-
for (let i = 0; i < array.length; i += chunkSize) {
|
|
668
|
-
chunks.push(array.slice(i, i + chunkSize));
|
|
669
|
-
}
|
|
670
|
-
return chunks;
|
|
671
|
-
}
|
|
672
|
-
async function processInBatches(items, concurrency, processor, onProgress) {
|
|
673
|
-
const results = new Array(items.length);
|
|
674
|
-
let completed = 0;
|
|
675
|
-
let currentIndex = 0;
|
|
676
|
-
const errors = [];
|
|
677
|
-
const workers = Array(Math.min(concurrency, items.length)).fill(null).map(async () => {
|
|
678
|
-
while (currentIndex < items.length) {
|
|
679
|
-
const index = currentIndex++;
|
|
680
|
-
const item = items[index];
|
|
681
|
-
try {
|
|
682
|
-
results[index] = await processor(item);
|
|
683
|
-
} catch (error) {
|
|
684
|
-
errors.push({ index, error });
|
|
685
|
-
results[index] = null;
|
|
686
|
-
}
|
|
687
|
-
completed++;
|
|
688
|
-
if (onProgress) {
|
|
689
|
-
onProgress(completed, items.length);
|
|
690
|
-
}
|
|
691
|
-
}
|
|
692
|
-
});
|
|
693
|
-
await Promise.all(workers);
|
|
694
|
-
if (errors.length > 0) {
|
|
695
|
-
console.warn(`Processed ${items.length} items with ${errors.length} errors`);
|
|
696
|
-
}
|
|
697
|
-
return results;
|
|
698
|
-
}
|
|
699
|
-
|
|
700
|
-
// src/core/extractor.ts
|
|
701
|
-
async function extractAllUrls(sitemapUrls, config, onProgress) {
|
|
702
|
-
const allUrls = [];
|
|
703
|
-
const allErrors = [];
|
|
704
|
-
let sitemapsProcessed = 0;
|
|
705
|
-
let sitemapsFailed = 0;
|
|
706
|
-
if (config.verbose) {
|
|
707
|
-
console.log(`
|
|
708
|
-
Extracting URLs from ${sitemapUrls.length} sitemap(s)...`);
|
|
709
|
-
}
|
|
710
|
-
const CONCURRENCY = config.parsingConcurrency || 50;
|
|
711
|
-
if (!config.silent && config.verbose) {
|
|
712
|
-
console.log(`Using parsing concurrency: ${CONCURRENCY}`);
|
|
713
|
-
}
|
|
714
|
-
const results = await processInBatches(
|
|
715
|
-
sitemapUrls,
|
|
716
|
-
CONCURRENCY,
|
|
717
|
-
async (sitemapUrl) => {
|
|
718
|
-
try {
|
|
719
|
-
if (config.verbose) {
|
|
720
|
-
console.log(`Extracting URLs from: ${sitemapUrl}`);
|
|
721
|
-
}
|
|
722
|
-
const response = await fetchUrl(sitemapUrl, {
|
|
723
|
-
timeout: 10,
|
|
724
|
-
// Fast timeout for sitemaps
|
|
725
|
-
maxRetries: 0,
|
|
726
|
-
// No retries - fail fast
|
|
727
|
-
disableBrowserFallback: true
|
|
728
|
-
// Don't use browser for bulk parsing
|
|
729
|
-
});
|
|
730
|
-
const parseResult = await parseSitemap(response.content, sitemapUrl);
|
|
731
|
-
const extractedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
732
|
-
parseResult.urls.forEach((url) => {
|
|
733
|
-
url.extractedAt = extractedAt;
|
|
734
|
-
});
|
|
735
|
-
if (config.verbose) {
|
|
736
|
-
console.log(` \u2713 Extracted ${parseResult.urls.length} URLs from ${sitemapUrl}`);
|
|
737
|
-
}
|
|
738
|
-
return {
|
|
739
|
-
success: true,
|
|
740
|
-
urls: parseResult.urls,
|
|
741
|
-
errors: parseResult.errors
|
|
742
|
-
};
|
|
743
|
-
} catch (error) {
|
|
744
|
-
const errorMsg = `Failed to process ${sitemapUrl}: ${error instanceof Error ? error.message : String(error)}`;
|
|
745
|
-
if (config.verbose) {
|
|
746
|
-
console.error(` \u2717 ${errorMsg}`);
|
|
747
|
-
}
|
|
748
|
-
return {
|
|
749
|
-
success: false,
|
|
750
|
-
urls: [],
|
|
751
|
-
errors: [errorMsg]
|
|
752
|
-
};
|
|
753
|
-
}
|
|
754
|
-
},
|
|
755
|
-
onProgress
|
|
756
|
-
// Pass progress callback to batch processor
|
|
757
|
-
);
|
|
758
|
-
for (const result of results) {
|
|
759
|
-
if (result.success) {
|
|
760
|
-
sitemapsProcessed++;
|
|
761
|
-
allUrls.push(...result.urls);
|
|
762
|
-
} else {
|
|
763
|
-
sitemapsFailed++;
|
|
764
|
-
}
|
|
765
|
-
allErrors.push(...result.errors);
|
|
766
|
-
}
|
|
767
|
-
if (config.verbose) {
|
|
768
|
-
console.log(`
|
|
769
|
-
Extraction complete:`);
|
|
770
|
-
console.log(` - Sitemaps processed: ${sitemapsProcessed}`);
|
|
771
|
-
console.log(` - Sitemaps failed: ${sitemapsFailed}`);
|
|
772
|
-
console.log(` - Total URLs: ${allUrls.length}`);
|
|
773
|
-
console.log(` - Errors: ${allErrors.length}`);
|
|
774
|
-
}
|
|
775
|
-
return {
|
|
776
|
-
allUrls,
|
|
777
|
-
sitemapsProcessed,
|
|
778
|
-
sitemapsFailed,
|
|
779
|
-
totalUrls: allUrls.length,
|
|
780
|
-
errors: allErrors
|
|
781
|
-
};
|
|
782
|
-
}
|
|
783
|
-
|
|
784
|
-
// src/core/consolidator.ts
|
|
785
|
-
function normalizeUrl(url) {
|
|
786
|
-
try {
|
|
787
|
-
const parsed = new URL(url);
|
|
788
|
-
let pathname = parsed.pathname;
|
|
789
|
-
if (pathname.endsWith("/") && pathname !== "/") {
|
|
790
|
-
pathname = pathname.slice(0, -1);
|
|
791
|
-
}
|
|
792
|
-
const params = Array.from(parsed.searchParams.entries()).sort(
|
|
793
|
-
([a], [b]) => a.localeCompare(b)
|
|
794
|
-
);
|
|
795
|
-
const sortedParams = new URLSearchParams(params);
|
|
796
|
-
return `${parsed.protocol}//${parsed.host}${pathname}${sortedParams.toString() ? "?" + sortedParams.toString() : ""}${parsed.hash}`;
|
|
797
|
-
} catch {
|
|
798
|
-
return url;
|
|
799
|
-
}
|
|
800
|
-
}
|
|
801
|
-
function mergeUrlEntries(entries) {
|
|
802
|
-
if (entries.length === 1) return entries[0];
|
|
803
|
-
const merged = { ...entries[0] };
|
|
804
|
-
const sources = entries.map((e) => e.source);
|
|
805
|
-
merged.source = sources.join(", ");
|
|
806
|
-
const lastmods = entries.map((e) => e.lastmod).filter((lm) => !!lm).map((lm) => new Date(lm).getTime()).sort((a, b) => b - a);
|
|
807
|
-
if (lastmods.length > 0) {
|
|
808
|
-
merged.lastmod = new Date(lastmods[0]).toISOString();
|
|
809
|
-
}
|
|
810
|
-
const priorities = entries.map((e) => e.priority).filter((p) => p !== void 0);
|
|
811
|
-
if (priorities.length > 0) {
|
|
812
|
-
merged.priority = Math.max(...priorities);
|
|
813
|
-
}
|
|
814
|
-
const changefreqs = entries.map((e) => e.changefreq).filter((cf) => !!cf);
|
|
815
|
-
if (changefreqs.length > 0) {
|
|
816
|
-
const counts = /* @__PURE__ */ new Map();
|
|
817
|
-
for (const cf of changefreqs) {
|
|
818
|
-
counts.set(cf, (counts.get(cf) || 0) + 1);
|
|
819
|
-
}
|
|
820
|
-
const sorted = Array.from(counts.entries()).sort((a, b) => b[1] - a[1]);
|
|
821
|
-
merged.changefreq = sorted[0][0];
|
|
822
|
-
}
|
|
823
|
-
const extractedAts = entries.map((e) => e.extractedAt).filter((ea) => !!ea).map((ea) => new Date(ea).getTime()).sort((a, b) => b - a);
|
|
824
|
-
if (extractedAts.length > 0) {
|
|
825
|
-
merged.extractedAt = new Date(extractedAts[0]).toISOString();
|
|
826
|
-
}
|
|
827
|
-
return merged;
|
|
828
|
-
}
|
|
829
|
-
function consolidateUrls(urls, verbose = false) {
|
|
830
|
-
const totalInputUrls = urls.length;
|
|
831
|
-
if (verbose) {
|
|
832
|
-
console.log(`
|
|
833
|
-
Consolidating ${urls.length} URL(s)...`);
|
|
834
|
-
}
|
|
835
|
-
const urlMap = /* @__PURE__ */ new Map();
|
|
836
|
-
for (const entry of urls) {
|
|
837
|
-
const normalized = normalizeUrl(entry.loc);
|
|
838
|
-
if (!urlMap.has(normalized)) {
|
|
839
|
-
urlMap.set(normalized, []);
|
|
840
|
-
}
|
|
841
|
-
urlMap.get(normalized).push(entry);
|
|
842
|
-
}
|
|
843
|
-
const uniqueUrls = [];
|
|
844
|
-
const duplicateGroups = [];
|
|
845
|
-
for (const [normalized, entries] of urlMap.entries()) {
|
|
846
|
-
const merged = mergeUrlEntries(entries);
|
|
847
|
-
uniqueUrls.push(merged);
|
|
848
|
-
if (entries.length > 1) {
|
|
849
|
-
duplicateGroups.push({
|
|
850
|
-
url: normalized,
|
|
851
|
-
count: entries.length,
|
|
852
|
-
sources: entries.map((e) => e.source)
|
|
853
|
-
});
|
|
854
|
-
}
|
|
855
|
-
}
|
|
856
|
-
if (verbose) {
|
|
857
|
-
console.log(`Consolidation complete:`);
|
|
858
|
-
console.log(` - Input URLs: ${totalInputUrls}`);
|
|
859
|
-
console.log(` - Unique URLs: ${uniqueUrls.length}`);
|
|
860
|
-
console.log(` - Duplicates removed: ${totalInputUrls - uniqueUrls.length}`);
|
|
861
|
-
if (duplicateGroups.length > 0) {
|
|
862
|
-
console.log(`
|
|
863
|
-
Top duplicates:`);
|
|
864
|
-
const top5 = duplicateGroups.sort((a, b) => b.count - a.count).slice(0, 5);
|
|
865
|
-
for (const group of top5) {
|
|
866
|
-
console.log(` - ${group.url} (${group.count} times)`);
|
|
867
|
-
}
|
|
868
|
-
}
|
|
869
|
-
}
|
|
870
|
-
return {
|
|
871
|
-
uniqueUrls,
|
|
872
|
-
totalInputUrls,
|
|
873
|
-
duplicatesRemoved: totalInputUrls - uniqueUrls.length,
|
|
874
|
-
duplicateGroups
|
|
875
|
-
};
|
|
876
|
-
}
|
|
877
|
-
|
|
878
|
-
// src/core/patterns/risk-patterns.ts
|
|
879
|
-
var RISK_PATTERNS = [
|
|
880
|
-
// Sensitive Parameter Patterns (HIGH)
|
|
881
|
-
{
|
|
882
|
-
name: "Authentication Parameter",
|
|
883
|
-
category: "sensitive_params",
|
|
884
|
-
severity: "high",
|
|
885
|
-
regex: /[?&](token|auth|key|password|secret|apikey|session|credentials)=/i,
|
|
886
|
-
description: "Query parameter may contain sensitive authentication data"
|
|
887
|
-
},
|
|
888
|
-
{
|
|
889
|
-
name: "Debug Parameter",
|
|
890
|
-
category: "sensitive_params",
|
|
891
|
-
severity: "medium",
|
|
892
|
-
regex: /[?&](debug|trace|verbose|test_mode)=/i,
|
|
893
|
-
description: "Query parameter may contain debug or diagnostic flag"
|
|
894
|
-
},
|
|
895
|
-
// Protocol Inconsistency Patterns (MEDIUM)
|
|
896
|
-
{
|
|
897
|
-
name: "HTTP in HTTPS Site",
|
|
898
|
-
category: "protocol_inconsistency",
|
|
899
|
-
severity: "medium",
|
|
900
|
-
regex: /^http:\/\//,
|
|
901
|
-
description: "HTTP URL in HTTPS sitemap (potential mixed content)"
|
|
902
|
-
},
|
|
903
|
-
// Test/Unfinished Content Patterns (MEDIUM)
|
|
904
|
-
// Focuses on obvious test/placeholder patterns, avoiding false positives with legitimate content
|
|
905
|
-
{
|
|
906
|
-
name: "Test Content Path",
|
|
907
|
-
category: "test_content",
|
|
908
|
-
severity: "medium",
|
|
909
|
-
regex: /\/(?:test-|demo-|sample-|temp-|temporary-|placeholder-)|\/(test|demo|sample|temp|temporary|placeholder)(?:\/|$)/i,
|
|
910
|
-
description: "URL path suggests test, demo, or unfinished content that may not be intended for indexing"
|
|
911
|
-
}
|
|
912
|
-
];
|
|
913
|
-
|
|
914
|
-
// src/core/patterns/domain-patterns.ts
|
|
915
|
-
function escapeRegex(str) {
|
|
916
|
-
return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
917
|
-
}
|
|
918
|
-
function extractRootDomain(hostname) {
|
|
919
|
-
const parts = hostname.split(".");
|
|
920
|
-
if (parts.length >= 2) {
|
|
921
|
-
return parts.slice(-2).join(".");
|
|
922
|
-
}
|
|
923
|
-
return hostname;
|
|
924
|
-
}
|
|
925
|
-
function createDomainMismatchPattern(baseUrl, options) {
|
|
926
|
-
const baseDomain = new URL(baseUrl).hostname;
|
|
927
|
-
const rootDomain = extractRootDomain(baseDomain);
|
|
928
|
-
if (options?.allowedSubdomains && options.allowedSubdomains.length > 0) {
|
|
929
|
-
const escapedRoot2 = escapeRegex(rootDomain);
|
|
930
|
-
const escapedSubdomains = options.allowedSubdomains.map(escapeRegex).join("|");
|
|
931
|
-
const pattern2 = `^https?://(?!(?:(?:${escapedSubdomains})\\.)?${escapedRoot2}(?:/|$))`;
|
|
932
|
-
return {
|
|
933
|
-
name: "Domain Mismatch",
|
|
934
|
-
category: "domain_mismatch",
|
|
935
|
-
severity: "high",
|
|
936
|
-
regex: new RegExp(pattern2),
|
|
937
|
-
description: `URL does not match expected domain or allowed subdomains`
|
|
938
|
-
};
|
|
939
|
-
}
|
|
940
|
-
const escapedRoot = escapeRegex(rootDomain);
|
|
941
|
-
const pattern = `^https?://(?!(?:www\\.)?${escapedRoot}(?:/|$))`;
|
|
942
|
-
return {
|
|
943
|
-
name: "Domain Mismatch",
|
|
944
|
-
category: "domain_mismatch",
|
|
945
|
-
severity: "high",
|
|
946
|
-
regex: new RegExp(pattern),
|
|
947
|
-
description: `URL does not match expected domain: ${rootDomain} (including www variant)`
|
|
948
|
-
};
|
|
949
|
-
}
|
|
950
|
-
var ENVIRONMENT_PATTERNS = [
|
|
951
|
-
{
|
|
952
|
-
name: "Staging Subdomain",
|
|
953
|
-
category: "environment_leakage",
|
|
954
|
-
severity: "high",
|
|
955
|
-
regex: /^https?:\/\/(staging|stg)\./i,
|
|
956
|
-
description: "URL uses staging subdomain"
|
|
957
|
-
},
|
|
958
|
-
{
|
|
959
|
-
name: "Development Subdomain",
|
|
960
|
-
category: "environment_leakage",
|
|
961
|
-
severity: "high",
|
|
962
|
-
regex: /^https?:\/\/(dev|development)\./i,
|
|
963
|
-
description: "URL uses development subdomain"
|
|
964
|
-
},
|
|
965
|
-
{
|
|
966
|
-
name: "QA/Test Subdomain",
|
|
967
|
-
category: "environment_leakage",
|
|
968
|
-
severity: "high",
|
|
969
|
-
regex: /^https?:\/\/(qa|test|uat|preprod)\./i,
|
|
970
|
-
description: "URL uses test environment subdomain"
|
|
971
|
-
},
|
|
972
|
-
{
|
|
973
|
-
name: "Localhost URL",
|
|
974
|
-
category: "environment_leakage",
|
|
975
|
-
severity: "high",
|
|
976
|
-
regex: /^https?:\/\/(localhost|127\.0\.0\.1|0\.0\.0\.0)/,
|
|
977
|
-
description: "URL points to localhost (development environment)"
|
|
978
|
-
},
|
|
979
|
-
{
|
|
980
|
-
name: "Environment in Path",
|
|
981
|
-
category: "environment_leakage",
|
|
982
|
-
severity: "high",
|
|
983
|
-
regex: /^https?:\/\/[^/]+\/(staging|dev|qa|uat|preprod)\//i,
|
|
984
|
-
description: "URL path contains environment identifier at root level"
|
|
985
|
-
}
|
|
986
|
-
];
|
|
987
|
-
|
|
988
|
-
// src/core/patterns/admin-patterns.ts
|
|
989
|
-
var ADMIN_PATH_PATTERNS = [
|
|
990
|
-
{
|
|
991
|
-
name: "Admin Path",
|
|
992
|
-
category: "admin_paths",
|
|
993
|
-
severity: "high",
|
|
994
|
-
regex: /\/(admin|administrator)(?:\/|$|\?)/i,
|
|
995
|
-
description: "URL contains /admin or /administrator as a path segment"
|
|
996
|
-
},
|
|
997
|
-
{
|
|
998
|
-
name: "Dashboard Path",
|
|
999
|
-
category: "admin_paths",
|
|
1000
|
-
severity: "high",
|
|
1001
|
-
regex: /\/dashboard(?:\/|$|\?)/i,
|
|
1002
|
-
description: "URL contains /dashboard as a path segment"
|
|
1003
|
-
},
|
|
1004
|
-
{
|
|
1005
|
-
name: "Config Path",
|
|
1006
|
-
category: "admin_paths",
|
|
1007
|
-
severity: "high",
|
|
1008
|
-
regex: /\/(config|configuration)(?:\/|$|\?)/i,
|
|
1009
|
-
description: "URL contains /config or /configuration as a path segment"
|
|
1010
|
-
},
|
|
1011
|
-
{
|
|
1012
|
-
name: "Console Path",
|
|
1013
|
-
category: "admin_paths",
|
|
1014
|
-
severity: "high",
|
|
1015
|
-
regex: /\/console(?:\/|$|\?)/i,
|
|
1016
|
-
description: "URL contains /console as a path segment"
|
|
1017
|
-
},
|
|
1018
|
-
{
|
|
1019
|
-
name: "Control Panel Path",
|
|
1020
|
-
category: "admin_paths",
|
|
1021
|
-
severity: "high",
|
|
1022
|
-
regex: /\/(cpanel|control-panel)(?:\/|$|\?)/i,
|
|
1023
|
-
description: "URL contains control panel as a path segment"
|
|
1024
|
-
}
|
|
1025
|
-
];
|
|
1026
|
-
var INTERNAL_CONTENT_PATTERNS = [
|
|
1027
|
-
{
|
|
1028
|
-
name: "Internal Content Path",
|
|
1029
|
-
category: "internal_content",
|
|
1030
|
-
severity: "medium",
|
|
1031
|
-
regex: /\/internal\b/i,
|
|
1032
|
-
description: "URL contains /internal path segment - may be internal-only content not intended for public indexing"
|
|
1033
|
-
}
|
|
1034
|
-
];
|
|
1035
|
-
var SENSITIVE_PARAM_PATTERNS = [
|
|
1036
|
-
{
|
|
1037
|
-
name: "Authentication Token Parameter",
|
|
1038
|
-
category: "sensitive_params",
|
|
1039
|
-
severity: "high",
|
|
1040
|
-
regex: /[?&](token|auth_token|access_token|api_token)=/i,
|
|
1041
|
-
description: "Query parameter may contain authentication token"
|
|
1042
|
-
},
|
|
1043
|
-
{
|
|
1044
|
-
name: "API Key Parameter",
|
|
1045
|
-
category: "sensitive_params",
|
|
1046
|
-
severity: "high",
|
|
1047
|
-
regex: /[?&](apikey|api_key|key)=/i,
|
|
1048
|
-
description: "Query parameter may contain API key"
|
|
1049
|
-
},
|
|
1050
|
-
{
|
|
1051
|
-
name: "Password Parameter",
|
|
1052
|
-
category: "sensitive_params",
|
|
1053
|
-
severity: "high",
|
|
1054
|
-
regex: /[?&](password|passwd|pwd)=/i,
|
|
1055
|
-
description: "Query parameter may contain password"
|
|
1056
|
-
},
|
|
1057
|
-
{
|
|
1058
|
-
name: "Secret Parameter",
|
|
1059
|
-
category: "sensitive_params",
|
|
1060
|
-
severity: "high",
|
|
1061
|
-
regex: /[?&](secret|client_secret)=/i,
|
|
1062
|
-
description: "Query parameter may contain secret value"
|
|
1063
|
-
},
|
|
1064
|
-
{
|
|
1065
|
-
name: "Session Parameter",
|
|
1066
|
-
category: "sensitive_params",
|
|
1067
|
-
severity: "high",
|
|
1068
|
-
regex: /[?&](session|sessionid|sid)=/i,
|
|
1069
|
-
description: "Query parameter may contain session identifier"
|
|
1070
|
-
},
|
|
1071
|
-
{
|
|
1072
|
-
name: "Credentials Parameter",
|
|
1073
|
-
category: "sensitive_params",
|
|
1074
|
-
severity: "high",
|
|
1075
|
-
regex: /[?&]credentials=/i,
|
|
1076
|
-
description: "Query parameter may contain credentials"
|
|
1077
|
-
},
|
|
1078
|
-
{
|
|
1079
|
-
name: "Debug Parameter",
|
|
1080
|
-
category: "sensitive_params",
|
|
1081
|
-
severity: "medium",
|
|
1082
|
-
regex: /[?&](debug|trace|verbose)=/i,
|
|
1083
|
-
description: "Query parameter contains debug or diagnostic flag"
|
|
1084
|
-
},
|
|
1085
|
-
{
|
|
1086
|
-
name: "Test Mode Parameter",
|
|
1087
|
-
category: "sensitive_params",
|
|
1088
|
-
severity: "medium",
|
|
1089
|
-
regex: /[?&](test_mode|test|testing)=/i,
|
|
1090
|
-
description: "Query parameter indicates test mode"
|
|
1091
|
-
}
|
|
1092
|
-
];
|
|
1093
|
-
|
|
1094
|
-
// src/utils/sanitizer.ts
|
|
1095
|
-
function sanitizeUrl(url) {
|
|
1096
|
-
try {
|
|
1097
|
-
const parsed = new URL(url);
|
|
1098
|
-
const sensitiveParams = [
|
|
1099
|
-
"token",
|
|
1100
|
-
"auth",
|
|
1101
|
-
"auth_token",
|
|
1102
|
-
"access_token",
|
|
1103
|
-
"api_token",
|
|
1104
|
-
"apikey",
|
|
1105
|
-
"api_key",
|
|
1106
|
-
"key",
|
|
1107
|
-
"password",
|
|
1108
|
-
"passwd",
|
|
1109
|
-
"pwd",
|
|
1110
|
-
"secret",
|
|
1111
|
-
"client_secret",
|
|
1112
|
-
"session",
|
|
1113
|
-
"sessionid",
|
|
1114
|
-
"sid",
|
|
1115
|
-
"credentials"
|
|
1116
|
-
];
|
|
1117
|
-
for (const param of sensitiveParams) {
|
|
1118
|
-
if (parsed.searchParams.has(param)) {
|
|
1119
|
-
parsed.searchParams.set(param, "[REDACTED]");
|
|
1120
|
-
}
|
|
1121
|
-
}
|
|
1122
|
-
return parsed.toString();
|
|
1123
|
-
} catch {
|
|
1124
|
-
return url;
|
|
1125
|
-
}
|
|
1126
|
-
}
|
|
1127
|
-
|
|
1128
|
-
// src/core/risk-grouper.ts
|
|
1129
|
-
function generateRecommendation(category, _severity, count) {
|
|
1130
|
-
switch (category) {
|
|
1131
|
-
case "environment_leakage":
|
|
1132
|
-
return {
|
|
1133
|
-
rationale: `Production sitemap contains ${count} URL(s) from non-production environments (staging, dev, QA, test). This indicates configuration errors or environment leakage.`,
|
|
1134
|
-
recommendedAction: "Verify sitemap generation excludes non-production environments. Review deployment configuration and environment filtering rules."
|
|
1135
|
-
};
|
|
1136
|
-
case "admin_paths":
|
|
1137
|
-
return {
|
|
1138
|
-
rationale: `${count} administrative path(s) detected in public sitemap (admin, dashboard, config). These paths may expose privileged access points.`,
|
|
1139
|
-
recommendedAction: "Confirm if admin paths should be publicly indexed. Consider excluding via robots.txt or removing from sitemap. Verify access controls."
|
|
1140
|
-
};
|
|
1141
|
-
case "internal_content":
|
|
1142
|
-
return {
|
|
1143
|
-
rationale: `${count} URL(s) contain "internal" in the path. These may be internal-facing content not intended for public indexing.`,
|
|
1144
|
-
recommendedAction: "Review URLs to determine if they should be publicly accessible. Consider excluding internal content from sitemap or adding noindex meta tags."
|
|
1145
|
-
};
|
|
1146
|
-
case "test_content":
|
|
1147
|
-
return {
|
|
1148
|
-
rationale: `${count} URL(s) contain test/demo/sample identifiers. These may be placeholder or unfinished content not intended for indexing.`,
|
|
1149
|
-
recommendedAction: "Review and remove test content from production sitemaps. Verify content is production-ready before including in sitemap."
|
|
1150
|
-
};
|
|
1151
|
-
case "sensitive_params":
|
|
1152
|
-
return {
|
|
1153
|
-
rationale: `${count} URL(s) contain sensitive query parameters (token, auth, key, password, session). This may expose authentication credentials or debugging flags.`,
|
|
1154
|
-
recommendedAction: "Review why sensitive parameters are in sitemap URLs. Remove authentication tokens from URLs. Consider POST requests for sensitive data."
|
|
1155
|
-
};
|
|
1156
|
-
case "protocol_inconsistency":
|
|
1157
|
-
return {
|
|
1158
|
-
rationale: `${count} URL(s) use HTTP protocol in HTTPS sitemap. This creates mixed content warnings and potential security issues.`,
|
|
1159
|
-
recommendedAction: "Update URLs to use HTTPS consistently. Verify SSL certificate coverage. Check for hardcoded HTTP URLs in content."
|
|
1160
|
-
};
|
|
1161
|
-
case "domain_mismatch":
|
|
1162
|
-
return {
|
|
1163
|
-
rationale: `${count} URL(s) do not match expected base domain. This may indicate external links, CDN URLs, or configuration errors.`,
|
|
1164
|
-
recommendedAction: "Verify if external domains are intentional. Review sitemap generation logic. Confirm CDN or subdomain configuration is correct."
|
|
1165
|
-
};
|
|
1166
|
-
default:
|
|
1167
|
-
return {
|
|
1168
|
-
rationale: `${count} URL(s) flagged in category: ${category}`,
|
|
1169
|
-
recommendedAction: "Review flagged URLs and determine appropriate action."
|
|
1170
|
-
};
|
|
1171
|
-
}
|
|
1172
|
-
}
|
|
1173
|
-
function groupRiskFindings(findings, maxSampleUrls = 5) {
|
|
1174
|
-
const categoryMap = /* @__PURE__ */ new Map();
|
|
1175
|
-
for (const finding of findings) {
|
|
1176
|
-
if (!categoryMap.has(finding.category)) {
|
|
1177
|
-
categoryMap.set(finding.category, []);
|
|
1178
|
-
}
|
|
1179
|
-
categoryMap.get(finding.category).push(finding);
|
|
1180
|
-
}
|
|
1181
|
-
const groups = [];
|
|
1182
|
-
for (const [category, categoryFindings] of categoryMap.entries()) {
|
|
1183
|
-
const uniqueUrls = Array.from(new Set(categoryFindings.map((f) => f.url)));
|
|
1184
|
-
const severity = categoryFindings.reduce((highest, finding) => {
|
|
1185
|
-
const severityOrder = ["low", "medium", "high"];
|
|
1186
|
-
return severityOrder.indexOf(finding.severity) > severityOrder.indexOf(highest) ? finding.severity : highest;
|
|
1187
|
-
}, "low");
|
|
1188
|
-
const sampleUrls = uniqueUrls.slice(0, maxSampleUrls);
|
|
1189
|
-
const { rationale, recommendedAction } = generateRecommendation(category, severity, uniqueUrls.length);
|
|
1190
|
-
groups.push({
|
|
1191
|
-
category,
|
|
1192
|
-
severity,
|
|
1193
|
-
count: uniqueUrls.length,
|
|
1194
|
-
rationale,
|
|
1195
|
-
sampleUrls,
|
|
1196
|
-
recommendedAction,
|
|
1197
|
-
allUrls: uniqueUrls
|
|
1198
|
-
});
|
|
1199
|
-
}
|
|
1200
|
-
groups.sort((a, b) => {
|
|
1201
|
-
const severityOrder = ["high", "medium", "low"];
|
|
1202
|
-
return severityOrder.indexOf(a.severity) - severityOrder.indexOf(b.severity);
|
|
1203
|
-
});
|
|
1204
|
-
const totalRiskUrls = new Set(findings.map((f) => f.url)).size;
|
|
1205
|
-
const highSeverityCount = groups.filter((g) => g.severity === "high").reduce((sum, g) => sum + g.count, 0);
|
|
1206
|
-
const mediumSeverityCount = groups.filter((g) => g.severity === "medium").reduce((sum, g) => sum + g.count, 0);
|
|
1207
|
-
const lowSeverityCount = groups.filter((g) => g.severity === "low").reduce((sum, g) => sum + g.count, 0);
|
|
1208
|
-
return {
|
|
1209
|
-
groups,
|
|
1210
|
-
totalRiskUrls,
|
|
1211
|
-
highSeverityCount,
|
|
1212
|
-
mediumSeverityCount,
|
|
1213
|
-
lowSeverityCount
|
|
1214
|
-
};
|
|
1215
|
-
}
|
|
1216
|
-
|
|
1217
|
-
// src/core/risk-detector.ts
|
|
1218
|
-
var import_os2 = __toESM(require("os"), 1);
|
|
1219
|
-
function compileAcceptedPatterns(config) {
|
|
1220
|
-
const patterns = [];
|
|
1221
|
-
if (config.acceptedPatterns && config.acceptedPatterns.length > 0) {
|
|
1222
|
-
for (const pattern of config.acceptedPatterns) {
|
|
1223
|
-
try {
|
|
1224
|
-
let regexPattern = pattern.replace(/[.+?^${}()|[\]\\]/g, "\\$&").replace(/\*/g, "[^/]*");
|
|
1225
|
-
if (!regexPattern.endsWith("$") && !regexPattern.includes("(?:")) {
|
|
1226
|
-
regexPattern = regexPattern + "(?:/|$|\\?|#)";
|
|
1227
|
-
}
|
|
1228
|
-
patterns.push(new RegExp(regexPattern, "i"));
|
|
1229
|
-
} catch (error) {
|
|
1230
|
-
if (config.verbose) {
|
|
1231
|
-
console.warn(`Invalid accepted pattern: ${pattern}`);
|
|
1232
|
-
}
|
|
1233
|
-
}
|
|
1234
|
-
}
|
|
1235
|
-
}
|
|
1236
|
-
return patterns;
|
|
1237
|
-
}
|
|
1238
|
-
async function detectRisksInBatch(urls, allPatterns, acceptedPatterns, expectedProtocol, verbose) {
|
|
1239
|
-
const findings = [];
|
|
1240
|
-
for (const urlEntry of urls) {
|
|
1241
|
-
const url = urlEntry.loc;
|
|
1242
|
-
let isAccepted = false;
|
|
1243
|
-
for (const acceptedPattern of acceptedPatterns) {
|
|
1244
|
-
if (acceptedPattern.test(url)) {
|
|
1245
|
-
isAccepted = true;
|
|
1246
|
-
break;
|
|
1247
|
-
}
|
|
1248
|
-
}
|
|
1249
|
-
if (isAccepted) continue;
|
|
1250
|
-
for (const pattern of allPatterns) {
|
|
1251
|
-
if (pattern.category === "protocol_inconsistency") {
|
|
1252
|
-
try {
|
|
1253
|
-
const urlProtocol = new URL(url).protocol;
|
|
1254
|
-
if (expectedProtocol === "https:" && urlProtocol === "http:") {
|
|
1255
|
-
findings.push({
|
|
1256
|
-
url,
|
|
1257
|
-
category: pattern.category,
|
|
1258
|
-
severity: pattern.severity,
|
|
1259
|
-
pattern: pattern.name,
|
|
1260
|
-
rationale: pattern.description,
|
|
1261
|
-
matchedValue: "http://"
|
|
1262
|
-
});
|
|
1263
|
-
}
|
|
1264
|
-
} catch (error) {
|
|
1265
|
-
continue;
|
|
1266
|
-
}
|
|
1267
|
-
} else {
|
|
1268
|
-
try {
|
|
1269
|
-
const match = url.match(pattern.regex);
|
|
1270
|
-
if (match) {
|
|
1271
|
-
findings.push({
|
|
1272
|
-
url: pattern.category === "sensitive_params" ? sanitizeUrl(url) : url,
|
|
1273
|
-
category: pattern.category,
|
|
1274
|
-
severity: pattern.severity,
|
|
1275
|
-
pattern: pattern.name,
|
|
1276
|
-
rationale: pattern.description,
|
|
1277
|
-
matchedValue: match[0]
|
|
1278
|
-
});
|
|
1279
|
-
}
|
|
1280
|
-
} catch (error) {
|
|
1281
|
-
if (verbose) {
|
|
1282
|
-
console.error(`Pattern matching failed for ${pattern.name}: ${error instanceof Error ? error.message : String(error)}`);
|
|
1283
|
-
}
|
|
1284
|
-
continue;
|
|
1285
|
-
}
|
|
1286
|
-
}
|
|
1287
|
-
}
|
|
1288
|
-
}
|
|
1289
|
-
return { findings, urlsProcessed: urls.length };
|
|
1290
|
-
}
|
|
1291
|
-
async function detectRisks(urls, baseUrl, config) {
|
|
1292
|
-
const startTime = Date.now();
|
|
1293
|
-
const domainPattern = createDomainMismatchPattern(baseUrl);
|
|
1294
|
-
const allPatterns = [
|
|
1295
|
-
...RISK_PATTERNS,
|
|
1296
|
-
...ENVIRONMENT_PATTERNS,
|
|
1297
|
-
...ADMIN_PATH_PATTERNS,
|
|
1298
|
-
...SENSITIVE_PARAM_PATTERNS,
|
|
1299
|
-
...INTERNAL_CONTENT_PATTERNS,
|
|
1300
|
-
domainPattern
|
|
1301
|
-
];
|
|
1302
|
-
const acceptedPatterns = compileAcceptedPatterns(config);
|
|
1303
|
-
let expectedProtocol;
|
|
1304
|
-
try {
|
|
1305
|
-
expectedProtocol = new URL(baseUrl).protocol;
|
|
1306
|
-
} catch (error) {
|
|
1307
|
-
if (config.verbose) {
|
|
1308
|
-
console.warn(`Invalid base URL: ${baseUrl}, defaulting to https:`);
|
|
1309
|
-
}
|
|
1310
|
-
expectedProtocol = "https:";
|
|
1311
|
-
}
|
|
1312
|
-
const BATCH_SIZE = config.riskDetectionBatchSize || 1e4;
|
|
1313
|
-
const CONCURRENCY = config.riskDetectionConcurrency || Math.max(2, import_os2.default.cpus().length - 1);
|
|
1314
|
-
const batches = chunkArray(urls, BATCH_SIZE);
|
|
1315
|
-
if (config.verbose) {
|
|
1316
|
-
console.log(`
|
|
1317
|
-
Risk Detection Configuration:`);
|
|
1318
|
-
console.log(` - Total URLs: ${urls.length.toLocaleString()}`);
|
|
1319
|
-
console.log(` - Batch size: ${BATCH_SIZE.toLocaleString()}`);
|
|
1320
|
-
console.log(` - Concurrency: ${CONCURRENCY}`);
|
|
1321
|
-
console.log(` - Total batches: ${batches.length}`);
|
|
1322
|
-
try {
|
|
1323
|
-
console.log(` - Base domain: ${new URL(baseUrl).hostname}`);
|
|
1324
|
-
} catch (error) {
|
|
1325
|
-
console.log(` - Base URL: ${baseUrl}`);
|
|
1326
|
-
}
|
|
1327
|
-
if (acceptedPatterns.length > 0) {
|
|
1328
|
-
console.log(` - Accepted patterns: ${acceptedPatterns.length}`);
|
|
1329
|
-
}
|
|
1330
|
-
}
|
|
1331
|
-
let completedBatches = 0;
|
|
1332
|
-
const totalBatches = batches.length;
|
|
1333
|
-
const batchStartTime = Date.now();
|
|
1334
|
-
const batchResults = await processInBatches(
|
|
1335
|
-
batches,
|
|
1336
|
-
CONCURRENCY,
|
|
1337
|
-
(batch) => detectRisksInBatch(batch, allPatterns, acceptedPatterns, expectedProtocol, config.verbose),
|
|
1338
|
-
(completed) => {
|
|
1339
|
-
completedBatches = completed;
|
|
1340
|
-
const pct = (completed / totalBatches * 100).toFixed(1);
|
|
1341
|
-
const elapsed = (Date.now() - batchStartTime) / 1e3;
|
|
1342
|
-
const urlsProcessed = completed * BATCH_SIZE;
|
|
1343
|
-
const speed = Math.round(urlsProcessed / elapsed);
|
|
1344
|
-
const remaining = totalBatches - completed;
|
|
1345
|
-
const eta = Math.round(remaining * BATCH_SIZE / speed);
|
|
1346
|
-
process.stdout.write(
|
|
1347
|
-
`\r\x1B[K Analyzing batch ${completed}/${totalBatches} (${pct}%) | ETA: ~${eta}s | ${speed.toLocaleString()} URLs/sec`
|
|
1348
|
-
);
|
|
1349
|
-
}
|
|
1350
|
-
);
|
|
1351
|
-
process.stdout.write("\r\x1B[K");
|
|
1352
|
-
const allFindings = batchResults.flatMap((r) => r.findings);
|
|
1353
|
-
const groupingResult = groupRiskFindings(allFindings);
|
|
1354
|
-
const processingTimeMs = Date.now() - startTime;
|
|
1355
|
-
if (config.verbose) {
|
|
1356
|
-
console.log(`
|
|
1357
|
-
Risk Detection Summary:`);
|
|
1358
|
-
console.log(` - Total URLs analyzed: ${urls.length.toLocaleString()}`);
|
|
1359
|
-
console.log(` - Risk URLs found: ${groupingResult.totalRiskUrls.toLocaleString()}`);
|
|
1360
|
-
console.log(` - HIGH severity: ${groupingResult.highSeverityCount}`);
|
|
1361
|
-
console.log(` - MEDIUM severity: ${groupingResult.mediumSeverityCount}`);
|
|
1362
|
-
console.log(` - LOW severity: ${groupingResult.lowSeverityCount}`);
|
|
1363
|
-
console.log(` - Processing time: ${(processingTimeMs / 1e3).toFixed(1)}s`);
|
|
1364
|
-
if (groupingResult.groups.length > 0) {
|
|
1365
|
-
console.log(`
|
|
1366
|
-
Risk Categories Found:`);
|
|
1367
|
-
for (const group of groupingResult.groups) {
|
|
1368
|
-
console.log(` - ${group.category}: ${group.count} URLs (${group.severity.toUpperCase()})`);
|
|
1369
|
-
}
|
|
1370
|
-
}
|
|
1371
|
-
}
|
|
1372
|
-
return {
|
|
1373
|
-
findings: allFindings,
|
|
1374
|
-
groups: groupingResult.groups,
|
|
1375
|
-
totalUrlsAnalyzed: urls.length,
|
|
1376
|
-
riskUrlCount: groupingResult.totalRiskUrls,
|
|
1377
|
-
cleanUrlCount: urls.length - groupingResult.totalRiskUrls,
|
|
1378
|
-
highSeverityCount: groupingResult.highSeverityCount,
|
|
1379
|
-
mediumSeverityCount: groupingResult.mediumSeverityCount,
|
|
1380
|
-
lowSeverityCount: groupingResult.lowSeverityCount,
|
|
1381
|
-
processingTimeMs
|
|
1382
|
-
};
|
|
1383
|
-
}
|
|
1384
|
-
|
|
1385
|
-
// src/summarizer.ts
|
|
1386
|
-
function summarizeRisks(request) {
|
|
1387
|
-
const severityBreakdown = {
|
|
1388
|
-
high: 0,
|
|
1389
|
-
medium: 0,
|
|
1390
|
-
low: 0
|
|
1391
|
-
};
|
|
1392
|
-
const categoryInsights = request.riskGroups.map((group) => {
|
|
1393
|
-
severityBreakdown[group.severity] += group.count;
|
|
1394
|
-
const urls = group.allUrls || group.sampleUrls;
|
|
1395
|
-
return {
|
|
1396
|
-
category: group.category,
|
|
1397
|
-
count: group.count,
|
|
1398
|
-
severity: group.severity,
|
|
1399
|
-
summary: group.rationale,
|
|
1400
|
-
examples: urls.slice(0, 3),
|
|
1401
|
-
allUrls: urls
|
|
1402
|
-
// Include all URLs for download functionality
|
|
1403
|
-
};
|
|
1404
|
-
});
|
|
1405
|
-
const totalRisks = request.riskGroups.reduce((sum, g) => sum + g.count, 0);
|
|
1406
|
-
const overview = totalRisks > 0 ? `Found ${totalRisks} potentially risky URLs across ${request.riskGroups.length} categories in ${request.totalUrls} total URLs.` : `Analyzed ${request.totalUrls} URLs. No suspicious patterns detected.`;
|
|
1407
|
-
const keyFindings = [];
|
|
1408
|
-
if (severityBreakdown.high > 0) {
|
|
1409
|
-
keyFindings.push(`${severityBreakdown.high} high-severity issues require immediate attention`);
|
|
1410
|
-
}
|
|
1411
|
-
if (severityBreakdown.medium > 0) {
|
|
1412
|
-
keyFindings.push(`${severityBreakdown.medium} medium-severity issues should be reviewed`);
|
|
1413
|
-
}
|
|
1414
|
-
if (severityBreakdown.low > 0) {
|
|
1415
|
-
keyFindings.push(`${severityBreakdown.low} low-severity items flagged for awareness`);
|
|
1416
|
-
}
|
|
1417
|
-
return {
|
|
1418
|
-
overview,
|
|
1419
|
-
keyFindings,
|
|
1420
|
-
categoryInsights,
|
|
1421
|
-
severityBreakdown,
|
|
1422
|
-
recommendations: [],
|
|
1423
|
-
generatedBy: "rule-based analysis",
|
|
1424
|
-
metadata: {
|
|
1425
|
-
tokensUsed: 0,
|
|
1426
|
-
processingTime: request.processingTime || 0,
|
|
1427
|
-
model: "pattern-matching"
|
|
1428
|
-
}
|
|
1429
|
-
};
|
|
1430
|
-
}
|
|
1431
|
-
|
|
1432
|
-
// src/reporters/json-reporter.ts
|
|
1433
|
-
var TOOL_VERSION = true ? "1.0.0-alpha.2" : "dev";
|
|
1434
|
-
function generateJsonReport(summary, discoveryResult, parseResult, riskGroups, config, startTime, options = {}) {
|
|
1435
|
-
const {
|
|
1436
|
-
pretty = true,
|
|
1437
|
-
indent = 2,
|
|
1438
|
-
performanceMetrics
|
|
1439
|
-
} = options;
|
|
1440
|
-
const result = buildAnalysisResult(
|
|
1441
|
-
summary,
|
|
1442
|
-
discoveryResult,
|
|
1443
|
-
parseResult,
|
|
1444
|
-
riskGroups,
|
|
1445
|
-
config,
|
|
1446
|
-
startTime
|
|
1447
|
-
);
|
|
1448
|
-
const jsonOutput = transformToJsonOutput(result, performanceMetrics);
|
|
1449
|
-
if (pretty) {
|
|
1450
|
-
return JSON.stringify(jsonOutput, null, indent);
|
|
1451
|
-
} else {
|
|
1452
|
-
return JSON.stringify(jsonOutput);
|
|
1453
|
-
}
|
|
1454
|
-
}
|
|
1455
|
-
function buildAnalysisResult(summary, discoveryResult, parseResult, riskGroups, config, startTime) {
|
|
1456
|
-
const metadata = buildAnalysisMetadata(
|
|
1457
|
-
config.baseUrl || "unknown",
|
|
1458
|
-
startTime,
|
|
1459
|
-
summary
|
|
1460
|
-
);
|
|
1461
|
-
const suspiciousGroups = riskGroups.map((group) => ({
|
|
1462
|
-
category: group.category,
|
|
1463
|
-
severity: group.severity,
|
|
1464
|
-
count: group.count,
|
|
1465
|
-
pattern: group.category,
|
|
1466
|
-
// Use category as pattern identifier
|
|
1467
|
-
rationale: group.rationale,
|
|
1468
|
-
sampleUrls: group.sampleUrls.slice(0, 5),
|
|
1469
|
-
// Limit to 5 samples
|
|
1470
|
-
recommendedAction: group.recommendedAction
|
|
1471
|
-
}));
|
|
1472
|
-
const summaryStats = {
|
|
1473
|
-
highSeverityCount: summary.severityBreakdown.high,
|
|
1474
|
-
mediumSeverityCount: summary.severityBreakdown.medium,
|
|
1475
|
-
lowSeverityCount: summary.severityBreakdown.low,
|
|
1476
|
-
totalRiskyUrls: riskGroups.reduce((sum, g) => sum + g.count, 0),
|
|
1477
|
-
overallStatus: determineOverallStatus(
|
|
1478
|
-
summary.severityBreakdown,
|
|
1479
|
-
parseResult.errors
|
|
1480
|
-
)
|
|
1481
|
-
};
|
|
1482
|
-
const riskSummary = {
|
|
1483
|
-
overview: summary.overview,
|
|
1484
|
-
keyFindings: summary.keyFindings,
|
|
1485
|
-
recommendations: summary.recommendations
|
|
1486
|
-
};
|
|
1487
|
-
const errors = parseResult.errors.map(transformError);
|
|
1488
|
-
return {
|
|
1489
|
-
analysisMetadata: metadata,
|
|
1490
|
-
sitemapsDiscovered: discoveryResult.sitemaps,
|
|
1491
|
-
totalUrlCount: parseResult.totalCount,
|
|
1492
|
-
urlsAnalyzed: parseResult.totalCount,
|
|
1493
|
-
suspiciousGroups,
|
|
1494
|
-
riskSummary,
|
|
1495
|
-
summary: summaryStats,
|
|
1496
|
-
errors
|
|
1497
|
-
};
|
|
1498
|
-
}
|
|
1499
|
-
function buildAnalysisMetadata(baseUrl, startTime, summary) {
|
|
1500
|
-
return {
|
|
1501
|
-
baseUrl,
|
|
1502
|
-
analysisTimestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1503
|
-
toolVersion: TOOL_VERSION,
|
|
1504
|
-
executionTimeMs: Date.now() - startTime,
|
|
1505
|
-
analysisType: summary.generatedBy
|
|
1506
|
-
};
|
|
1507
|
-
}
|
|
1508
|
-
function determineOverallStatus(severityBreakdown, errors) {
|
|
1509
|
-
if (errors.length > 0) {
|
|
1510
|
-
return "errors";
|
|
1511
|
-
}
|
|
1512
|
-
const totalIssues = severityBreakdown.high + severityBreakdown.medium + severityBreakdown.low;
|
|
1513
|
-
return totalIssues > 0 ? "issues_found" : "clean";
|
|
1514
|
-
}
|
|
1515
|
-
function transformToJsonOutput(result, performanceMetrics) {
|
|
1516
|
-
const output = {
|
|
1517
|
-
analysis_metadata: transformMetadata(result.analysisMetadata),
|
|
1518
|
-
sitemaps_discovered: result.sitemapsDiscovered,
|
|
1519
|
-
total_url_count: result.totalUrlCount,
|
|
1520
|
-
urls_analyzed: result.urlsAnalyzed,
|
|
1521
|
-
suspicious_groups: result.suspiciousGroups.map(transformGroup),
|
|
1522
|
-
risk_summary: transformRiskSummary(result.riskSummary),
|
|
1523
|
-
summary: transformSummary(result.summary),
|
|
1524
|
-
errors: result.errors
|
|
1525
|
-
};
|
|
1526
|
-
if (performanceMetrics) {
|
|
1527
|
-
output.performance_metrics = {
|
|
1528
|
-
total_execution_time_ms: performanceMetrics.totalExecutionTimeMs,
|
|
1529
|
-
phase_timings: performanceMetrics.phaseTimings,
|
|
1530
|
-
throughput: performanceMetrics.throughput,
|
|
1531
|
-
resource_usage: performanceMetrics.resourceUsage
|
|
1532
|
-
};
|
|
1533
|
-
}
|
|
1534
|
-
return output;
|
|
1535
|
-
}
|
|
1536
|
-
function transformMetadata(meta) {
|
|
1537
|
-
return {
|
|
1538
|
-
base_url: meta.baseUrl,
|
|
1539
|
-
analysis_timestamp: meta.analysisTimestamp,
|
|
1540
|
-
tool_version: meta.toolVersion,
|
|
1541
|
-
execution_time_ms: meta.executionTimeMs,
|
|
1542
|
-
analysis_type: meta.analysisType
|
|
1543
|
-
};
|
|
1544
|
-
}
|
|
1545
|
-
function transformGroup(group) {
|
|
1546
|
-
return {
|
|
1547
|
-
category: group.category,
|
|
1548
|
-
severity: group.severity,
|
|
1549
|
-
count: group.count,
|
|
1550
|
-
pattern: group.pattern,
|
|
1551
|
-
rationale: group.rationale,
|
|
1552
|
-
sample_urls: group.sampleUrls,
|
|
1553
|
-
recommended_action: group.recommendedAction
|
|
1554
|
-
};
|
|
1555
|
-
}
|
|
1556
|
-
function transformRiskSummary(summary) {
|
|
1557
|
-
return {
|
|
1558
|
-
overview: summary.overview,
|
|
1559
|
-
key_findings: summary.keyFindings,
|
|
1560
|
-
recommendations: summary.recommendations
|
|
1561
|
-
};
|
|
1562
|
-
}
|
|
1563
|
-
function transformSummary(summary) {
|
|
1564
|
-
return {
|
|
1565
|
-
high_severity_count: summary.highSeverityCount,
|
|
1566
|
-
medium_severity_count: summary.mediumSeverityCount,
|
|
1567
|
-
low_severity_count: summary.lowSeverityCount,
|
|
1568
|
-
total_risky_urls: summary.totalRiskyUrls,
|
|
1569
|
-
overall_status: summary.overallStatus
|
|
1570
|
-
};
|
|
1571
|
-
}
|
|
1572
|
-
function transformError(error) {
|
|
1573
|
-
if ("code" in error) {
|
|
1574
|
-
const customError = error;
|
|
1575
|
-
const errorDetail = {
|
|
1576
|
-
code: customError.code || "UNKNOWN_ERROR",
|
|
1577
|
-
message: error.message
|
|
1578
|
-
};
|
|
1579
|
-
if ("attemptedPaths" in customError) {
|
|
1580
|
-
errorDetail.context = {
|
|
1581
|
-
attempted_paths: customError.attemptedPaths
|
|
1582
|
-
};
|
|
1583
|
-
} else if ("sitemapUrl" in customError && "lineNumber" in customError) {
|
|
1584
|
-
errorDetail.context = {
|
|
1585
|
-
sitemap_url: customError.sitemapUrl,
|
|
1586
|
-
line_number: customError.lineNumber
|
|
1587
|
-
};
|
|
1588
|
-
} else if ("url" in customError) {
|
|
1589
|
-
errorDetail.context = {
|
|
1590
|
-
url: customError.url
|
|
1591
|
-
};
|
|
1592
|
-
}
|
|
1593
|
-
return errorDetail;
|
|
1594
|
-
}
|
|
1595
|
-
return {
|
|
1596
|
-
code: "UNKNOWN_ERROR",
|
|
1597
|
-
message: error.message
|
|
1598
|
-
};
|
|
1599
|
-
}
|
|
1600
|
-
|
|
1601
|
-
// src/reporters/html-reporter.ts
|
|
1602
|
-
var import_fs2 = require("fs");
|
|
1603
|
-
var TOOL_VERSION2 = "1.0.0-alpha.2";
|
|
1604
|
-
function generateHtmlReport(summary, discoveryResult, totalUrls, config, errors, options = {}) {
|
|
1605
|
-
const maxUrls = options.maxUrlsPerGroup ?? 10;
|
|
1606
|
-
const timestamp = (/* @__PURE__ */ new Date()).toISOString();
|
|
1607
|
-
const riskyUrlCount = summary.categoryInsights.reduce((sum, g) => sum + g.count, 0);
|
|
1608
|
-
const highSeverity = summary.categoryInsights.filter((g) => g.severity === "high");
|
|
1609
|
-
const mediumSeverity = summary.categoryInsights.filter((g) => g.severity === "medium");
|
|
1610
|
-
const lowSeverity = summary.categoryInsights.filter((g) => g.severity === "low");
|
|
1611
|
-
const html = `<!DOCTYPE html>
|
|
1612
|
-
<html lang="en">
|
|
1613
|
-
<head>
|
|
1614
|
-
<meta charset="UTF-8">
|
|
1615
|
-
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
1616
|
-
<title>Sitemap QA Report - ${config.baseUrl}</title>
|
|
1617
|
-
<style>
|
|
1618
|
-
* { margin: 0; padding: 0; box-sizing: border-box; }
|
|
1619
|
-
body {
|
|
1620
|
-
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
|
|
1621
|
-
line-height: 1.6;
|
|
1622
|
-
color: #1f2937;
|
|
1623
|
-
background: #ffffff;
|
|
1624
|
-
padding: 24px;
|
|
1625
|
-
}
|
|
1626
|
-
.container {
|
|
1627
|
-
max-width: 1400px;
|
|
1628
|
-
margin: 0 auto;
|
|
1629
|
-
background: white;
|
|
1630
|
-
box-shadow: 0 1px 3px rgba(0,0,0,0.05);
|
|
1631
|
-
border-radius: 12px;
|
|
1632
|
-
overflow: hidden;
|
|
1633
|
-
border: 1px solid #e5e7eb;
|
|
1634
|
-
}
|
|
1635
|
-
.header {
|
|
1636
|
-
background: #0f172a;
|
|
1637
|
-
color: white;
|
|
1638
|
-
padding: 48px 40px;
|
|
1639
|
-
border-bottom: 3px solid #3b82f6;
|
|
1640
|
-
}
|
|
1641
|
-
.header h1 {
|
|
1642
|
-
font-size: 1.875rem;
|
|
1643
|
-
font-weight: 700;
|
|
1644
|
-
margin-bottom: 12px;
|
|
1645
|
-
letter-spacing: -0.025em;
|
|
1646
|
-
}
|
|
1647
|
-
.header .meta {
|
|
1648
|
-
opacity: 0.75;
|
|
1649
|
-
font-size: 0.875rem;
|
|
1650
|
-
font-weight: 400;
|
|
1651
|
-
}
|
|
1652
|
-
.summary {
|
|
1653
|
-
display: grid;
|
|
1654
|
-
grid-template-columns: repeat(auto-fit, minmax(240px, 1fr));
|
|
1655
|
-
gap: 1px;
|
|
1656
|
-
background: #e5e7eb;
|
|
1657
|
-
border-bottom: 1px solid #e5e7eb;
|
|
1658
|
-
}
|
|
1659
|
-
.summary-card {
|
|
1660
|
-
background: white;
|
|
1661
|
-
padding: 28px 32px;
|
|
1662
|
-
text-align: center;
|
|
1663
|
-
}
|
|
1664
|
-
.summary-card .label {
|
|
1665
|
-
font-size: 0.75rem;
|
|
1666
|
-
color: #6b7280;
|
|
1667
|
-
text-transform: uppercase;
|
|
1668
|
-
letter-spacing: 0.05em;
|
|
1669
|
-
font-weight: 600;
|
|
1670
|
-
margin-bottom: 8px;
|
|
1671
|
-
}
|
|
1672
|
-
.summary-card .value {
|
|
1673
|
-
font-size: 2.25rem;
|
|
1674
|
-
font-weight: 700;
|
|
1675
|
-
color: #0f172a;
|
|
1676
|
-
font-variant-numeric: tabular-nums;
|
|
1677
|
-
}
|
|
1678
|
-
.content { padding: 40px; }
|
|
1679
|
-
.status-clean {
|
|
1680
|
-
text-align: center;
|
|
1681
|
-
padding: 80px 32px;
|
|
1682
|
-
background: #f0fdf4;
|
|
1683
|
-
border-radius: 8px;
|
|
1684
|
-
border: 1px solid #86efac;
|
|
1685
|
-
}
|
|
1686
|
-
.status-clean h2 {
|
|
1687
|
-
font-size: 1.875rem;
|
|
1688
|
-
margin-bottom: 12px;
|
|
1689
|
-
color: #166534;
|
|
1690
|
-
font-weight: 700;
|
|
1691
|
-
}
|
|
1692
|
-
.status-clean p {
|
|
1693
|
-
font-size: 1rem;
|
|
1694
|
-
color: #65a30d;
|
|
1695
|
-
}
|
|
1696
|
-
.severity-section { margin-bottom: 32px; }
|
|
1697
|
-
.severity-section h2 {
|
|
1698
|
-
font-size: 1.125rem;
|
|
1699
|
-
font-weight: 600;
|
|
1700
|
-
padding: 16px 20px;
|
|
1701
|
-
margin-bottom: 16px;
|
|
1702
|
-
border-radius: 8px;
|
|
1703
|
-
display: flex;
|
|
1704
|
-
align-items: center;
|
|
1705
|
-
gap: 12px;
|
|
1706
|
-
cursor: pointer;
|
|
1707
|
-
user-select: none;
|
|
1708
|
-
transition: all 0.2s;
|
|
1709
|
-
}
|
|
1710
|
-
.severity-section h2:hover {
|
|
1711
|
-
opacity: 0.85;
|
|
1712
|
-
transform: translateY(-1px);
|
|
1713
|
-
}
|
|
1714
|
-
.severity-section h2::after {
|
|
1715
|
-
content: '\u25BC';
|
|
1716
|
-
margin-left: auto;
|
|
1717
|
-
font-size: 0.8em;
|
|
1718
|
-
transition: transform 0.3s ease;
|
|
1719
|
-
opacity: 0.7;
|
|
1720
|
-
}
|
|
1721
|
-
.severity-section h2.collapsed::after {
|
|
1722
|
-
transform: rotate(-90deg);
|
|
1723
|
-
}
|
|
1724
|
-
.severity-section h2.collapsed {
|
|
1725
|
-
margin-bottom: 0;
|
|
1726
|
-
}
|
|
1727
|
-
.severity-content {
|
|
1728
|
-
max-height: none;
|
|
1729
|
-
overflow: visible;
|
|
1730
|
-
transition: max-height 0.4s ease-out, opacity 0.3s ease-out;
|
|
1731
|
-
opacity: 1;
|
|
1732
|
-
}
|
|
1733
|
-
.severity-content.collapsed {
|
|
1734
|
-
max-height: 0;
|
|
1735
|
-
overflow: hidden;
|
|
1736
|
-
opacity: 0;
|
|
1737
|
-
}
|
|
1738
|
-
.severity-high { background: #fef2f2; color: #dc2626; border: 1px solid #fecaca; }
|
|
1739
|
-
.severity-medium { background: #fffbeb; color: #d97706; border: 1px solid #fde68a; }
|
|
1740
|
-
.severity-low { background: #eff6ff; color: #2563eb; border: 1px solid #dbeafe; }
|
|
1741
|
-
.risk-group {
|
|
1742
|
-
background: white;
|
|
1743
|
-
border: 1px solid #e5e7eb;
|
|
1744
|
-
border-radius: 8px;
|
|
1745
|
-
padding: 24px;
|
|
1746
|
-
margin-bottom: 16px;
|
|
1747
|
-
}
|
|
1748
|
-
.risk-group h3 {
|
|
1749
|
-
font-size: 1rem;
|
|
1750
|
-
margin-bottom: 12px;
|
|
1751
|
-
color: #0f172a;
|
|
1752
|
-
font-weight: 600;
|
|
1753
|
-
}
|
|
1754
|
-
.risk-group .count {
|
|
1755
|
-
display: inline-block;
|
|
1756
|
-
background: #3b82f6;
|
|
1757
|
-
color: white;
|
|
1758
|
-
padding: 2px 10px;
|
|
1759
|
-
border-radius: 9999px;
|
|
1760
|
-
font-size: 0.75rem;
|
|
1761
|
-
font-weight: 600;
|
|
1762
|
-
margin-left: 8px;
|
|
1763
|
-
}
|
|
1764
|
-
.risk-group .impact {
|
|
1765
|
-
color: #64748b;
|
|
1766
|
-
margin-bottom: 16px;
|
|
1767
|
-
font-size: 0.875rem;
|
|
1768
|
-
line-height: 1.6;
|
|
1769
|
-
}
|
|
1770
|
-
.risk-group .urls {
|
|
1771
|
-
background: #f8fafc;
|
|
1772
|
-
border: 1px solid #e2e8f0;
|
|
1773
|
-
border-radius: 6px;
|
|
1774
|
-
padding: 16px;
|
|
1775
|
-
}
|
|
1776
|
-
.risk-group .urls h4 {
|
|
1777
|
-
font-size: 0.75rem;
|
|
1778
|
-
color: #64748b;
|
|
1779
|
-
margin-bottom: 12px;
|
|
1780
|
-
text-transform: uppercase;
|
|
1781
|
-
letter-spacing: 0.05em;
|
|
1782
|
-
font-weight: 600;
|
|
1783
|
-
}
|
|
1784
|
-
.risk-group .urls ul { list-style: none; }
|
|
1785
|
-
.risk-group .urls li {
|
|
1786
|
-
padding: 10px 12px;
|
|
1787
|
-
border-bottom: 1px solid #e2e8f0;
|
|
1788
|
-
font-family: 'SF Mono', 'Monaco', 'Cascadia Code', 'Consolas', monospace;
|
|
1789
|
-
font-size: 0.8125rem;
|
|
1790
|
-
color: #334155;
|
|
1791
|
-
background: white;
|
|
1792
|
-
margin-bottom: 4px;
|
|
1793
|
-
border-radius: 4px;
|
|
1794
|
-
word-break: break-all;
|
|
1795
|
-
line-height: 1.6;
|
|
1796
|
-
}
|
|
1797
|
-
.risk-group .urls li:last-child { border-bottom: none; margin-bottom: 0; }
|
|
1798
|
-
.risk-group .more {
|
|
1799
|
-
color: #3b82f6;
|
|
1800
|
-
font-style: italic;
|
|
1801
|
-
margin-top: 8px;
|
|
1802
|
-
font-size: 0.8125rem;
|
|
1803
|
-
}
|
|
1804
|
-
.download-btn {
|
|
1805
|
-
display: inline-block;
|
|
1806
|
-
background: #3b82f6;
|
|
1807
|
-
color: white;
|
|
1808
|
-
padding: 8px 16px;
|
|
1809
|
-
border-radius: 6px;
|
|
1810
|
-
text-decoration: none;
|
|
1811
|
-
font-size: 0.8125rem;
|
|
1812
|
-
font-weight: 500;
|
|
1813
|
-
margin-top: 12px;
|
|
1814
|
-
cursor: pointer;
|
|
1815
|
-
border: none;
|
|
1816
|
-
transition: all 0.15s;
|
|
1817
|
-
}
|
|
1818
|
-
.download-btn:hover {
|
|
1819
|
-
background: #2563eb;
|
|
1820
|
-
transform: translateY(-1px);
|
|
1821
|
-
box-shadow: 0 4px 6px -1px rgba(0,0,0,0.1);
|
|
1822
|
-
}
|
|
1823
|
-
.footer {
|
|
1824
|
-
background: #f8fafc;
|
|
1825
|
-
padding: 24px 40px;
|
|
1826
|
-
border-top: 1px solid #e5e7eb;
|
|
1827
|
-
text-align: center;
|
|
1828
|
-
color: #64748b;
|
|
1829
|
-
font-size: 0.8125rem;
|
|
1830
|
-
}
|
|
1831
|
-
.sitemaps {
|
|
1832
|
-
background: white;
|
|
1833
|
-
border: 1px solid #e5e7eb;
|
|
1834
|
-
border-radius: 8px;
|
|
1835
|
-
margin-bottom: 24px;
|
|
1836
|
-
overflow: hidden;
|
|
1837
|
-
}
|
|
1838
|
-
.sitemaps h3 {
|
|
1839
|
-
font-size: 1.125rem;
|
|
1840
|
-
font-weight: 600;
|
|
1841
|
-
padding: 16px 20px;
|
|
1842
|
-
margin: 0;
|
|
1843
|
-
color: #0f172a;
|
|
1844
|
-
background: #f8fafc;
|
|
1845
|
-
cursor: pointer;
|
|
1846
|
-
user-select: none;
|
|
1847
|
-
transition: all 0.15s;
|
|
1848
|
-
display: flex;
|
|
1849
|
-
align-items: center;
|
|
1850
|
-
gap: 10px;
|
|
1851
|
-
}
|
|
1852
|
-
.sitemaps h3:hover {
|
|
1853
|
-
background: #f1f5f9;
|
|
1854
|
-
}
|
|
1855
|
-
.sitemaps h3::after {
|
|
1856
|
-
content: '\u25BC';
|
|
1857
|
-
margin-left: auto;
|
|
1858
|
-
font-size: 0.8em;
|
|
1859
|
-
transition: transform 0.3s ease;
|
|
1860
|
-
opacity: 0.7;
|
|
1861
|
-
}
|
|
1862
|
-
.sitemaps h3.collapsed::after {
|
|
1863
|
-
transform: rotate(-90deg);
|
|
1864
|
-
}
|
|
1865
|
-
.sitemaps-content {
|
|
1866
|
-
max-height: none;
|
|
1867
|
-
overflow: visible;
|
|
1868
|
-
transition: max-height 0.4s ease-out, opacity 0.3s ease-out;
|
|
1869
|
-
opacity: 1;
|
|
1870
|
-
padding: 20px;
|
|
1871
|
-
}
|
|
1872
|
-
.sitemaps-content.collapsed {
|
|
1873
|
-
max-height: 0;
|
|
1874
|
-
overflow: hidden;
|
|
1875
|
-
opacity: 0;
|
|
1876
|
-
padding: 0 20px;
|
|
1877
|
-
}
|
|
1878
|
-
.sitemaps ul { list-style: none; }
|
|
1879
|
-
.sitemaps li {
|
|
1880
|
-
padding: 10px 12px;
|
|
1881
|
-
font-family: 'SF Mono', 'Monaco', 'Cascadia Code', 'Consolas', monospace;
|
|
1882
|
-
font-size: 0.8125rem;
|
|
1883
|
-
color: #475569;
|
|
1884
|
-
word-break: break-all;
|
|
1885
|
-
line-height: 1.6;
|
|
1886
|
-
background: #f8fafc;
|
|
1887
|
-
margin-bottom: 4px;
|
|
1888
|
-
border-radius: 4px;
|
|
1889
|
-
}
|
|
1890
|
-
.sitemaps li:last-child { margin-bottom: 0; }
|
|
1891
|
-
.errors-section {
|
|
1892
|
-
background: #fffbeb;
|
|
1893
|
-
border-left: 4px solid #f59e0b;
|
|
1894
|
-
padding: 20px;
|
|
1895
|
-
margin-bottom: 24px;
|
|
1896
|
-
border-radius: 8px;
|
|
1897
|
-
border: 1px solid #fde68a;
|
|
1898
|
-
}
|
|
1899
|
-
.errors-section h3 {
|
|
1900
|
-
color: #92400e;
|
|
1901
|
-
margin-bottom: 16px;
|
|
1902
|
-
font-size: 1.125rem;
|
|
1903
|
-
font-weight: 600;
|
|
1904
|
-
display: flex;
|
|
1905
|
-
align-items: center;
|
|
1906
|
-
gap: 8px;
|
|
1907
|
-
}
|
|
1908
|
-
.errors-section ul {
|
|
1909
|
-
list-style: none;
|
|
1910
|
-
padding: 0;
|
|
1911
|
-
}
|
|
1912
|
-
.errors-section li {
|
|
1913
|
-
padding: 12px;
|
|
1914
|
-
background: white;
|
|
1915
|
-
margin-bottom: 8px;
|
|
1916
|
-
border-radius: 6px;
|
|
1917
|
-
font-family: 'SF Mono', 'Monaco', 'Cascadia Code', 'Consolas', monospace;
|
|
1918
|
-
font-size: 0.8125rem;
|
|
1919
|
-
color: #78350f;
|
|
1920
|
-
word-break: break-all;
|
|
1921
|
-
line-height: 1.6;
|
|
1922
|
-
border: 1px solid #fde68a;
|
|
1923
|
-
}
|
|
1924
|
-
.errors-section li:last-child {
|
|
1925
|
-
margin-bottom: 0;
|
|
1926
|
-
}
|
|
1927
|
-
</style>
|
|
1928
|
-
</head>
|
|
1929
|
-
<body>
|
|
1930
|
-
<div class="container">
|
|
1931
|
-
<div class="header">
|
|
1932
|
-
<h1>Sitemap Analysis</h1>
|
|
1933
|
-
<div class="meta">
|
|
1934
|
-
<div>${config.baseUrl}</div>
|
|
1935
|
-
<div>${new Date(timestamp).toLocaleString()}</div>
|
|
1936
|
-
</div>
|
|
1937
|
-
</div>
|
|
1938
|
-
|
|
1939
|
-
<div class="summary">
|
|
1940
|
-
<div class="summary-card">
|
|
1941
|
-
<div class="label">Sitemaps</div>
|
|
1942
|
-
<div class="value">${discoveryResult.sitemaps.length}</div>
|
|
1943
|
-
</div>
|
|
1944
|
-
<div class="summary-card">
|
|
1945
|
-
<div class="label">URLs Analyzed</div>
|
|
1946
|
-
<div class="value">${totalUrls.toLocaleString()}</div>
|
|
1947
|
-
</div>
|
|
1948
|
-
<div class="summary-card">
|
|
1949
|
-
<div class="label">Issues Found</div>
|
|
1950
|
-
<div class="value" style="color: ${riskyUrlCount > 0 ? "#dc2626" : "#059669"}">${riskyUrlCount}</div>
|
|
1951
|
-
</div>
|
|
1952
|
-
<div class="summary-card">
|
|
1953
|
-
<div class="label">Scan Time</div>
|
|
1954
|
-
<div class="value">${(summary.metadata.processingTime / 1e3).toFixed(1)}s</div>
|
|
1955
|
-
</div>
|
|
1956
|
-
</div>
|
|
1957
|
-
|
|
1958
|
-
<div class="content">
|
|
1959
|
-
${errors.length > 0 ? `
|
|
1960
|
-
<div class="errors-section">
|
|
1961
|
-
<h3>Parsing Errors & Warnings (${errors.length})</h3>
|
|
1962
|
-
<ul>
|
|
1963
|
-
${errors.map((err) => `<li>${err.message}</li>`).join("\n ")}
|
|
1964
|
-
</ul>
|
|
1965
|
-
</div>
|
|
1966
|
-
` : ""}
|
|
1967
|
-
|
|
1968
|
-
${discoveryResult.sitemaps.length > 0 ? `
|
|
1969
|
-
<div class="sitemaps">
|
|
1970
|
-
<h3 class="collapsed" onclick="toggleSection(this)">Sitemaps Discovered (${discoveryResult.sitemaps.length})</h3>
|
|
1971
|
-
<div class="sitemaps-content collapsed">
|
|
1972
|
-
<ul>
|
|
1973
|
-
${discoveryResult.sitemaps.map((s) => `<li>\u2022 ${s}</li>`).join("\n ")}
|
|
1974
|
-
</ul>
|
|
1975
|
-
</div>
|
|
1976
|
-
</div>
|
|
1977
|
-
` : ""}
|
|
1978
|
-
|
|
1979
|
-
${riskyUrlCount === 0 ? `
|
|
1980
|
-
<div class="status-clean">
|
|
1981
|
-
<h2>No Issues Found</h2>
|
|
1982
|
-
<p>All URLs in the sitemap passed validation checks.</p>
|
|
1983
|
-
</div>
|
|
1984
|
-
` : ""}
|
|
1985
|
-
|
|
1986
|
-
${highSeverity.length > 0 ? `
|
|
1987
|
-
<div class="severity-section">
|
|
1988
|
-
<h2 class="severity-high" onclick="toggleSection(this)">High Severity (${highSeverity.reduce((sum, g) => sum + g.count, 0)} URLs)</h2>
|
|
1989
|
-
<div class="severity-content">
|
|
1990
|
-
${highSeverity.map((group) => renderRiskGroup(group, maxUrls)).join("\n ")}
|
|
1991
|
-
</div>
|
|
1992
|
-
</div>
|
|
1993
|
-
` : ""}
|
|
1994
|
-
|
|
1995
|
-
${mediumSeverity.length > 0 ? `
|
|
1996
|
-
<div class="severity-section">
|
|
1997
|
-
<h2 class="severity-medium" onclick="toggleSection(this)">Medium Severity (${mediumSeverity.reduce((sum, g) => sum + g.count, 0)} URLs)</h2>
|
|
1998
|
-
<div class="severity-content">
|
|
1999
|
-
${mediumSeverity.map((group) => renderRiskGroup(group, maxUrls)).join("\n ")}
|
|
2000
|
-
</div>
|
|
2001
|
-
</div>
|
|
2002
|
-
` : ""}
|
|
2003
|
-
|
|
2004
|
-
${lowSeverity.length > 0 ? `
|
|
2005
|
-
<div class="severity-section">
|
|
2006
|
-
<h2 class="severity-low" onclick="toggleSection(this)">Low Severity (${lowSeverity.reduce((sum, g) => sum + g.count, 0)} URLs)</h2>
|
|
2007
|
-
<div class="severity-content">
|
|
2008
|
-
${lowSeverity.map((group) => renderRiskGroup(group, maxUrls)).join("\n ")}
|
|
2009
|
-
</div>
|
|
2010
|
-
</div>
|
|
2011
|
-
` : ""}
|
|
2012
|
-
</div>
|
|
2013
|
-
|
|
2014
|
-
<div class="footer">
|
|
2015
|
-
Generated by <strong>sitemap-qa</strong> v${TOOL_VERSION2}
|
|
2016
|
-
</div>
|
|
2017
|
-
</div>
|
|
2018
|
-
|
|
2019
|
-
<script>
|
|
2020
|
-
function toggleSection(header) {
|
|
2021
|
-
header.classList.toggle('collapsed');
|
|
2022
|
-
const content = header.nextElementSibling;
|
|
2023
|
-
content.classList.toggle('collapsed');
|
|
2024
|
-
}
|
|
2025
|
-
|
|
2026
|
-
function downloadUrls(categorySlug, encodedUrls) {
|
|
2027
|
-
// Decode HTML entities and parse JSON
|
|
2028
|
-
const textarea = document.createElement('textarea');
|
|
2029
|
-
textarea.innerHTML = encodedUrls;
|
|
2030
|
-
const urls = JSON.parse(textarea.value);
|
|
2031
|
-
|
|
2032
|
-
// Create text content (one URL per line)
|
|
2033
|
-
const textContent = urls.join('\\n');
|
|
2034
|
-
|
|
2035
|
-
// Create blob and download
|
|
2036
|
-
const blob = new Blob([textContent], { type: 'text/plain' });
|
|
2037
|
-
const url = URL.createObjectURL(blob);
|
|
2038
|
-
const a = document.createElement('a');
|
|
2039
|
-
a.href = url;
|
|
2040
|
-
a.download = categorySlug + '_urls.txt';
|
|
2041
|
-
document.body.appendChild(a);
|
|
2042
|
-
a.click();
|
|
2043
|
-
document.body.removeChild(a);
|
|
2044
|
-
URL.revokeObjectURL(url);
|
|
2045
|
-
}
|
|
2046
|
-
</script>
|
|
2047
|
-
</body>
|
|
2048
|
-
</html>`;
|
|
2049
|
-
return html;
|
|
2050
|
-
}
|
|
2051
|
-
function renderRiskGroup(group, maxUrls) {
|
|
2052
|
-
const categoryTitle = group.category.split("_").map((word) => word.charAt(0).toUpperCase() + word.slice(1)).join(" ");
|
|
2053
|
-
const urlsToShow = group.examples.slice(0, maxUrls);
|
|
2054
|
-
const remaining = group.count - urlsToShow.length;
|
|
2055
|
-
const categorySlug = group.category.toLowerCase();
|
|
2056
|
-
const allUrlsJson = JSON.stringify(group.allUrls);
|
|
2057
|
-
const encodedUrls = escapeHtml(allUrlsJson);
|
|
2058
|
-
return `<div class="risk-group">
|
|
2059
|
-
<h3>${categoryTitle} <span class="count">${group.count} URLs</span></h3>
|
|
2060
|
-
<div class="impact">${group.summary}</div>
|
|
2061
|
-
<div class="urls">
|
|
2062
|
-
<h4>Sample URLs</h4>
|
|
2063
|
-
<ul>
|
|
2064
|
-
${urlsToShow.map((url) => `<li>${escapeHtml(url)}</li>`).join("\n ")}
|
|
2065
|
-
</ul>
|
|
2066
|
-
${remaining > 0 ? `<div class="more">... and ${remaining} more</div>` : ""}
|
|
2067
|
-
<button class="download-btn" onclick="downloadUrls('${categorySlug}', '${encodedUrls}')">\u{1F4E5} Download All ${group.count} URLs</button>
|
|
2068
|
-
</div>
|
|
2069
|
-
</div>`;
|
|
2070
|
-
}
|
|
2071
|
-
function escapeHtml(text) {
|
|
2072
|
-
return text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'");
|
|
2073
|
-
}
|
|
2074
|
-
async function writeHtmlReport(summary, discoveryResult, totalUrls, config, outputPath, errors, options = {}) {
|
|
2075
|
-
const htmlContent = generateHtmlReport(summary, discoveryResult, totalUrls, config, errors, options);
|
|
2076
|
-
await import_fs2.promises.writeFile(outputPath, htmlContent, "utf-8");
|
|
2077
|
-
}
|
|
2078
|
-
|
|
2079
|
-
// src/commands/analyze.ts
|
|
2080
|
-
var analyzeCommand = new import_commander.Command("analyze").description("Analyze sitemap for QA issues").argument("<url>", "Base URL to analyze").option("--timeout <seconds>", "HTTP timeout in seconds", "30").option("--no-progress", "Disable progress bar").option("--output <format>", "Output format: html or json", "html").option("--output-dir <path>", "Output directory for reports").option("--output-file <path>", "Custom output filename").option("--accepted-patterns <patterns>", "Comma-separated regex patterns to exclude from risk detection").option("--concurrency <number>", "Number of concurrent workers for risk detection").option("--batch-size <number>", "URLs per batch for risk detection", "10000").option("--parsing-concurrency <number>", "Number of concurrent sitemap parsers", "50").option("--discovery-concurrency <number>", "Number of concurrent sitemap index fetches", "50").option("--silent", "Disable all progress output").option("--benchmark", "Save performance profile").option("--no-color", "Disable ANSI color codes in CLI output").option("--verbose", "Enable verbose logging", false).action(async (url, options) => {
|
|
2081
|
-
let config;
|
|
2082
|
-
try {
|
|
2083
|
-
validateAnalyzeOptions(options);
|
|
2084
|
-
const loadedConfig = await loadConfig({
|
|
2085
|
-
...options,
|
|
2086
|
-
baseUrl: url,
|
|
2087
|
-
outputFormat: options.output,
|
|
2088
|
-
riskDetectionConcurrency: options.concurrency ? parseInt(options.concurrency) : void 0,
|
|
2089
|
-
riskDetectionBatchSize: options.batchSize ? parseInt(options.batchSize) : void 0,
|
|
2090
|
-
parsingConcurrency: options.parsingConcurrency ? parseInt(options.parsingConcurrency) : void 0,
|
|
2091
|
-
discoveryConcurrency: options.discoveryConcurrency ? parseInt(options.discoveryConcurrency) : void 0,
|
|
2092
|
-
silent: options.silent,
|
|
2093
|
-
benchmark: options.benchmark,
|
|
2094
|
-
progressBar: options.progress
|
|
2095
|
-
});
|
|
2096
|
-
config = loadedConfig;
|
|
2097
|
-
console.log(`
|
|
2098
|
-
\u{1F50D} Analyzing ${url}...
|
|
2099
|
-
`);
|
|
2100
|
-
const result = await runAnalysisPipeline(url, config);
|
|
2101
|
-
await import_fs3.promises.mkdir(config.outputDir, { recursive: true });
|
|
2102
|
-
if (options.output === "json") {
|
|
2103
|
-
const jsonReport = generateJsonReport(
|
|
2104
|
-
result.summary,
|
|
2105
|
-
result.discoveryResult,
|
|
2106
|
-
{ totalCount: result.totalUrls, uniqueUrls: [], errors: [] },
|
|
2107
|
-
result.riskGroups,
|
|
2108
|
-
config,
|
|
2109
|
-
result.executionTime,
|
|
2110
|
-
{ pretty: true, indent: 2 }
|
|
2111
|
-
);
|
|
2112
|
-
console.log("\n" + jsonReport);
|
|
2113
|
-
if (options.outputFile) {
|
|
2114
|
-
const jsonFilePath = `${config.outputDir}/${options.outputFile}`;
|
|
2115
|
-
await import_fs3.promises.writeFile(jsonFilePath, jsonReport, "utf-8");
|
|
2116
|
-
console.log(`
|
|
2117
|
-
\u{1F4C4} JSON report saved to: ${import_chalk.default.cyan(jsonFilePath)}`);
|
|
2118
|
-
}
|
|
2119
|
-
} else {
|
|
2120
|
-
showCliSummary(result);
|
|
2121
|
-
const htmlFileName = options.outputFile || `sitemap-qa-report-${Date.now()}.html`;
|
|
2122
|
-
const htmlFilePath = `${config.outputDir}/${htmlFileName}`;
|
|
2123
|
-
await writeHtmlReport(
|
|
2124
|
-
result.summary,
|
|
2125
|
-
result.discoveryResult,
|
|
2126
|
-
result.totalUrls,
|
|
2127
|
-
config,
|
|
2128
|
-
htmlFilePath,
|
|
2129
|
-
result.errors,
|
|
2130
|
-
{ maxUrlsPerGroup: 10 }
|
|
2131
|
-
);
|
|
2132
|
-
console.log(`
|
|
2133
|
-
\u{1F4C4} Full report saved to: ${import_chalk.default.cyan(htmlFilePath)}`);
|
|
2134
|
-
}
|
|
2135
|
-
const exitCode = determineExitCode(result);
|
|
2136
|
-
process.exit(exitCode);
|
|
2137
|
-
} catch (error) {
|
|
2138
|
-
handleAnalysisError(error, config);
|
|
2139
|
-
process.exit(2);
|
|
2140
|
-
}
|
|
2141
|
-
});
|
|
2142
|
-
function validateAnalyzeOptions(options) {
|
|
2143
|
-
const validFormats = ["json", "html"];
|
|
2144
|
-
if (!validFormats.includes(options.output)) {
|
|
2145
|
-
throw new Error(
|
|
2146
|
-
`Invalid output format: ${options.output}. Must be one of: ${validFormats.join(", ")}`
|
|
2147
|
-
);
|
|
2148
|
-
}
|
|
2149
|
-
const timeout = parseInt(options.timeout);
|
|
2150
|
-
if (isNaN(timeout) || timeout <= 0) {
|
|
2151
|
-
throw new Error(`Invalid timeout: ${options.timeout}. Must be a positive number.`);
|
|
2152
|
-
}
|
|
2153
|
-
}
|
|
2154
|
-
function showCliSummary(result) {
|
|
2155
|
-
const riskyUrlCount = result.summary.categoryInsights.reduce((sum, g) => sum + g.count, 0);
|
|
2156
|
-
console.log(import_chalk.default.dim("\u2500".repeat(50)));
|
|
2157
|
-
if (riskyUrlCount === 0) {
|
|
2158
|
-
console.log(import_chalk.default.green("No issues found - sitemap looks clean!"));
|
|
2159
|
-
} else {
|
|
2160
|
-
const { high, medium, low } = result.summary.severityBreakdown;
|
|
2161
|
-
const severityParts = [];
|
|
2162
|
-
if (high > 0) severityParts.push(import_chalk.default.red(`High: ${high}`));
|
|
2163
|
-
if (medium > 0) severityParts.push(import_chalk.default.yellow(`Medium: ${medium}`));
|
|
2164
|
-
if (low > 0) severityParts.push(import_chalk.default.blue(`Low: ${low}`));
|
|
2165
|
-
const severitySummary = severityParts.length > 0 ? ` (${severityParts.join(", ")})` : "";
|
|
2166
|
-
console.log(import_chalk.default.yellow(`\u26A0\uFE0F ${riskyUrlCount} risky URLs found${severitySummary}`));
|
|
2167
|
-
}
|
|
2168
|
-
console.log("");
|
|
2169
|
-
}
|
|
2170
|
-
async function runAnalysisPipeline(url, config) {
|
|
2171
|
-
const overallStartTime = Date.now();
|
|
2172
|
-
const phaseTimings = [];
|
|
2173
|
-
const errors = [];
|
|
2174
|
-
const showProgress = !config.silent && config.progressBar !== false && process.stdout.isTTY;
|
|
2175
|
-
let phaseStart = Date.now();
|
|
2176
|
-
const discoverySpinner = showProgress ? (0, import_ora.default)({ text: "Discovering sitemaps...", color: "cyan" }).start() : null;
|
|
2177
|
-
const discoveryResult = await discoverSitemaps(url, config);
|
|
2178
|
-
if (discoverySpinner) {
|
|
2179
|
-
discoverySpinner.stop();
|
|
2180
|
-
}
|
|
2181
|
-
phaseTimings.push({
|
|
2182
|
-
name: "Discovery",
|
|
2183
|
-
startTime: phaseStart,
|
|
2184
|
-
endTime: Date.now(),
|
|
2185
|
-
duration: Date.now() - phaseStart
|
|
2186
|
-
});
|
|
2187
|
-
if (discoveryResult.accessIssues.length > 0) {
|
|
2188
|
-
if (!config.silent) {
|
|
2189
|
-
console.warn(import_chalk.default.yellow(`\u26A0\uFE0F Warning: ${discoveryResult.accessIssues.length} sitemap(s) are access-blocked`));
|
|
2190
|
-
}
|
|
2191
|
-
for (const issue of discoveryResult.accessIssues) {
|
|
2192
|
-
errors.push(new Error(`Access blocked: ${issue.url} (${issue.statusCode})`));
|
|
2193
|
-
}
|
|
2194
|
-
}
|
|
2195
|
-
if (discoveryResult.sitemaps.length === 0) {
|
|
2196
|
-
throw new Error(`No sitemaps found at ${url}. Tried: /sitemap.xml, /sitemap_index.xml, /robots.txt`);
|
|
2197
|
-
}
|
|
2198
|
-
phaseStart = Date.now();
|
|
2199
|
-
let extractionResult;
|
|
2200
|
-
if (showProgress && discoveryResult.sitemaps.length > 10) {
|
|
2201
|
-
const parseBar = new import_cli_progress.default.SingleBar({
|
|
2202
|
-
format: "{bar} {percentage}% | {value}/{total} | ETA: {eta}s | {speed} sitemaps/sec",
|
|
2203
|
-
barCompleteChar: "\u2588",
|
|
2204
|
-
barIncompleteChar: "\u2591",
|
|
2205
|
-
hideCursor: true
|
|
2206
|
-
});
|
|
2207
|
-
parseBar.start(discoveryResult.sitemaps.length, 0, { speed: "0" });
|
|
2208
|
-
extractionResult = await extractAllUrls(
|
|
2209
|
-
discoveryResult.sitemaps,
|
|
2210
|
-
config,
|
|
2211
|
-
(completed, total) => {
|
|
2212
|
-
const elapsed = (Date.now() - phaseStart) / 1e3;
|
|
2213
|
-
const speed = elapsed > 0 ? (completed / elapsed).toFixed(1) : "0";
|
|
2214
|
-
parseBar.update(completed, { speed });
|
|
2215
|
-
}
|
|
2216
|
-
);
|
|
2217
|
-
parseBar.stop();
|
|
2218
|
-
} else {
|
|
2219
|
-
extractionResult = await extractAllUrls(discoveryResult.sitemaps, config);
|
|
2220
|
-
}
|
|
2221
|
-
phaseTimings.push({
|
|
2222
|
-
name: "Parsing",
|
|
2223
|
-
startTime: phaseStart,
|
|
2224
|
-
endTime: Date.now(),
|
|
2225
|
-
duration: Date.now() - phaseStart
|
|
2226
|
-
});
|
|
2227
|
-
if (extractionResult.errors.length > 0) {
|
|
2228
|
-
for (const err of extractionResult.errors) {
|
|
2229
|
-
if (typeof err === "string") {
|
|
2230
|
-
errors.push(new Error(err));
|
|
2231
|
-
} else {
|
|
2232
|
-
errors.push(err);
|
|
2233
|
-
}
|
|
2234
|
-
}
|
|
2235
|
-
}
|
|
2236
|
-
if (extractionResult.allUrls.length === 0) {
|
|
2237
|
-
throw new Error("No URLs extracted from sitemaps");
|
|
2238
|
-
}
|
|
2239
|
-
phaseStart = Date.now();
|
|
2240
|
-
const consolidatedResult = consolidateUrls(extractionResult.allUrls);
|
|
2241
|
-
phaseTimings.push({
|
|
2242
|
-
name: "Deduplication",
|
|
2243
|
-
startTime: phaseStart,
|
|
2244
|
-
endTime: Date.now(),
|
|
2245
|
-
duration: Date.now() - phaseStart
|
|
2246
|
-
});
|
|
2247
|
-
const duplicatesRemoved = extractionResult.allUrls.length - consolidatedResult.uniqueUrls.length;
|
|
2248
|
-
const duplicatePercentage = duplicatesRemoved / extractionResult.allUrls.length * 100;
|
|
2249
|
-
if (!config.silent) {
|
|
2250
|
-
if (duplicatesRemoved > 100 || duplicatePercentage > 1) {
|
|
2251
|
-
console.log(import_chalk.default.green(`Found ${discoveryResult.sitemaps.length} sitemap(s) \u2192 ${extractionResult.allUrls.length.toLocaleString()} URLs (${consolidatedResult.uniqueUrls.length.toLocaleString()} unique)`));
|
|
2252
|
-
} else {
|
|
2253
|
-
console.log(import_chalk.default.green(`Found ${discoveryResult.sitemaps.length} sitemap(s) \u2192 ${extractionResult.allUrls.length.toLocaleString()} URLs`));
|
|
2254
|
-
}
|
|
2255
|
-
}
|
|
2256
|
-
phaseStart = Date.now();
|
|
2257
|
-
const riskResult = await detectRisks(consolidatedResult.uniqueUrls, url, config);
|
|
2258
|
-
const riskGroups = groupRiskFindings(riskResult.findings);
|
|
2259
|
-
phaseTimings.push({
|
|
2260
|
-
name: "Risk Detection",
|
|
2261
|
-
startTime: phaseStart,
|
|
2262
|
-
endTime: Date.now(),
|
|
2263
|
-
duration: Date.now() - phaseStart
|
|
2264
|
-
});
|
|
2265
|
-
phaseStart = Date.now();
|
|
2266
|
-
const executionTime = Date.now() - overallStartTime;
|
|
2267
|
-
const summary = summarizeRisks({
|
|
2268
|
-
riskGroups: riskGroups.groups,
|
|
2269
|
-
totalUrls: consolidatedResult.uniqueUrls.length,
|
|
2270
|
-
sitemapUrl: url,
|
|
2271
|
-
processingTime: executionTime
|
|
2272
|
-
});
|
|
2273
|
-
phaseTimings.push({
|
|
2274
|
-
name: "Summarization",
|
|
2275
|
-
startTime: phaseStart,
|
|
2276
|
-
endTime: Date.now(),
|
|
2277
|
-
duration: Date.now() - phaseStart
|
|
2278
|
-
});
|
|
2279
|
-
if (!config.silent && config.verbose) {
|
|
2280
|
-
displayPhaseSummary(phaseTimings, executionTime);
|
|
2281
|
-
} else if (!config.silent) {
|
|
2282
|
-
const parsingPhase = phaseTimings.find((p) => p.name === "Parsing");
|
|
2283
|
-
const sitemapsPerSec = parsingPhase ? (discoveryResult.sitemaps.length / (parsingPhase.duration / 1e3)).toFixed(1) : "0";
|
|
2284
|
-
console.log(import_chalk.default.green(`Analysis complete (${(executionTime / 1e3).toFixed(1)}s \xB7 ${sitemapsPerSec} sitemaps/sec)
|
|
2285
|
-
`));
|
|
2286
|
-
}
|
|
2287
|
-
if (config.benchmark) {
|
|
2288
|
-
await saveBenchmark(phaseTimings, url, executionTime, discoveryResult.sitemaps.length, consolidatedResult.uniqueUrls.length, config);
|
|
2289
|
-
}
|
|
2290
|
-
return {
|
|
2291
|
-
discoveryResult,
|
|
2292
|
-
totalUrls: consolidatedResult.uniqueUrls.length,
|
|
2293
|
-
riskGroups: riskGroups.groups,
|
|
2294
|
-
summary,
|
|
2295
|
-
errors,
|
|
2296
|
-
executionTime,
|
|
2297
|
-
phaseTimings
|
|
2298
|
-
};
|
|
2299
|
-
}
|
|
2300
|
-
function determineExitCode(result) {
|
|
2301
|
-
const highSeverityCount = result.summary.severityBreakdown.high;
|
|
2302
|
-
if (highSeverityCount > 0) {
|
|
2303
|
-
return 1;
|
|
2304
|
-
}
|
|
2305
|
-
return 0;
|
|
2306
|
-
}
|
|
2307
|
-
function handleAnalysisError(error, config) {
|
|
2308
|
-
console.error("\n\u274C Analysis failed\n");
|
|
2309
|
-
if (error instanceof Error) {
|
|
2310
|
-
console.error(`Error: ${error.message}`);
|
|
2311
|
-
if (config?.verbose && error.stack) {
|
|
2312
|
-
console.error("\nStack trace:");
|
|
2313
|
-
console.error(error.stack);
|
|
2314
|
-
}
|
|
2315
|
-
if (error.message.includes("No sitemaps found")) {
|
|
2316
|
-
console.error("\nSuggestions:");
|
|
2317
|
-
console.error(" \u2022 Verify the base URL is correct");
|
|
2318
|
-
console.error(" \u2022 Check if the site has a sitemap");
|
|
2319
|
-
console.error(" \u2022 Ensure the sitemap is publicly accessible");
|
|
2320
|
-
} else if (error.message.includes("Network") || error.message.includes("timeout")) {
|
|
2321
|
-
console.error("\nSuggestions:");
|
|
2322
|
-
console.error(" \u2022 Check your internet connection");
|
|
2323
|
-
console.error(" \u2022 Verify the URL is accessible");
|
|
2324
|
-
console.error(" \u2022 Try increasing the timeout with --timeout option");
|
|
2325
|
-
}
|
|
2326
|
-
} else {
|
|
2327
|
-
console.error("Unknown error occurred");
|
|
2328
|
-
console.error(String(error));
|
|
2329
|
-
}
|
|
2330
|
-
}
|
|
2331
|
-
function displayPhaseSummary(timings, totalTime) {
|
|
2332
|
-
console.log(import_chalk.default.green(`
|
|
2333
|
-
Analysis Complete (Total: ${(totalTime / 1e3).toFixed(1)}s)
|
|
2334
|
-
`));
|
|
2335
|
-
console.log(import_chalk.default.cyan("Phase Breakdown:"));
|
|
2336
|
-
for (const timing of timings) {
|
|
2337
|
-
const seconds = (timing.duration / 1e3).toFixed(1);
|
|
2338
|
-
const percentage = (timing.duration / totalTime * 100).toFixed(1);
|
|
2339
|
-
const bar = "\u2022";
|
|
2340
|
-
console.log(` ${bar} ${timing.name.padEnd(15)}: ${seconds.padStart(5)}s (${percentage.padStart(5)}%)`);
|
|
2341
|
-
}
|
|
2342
|
-
console.log("");
|
|
2343
|
-
}
|
|
2344
|
-
async function saveBenchmark(timings, url, totalTime, sitemapCount, urlCount, config) {
|
|
2345
|
-
const benchmark = {
|
|
2346
|
-
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2347
|
-
url,
|
|
2348
|
-
total_duration_ms: totalTime,
|
|
2349
|
-
phases: timings.map((t) => ({
|
|
2350
|
-
name: t.name.toLowerCase(),
|
|
2351
|
-
start_ms: t.startTime,
|
|
2352
|
-
end_ms: t.endTime,
|
|
2353
|
-
duration_ms: t.duration
|
|
2354
|
-
})),
|
|
2355
|
-
metrics: {
|
|
2356
|
-
sitemaps_processed: sitemapCount,
|
|
2357
|
-
urls_analyzed: urlCount,
|
|
2358
|
-
throughput: {
|
|
2359
|
-
urls_per_second: Math.round(urlCount / totalTime * 1e3),
|
|
2360
|
-
sitemaps_per_second: (sitemapCount / totalTime * 1e3).toFixed(2)
|
|
2361
|
-
}
|
|
2362
|
-
},
|
|
2363
|
-
system_info: {
|
|
2364
|
-
cpu_count: import_os3.default.cpus().length,
|
|
2365
|
-
node_version: process.version,
|
|
2366
|
-
platform: process.platform,
|
|
2367
|
-
memory_total_mb: Math.round(import_os3.default.totalmem() / 1024 / 1024)
|
|
2368
|
-
},
|
|
2369
|
-
config: {
|
|
2370
|
-
discovery_concurrency: config.discoveryConcurrency,
|
|
2371
|
-
parsing_concurrency: config.parsingConcurrency,
|
|
2372
|
-
risk_detection_concurrency: config.riskDetectionConcurrency,
|
|
2373
|
-
risk_detection_batch_size: config.riskDetectionBatchSize
|
|
2374
|
-
}
|
|
2375
|
-
};
|
|
2376
|
-
const filename = `performance-profile-${Date.now()}.json`;
|
|
2377
|
-
await import_fs3.promises.writeFile(filename, JSON.stringify(benchmark, null, 2));
|
|
2378
|
-
console.log(import_chalk.default.blue(`\u{1F4CA} Benchmark saved to: ${filename}`));
|
|
2379
|
-
}
|
|
2380
|
-
|
|
2381
|
-
// src/index.ts
|
|
2382
|
-
var program = new import_commander2.Command();
|
|
2383
|
-
program.name("sitemap-qa").version("1.0.0").description("sitemap analysis for QA teams");
|
|
2384
|
-
program.addCommand(analyzeCommand);
|
|
2385
|
-
process.on("unhandledRejection", (reason, promise) => {
|
|
2386
|
-
console.error("Unhandled Rejection at:", promise, "reason:", reason);
|
|
2387
|
-
process.exit(1);
|
|
2388
|
-
});
|
|
2389
|
-
process.on("SIGINT", () => {
|
|
2390
|
-
console.log("\nGracefully shutting down...");
|
|
2391
|
-
process.exit(0);
|
|
2392
|
-
});
|
|
2393
|
-
process.on("SIGTERM", () => {
|
|
2394
|
-
console.log("\nGracefully shutting down...");
|
|
2395
|
-
process.exit(0);
|
|
2396
|
-
});
|
|
2397
|
-
program.parse();
|
|
2398
|
-
//# sourceMappingURL=index.cjs.map
|