@akotliar/sitemap-qa 1.0.0-alpha.3 → 1.0.0-alpha.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +165 -104
- package/dist/index.js +841 -2248
- package/dist/index.js.map +1 -1
- package/package.json +9 -7
package/dist/index.js
CHANGED
|
@@ -1,2364 +1,957 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
|
|
3
3
|
// src/index.ts
|
|
4
|
-
import "
|
|
5
|
-
import { Command as Command2 } from "commander";
|
|
4
|
+
import { Command as Command3 } from "commander";
|
|
6
5
|
|
|
7
6
|
// src/commands/analyze.ts
|
|
8
7
|
import { Command } from "commander";
|
|
9
|
-
import
|
|
10
|
-
import
|
|
11
|
-
import
|
|
12
|
-
import cliProgress from "cli-progress";
|
|
13
|
-
import os2 from "os";
|
|
14
|
-
|
|
15
|
-
// src/config/config-loader.ts
|
|
16
|
-
import { readFile } from "fs/promises";
|
|
17
|
-
import { existsSync } from "fs";
|
|
18
|
-
import { join } from "path";
|
|
19
|
-
import { homedir } from "os";
|
|
20
|
-
|
|
21
|
-
// src/types/config.ts
|
|
22
|
-
var DEFAULT_CONFIG = {
|
|
23
|
-
timeout: 30,
|
|
24
|
-
concurrency: 10,
|
|
25
|
-
parsingConcurrency: 50,
|
|
26
|
-
// Optimized for network-bound parallel parsing
|
|
27
|
-
discoveryConcurrency: 50,
|
|
28
|
-
// Optimized for recursive sitemap index discovery
|
|
29
|
-
outputFormat: "html",
|
|
30
|
-
outputDir: "./sitemap-qa/report",
|
|
31
|
-
verbose: false,
|
|
32
|
-
baseUrl: "https://example.com",
|
|
33
|
-
// Default for tests
|
|
34
|
-
acceptedPatterns: [],
|
|
35
|
-
riskDetectionBatchSize: 1e4,
|
|
36
|
-
riskDetectionConcurrency: void 0,
|
|
37
|
-
// Auto-detect in risk-detector.ts
|
|
38
|
-
progressBar: void 0,
|
|
39
|
-
// Auto-detect TTY
|
|
40
|
-
silent: false,
|
|
41
|
-
benchmark: false
|
|
42
|
-
};
|
|
43
|
-
|
|
44
|
-
// src/config/config-loader.ts
|
|
45
|
-
async function loadConfig(cliOptions) {
|
|
46
|
-
let config = { ...DEFAULT_CONFIG };
|
|
47
|
-
const globalConfigPath = join(homedir(), ".sitemap-qa", "config.json");
|
|
48
|
-
if (existsSync(globalConfigPath)) {
|
|
49
|
-
try {
|
|
50
|
-
const globalConfig = JSON.parse(await readFile(globalConfigPath, "utf-8"));
|
|
51
|
-
config = { ...config, ...globalConfig };
|
|
52
|
-
} catch (error) {
|
|
53
|
-
console.warn(`Warning: Failed to load global config: ${error}`);
|
|
54
|
-
}
|
|
55
|
-
}
|
|
56
|
-
const projectConfigPath = join(process.cwd(), ".sitemap-qa.config.json");
|
|
57
|
-
if (existsSync(projectConfigPath)) {
|
|
58
|
-
try {
|
|
59
|
-
const projectConfig = JSON.parse(await readFile(projectConfigPath, "utf-8"));
|
|
60
|
-
config = { ...config, ...projectConfig };
|
|
61
|
-
} catch (error) {
|
|
62
|
-
console.warn(`Warning: Failed to load project config: ${error}`);
|
|
63
|
-
}
|
|
64
|
-
}
|
|
65
|
-
const envConfig = loadFromEnv();
|
|
66
|
-
config = { ...config, ...envConfig };
|
|
67
|
-
config = mergeCliOptions(config, cliOptions);
|
|
68
|
-
if (cliOptions.baseUrl) {
|
|
69
|
-
config.baseUrl = cliOptions.baseUrl;
|
|
70
|
-
}
|
|
71
|
-
validateConfig(config);
|
|
72
|
-
return config;
|
|
73
|
-
}
|
|
74
|
-
function loadFromEnv() {
|
|
75
|
-
const env = {};
|
|
76
|
-
if (process.env.SITEMAP_VERIFY_TIMEOUT) {
|
|
77
|
-
env.timeout = parseInt(process.env.SITEMAP_VERIFY_TIMEOUT, 10);
|
|
78
|
-
}
|
|
79
|
-
return env;
|
|
80
|
-
}
|
|
81
|
-
function mergeCliOptions(config, cliOptions) {
|
|
82
|
-
const merged = { ...config };
|
|
83
|
-
if (cliOptions.timeout && cliOptions.timeout !== "30") {
|
|
84
|
-
merged.timeout = parseInt(cliOptions.timeout, 10);
|
|
85
|
-
}
|
|
86
|
-
if (cliOptions.output) {
|
|
87
|
-
merged.outputFormat = cliOptions.output;
|
|
88
|
-
}
|
|
89
|
-
if (cliOptions.outputDir) {
|
|
90
|
-
merged.outputDir = cliOptions.outputDir;
|
|
91
|
-
}
|
|
92
|
-
if (cliOptions.verbose === true) {
|
|
93
|
-
merged.verbose = true;
|
|
94
|
-
}
|
|
95
|
-
if (cliOptions.acceptedPatterns) {
|
|
96
|
-
merged.acceptedPatterns = cliOptions.acceptedPatterns.split(",").map((p) => p.trim()).filter(Boolean);
|
|
97
|
-
}
|
|
98
|
-
return merged;
|
|
99
|
-
}
|
|
100
|
-
function validateConfig(config) {
|
|
101
|
-
if (config.timeout < 1 || config.timeout > 300) {
|
|
102
|
-
throw new Error("Timeout must be between 1 and 300 seconds");
|
|
103
|
-
}
|
|
104
|
-
if (!["json", "html"].includes(config.outputFormat)) {
|
|
105
|
-
throw new Error("Output format must be json or html");
|
|
106
|
-
}
|
|
107
|
-
}
|
|
8
|
+
import chalk3 from "chalk";
|
|
9
|
+
import path2 from "path";
|
|
10
|
+
import fs4 from "fs/promises";
|
|
108
11
|
|
|
109
|
-
// src/
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
this.url = url;
|
|
114
|
-
this.originalError = originalError;
|
|
115
|
-
this.name = "NetworkError";
|
|
116
|
-
}
|
|
117
|
-
code = "NETWORK_ERROR";
|
|
118
|
-
};
|
|
119
|
-
var HttpError = class extends Error {
|
|
120
|
-
constructor(url, statusCode, statusText) {
|
|
121
|
-
let message = `HTTP ${statusCode} error for ${url}`;
|
|
122
|
-
if (statusCode === 403) {
|
|
123
|
-
message += "\n Note: 403 Forbidden often indicates bot protection (Cloudflare, etc.) or access restrictions";
|
|
124
|
-
}
|
|
125
|
-
super(message);
|
|
126
|
-
this.url = url;
|
|
127
|
-
this.statusCode = statusCode;
|
|
128
|
-
this.statusText = statusText;
|
|
129
|
-
this.name = "HttpError";
|
|
130
|
-
}
|
|
131
|
-
code = "HTTP_ERROR";
|
|
132
|
-
};
|
|
12
|
+
// src/config/loader.ts
|
|
13
|
+
import fs from "fs";
|
|
14
|
+
import path from "path";
|
|
15
|
+
import yaml from "js-yaml";
|
|
133
16
|
|
|
134
|
-
// src/
|
|
135
|
-
import {
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
maxSockets: 200,
|
|
142
|
-
// Allow many concurrent connections
|
|
143
|
-
maxFreeSockets: 50,
|
|
144
|
-
timeout: 15e3
|
|
17
|
+
// src/config/schema.ts
|
|
18
|
+
import { z } from "zod";
|
|
19
|
+
var PatternTypeSchema = z.enum(["literal", "glob", "regex"]);
|
|
20
|
+
var PatternSchema = z.object({
|
|
21
|
+
type: PatternTypeSchema,
|
|
22
|
+
value: z.string().min(1, "Pattern value cannot be empty"),
|
|
23
|
+
reason: z.string().min(1, "Reason is mandatory for each pattern")
|
|
145
24
|
});
|
|
146
|
-
var
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
maxFreeSockets: 50,
|
|
150
|
-
timeout: 15e3
|
|
25
|
+
var PolicySchema = z.object({
|
|
26
|
+
category: z.string().min(1, "Category name is mandatory"),
|
|
27
|
+
patterns: z.array(PatternSchema).min(1, "At least one pattern is required per category")
|
|
151
28
|
});
|
|
152
|
-
var
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
29
|
+
var ConfigSchema = z.object({
|
|
30
|
+
acceptable_patterns: z.array(PatternSchema).default([]),
|
|
31
|
+
policies: z.array(PolicySchema).default([]),
|
|
32
|
+
outDir: z.string().optional(),
|
|
33
|
+
outputFormat: z.enum(["json", "html", "all"]).default("all"),
|
|
34
|
+
enforceDomainConsistency: z.boolean().default(true)
|
|
158
35
|
});
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
36
|
+
|
|
37
|
+
// src/config/defaults.ts
|
|
38
|
+
var DEFAULT_POLICIES = {
|
|
39
|
+
acceptable_patterns: [],
|
|
40
|
+
outputFormat: "all",
|
|
41
|
+
enforceDomainConsistency: true,
|
|
42
|
+
policies: [
|
|
43
|
+
{
|
|
44
|
+
category: "Security & Admin",
|
|
45
|
+
patterns: [
|
|
46
|
+
{
|
|
47
|
+
type: "glob",
|
|
48
|
+
value: "**/admin/**",
|
|
49
|
+
reason: "Administrative interfaces should not be publicly indexed."
|
|
50
|
+
},
|
|
51
|
+
{
|
|
52
|
+
type: "glob",
|
|
53
|
+
value: "**/.env*",
|
|
54
|
+
reason: "Environment files contain sensitive secrets."
|
|
55
|
+
},
|
|
56
|
+
{
|
|
57
|
+
type: "literal",
|
|
58
|
+
value: "/wp-admin",
|
|
59
|
+
reason: "WordPress admin paths are common attack vectors."
|
|
60
|
+
}
|
|
61
|
+
]
|
|
62
|
+
},
|
|
63
|
+
{
|
|
64
|
+
category: "Environment Leakage",
|
|
65
|
+
patterns: [
|
|
66
|
+
{
|
|
67
|
+
type: "glob",
|
|
68
|
+
value: "**/staging.**",
|
|
69
|
+
reason: "Staging environments should be restricted."
|
|
70
|
+
},
|
|
71
|
+
{
|
|
72
|
+
type: "glob",
|
|
73
|
+
value: "**/dev.**",
|
|
74
|
+
reason: "Development subdomains detected in production sitemap."
|
|
75
|
+
}
|
|
76
|
+
]
|
|
77
|
+
},
|
|
78
|
+
{
|
|
79
|
+
category: "Sensitive Files",
|
|
80
|
+
patterns: [
|
|
81
|
+
{
|
|
82
|
+
type: "glob",
|
|
83
|
+
value: "**/*.{sql,bak,zip,tar.gz}",
|
|
84
|
+
reason: "Archive or database backup files exposed."
|
|
85
|
+
}
|
|
169
86
|
]
|
|
170
|
-
});
|
|
171
|
-
const context = await browser.newContext({
|
|
172
|
-
userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
173
|
-
viewport: { width: 1920, height: 1080 },
|
|
174
|
-
locale: "en-US",
|
|
175
|
-
timezoneId: "America/New_York",
|
|
176
|
-
extraHTTPHeaders: {
|
|
177
|
-
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
|
178
|
-
"Accept-Language": "en-US,en;q=0.9",
|
|
179
|
-
"Accept-Encoding": "gzip, deflate, br",
|
|
180
|
-
"DNT": "1",
|
|
181
|
-
"Connection": "keep-alive",
|
|
182
|
-
"Upgrade-Insecure-Requests": "1"
|
|
183
|
-
}
|
|
184
|
-
});
|
|
185
|
-
const page = await context.newPage();
|
|
186
|
-
await page.addInitScript(() => {
|
|
187
|
-
Object.defineProperty(navigator, "webdriver", {
|
|
188
|
-
get: () => false
|
|
189
|
-
});
|
|
190
|
-
window.chrome = {
|
|
191
|
-
runtime: {}
|
|
192
|
-
};
|
|
193
|
-
const originalQuery = window.navigator.permissions.query;
|
|
194
|
-
window.navigator.permissions.query = (parameters) => parameters.name === "notifications" ? Promise.resolve({ state: Notification.permission }) : originalQuery(parameters);
|
|
195
|
-
});
|
|
196
|
-
page.setDefaultTimeout(timeout * 1e3);
|
|
197
|
-
const response = await page.goto(url, {
|
|
198
|
-
waitUntil: "domcontentloaded",
|
|
199
|
-
// Changed from networkidle - faster for simple XML
|
|
200
|
-
timeout: timeout * 1e3
|
|
201
|
-
});
|
|
202
|
-
if (!response) {
|
|
203
|
-
throw new Error("No response received from page");
|
|
204
|
-
}
|
|
205
|
-
const statusCode = response.status();
|
|
206
|
-
const content = await page.content();
|
|
207
|
-
const finalUrl = page.url();
|
|
208
|
-
await browser.close();
|
|
209
|
-
if (statusCode >= 200 && statusCode < 300) {
|
|
210
|
-
return {
|
|
211
|
-
content,
|
|
212
|
-
statusCode,
|
|
213
|
-
url: finalUrl
|
|
214
|
-
};
|
|
215
|
-
}
|
|
216
|
-
throw new HttpError(finalUrl, statusCode);
|
|
217
|
-
} catch (error) {
|
|
218
|
-
if (browser) {
|
|
219
|
-
await browser.close();
|
|
220
|
-
}
|
|
221
|
-
if (error.code === "HTTP_ERROR") {
|
|
222
|
-
throw error;
|
|
223
87
|
}
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
timeout: timeout * 1e3,
|
|
246
|
-
headers: {
|
|
247
|
-
"User-Agent": "sitemap-qa/1.0.0 (compatible; +https://github.com/Akotliar/sitemap-qa)",
|
|
248
|
-
"Accept": "text/xml,application/xml,text/plain,*/*",
|
|
249
|
-
"Accept-Encoding": "gzip, deflate",
|
|
250
|
-
"Connection": "keep-alive"
|
|
88
|
+
]
|
|
89
|
+
};
|
|
90
|
+
|
|
91
|
+
// src/config/loader.ts
|
|
92
|
+
import chalk from "chalk";
|
|
93
|
+
var ConfigLoader = class {
|
|
94
|
+
static DEFAULT_CONFIG_PATH = "sitemap-qa.yaml";
|
|
95
|
+
static load(configPath) {
|
|
96
|
+
const targetPath = configPath || path.join(process.cwd(), this.DEFAULT_CONFIG_PATH);
|
|
97
|
+
let userConfig = { policies: [] };
|
|
98
|
+
if (fs.existsSync(targetPath)) {
|
|
99
|
+
try {
|
|
100
|
+
const fileContent = fs.readFileSync(targetPath, "utf8");
|
|
101
|
+
const parsedYaml = yaml.load(fileContent);
|
|
102
|
+
const result = ConfigSchema.safeParse(parsedYaml);
|
|
103
|
+
if (!result.success) {
|
|
104
|
+
console.error(chalk.red("Configuration Validation Error:"));
|
|
105
|
+
result.error.issues.forEach((issue) => {
|
|
106
|
+
console.error(chalk.yellow(` - ${issue.path.join(".")}: ${issue.message}`));
|
|
107
|
+
});
|
|
108
|
+
process.exit(2);
|
|
251
109
|
}
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
return {
|
|
257
|
-
content: typeof body === "string" ? body : JSON.stringify(body),
|
|
258
|
-
statusCode,
|
|
259
|
-
url: response.request?.res?.responseUrl || url
|
|
260
|
-
// Final URL after redirects
|
|
261
|
-
};
|
|
262
|
-
}
|
|
263
|
-
if (statusCode === 403 && !attemptedBrowser && !disableBrowserFallback) {
|
|
264
|
-
attemptedBrowser = true;
|
|
265
|
-
continue;
|
|
266
|
-
}
|
|
267
|
-
if (!retryableStatuses.includes(statusCode)) {
|
|
268
|
-
throw new HttpError(response.request?.res?.responseUrl || url, statusCode);
|
|
110
|
+
userConfig = result.data;
|
|
111
|
+
} catch (error) {
|
|
112
|
+
console.error(chalk.red("Failed to load configuration:"), error);
|
|
113
|
+
process.exit(2);
|
|
269
114
|
}
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
115
|
+
} else if (configPath) {
|
|
116
|
+
console.error(chalk.red(`Error: Configuration file not found at ${targetPath}`));
|
|
117
|
+
process.exit(2);
|
|
118
|
+
}
|
|
119
|
+
return this.mergeConfigs(DEFAULT_POLICIES, userConfig);
|
|
120
|
+
}
|
|
121
|
+
static mergeConfigs(defaults, user) {
|
|
122
|
+
const mergedPolicies = [...defaults.policies];
|
|
123
|
+
user.policies.forEach((userPolicy) => {
|
|
124
|
+
const index = mergedPolicies.findIndex((p) => p.category === userPolicy.category);
|
|
125
|
+
if (index !== -1) {
|
|
126
|
+
mergedPolicies[index] = userPolicy;
|
|
278
127
|
} else {
|
|
279
|
-
|
|
128
|
+
mergedPolicies.push(userPolicy);
|
|
280
129
|
}
|
|
281
|
-
|
|
130
|
+
});
|
|
131
|
+
const merged = {
|
|
132
|
+
...defaults,
|
|
133
|
+
acceptable_patterns: [...defaults.acceptable_patterns || [], ...user.acceptable_patterns || []],
|
|
134
|
+
policies: mergedPolicies
|
|
135
|
+
};
|
|
136
|
+
if (user.outDir !== void 0) {
|
|
137
|
+
merged.outDir = user.outDir;
|
|
138
|
+
}
|
|
139
|
+
if (user.outputFormat !== void 0) {
|
|
140
|
+
merged.outputFormat = user.outputFormat;
|
|
282
141
|
}
|
|
283
|
-
if (
|
|
284
|
-
|
|
285
|
-
await new Promise((resolve) => setTimeout(resolve, delay));
|
|
142
|
+
if (user.enforceDomainConsistency !== void 0) {
|
|
143
|
+
merged.enforceDomainConsistency = user.enforceDomainConsistency;
|
|
286
144
|
}
|
|
145
|
+
return merged;
|
|
287
146
|
}
|
|
288
|
-
|
|
289
|
-
}
|
|
147
|
+
};
|
|
290
148
|
|
|
291
149
|
// src/core/discovery.ts
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
150
|
+
import { fetch } from "undici";
|
|
151
|
+
import { XMLParser } from "fast-xml-parser";
|
|
152
|
+
var DiscoveryService = class {
|
|
153
|
+
parser;
|
|
154
|
+
visited = /* @__PURE__ */ new Set();
|
|
155
|
+
STANDARD_PATHS = [
|
|
296
156
|
"/sitemap.xml",
|
|
297
157
|
"/sitemap_index.xml",
|
|
298
|
-
"/sitemap-index.xml"
|
|
158
|
+
"/sitemap-index.xml",
|
|
159
|
+
"/sitemap.php",
|
|
160
|
+
"/sitemap.xml.gz"
|
|
299
161
|
];
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
const result = await fetchUrl(sitemapUrl, {
|
|
305
|
-
timeout: config.timeout,
|
|
306
|
-
maxRetries: 0
|
|
307
|
-
// Don't retry on standard paths - fail fast
|
|
308
|
-
});
|
|
309
|
-
if (result.statusCode === 200) {
|
|
310
|
-
if (config.verbose) {
|
|
311
|
-
console.log(`\u2713 Found sitemap at: ${sitemapUrl}`);
|
|
312
|
-
}
|
|
313
|
-
return { found: true, url: sitemapUrl };
|
|
314
|
-
}
|
|
315
|
-
return { found: false };
|
|
316
|
-
} catch (error) {
|
|
317
|
-
if (error instanceof HttpError) {
|
|
318
|
-
if (error.statusCode === 401 || error.statusCode === 403) {
|
|
319
|
-
accessIssues.push({
|
|
320
|
-
url: sitemapUrl,
|
|
321
|
-
statusCode: error.statusCode,
|
|
322
|
-
error: error.statusCode === 401 ? "Unauthorized" : "Access Denied"
|
|
323
|
-
});
|
|
324
|
-
if (config.verbose) {
|
|
325
|
-
console.log(`\u26A0 Access denied: ${sitemapUrl} (${error.statusCode})`);
|
|
326
|
-
}
|
|
327
|
-
} else if (config.verbose) {
|
|
328
|
-
console.log(`\u2717 Not found: ${sitemapUrl} (${error.statusCode})`);
|
|
329
|
-
}
|
|
330
|
-
} else if (config.verbose) {
|
|
331
|
-
console.log(`\u2717 Not found: ${sitemapUrl}`);
|
|
332
|
-
}
|
|
333
|
-
return { found: false };
|
|
334
|
-
}
|
|
335
|
-
})
|
|
336
|
-
);
|
|
337
|
-
for (const result of results) {
|
|
338
|
-
if (result.status === "fulfilled" && result.value.found) {
|
|
339
|
-
return { sitemaps: [result.value.url], issues: accessIssues };
|
|
340
|
-
}
|
|
341
|
-
}
|
|
342
|
-
if (config.verbose) {
|
|
343
|
-
console.log("No sitemap found at standard paths");
|
|
344
|
-
}
|
|
345
|
-
return { sitemaps: [], issues: accessIssues };
|
|
346
|
-
}
|
|
347
|
-
async function parseRobotsTxt(baseUrl, config) {
|
|
348
|
-
const robotsUrl = `${new URL(baseUrl).origin}/robots.txt`;
|
|
349
|
-
try {
|
|
350
|
-
const result = await fetchUrl(robotsUrl, {
|
|
351
|
-
timeout: config.timeout,
|
|
352
|
-
maxRetries: 1
|
|
162
|
+
constructor() {
|
|
163
|
+
this.parser = new XMLParser({
|
|
164
|
+
ignoreAttributes: false,
|
|
165
|
+
attributeNamePrefix: "@_"
|
|
353
166
|
});
|
|
354
|
-
const lines = result.content.split("\n");
|
|
355
|
-
const sitemaps = [];
|
|
356
|
-
for (const line of lines) {
|
|
357
|
-
const match = line.match(/^Sitemap:\s*(.+)$/i);
|
|
358
|
-
if (match) {
|
|
359
|
-
const sitemapUrl = match[1].trim();
|
|
360
|
-
try {
|
|
361
|
-
new URL(sitemapUrl);
|
|
362
|
-
sitemaps.push(sitemapUrl);
|
|
363
|
-
} catch {
|
|
364
|
-
if (config.verbose) {
|
|
365
|
-
console.warn(`Invalid sitemap URL in robots.txt: ${sitemapUrl}`);
|
|
366
|
-
}
|
|
367
|
-
}
|
|
368
|
-
}
|
|
369
|
-
}
|
|
370
|
-
if (config.verbose && sitemaps.length > 0) {
|
|
371
|
-
console.log(`Found ${sitemaps.length} sitemap(s) in robots.txt`);
|
|
372
|
-
}
|
|
373
|
-
return sitemaps;
|
|
374
|
-
} catch (error) {
|
|
375
|
-
if (config.verbose) {
|
|
376
|
-
console.log(`No robots.txt found at ${robotsUrl}`);
|
|
377
|
-
}
|
|
378
|
-
return [];
|
|
379
|
-
}
|
|
380
|
-
}
|
|
381
|
-
function isSitemapIndex(xmlContent) {
|
|
382
|
-
if (xmlContent.includes("<sitemapindex")) {
|
|
383
|
-
return true;
|
|
384
167
|
}
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
function extractSitemapIndexUrls(xmlContent) {
|
|
401
|
-
const urls = [];
|
|
402
|
-
if (xmlContent.includes("<sitemapindex")) {
|
|
403
|
-
const sitemapBlockRegex = /<sitemap[^>]*>(.*?)<\/sitemap>/gs;
|
|
404
|
-
let sitemapMatch;
|
|
405
|
-
while ((sitemapMatch = sitemapBlockRegex.exec(xmlContent)) !== null) {
|
|
406
|
-
const locMatch = /<loc>([^<]+)<\/loc>/i.exec(sitemapMatch[1]);
|
|
407
|
-
if (locMatch) {
|
|
408
|
-
const url = locMatch[1].trim();
|
|
409
|
-
try {
|
|
410
|
-
new URL(url);
|
|
411
|
-
urls.push(url);
|
|
412
|
-
} catch {
|
|
168
|
+
/**
|
|
169
|
+
* Attempts to find sitemaps for a given base website URL.
|
|
170
|
+
*/
|
|
171
|
+
async findSitemaps(baseUrl) {
|
|
172
|
+
const sitemaps = /* @__PURE__ */ new Set();
|
|
173
|
+
const url = new URL(baseUrl);
|
|
174
|
+
const origin = url.origin;
|
|
175
|
+
try {
|
|
176
|
+
const robotsUrl = `${origin}/robots.txt`;
|
|
177
|
+
const response = await fetch(robotsUrl);
|
|
178
|
+
if (response.status === 200) {
|
|
179
|
+
const text = await response.text();
|
|
180
|
+
const matches = text.matchAll(/^Sitemap:\s*(.+)$/gim);
|
|
181
|
+
for (const match of matches) {
|
|
182
|
+
if (match[1]) sitemaps.add(match[1].trim());
|
|
413
183
|
}
|
|
414
184
|
}
|
|
185
|
+
} catch (e) {
|
|
415
186
|
}
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
if (url.toLowerCase().includes("sitemap") || url.toLowerCase().endsWith(".xml")) {
|
|
424
|
-
try {
|
|
425
|
-
new URL(url);
|
|
426
|
-
urls.push(url);
|
|
427
|
-
} catch {
|
|
187
|
+
if (sitemaps.size === 0) {
|
|
188
|
+
for (const path4 of this.STANDARD_PATHS) {
|
|
189
|
+
try {
|
|
190
|
+
const sitemapUrl = `${origin}${path4}`;
|
|
191
|
+
const response = await fetch(sitemapUrl, { method: "HEAD" });
|
|
192
|
+
if (response.status === 200) {
|
|
193
|
+
sitemaps.add(sitemapUrl);
|
|
428
194
|
}
|
|
195
|
+
} catch (e) {
|
|
429
196
|
}
|
|
430
197
|
}
|
|
431
198
|
}
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
if (processed.has(sitemapUrl)) {
|
|
445
|
-
if (config.verbose) {
|
|
446
|
-
console.warn(`Skipping duplicate sitemap: ${sitemapUrl}`);
|
|
447
|
-
}
|
|
448
|
-
return { type: "skip" };
|
|
449
|
-
}
|
|
450
|
-
processed.add(sitemapUrl);
|
|
199
|
+
return Array.from(sitemaps);
|
|
200
|
+
}
|
|
201
|
+
/**
|
|
202
|
+
* Recursively discovers all leaf sitemaps from a root URL.
|
|
203
|
+
* Returns both the sitemap URL and its XML data to avoid duplicate fetches.
|
|
204
|
+
*/
|
|
205
|
+
async *discover(rootUrl) {
|
|
206
|
+
const queue = [rootUrl];
|
|
207
|
+
while (queue.length > 0) {
|
|
208
|
+
const currentUrl = queue.shift();
|
|
209
|
+
if (this.visited.has(currentUrl)) continue;
|
|
210
|
+
this.visited.add(currentUrl);
|
|
451
211
|
try {
|
|
452
|
-
const
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
if (
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
console.log(` \u2514\u2500 Contains ${childUrls.length} child sitemap(s)`);
|
|
463
|
-
}
|
|
464
|
-
return { type: "index", childUrls };
|
|
465
|
-
} else {
|
|
466
|
-
if (config.verbose) {
|
|
467
|
-
console.log(`\u2713 Discovered sitemap: ${sitemapUrl}`);
|
|
212
|
+
const response = await fetch(currentUrl);
|
|
213
|
+
if (response.status !== 200) continue;
|
|
214
|
+
const xmlData = await response.text();
|
|
215
|
+
const jsonObj = this.parser.parse(xmlData);
|
|
216
|
+
if (jsonObj.sitemapindex) {
|
|
217
|
+
const sitemaps = Array.isArray(jsonObj.sitemapindex.sitemap) ? jsonObj.sitemapindex.sitemap : [jsonObj.sitemapindex.sitemap];
|
|
218
|
+
for (const sitemap of sitemaps) {
|
|
219
|
+
if (sitemap?.loc) {
|
|
220
|
+
queue.push(sitemap.loc);
|
|
221
|
+
}
|
|
468
222
|
}
|
|
469
|
-
|
|
223
|
+
} else if (jsonObj.urlset) {
|
|
224
|
+
yield { url: currentUrl, xmlData };
|
|
470
225
|
}
|
|
471
226
|
} catch (error) {
|
|
472
|
-
|
|
473
|
-
if (config.verbose) {
|
|
474
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
475
|
-
console.warn(`Failed to fetch sitemap ${sitemapUrl}: ${message}`);
|
|
476
|
-
}
|
|
477
|
-
return { type: "failed" };
|
|
478
|
-
}
|
|
479
|
-
}));
|
|
480
|
-
for (const result of batchResults) {
|
|
481
|
-
if (result.type === "index") {
|
|
482
|
-
toProcess.push(...result.childUrls);
|
|
483
|
-
} else if (result.type === "sitemap") {
|
|
484
|
-
finalSitemaps.push(result.url);
|
|
227
|
+
console.error(`Failed to fetch or parse sitemap at ${currentUrl}:`, error);
|
|
485
228
|
}
|
|
486
229
|
}
|
|
487
|
-
if (processed.size > 1e3) {
|
|
488
|
-
console.warn(`\u26A0\uFE0F Processed over 1000 sitemap URLs. Stopping to prevent excessive requests.`);
|
|
489
|
-
break;
|
|
490
|
-
}
|
|
491
230
|
}
|
|
492
|
-
|
|
493
|
-
console.warn(`
|
|
494
|
-
\u26A0\uFE0F All ${inaccessible.size} sitemap(s) were inaccessible`);
|
|
495
|
-
console.warn(`Common causes: 403/404 errors, network issues, or bot protection`);
|
|
496
|
-
}
|
|
497
|
-
return finalSitemaps;
|
|
498
|
-
}
|
|
499
|
-
async function discoverSitemaps(baseUrl, config) {
|
|
500
|
-
const normalizedUrl = new URL(baseUrl).origin;
|
|
501
|
-
if (config.verbose) {
|
|
502
|
-
console.log("Checking robots.txt for sitemap directives...");
|
|
503
|
-
}
|
|
504
|
-
const robotsSitemaps = await parseRobotsTxt(normalizedUrl, config);
|
|
505
|
-
if (robotsSitemaps.length > 0) {
|
|
506
|
-
const sitemaps = await discoverAllSitemaps(robotsSitemaps, config);
|
|
507
|
-
return {
|
|
508
|
-
sitemaps,
|
|
509
|
-
source: "robots-txt",
|
|
510
|
-
accessIssues: []
|
|
511
|
-
};
|
|
512
|
-
}
|
|
513
|
-
if (config.verbose) {
|
|
514
|
-
console.log("Trying standard sitemap paths...");
|
|
515
|
-
}
|
|
516
|
-
const { sitemaps: standardSitemaps, issues } = await tryStandardPaths(normalizedUrl, config);
|
|
517
|
-
if (standardSitemaps.length > 0) {
|
|
518
|
-
const sitemaps = await discoverAllSitemaps(standardSitemaps, config);
|
|
519
|
-
if (sitemaps.length > 0) {
|
|
520
|
-
return {
|
|
521
|
-
sitemaps,
|
|
522
|
-
source: "standard-path",
|
|
523
|
-
accessIssues: []
|
|
524
|
-
};
|
|
525
|
-
}
|
|
526
|
-
return {
|
|
527
|
-
sitemaps: [],
|
|
528
|
-
source: "standard-path",
|
|
529
|
-
accessIssues: issues
|
|
530
|
-
};
|
|
531
|
-
}
|
|
532
|
-
return {
|
|
533
|
-
sitemaps: [],
|
|
534
|
-
source: "none",
|
|
535
|
-
accessIssues: issues
|
|
536
|
-
};
|
|
537
|
-
}
|
|
231
|
+
};
|
|
538
232
|
|
|
539
233
|
// src/core/parser.ts
|
|
540
|
-
import { XMLParser
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
continue;
|
|
234
|
+
import { XMLParser as XMLParser2 } from "fast-xml-parser";
|
|
235
|
+
import { fetch as fetch2 } from "undici";
|
|
236
|
+
var SitemapParser = class {
|
|
237
|
+
parser;
|
|
238
|
+
constructor() {
|
|
239
|
+
this.parser = new XMLParser2({
|
|
240
|
+
ignoreAttributes: false,
|
|
241
|
+
attributeNamePrefix: "@_"
|
|
242
|
+
});
|
|
243
|
+
}
|
|
244
|
+
/**
|
|
245
|
+
* Parses a leaf sitemap and yields SitemapUrl objects.
|
|
246
|
+
* Can accept either a URL to fetch or pre-fetched XML data with the source URL.
|
|
247
|
+
* Note: For true streaming of massive files, we'd use a SAX-like approach.
|
|
248
|
+
* fast-xml-parser's parse() is fast but loads the whole string.
|
|
249
|
+
* Given the 50k URL requirement, we'll use a more memory-efficient approach if needed,
|
|
250
|
+
* but let's start with a clean AsyncGenerator interface.
|
|
251
|
+
*/
|
|
252
|
+
async *parse(sitemapUrlOrData) {
|
|
253
|
+
let sitemapUrl = typeof sitemapUrlOrData === "string" ? sitemapUrlOrData : sitemapUrlOrData.url;
|
|
254
|
+
try {
|
|
255
|
+
let xmlData;
|
|
256
|
+
if (typeof sitemapUrlOrData === "string") {
|
|
257
|
+
const response = await fetch2(sitemapUrl);
|
|
258
|
+
xmlData = await response.text();
|
|
259
|
+
} else {
|
|
260
|
+
xmlData = sitemapUrlOrData.xmlData;
|
|
568
261
|
}
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
262
|
+
const jsonObj = this.parser.parse(xmlData);
|
|
263
|
+
if (jsonObj.urlset && jsonObj.urlset.url) {
|
|
264
|
+
const urls = Array.isArray(jsonObj.urlset.url) ? jsonObj.urlset.url : [jsonObj.urlset.url];
|
|
265
|
+
for (const url of urls) {
|
|
266
|
+
if (url.loc) {
|
|
267
|
+
yield {
|
|
268
|
+
loc: url.loc,
|
|
269
|
+
source: sitemapUrl,
|
|
270
|
+
lastmod: url.lastmod,
|
|
271
|
+
changefreq: url.changefreq,
|
|
272
|
+
priority: url.priority,
|
|
273
|
+
risks: []
|
|
274
|
+
};
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
} catch (error) {
|
|
279
|
+
console.error(`Failed to parse sitemap at ${sitemapUrl}:`, error);
|
|
576
280
|
}
|
|
577
281
|
}
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
282
|
+
};
|
|
283
|
+
|
|
284
|
+
// src/core/extractor.ts
|
|
285
|
+
var ExtractorService = class {
|
|
286
|
+
discovery;
|
|
287
|
+
parser;
|
|
288
|
+
seenUrls = /* @__PURE__ */ new Set();
|
|
289
|
+
discoveredSitemaps = /* @__PURE__ */ new Set();
|
|
290
|
+
constructor() {
|
|
291
|
+
this.discovery = new DiscoveryService();
|
|
292
|
+
this.parser = new SitemapParser();
|
|
293
|
+
}
|
|
294
|
+
/**
|
|
295
|
+
* Returns the list of sitemaps discovered during the extraction process.
|
|
296
|
+
*/
|
|
297
|
+
getDiscoveredSitemaps() {
|
|
298
|
+
return Array.from(this.discoveredSitemaps);
|
|
299
|
+
}
|
|
300
|
+
/**
|
|
301
|
+
* Normalizes a URL by removing trailing slashes and converting to lowercase.
|
|
302
|
+
*/
|
|
303
|
+
normalizeUrl(url) {
|
|
304
|
+
try {
|
|
305
|
+
const parsed = new URL(url);
|
|
306
|
+
let normalized = parsed.origin + parsed.pathname.replace(/\/$/, "");
|
|
307
|
+
if (parsed.search) normalized += parsed.search;
|
|
308
|
+
return normalized.toLowerCase();
|
|
309
|
+
} catch {
|
|
310
|
+
return url.toLowerCase().replace(/\/$/, "");
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
/**
|
|
314
|
+
* Extracts all unique URLs from a root sitemap URL or website base URL.
|
|
315
|
+
*/
|
|
316
|
+
async *extract(inputUrl) {
|
|
317
|
+
let startUrls = [inputUrl];
|
|
318
|
+
if (!inputUrl.endsWith(".xml") && !inputUrl.endsWith(".gz")) {
|
|
319
|
+
const discovered = await this.discovery.findSitemaps(inputUrl);
|
|
320
|
+
if (discovered.length > 0) {
|
|
321
|
+
console.log(`\u2705 Discovered ${discovered.length} sitemap(s): ${discovered.join(", ")}`);
|
|
322
|
+
startUrls = discovered;
|
|
323
|
+
} else {
|
|
324
|
+
console.log(`\u26A0\uFE0F No sitemaps discovered via robots.txt or standard paths. Proceeding with input URL.`);
|
|
325
|
+
}
|
|
594
326
|
}
|
|
595
|
-
const
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
errors.push(
|
|
604
|
-
`Invalid priority ${entry.priority} for ${entry.loc} - clamping to 0-1`
|
|
605
|
-
);
|
|
606
|
-
entry.priority = Math.max(0, Math.min(1, entry.priority));
|
|
607
|
-
}
|
|
608
|
-
}
|
|
609
|
-
if (entry.changefreq) {
|
|
610
|
-
if (!VALID_CHANGEFREQ.has(entry.changefreq.toLowerCase())) {
|
|
611
|
-
errors.push(
|
|
612
|
-
`Invalid changefreq "${entry.changefreq}" for ${entry.loc}`
|
|
613
|
-
);
|
|
614
|
-
entry.changefreq = void 0;
|
|
327
|
+
for (const startUrl of startUrls) {
|
|
328
|
+
for await (const discovered of this.discovery.discover(startUrl)) {
|
|
329
|
+
this.discoveredSitemaps.add(discovered.url);
|
|
330
|
+
for await (const urlObj of this.parser.parse(discovered)) {
|
|
331
|
+
const normalized = this.normalizeUrl(urlObj.loc);
|
|
332
|
+
if (!this.seenUrls.has(normalized)) {
|
|
333
|
+
this.seenUrls.add(normalized);
|
|
334
|
+
yield urlObj;
|
|
615
335
|
}
|
|
616
336
|
}
|
|
617
|
-
validUrls.push(entry);
|
|
618
|
-
} catch (urlError) {
|
|
619
|
-
errors.push(`Invalid URL format: ${entry.loc}`);
|
|
620
337
|
}
|
|
621
338
|
}
|
|
622
|
-
return {
|
|
623
|
-
urls: validUrls,
|
|
624
|
-
errors,
|
|
625
|
-
totalCount: validUrls.length,
|
|
626
|
-
sitemapUrl
|
|
627
|
-
};
|
|
628
|
-
} catch (parseError) {
|
|
629
|
-
const errorMsg = parseError instanceof Error ? parseError.message : String(parseError);
|
|
630
|
-
return {
|
|
631
|
-
urls: [],
|
|
632
|
-
errors: [
|
|
633
|
-
`[${sitemapUrl}] XML parsing failed: ${errorMsg}`
|
|
634
|
-
],
|
|
635
|
-
totalCount: 0,
|
|
636
|
-
sitemapUrl
|
|
637
|
-
};
|
|
638
339
|
}
|
|
639
|
-
}
|
|
340
|
+
};
|
|
640
341
|
|
|
641
|
-
// src/
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
async function processInBatches(items, concurrency, processor, onProgress) {
|
|
650
|
-
const results = new Array(items.length);
|
|
651
|
-
let completed = 0;
|
|
652
|
-
let currentIndex = 0;
|
|
653
|
-
const errors = [];
|
|
654
|
-
const workers = Array(Math.min(concurrency, items.length)).fill(null).map(async () => {
|
|
655
|
-
while (currentIndex < items.length) {
|
|
656
|
-
const index = currentIndex++;
|
|
657
|
-
const item = items[index];
|
|
342
|
+
// src/core/matcher.ts
|
|
343
|
+
import micromatch from "micromatch";
|
|
344
|
+
var MatcherService = class {
|
|
345
|
+
config;
|
|
346
|
+
rootDomain;
|
|
347
|
+
constructor(config, rootUrl) {
|
|
348
|
+
this.config = config;
|
|
349
|
+
if (rootUrl) {
|
|
658
350
|
try {
|
|
659
|
-
|
|
660
|
-
} catch
|
|
661
|
-
errors.push({ index, error });
|
|
662
|
-
results[index] = null;
|
|
663
|
-
}
|
|
664
|
-
completed++;
|
|
665
|
-
if (onProgress) {
|
|
666
|
-
onProgress(completed, items.length);
|
|
351
|
+
this.rootDomain = new URL(rootUrl).hostname.replace(/^www\./, "");
|
|
352
|
+
} catch {
|
|
667
353
|
}
|
|
668
354
|
}
|
|
669
|
-
});
|
|
670
|
-
await Promise.all(workers);
|
|
671
|
-
if (errors.length > 0) {
|
|
672
|
-
console.warn(`Processed ${items.length} items with ${errors.length} errors`);
|
|
673
|
-
}
|
|
674
|
-
return results;
|
|
675
|
-
}
|
|
676
|
-
|
|
677
|
-
// src/core/extractor.ts
|
|
678
|
-
async function extractAllUrls(sitemapUrls, config, onProgress) {
|
|
679
|
-
const allUrls = [];
|
|
680
|
-
const allErrors = [];
|
|
681
|
-
let sitemapsProcessed = 0;
|
|
682
|
-
let sitemapsFailed = 0;
|
|
683
|
-
if (config.verbose) {
|
|
684
|
-
console.log(`
|
|
685
|
-
Extracting URLs from ${sitemapUrls.length} sitemap(s)...`);
|
|
686
|
-
}
|
|
687
|
-
const CONCURRENCY = config.parsingConcurrency || 50;
|
|
688
|
-
if (!config.silent && config.verbose) {
|
|
689
|
-
console.log(`Using parsing concurrency: ${CONCURRENCY}`);
|
|
690
355
|
}
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
356
|
+
/**
|
|
357
|
+
* Matches a URL against all policies and returns detected risks.
|
|
358
|
+
*/
|
|
359
|
+
match(urlObj) {
|
|
360
|
+
const risks = [];
|
|
361
|
+
if (this.config.enforceDomainConsistency && this.rootDomain) {
|
|
695
362
|
try {
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
disableBrowserFallback: true
|
|
705
|
-
// Don't use browser for bulk parsing
|
|
706
|
-
});
|
|
707
|
-
const parseResult = await parseSitemap(response.content, sitemapUrl);
|
|
708
|
-
const extractedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
709
|
-
parseResult.urls.forEach((url) => {
|
|
710
|
-
url.extractedAt = extractedAt;
|
|
711
|
-
});
|
|
712
|
-
if (config.verbose) {
|
|
713
|
-
console.log(` \u2713 Extracted ${parseResult.urls.length} URLs from ${sitemapUrl}`);
|
|
714
|
-
}
|
|
715
|
-
return {
|
|
716
|
-
success: true,
|
|
717
|
-
urls: parseResult.urls,
|
|
718
|
-
errors: parseResult.errors
|
|
719
|
-
};
|
|
720
|
-
} catch (error) {
|
|
721
|
-
const errorMsg = `Failed to process ${sitemapUrl}: ${error instanceof Error ? error.message : String(error)}`;
|
|
722
|
-
if (config.verbose) {
|
|
723
|
-
console.error(` \u2717 ${errorMsg}`);
|
|
363
|
+
const currentDomain = new URL(urlObj.loc).hostname.replace(/^www\./, "");
|
|
364
|
+
if (currentDomain !== this.rootDomain) {
|
|
365
|
+
risks.push({
|
|
366
|
+
category: "Domain Consistency",
|
|
367
|
+
pattern: this.rootDomain,
|
|
368
|
+
type: "literal",
|
|
369
|
+
reason: `URL domain mismatch: expected ${this.rootDomain} (or www.${this.rootDomain}), but found ${currentDomain}.`
|
|
370
|
+
});
|
|
724
371
|
}
|
|
725
|
-
|
|
726
|
-
success: false,
|
|
727
|
-
urls: [],
|
|
728
|
-
errors: [errorMsg]
|
|
729
|
-
};
|
|
372
|
+
} catch {
|
|
730
373
|
}
|
|
731
|
-
},
|
|
732
|
-
onProgress
|
|
733
|
-
// Pass progress callback to batch processor
|
|
734
|
-
);
|
|
735
|
-
for (const result of results) {
|
|
736
|
-
if (result.success) {
|
|
737
|
-
sitemapsProcessed++;
|
|
738
|
-
allUrls.push(...result.urls);
|
|
739
|
-
} else {
|
|
740
|
-
sitemapsFailed++;
|
|
741
374
|
}
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
console.log(` - Sitemaps failed: ${sitemapsFailed}`);
|
|
749
|
-
console.log(` - Total URLs: ${allUrls.length}`);
|
|
750
|
-
console.log(` - Errors: ${allErrors.length}`);
|
|
751
|
-
}
|
|
752
|
-
return {
|
|
753
|
-
allUrls,
|
|
754
|
-
sitemapsProcessed,
|
|
755
|
-
sitemapsFailed,
|
|
756
|
-
totalUrls: allUrls.length,
|
|
757
|
-
errors: allErrors
|
|
758
|
-
};
|
|
759
|
-
}
|
|
760
|
-
|
|
761
|
-
// src/core/consolidator.ts
|
|
762
|
-
function normalizeUrl(url) {
|
|
763
|
-
try {
|
|
764
|
-
const parsed = new URL(url);
|
|
765
|
-
let pathname = parsed.pathname;
|
|
766
|
-
if (pathname.endsWith("/") && pathname !== "/") {
|
|
767
|
-
pathname = pathname.slice(0, -1);
|
|
375
|
+
for (const pattern of this.config.acceptable_patterns) {
|
|
376
|
+
if (this.isMatch(urlObj.loc, pattern)) {
|
|
377
|
+
urlObj.ignored = true;
|
|
378
|
+
urlObj.ignoredBy = pattern.reason;
|
|
379
|
+
return risks;
|
|
380
|
+
}
|
|
768
381
|
}
|
|
769
|
-
const
|
|
770
|
-
(
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
}
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
const merged = { ...entries[0] };
|
|
781
|
-
const sources = entries.map((e) => e.source);
|
|
782
|
-
merged.source = sources.join(", ");
|
|
783
|
-
const lastmods = entries.map((e) => e.lastmod).filter((lm) => !!lm).map((lm) => new Date(lm).getTime()).sort((a, b) => b - a);
|
|
784
|
-
if (lastmods.length > 0) {
|
|
785
|
-
merged.lastmod = new Date(lastmods[0]).toISOString();
|
|
786
|
-
}
|
|
787
|
-
const priorities = entries.map((e) => e.priority).filter((p) => p !== void 0);
|
|
788
|
-
if (priorities.length > 0) {
|
|
789
|
-
merged.priority = Math.max(...priorities);
|
|
790
|
-
}
|
|
791
|
-
const changefreqs = entries.map((e) => e.changefreq).filter((cf) => !!cf);
|
|
792
|
-
if (changefreqs.length > 0) {
|
|
793
|
-
const counts = /* @__PURE__ */ new Map();
|
|
794
|
-
for (const cf of changefreqs) {
|
|
795
|
-
counts.set(cf, (counts.get(cf) || 0) + 1);
|
|
382
|
+
for (const policy of this.config.policies) {
|
|
383
|
+
for (const pattern of policy.patterns) {
|
|
384
|
+
if (this.isMatch(urlObj.loc, pattern)) {
|
|
385
|
+
risks.push({
|
|
386
|
+
category: policy.category,
|
|
387
|
+
pattern: pattern.value,
|
|
388
|
+
type: pattern.type,
|
|
389
|
+
reason: pattern.reason
|
|
390
|
+
});
|
|
391
|
+
}
|
|
392
|
+
}
|
|
796
393
|
}
|
|
797
|
-
|
|
798
|
-
merged.changefreq = sorted[0][0];
|
|
799
|
-
}
|
|
800
|
-
const extractedAts = entries.map((e) => e.extractedAt).filter((ea) => !!ea).map((ea) => new Date(ea).getTime()).sort((a, b) => b - a);
|
|
801
|
-
if (extractedAts.length > 0) {
|
|
802
|
-
merged.extractedAt = new Date(extractedAts[0]).toISOString();
|
|
394
|
+
return risks;
|
|
803
395
|
}
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
396
|
+
isMatch(url, pattern) {
|
|
397
|
+
switch (pattern.type) {
|
|
398
|
+
case "literal":
|
|
399
|
+
return url.includes(pattern.value);
|
|
400
|
+
case "glob":
|
|
401
|
+
return micromatch.isMatch(url, pattern.value, { contains: true });
|
|
402
|
+
case "regex":
|
|
403
|
+
try {
|
|
404
|
+
const regex = new RegExp(pattern.value, "i");
|
|
405
|
+
return regex.test(url);
|
|
406
|
+
} catch {
|
|
407
|
+
return false;
|
|
408
|
+
}
|
|
409
|
+
default:
|
|
410
|
+
return false;
|
|
817
411
|
}
|
|
818
|
-
urlMap.get(normalized).push(entry);
|
|
819
412
|
}
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
413
|
+
};
|
|
414
|
+
|
|
415
|
+
// src/reporters/console-reporter.ts
|
|
416
|
+
import chalk2 from "chalk";
|
|
417
|
+
var ConsoleReporter = class {
|
|
418
|
+
async generate(data) {
|
|
419
|
+
console.log("\n" + chalk2.bold.blue("=== sitemap-qa Analysis Summary ==="));
|
|
420
|
+
console.log(`Total URLs Scanned: ${data.totalUrls}`);
|
|
421
|
+
console.log(`Total Risks Found: ${data.totalRisks > 0 ? chalk2.red(data.totalRisks) : chalk2.green(0)}`);
|
|
422
|
+
console.log(`URLs with Risks: ${data.urlsWithRisks.length}`);
|
|
423
|
+
console.log(`URLs Ignored: ${data.ignoredUrls.length > 0 ? chalk2.yellow(data.ignoredUrls.length) : 0}`);
|
|
424
|
+
console.log(`Duration: ${((data.endTime.getTime() - data.startTime.getTime()) / 1e3).toFixed(2)}s`);
|
|
425
|
+
if (data.urlsWithRisks.length > 0) {
|
|
426
|
+
console.log("\n" + chalk2.bold.yellow("Top Findings:"));
|
|
427
|
+
data.urlsWithRisks.slice(0, 10).forEach((url) => {
|
|
428
|
+
console.log(`
|
|
429
|
+
${chalk2.cyan(url.loc)}`);
|
|
430
|
+
url.risks.forEach((risk) => {
|
|
431
|
+
console.log(` - [${chalk2.red(risk.category)}] ${risk.reason} (${chalk2.gray(risk.pattern)})`);
|
|
432
|
+
});
|
|
830
433
|
});
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
console.log(`Consolidation complete:`);
|
|
835
|
-
console.log(` - Input URLs: ${totalInputUrls}`);
|
|
836
|
-
console.log(` - Unique URLs: ${uniqueUrls.length}`);
|
|
837
|
-
console.log(` - Duplicates removed: ${totalInputUrls - uniqueUrls.length}`);
|
|
838
|
-
if (duplicateGroups.length > 0) {
|
|
839
|
-
console.log(`
|
|
840
|
-
Top duplicates:`);
|
|
841
|
-
const top5 = duplicateGroups.sort((a, b) => b.count - a.count).slice(0, 5);
|
|
842
|
-
for (const group of top5) {
|
|
843
|
-
console.log(` - ${group.url} (${group.count} times)`);
|
|
434
|
+
if (data.urlsWithRisks.length > 10) {
|
|
435
|
+
console.log(`
|
|
436
|
+
... and ${data.urlsWithRisks.length - 10} more. See JSON/HTML report for full details.`);
|
|
844
437
|
}
|
|
845
438
|
}
|
|
439
|
+
console.log("\n" + chalk2.bold.blue("==================================="));
|
|
846
440
|
}
|
|
847
|
-
|
|
848
|
-
uniqueUrls,
|
|
849
|
-
totalInputUrls,
|
|
850
|
-
duplicatesRemoved: totalInputUrls - uniqueUrls.length,
|
|
851
|
-
duplicateGroups
|
|
852
|
-
};
|
|
853
|
-
}
|
|
854
|
-
|
|
855
|
-
// src/core/patterns/risk-patterns.ts
|
|
856
|
-
var RISK_PATTERNS = [
|
|
857
|
-
// Sensitive Parameter Patterns (HIGH)
|
|
858
|
-
{
|
|
859
|
-
name: "Authentication Parameter",
|
|
860
|
-
category: "sensitive_params",
|
|
861
|
-
severity: "high",
|
|
862
|
-
regex: /[?&](token|auth|key|password|secret|apikey|session|credentials)=/i,
|
|
863
|
-
description: "Query parameter may contain sensitive authentication data"
|
|
864
|
-
},
|
|
865
|
-
{
|
|
866
|
-
name: "Debug Parameter",
|
|
867
|
-
category: "sensitive_params",
|
|
868
|
-
severity: "medium",
|
|
869
|
-
regex: /[?&](debug|trace|verbose|test_mode)=/i,
|
|
870
|
-
description: "Query parameter may contain debug or diagnostic flag"
|
|
871
|
-
},
|
|
872
|
-
// Protocol Inconsistency Patterns (MEDIUM)
|
|
873
|
-
{
|
|
874
|
-
name: "HTTP in HTTPS Site",
|
|
875
|
-
category: "protocol_inconsistency",
|
|
876
|
-
severity: "medium",
|
|
877
|
-
regex: /^http:\/\//,
|
|
878
|
-
description: "HTTP URL in HTTPS sitemap (potential mixed content)"
|
|
879
|
-
},
|
|
880
|
-
// Test/Unfinished Content Patterns (MEDIUM)
|
|
881
|
-
// Focuses on obvious test/placeholder patterns, avoiding false positives with legitimate content
|
|
882
|
-
{
|
|
883
|
-
name: "Test Content Path",
|
|
884
|
-
category: "test_content",
|
|
885
|
-
severity: "medium",
|
|
886
|
-
regex: /\/(?:test-|demo-|sample-|temp-|temporary-|placeholder-)|\/(test|demo|sample|temp|temporary|placeholder)(?:\/|$)/i,
|
|
887
|
-
description: "URL path suggests test, demo, or unfinished content that may not be intended for indexing"
|
|
888
|
-
}
|
|
889
|
-
];
|
|
441
|
+
};
|
|
890
442
|
|
|
891
|
-
// src/
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
severity: "high",
|
|
913
|
-
regex: new RegExp(pattern2),
|
|
914
|
-
description: `URL does not match expected domain or allowed subdomains`
|
|
443
|
+
// src/reporters/json-reporter.ts
|
|
444
|
+
import fs2 from "fs/promises";
|
|
445
|
+
var JsonReporter = class {
|
|
446
|
+
outputPath;
|
|
447
|
+
constructor(outputPath = "sitemap-qa-report.json") {
|
|
448
|
+
this.outputPath = outputPath;
|
|
449
|
+
}
|
|
450
|
+
async generate(data) {
|
|
451
|
+
const report = {
|
|
452
|
+
metadata: {
|
|
453
|
+
generatedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
454
|
+
durationMs: data.endTime.getTime() - data.startTime.getTime()
|
|
455
|
+
},
|
|
456
|
+
summary: {
|
|
457
|
+
totalUrls: data.totalUrls,
|
|
458
|
+
totalRisks: data.totalRisks,
|
|
459
|
+
urlsWithRisksCount: data.urlsWithRisks.length,
|
|
460
|
+
ignoredUrlsCount: data.ignoredUrls.length
|
|
461
|
+
},
|
|
462
|
+
findings: data.urlsWithRisks,
|
|
463
|
+
ignored: data.ignoredUrls
|
|
915
464
|
};
|
|
465
|
+
await fs2.writeFile(this.outputPath, JSON.stringify(report, null, 2), "utf8");
|
|
466
|
+
console.log(`JSON report generated at ${this.outputPath}`);
|
|
916
467
|
}
|
|
917
|
-
|
|
918
|
-
const pattern = `^https?://(?!(?:www\\.)?${escapedRoot}(?:/|$))`;
|
|
919
|
-
return {
|
|
920
|
-
name: "Domain Mismatch",
|
|
921
|
-
category: "domain_mismatch",
|
|
922
|
-
severity: "high",
|
|
923
|
-
regex: new RegExp(pattern),
|
|
924
|
-
description: `URL does not match expected domain: ${rootDomain} (including www variant)`
|
|
925
|
-
};
|
|
926
|
-
}
|
|
927
|
-
var ENVIRONMENT_PATTERNS = [
|
|
928
|
-
{
|
|
929
|
-
name: "Staging Subdomain",
|
|
930
|
-
category: "environment_leakage",
|
|
931
|
-
severity: "high",
|
|
932
|
-
regex: /^https?:\/\/(staging|stg)\./i,
|
|
933
|
-
description: "URL uses staging subdomain"
|
|
934
|
-
},
|
|
935
|
-
{
|
|
936
|
-
name: "Development Subdomain",
|
|
937
|
-
category: "environment_leakage",
|
|
938
|
-
severity: "high",
|
|
939
|
-
regex: /^https?:\/\/(dev|development)\./i,
|
|
940
|
-
description: "URL uses development subdomain"
|
|
941
|
-
},
|
|
942
|
-
{
|
|
943
|
-
name: "QA/Test Subdomain",
|
|
944
|
-
category: "environment_leakage",
|
|
945
|
-
severity: "high",
|
|
946
|
-
regex: /^https?:\/\/(qa|test|uat|preprod)\./i,
|
|
947
|
-
description: "URL uses test environment subdomain"
|
|
948
|
-
},
|
|
949
|
-
{
|
|
950
|
-
name: "Localhost URL",
|
|
951
|
-
category: "environment_leakage",
|
|
952
|
-
severity: "high",
|
|
953
|
-
regex: /^https?:\/\/(localhost|127\.0\.0\.1|0\.0\.0\.0)/,
|
|
954
|
-
description: "URL points to localhost (development environment)"
|
|
955
|
-
},
|
|
956
|
-
{
|
|
957
|
-
name: "Environment in Path",
|
|
958
|
-
category: "environment_leakage",
|
|
959
|
-
severity: "high",
|
|
960
|
-
regex: /^https?:\/\/[^/]+\/(staging|dev|qa|uat|preprod)\//i,
|
|
961
|
-
description: "URL path contains environment identifier at root level"
|
|
962
|
-
}
|
|
963
|
-
];
|
|
964
|
-
|
|
965
|
-
// src/core/patterns/admin-patterns.ts
|
|
966
|
-
var ADMIN_PATH_PATTERNS = [
|
|
967
|
-
{
|
|
968
|
-
name: "Admin Path",
|
|
969
|
-
category: "admin_paths",
|
|
970
|
-
severity: "high",
|
|
971
|
-
regex: /\/(admin|administrator)(?:\/|$|\?)/i,
|
|
972
|
-
description: "URL contains /admin or /administrator as a path segment"
|
|
973
|
-
},
|
|
974
|
-
{
|
|
975
|
-
name: "Dashboard Path",
|
|
976
|
-
category: "admin_paths",
|
|
977
|
-
severity: "high",
|
|
978
|
-
regex: /\/dashboard(?:\/|$|\?)/i,
|
|
979
|
-
description: "URL contains /dashboard as a path segment"
|
|
980
|
-
},
|
|
981
|
-
{
|
|
982
|
-
name: "Config Path",
|
|
983
|
-
category: "admin_paths",
|
|
984
|
-
severity: "high",
|
|
985
|
-
regex: /\/(config|configuration)(?:\/|$|\?)/i,
|
|
986
|
-
description: "URL contains /config or /configuration as a path segment"
|
|
987
|
-
},
|
|
988
|
-
{
|
|
989
|
-
name: "Console Path",
|
|
990
|
-
category: "admin_paths",
|
|
991
|
-
severity: "high",
|
|
992
|
-
regex: /\/console(?:\/|$|\?)/i,
|
|
993
|
-
description: "URL contains /console as a path segment"
|
|
994
|
-
},
|
|
995
|
-
{
|
|
996
|
-
name: "Control Panel Path",
|
|
997
|
-
category: "admin_paths",
|
|
998
|
-
severity: "high",
|
|
999
|
-
regex: /\/(cpanel|control-panel)(?:\/|$|\?)/i,
|
|
1000
|
-
description: "URL contains control panel as a path segment"
|
|
1001
|
-
}
|
|
1002
|
-
];
|
|
1003
|
-
var INTERNAL_CONTENT_PATTERNS = [
|
|
1004
|
-
{
|
|
1005
|
-
name: "Internal Content Path",
|
|
1006
|
-
category: "internal_content",
|
|
1007
|
-
severity: "medium",
|
|
1008
|
-
regex: /\/internal\b/i,
|
|
1009
|
-
description: "URL contains /internal path segment - may be internal-only content not intended for public indexing"
|
|
1010
|
-
}
|
|
1011
|
-
];
|
|
1012
|
-
var SENSITIVE_PARAM_PATTERNS = [
|
|
1013
|
-
{
|
|
1014
|
-
name: "Authentication Token Parameter",
|
|
1015
|
-
category: "sensitive_params",
|
|
1016
|
-
severity: "high",
|
|
1017
|
-
regex: /[?&](token|auth_token|access_token|api_token)=/i,
|
|
1018
|
-
description: "Query parameter may contain authentication token"
|
|
1019
|
-
},
|
|
1020
|
-
{
|
|
1021
|
-
name: "API Key Parameter",
|
|
1022
|
-
category: "sensitive_params",
|
|
1023
|
-
severity: "high",
|
|
1024
|
-
regex: /[?&](apikey|api_key|key)=/i,
|
|
1025
|
-
description: "Query parameter may contain API key"
|
|
1026
|
-
},
|
|
1027
|
-
{
|
|
1028
|
-
name: "Password Parameter",
|
|
1029
|
-
category: "sensitive_params",
|
|
1030
|
-
severity: "high",
|
|
1031
|
-
regex: /[?&](password|passwd|pwd)=/i,
|
|
1032
|
-
description: "Query parameter may contain password"
|
|
1033
|
-
},
|
|
1034
|
-
{
|
|
1035
|
-
name: "Secret Parameter",
|
|
1036
|
-
category: "sensitive_params",
|
|
1037
|
-
severity: "high",
|
|
1038
|
-
regex: /[?&](secret|client_secret)=/i,
|
|
1039
|
-
description: "Query parameter may contain secret value"
|
|
1040
|
-
},
|
|
1041
|
-
{
|
|
1042
|
-
name: "Session Parameter",
|
|
1043
|
-
category: "sensitive_params",
|
|
1044
|
-
severity: "high",
|
|
1045
|
-
regex: /[?&](session|sessionid|sid)=/i,
|
|
1046
|
-
description: "Query parameter may contain session identifier"
|
|
1047
|
-
},
|
|
1048
|
-
{
|
|
1049
|
-
name: "Credentials Parameter",
|
|
1050
|
-
category: "sensitive_params",
|
|
1051
|
-
severity: "high",
|
|
1052
|
-
regex: /[?&]credentials=/i,
|
|
1053
|
-
description: "Query parameter may contain credentials"
|
|
1054
|
-
},
|
|
1055
|
-
{
|
|
1056
|
-
name: "Debug Parameter",
|
|
1057
|
-
category: "sensitive_params",
|
|
1058
|
-
severity: "medium",
|
|
1059
|
-
regex: /[?&](debug|trace|verbose)=/i,
|
|
1060
|
-
description: "Query parameter contains debug or diagnostic flag"
|
|
1061
|
-
},
|
|
1062
|
-
{
|
|
1063
|
-
name: "Test Mode Parameter",
|
|
1064
|
-
category: "sensitive_params",
|
|
1065
|
-
severity: "medium",
|
|
1066
|
-
regex: /[?&](test_mode|test|testing)=/i,
|
|
1067
|
-
description: "Query parameter indicates test mode"
|
|
1068
|
-
}
|
|
1069
|
-
];
|
|
468
|
+
};
|
|
1070
469
|
|
|
1071
|
-
// src/
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
|
|
1096
|
-
|
|
470
|
+
// src/reporters/html-reporter.ts
|
|
471
|
+
import fs3 from "fs/promises";
|
|
472
|
+
var HtmlReporter = class {
|
|
473
|
+
outputPath;
|
|
474
|
+
constructor(outputPath = "sitemap-qa-report.html") {
|
|
475
|
+
this.outputPath = outputPath;
|
|
476
|
+
}
|
|
477
|
+
async generate(data) {
|
|
478
|
+
const categories = this.groupRisks(data);
|
|
479
|
+
const html = this.generateHtml(data, categories);
|
|
480
|
+
await fs3.writeFile(this.outputPath, html, "utf8");
|
|
481
|
+
console.log(`HTML report generated at ${this.outputPath}`);
|
|
482
|
+
}
|
|
483
|
+
groupRisks(data) {
|
|
484
|
+
const categories = {};
|
|
485
|
+
for (const urlObj of data.urlsWithRisks) {
|
|
486
|
+
for (const risk of urlObj.risks) {
|
|
487
|
+
if (!categories[risk.category]) {
|
|
488
|
+
categories[risk.category] = {};
|
|
489
|
+
}
|
|
490
|
+
if (!categories[risk.category][risk.pattern]) {
|
|
491
|
+
categories[risk.category][risk.pattern] = {
|
|
492
|
+
reason: risk.reason,
|
|
493
|
+
urls: []
|
|
494
|
+
};
|
|
495
|
+
}
|
|
496
|
+
categories[risk.category][risk.pattern].urls.push(urlObj.loc);
|
|
1097
497
|
}
|
|
1098
498
|
}
|
|
1099
|
-
return
|
|
1100
|
-
} catch {
|
|
1101
|
-
return url;
|
|
1102
|
-
}
|
|
1103
|
-
}
|
|
1104
|
-
|
|
1105
|
-
// src/core/risk-grouper.ts
|
|
1106
|
-
function generateRecommendation(category, _severity, count) {
|
|
1107
|
-
switch (category) {
|
|
1108
|
-
case "environment_leakage":
|
|
1109
|
-
return {
|
|
1110
|
-
rationale: `Production sitemap contains ${count} URL(s) from non-production environments (staging, dev, QA, test). This indicates configuration errors or environment leakage.`,
|
|
1111
|
-
recommendedAction: "Verify sitemap generation excludes non-production environments. Review deployment configuration and environment filtering rules."
|
|
1112
|
-
};
|
|
1113
|
-
case "admin_paths":
|
|
1114
|
-
return {
|
|
1115
|
-
rationale: `${count} administrative path(s) detected in public sitemap (admin, dashboard, config). These paths may expose privileged access points.`,
|
|
1116
|
-
recommendedAction: "Confirm if admin paths should be publicly indexed. Consider excluding via robots.txt or removing from sitemap. Verify access controls."
|
|
1117
|
-
};
|
|
1118
|
-
case "internal_content":
|
|
1119
|
-
return {
|
|
1120
|
-
rationale: `${count} URL(s) contain "internal" in the path. These may be internal-facing content not intended for public indexing.`,
|
|
1121
|
-
recommendedAction: "Review URLs to determine if they should be publicly accessible. Consider excluding internal content from sitemap or adding noindex meta tags."
|
|
1122
|
-
};
|
|
1123
|
-
case "test_content":
|
|
1124
|
-
return {
|
|
1125
|
-
rationale: `${count} URL(s) contain test/demo/sample identifiers. These may be placeholder or unfinished content not intended for indexing.`,
|
|
1126
|
-
recommendedAction: "Review and remove test content from production sitemaps. Verify content is production-ready before including in sitemap."
|
|
1127
|
-
};
|
|
1128
|
-
case "sensitive_params":
|
|
1129
|
-
return {
|
|
1130
|
-
rationale: `${count} URL(s) contain sensitive query parameters (token, auth, key, password, session). This may expose authentication credentials or debugging flags.`,
|
|
1131
|
-
recommendedAction: "Review why sensitive parameters are in sitemap URLs. Remove authentication tokens from URLs. Consider POST requests for sensitive data."
|
|
1132
|
-
};
|
|
1133
|
-
case "protocol_inconsistency":
|
|
1134
|
-
return {
|
|
1135
|
-
rationale: `${count} URL(s) use HTTP protocol in HTTPS sitemap. This creates mixed content warnings and potential security issues.`,
|
|
1136
|
-
recommendedAction: "Update URLs to use HTTPS consistently. Verify SSL certificate coverage. Check for hardcoded HTTP URLs in content."
|
|
1137
|
-
};
|
|
1138
|
-
case "domain_mismatch":
|
|
1139
|
-
return {
|
|
1140
|
-
rationale: `${count} URL(s) do not match expected base domain. This may indicate external links, CDN URLs, or configuration errors.`,
|
|
1141
|
-
recommendedAction: "Verify if external domains are intentional. Review sitemap generation logic. Confirm CDN or subdomain configuration is correct."
|
|
1142
|
-
};
|
|
1143
|
-
default:
|
|
1144
|
-
return {
|
|
1145
|
-
rationale: `${count} URL(s) flagged in category: ${category}`,
|
|
1146
|
-
recommendedAction: "Review flagged URLs and determine appropriate action."
|
|
1147
|
-
};
|
|
1148
|
-
}
|
|
1149
|
-
}
|
|
1150
|
-
function groupRiskFindings(findings, maxSampleUrls = 5) {
|
|
1151
|
-
const categoryMap = /* @__PURE__ */ new Map();
|
|
1152
|
-
for (const finding of findings) {
|
|
1153
|
-
if (!categoryMap.has(finding.category)) {
|
|
1154
|
-
categoryMap.set(finding.category, []);
|
|
1155
|
-
}
|
|
1156
|
-
categoryMap.get(finding.category).push(finding);
|
|
1157
|
-
}
|
|
1158
|
-
const groups = [];
|
|
1159
|
-
for (const [category, categoryFindings] of categoryMap.entries()) {
|
|
1160
|
-
const uniqueUrls = Array.from(new Set(categoryFindings.map((f) => f.url)));
|
|
1161
|
-
const severity = categoryFindings.reduce((highest, finding) => {
|
|
1162
|
-
const severityOrder = ["low", "medium", "high"];
|
|
1163
|
-
return severityOrder.indexOf(finding.severity) > severityOrder.indexOf(highest) ? finding.severity : highest;
|
|
1164
|
-
}, "low");
|
|
1165
|
-
const sampleUrls = uniqueUrls.slice(0, maxSampleUrls);
|
|
1166
|
-
const { rationale, recommendedAction } = generateRecommendation(category, severity, uniqueUrls.length);
|
|
1167
|
-
groups.push({
|
|
1168
|
-
category,
|
|
1169
|
-
severity,
|
|
1170
|
-
count: uniqueUrls.length,
|
|
1171
|
-
rationale,
|
|
1172
|
-
sampleUrls,
|
|
1173
|
-
recommendedAction,
|
|
1174
|
-
allUrls: uniqueUrls
|
|
1175
|
-
});
|
|
499
|
+
return categories;
|
|
1176
500
|
}
|
|
1177
|
-
|
|
1178
|
-
const
|
|
1179
|
-
|
|
1180
|
-
|
|
1181
|
-
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
501
|
+
generateHtml(data, categories) {
|
|
502
|
+
const duration = ((data.endTime.getTime() - data.startTime.getTime()) / 1e3).toFixed(1);
|
|
503
|
+
const timestamp = data.endTime.toLocaleString();
|
|
504
|
+
const esc = this.escapeHtml.bind(this);
|
|
505
|
+
return `
|
|
506
|
+
<!DOCTYPE html>
|
|
507
|
+
<html lang="en">
|
|
508
|
+
<head>
|
|
509
|
+
<meta charset="UTF-8">
|
|
510
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
511
|
+
<title>Sitemap Analysis - ${esc(data.rootUrl)}</title>
|
|
512
|
+
<style>
|
|
513
|
+
:root {
|
|
514
|
+
--bg-dark: #0f172a;
|
|
515
|
+
--bg-light: #f8fafc;
|
|
516
|
+
--text-main: #1e293b;
|
|
517
|
+
--text-muted: #64748b;
|
|
518
|
+
--primary: #3b82f6;
|
|
519
|
+
--danger: #ef4444;
|
|
520
|
+
--warning: #f59e0b;
|
|
521
|
+
--border: #e2e8f0;
|
|
522
|
+
}
|
|
523
|
+
body {
|
|
524
|
+
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
|
|
525
|
+
line-height: 1.5;
|
|
526
|
+
color: var(--text-main);
|
|
527
|
+
background-color: #fff;
|
|
528
|
+
margin: 0;
|
|
529
|
+
padding: 0;
|
|
530
|
+
}
|
|
531
|
+
header {
|
|
532
|
+
background-color: var(--bg-dark);
|
|
533
|
+
color: white;
|
|
534
|
+
padding: 40px 20px;
|
|
535
|
+
text-align: left;
|
|
536
|
+
}
|
|
537
|
+
.container {
|
|
538
|
+
max-width: 1200px;
|
|
539
|
+
margin: 0 auto;
|
|
540
|
+
padding: 0 20px;
|
|
541
|
+
}
|
|
542
|
+
header h1 { margin: 0; font-size: 24px; }
|
|
543
|
+
header .meta { margin-top: 10px; color: #94a3b8; font-size: 14px; }
|
|
544
|
+
|
|
545
|
+
.summary-grid {
|
|
546
|
+
display: grid;
|
|
547
|
+
grid-template-columns: repeat(5, 1fr);
|
|
548
|
+
border-bottom: 1px solid var(--border);
|
|
549
|
+
margin-bottom: 40px;
|
|
550
|
+
}
|
|
551
|
+
.summary-card {
|
|
552
|
+
padding: 30px 20px;
|
|
553
|
+
text-align: center;
|
|
554
|
+
border-right: 1px solid var(--border);
|
|
555
|
+
}
|
|
556
|
+
.summary-card:last-child { border-right: none; }
|
|
557
|
+
.summary-card h3 {
|
|
558
|
+
margin: 0;
|
|
559
|
+
font-size: 12px;
|
|
560
|
+
text-transform: uppercase;
|
|
561
|
+
color: var(--text-muted);
|
|
562
|
+
letter-spacing: 0.05em;
|
|
563
|
+
}
|
|
564
|
+
.summary-card p {
|
|
565
|
+
margin: 10px 0 0;
|
|
566
|
+
font-size: 32px;
|
|
567
|
+
font-weight: 700;
|
|
568
|
+
color: var(--text-main);
|
|
569
|
+
}
|
|
570
|
+
.summary-card.highlight p { color: var(--danger); }
|
|
1193
571
|
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
|
|
1198
|
-
|
|
1199
|
-
for (const pattern of config.acceptedPatterns) {
|
|
1200
|
-
try {
|
|
1201
|
-
let regexPattern = pattern.replace(/[.+?^${}()|[\]\\]/g, "\\$&").replace(/\*/g, "[^/]*");
|
|
1202
|
-
if (!regexPattern.endsWith("$") && !regexPattern.includes("(?:")) {
|
|
1203
|
-
regexPattern = regexPattern + "(?:/|$|\\?|#)";
|
|
572
|
+
details {
|
|
573
|
+
margin-bottom: 20px;
|
|
574
|
+
border: 1px solid var(--border);
|
|
575
|
+
border-radius: 8px;
|
|
576
|
+
overflow: hidden;
|
|
1204
577
|
}
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
578
|
+
summary {
|
|
579
|
+
padding: 15px 20px;
|
|
580
|
+
background-color: #fff;
|
|
581
|
+
cursor: pointer;
|
|
582
|
+
font-weight: 600;
|
|
583
|
+
display: flex;
|
|
584
|
+
justify-content: space-between;
|
|
585
|
+
align-items: center;
|
|
586
|
+
list-style: none;
|
|
1209
587
|
}
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
|
|
1216
|
-
const findings = [];
|
|
1217
|
-
for (const urlEntry of urls) {
|
|
1218
|
-
const url = urlEntry.loc;
|
|
1219
|
-
let isAccepted = false;
|
|
1220
|
-
for (const acceptedPattern of acceptedPatterns) {
|
|
1221
|
-
if (acceptedPattern.test(url)) {
|
|
1222
|
-
isAccepted = true;
|
|
1223
|
-
break;
|
|
1224
|
-
}
|
|
1225
|
-
}
|
|
1226
|
-
if (isAccepted) continue;
|
|
1227
|
-
for (const pattern of allPatterns) {
|
|
1228
|
-
if (pattern.category === "protocol_inconsistency") {
|
|
1229
|
-
try {
|
|
1230
|
-
const urlProtocol = new URL(url).protocol;
|
|
1231
|
-
if (expectedProtocol === "https:" && urlProtocol === "http:") {
|
|
1232
|
-
findings.push({
|
|
1233
|
-
url,
|
|
1234
|
-
category: pattern.category,
|
|
1235
|
-
severity: pattern.severity,
|
|
1236
|
-
pattern: pattern.name,
|
|
1237
|
-
rationale: pattern.description,
|
|
1238
|
-
matchedValue: "http://"
|
|
1239
|
-
});
|
|
1240
|
-
}
|
|
1241
|
-
} catch (error) {
|
|
1242
|
-
continue;
|
|
588
|
+
summary::-webkit-details-marker { display: none; }
|
|
589
|
+
summary::after {
|
|
590
|
+
content: '\u25B6';
|
|
591
|
+
font-size: 12px;
|
|
592
|
+
color: var(--text-muted);
|
|
593
|
+
transition: transform 0.2s;
|
|
1243
594
|
}
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
|
|
1252
|
-
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
|
|
1260
|
-
|
|
1261
|
-
|
|
595
|
+
details[open] summary::after { transform: rotate(90deg); }
|
|
596
|
+
|
|
597
|
+
.category-section {
|
|
598
|
+
border: 1px solid var(--warning);
|
|
599
|
+
border-radius: 8px;
|
|
600
|
+
margin-bottom: 20px;
|
|
601
|
+
}
|
|
602
|
+
.category-header {
|
|
603
|
+
padding: 15px 20px;
|
|
604
|
+
background-color: #fffbeb;
|
|
605
|
+
color: var(--warning);
|
|
606
|
+
font-weight: 600;
|
|
607
|
+
cursor: pointer;
|
|
608
|
+
display: flex;
|
|
609
|
+
justify-content: space-between;
|
|
610
|
+
align-items: center;
|
|
611
|
+
}
|
|
612
|
+
.category-content {
|
|
613
|
+
padding: 20px;
|
|
614
|
+
background-color: #fff;
|
|
1262
615
|
}
|
|
1263
|
-
}
|
|
1264
|
-
}
|
|
1265
|
-
}
|
|
1266
|
-
return { findings, urlsProcessed: urls.length };
|
|
1267
|
-
}
|
|
1268
|
-
async function detectRisks(urls, baseUrl, config) {
|
|
1269
|
-
const startTime = Date.now();
|
|
1270
|
-
const domainPattern = createDomainMismatchPattern(baseUrl);
|
|
1271
|
-
const allPatterns = [
|
|
1272
|
-
...RISK_PATTERNS,
|
|
1273
|
-
...ENVIRONMENT_PATTERNS,
|
|
1274
|
-
...ADMIN_PATH_PATTERNS,
|
|
1275
|
-
...SENSITIVE_PARAM_PATTERNS,
|
|
1276
|
-
...INTERNAL_CONTENT_PATTERNS,
|
|
1277
|
-
domainPattern
|
|
1278
|
-
];
|
|
1279
|
-
const acceptedPatterns = compileAcceptedPatterns(config);
|
|
1280
|
-
let expectedProtocol;
|
|
1281
|
-
try {
|
|
1282
|
-
expectedProtocol = new URL(baseUrl).protocol;
|
|
1283
|
-
} catch (error) {
|
|
1284
|
-
if (config.verbose) {
|
|
1285
|
-
console.warn(`Invalid base URL: ${baseUrl}, defaulting to https:`);
|
|
1286
|
-
}
|
|
1287
|
-
expectedProtocol = "https:";
|
|
1288
|
-
}
|
|
1289
|
-
const BATCH_SIZE = config.riskDetectionBatchSize || 1e4;
|
|
1290
|
-
const CONCURRENCY = config.riskDetectionConcurrency || Math.max(2, os.cpus().length - 1);
|
|
1291
|
-
const batches = chunkArray(urls, BATCH_SIZE);
|
|
1292
|
-
if (config.verbose) {
|
|
1293
|
-
console.log(`
|
|
1294
|
-
Risk Detection Configuration:`);
|
|
1295
|
-
console.log(` - Total URLs: ${urls.length.toLocaleString()}`);
|
|
1296
|
-
console.log(` - Batch size: ${BATCH_SIZE.toLocaleString()}`);
|
|
1297
|
-
console.log(` - Concurrency: ${CONCURRENCY}`);
|
|
1298
|
-
console.log(` - Total batches: ${batches.length}`);
|
|
1299
|
-
try {
|
|
1300
|
-
console.log(` - Base domain: ${new URL(baseUrl).hostname}`);
|
|
1301
|
-
} catch (error) {
|
|
1302
|
-
console.log(` - Base URL: ${baseUrl}`);
|
|
1303
|
-
}
|
|
1304
|
-
if (acceptedPatterns.length > 0) {
|
|
1305
|
-
console.log(` - Accepted patterns: ${acceptedPatterns.length}`);
|
|
1306
|
-
}
|
|
1307
|
-
}
|
|
1308
|
-
let completedBatches = 0;
|
|
1309
|
-
const totalBatches = batches.length;
|
|
1310
|
-
const batchStartTime = Date.now();
|
|
1311
|
-
const batchResults = await processInBatches(
|
|
1312
|
-
batches,
|
|
1313
|
-
CONCURRENCY,
|
|
1314
|
-
(batch) => detectRisksInBatch(batch, allPatterns, acceptedPatterns, expectedProtocol, config.verbose),
|
|
1315
|
-
(completed) => {
|
|
1316
|
-
completedBatches = completed;
|
|
1317
|
-
const pct = (completed / totalBatches * 100).toFixed(1);
|
|
1318
|
-
const elapsed = (Date.now() - batchStartTime) / 1e3;
|
|
1319
|
-
const urlsProcessed = completed * BATCH_SIZE;
|
|
1320
|
-
const speed = Math.round(urlsProcessed / elapsed);
|
|
1321
|
-
const remaining = totalBatches - completed;
|
|
1322
|
-
const eta = Math.round(remaining * BATCH_SIZE / speed);
|
|
1323
|
-
process.stdout.write(
|
|
1324
|
-
`\r\x1B[K Analyzing batch ${completed}/${totalBatches} (${pct}%) | ETA: ~${eta}s | ${speed.toLocaleString()} URLs/sec`
|
|
1325
|
-
);
|
|
1326
|
-
}
|
|
1327
|
-
);
|
|
1328
|
-
process.stdout.write("\r\x1B[K");
|
|
1329
|
-
const allFindings = batchResults.flatMap((r) => r.findings);
|
|
1330
|
-
const groupingResult = groupRiskFindings(allFindings);
|
|
1331
|
-
const processingTimeMs = Date.now() - startTime;
|
|
1332
|
-
if (config.verbose) {
|
|
1333
|
-
console.log(`
|
|
1334
|
-
Risk Detection Summary:`);
|
|
1335
|
-
console.log(` - Total URLs analyzed: ${urls.length.toLocaleString()}`);
|
|
1336
|
-
console.log(` - Risk URLs found: ${groupingResult.totalRiskUrls.toLocaleString()}`);
|
|
1337
|
-
console.log(` - HIGH severity: ${groupingResult.highSeverityCount}`);
|
|
1338
|
-
console.log(` - MEDIUM severity: ${groupingResult.mediumSeverityCount}`);
|
|
1339
|
-
console.log(` - LOW severity: ${groupingResult.lowSeverityCount}`);
|
|
1340
|
-
console.log(` - Processing time: ${(processingTimeMs / 1e3).toFixed(1)}s`);
|
|
1341
|
-
if (groupingResult.groups.length > 0) {
|
|
1342
|
-
console.log(`
|
|
1343
|
-
Risk Categories Found:`);
|
|
1344
|
-
for (const group of groupingResult.groups) {
|
|
1345
|
-
console.log(` - ${group.category}: ${group.count} URLs (${group.severity.toUpperCase()})`);
|
|
1346
|
-
}
|
|
1347
|
-
}
|
|
1348
|
-
}
|
|
1349
|
-
return {
|
|
1350
|
-
findings: allFindings,
|
|
1351
|
-
groups: groupingResult.groups,
|
|
1352
|
-
totalUrlsAnalyzed: urls.length,
|
|
1353
|
-
riskUrlCount: groupingResult.totalRiskUrls,
|
|
1354
|
-
cleanUrlCount: urls.length - groupingResult.totalRiskUrls,
|
|
1355
|
-
highSeverityCount: groupingResult.highSeverityCount,
|
|
1356
|
-
mediumSeverityCount: groupingResult.mediumSeverityCount,
|
|
1357
|
-
lowSeverityCount: groupingResult.lowSeverityCount,
|
|
1358
|
-
processingTimeMs
|
|
1359
|
-
};
|
|
1360
|
-
}
|
|
1361
616
|
|
|
1362
|
-
|
|
1363
|
-
|
|
1364
|
-
|
|
1365
|
-
|
|
1366
|
-
|
|
1367
|
-
|
|
1368
|
-
|
|
1369
|
-
|
|
1370
|
-
|
|
1371
|
-
|
|
1372
|
-
|
|
1373
|
-
|
|
1374
|
-
|
|
1375
|
-
|
|
1376
|
-
|
|
1377
|
-
|
|
1378
|
-
|
|
1379
|
-
|
|
1380
|
-
|
|
1381
|
-
|
|
1382
|
-
|
|
1383
|
-
|
|
1384
|
-
|
|
1385
|
-
|
|
1386
|
-
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
|
|
1390
|
-
|
|
1391
|
-
|
|
1392
|
-
|
|
1393
|
-
|
|
1394
|
-
|
|
1395
|
-
|
|
1396
|
-
|
|
1397
|
-
|
|
1398
|
-
|
|
1399
|
-
|
|
1400
|
-
|
|
1401
|
-
|
|
1402
|
-
|
|
1403
|
-
|
|
1404
|
-
|
|
1405
|
-
|
|
1406
|
-
|
|
1407
|
-
|
|
617
|
+
.finding-group {
|
|
618
|
+
border: 1px solid var(--border);
|
|
619
|
+
border-radius: 8px;
|
|
620
|
+
padding: 20px;
|
|
621
|
+
margin-bottom: 20px;
|
|
622
|
+
}
|
|
623
|
+
.finding-header {
|
|
624
|
+
display: flex;
|
|
625
|
+
align-items: center;
|
|
626
|
+
gap: 10px;
|
|
627
|
+
margin-bottom: 10px;
|
|
628
|
+
}
|
|
629
|
+
.finding-header h4 { margin: 0; font-size: 16px; }
|
|
630
|
+
.badge {
|
|
631
|
+
background-color: var(--primary);
|
|
632
|
+
color: white;
|
|
633
|
+
padding: 2px 8px;
|
|
634
|
+
border-radius: 12px;
|
|
635
|
+
font-size: 12px;
|
|
636
|
+
}
|
|
637
|
+
.finding-description {
|
|
638
|
+
color: var(--text-muted);
|
|
639
|
+
font-size: 14px;
|
|
640
|
+
margin-bottom: 20px;
|
|
641
|
+
}
|
|
642
|
+
|
|
643
|
+
.url-list {
|
|
644
|
+
background-color: var(--bg-light);
|
|
645
|
+
border-radius: 4px;
|
|
646
|
+
padding: 15px;
|
|
647
|
+
margin-bottom: 15px;
|
|
648
|
+
}
|
|
649
|
+
.url-item {
|
|
650
|
+
font-family: monospace;
|
|
651
|
+
font-size: 13px;
|
|
652
|
+
padding: 8px 12px;
|
|
653
|
+
background: white;
|
|
654
|
+
border: 1px solid var(--border);
|
|
655
|
+
border-radius: 4px;
|
|
656
|
+
margin-bottom: 8px;
|
|
657
|
+
white-space: nowrap;
|
|
658
|
+
overflow: hidden;
|
|
659
|
+
text-overflow: ellipsis;
|
|
660
|
+
}
|
|
661
|
+
.url-item:last-child { margin-bottom: 0; }
|
|
662
|
+
|
|
663
|
+
.more-count {
|
|
664
|
+
font-size: 12px;
|
|
665
|
+
color: var(--text-muted);
|
|
666
|
+
font-style: italic;
|
|
667
|
+
margin-bottom: 15px;
|
|
668
|
+
}
|
|
1408
669
|
|
|
1409
|
-
|
|
1410
|
-
|
|
1411
|
-
|
|
1412
|
-
|
|
1413
|
-
|
|
1414
|
-
|
|
1415
|
-
|
|
1416
|
-
|
|
1417
|
-
|
|
1418
|
-
|
|
1419
|
-
|
|
1420
|
-
|
|
1421
|
-
|
|
1422
|
-
config,
|
|
1423
|
-
startTime
|
|
1424
|
-
);
|
|
1425
|
-
const jsonOutput = transformToJsonOutput(result, performanceMetrics);
|
|
1426
|
-
if (pretty) {
|
|
1427
|
-
return JSON.stringify(jsonOutput, null, indent);
|
|
1428
|
-
} else {
|
|
1429
|
-
return JSON.stringify(jsonOutput);
|
|
1430
|
-
}
|
|
1431
|
-
}
|
|
1432
|
-
function buildAnalysisResult(summary, discoveryResult, parseResult, riskGroups, config, startTime) {
|
|
1433
|
-
const metadata = buildAnalysisMetadata(
|
|
1434
|
-
config.baseUrl || "unknown",
|
|
1435
|
-
startTime,
|
|
1436
|
-
summary
|
|
1437
|
-
);
|
|
1438
|
-
const suspiciousGroups = riskGroups.map((group) => ({
|
|
1439
|
-
category: group.category,
|
|
1440
|
-
severity: group.severity,
|
|
1441
|
-
count: group.count,
|
|
1442
|
-
pattern: group.category,
|
|
1443
|
-
// Use category as pattern identifier
|
|
1444
|
-
rationale: group.rationale,
|
|
1445
|
-
sampleUrls: group.sampleUrls.slice(0, 5),
|
|
1446
|
-
// Limit to 5 samples
|
|
1447
|
-
recommendedAction: group.recommendedAction
|
|
1448
|
-
}));
|
|
1449
|
-
const summaryStats = {
|
|
1450
|
-
highSeverityCount: summary.severityBreakdown.high,
|
|
1451
|
-
mediumSeverityCount: summary.severityBreakdown.medium,
|
|
1452
|
-
lowSeverityCount: summary.severityBreakdown.low,
|
|
1453
|
-
totalRiskyUrls: riskGroups.reduce((sum, g) => sum + g.count, 0),
|
|
1454
|
-
overallStatus: determineOverallStatus(
|
|
1455
|
-
summary.severityBreakdown,
|
|
1456
|
-
parseResult.errors
|
|
1457
|
-
)
|
|
1458
|
-
};
|
|
1459
|
-
const riskSummary = {
|
|
1460
|
-
overview: summary.overview,
|
|
1461
|
-
keyFindings: summary.keyFindings,
|
|
1462
|
-
recommendations: summary.recommendations
|
|
1463
|
-
};
|
|
1464
|
-
const errors = parseResult.errors.map(transformError);
|
|
1465
|
-
return {
|
|
1466
|
-
analysisMetadata: metadata,
|
|
1467
|
-
sitemapsDiscovered: discoveryResult.sitemaps,
|
|
1468
|
-
totalUrlCount: parseResult.totalCount,
|
|
1469
|
-
urlsAnalyzed: parseResult.totalCount,
|
|
1470
|
-
suspiciousGroups,
|
|
1471
|
-
riskSummary,
|
|
1472
|
-
summary: summaryStats,
|
|
1473
|
-
errors
|
|
1474
|
-
};
|
|
1475
|
-
}
|
|
1476
|
-
function buildAnalysisMetadata(baseUrl, startTime, summary) {
|
|
1477
|
-
return {
|
|
1478
|
-
baseUrl,
|
|
1479
|
-
analysisTimestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1480
|
-
toolVersion: TOOL_VERSION,
|
|
1481
|
-
executionTimeMs: Date.now() - startTime,
|
|
1482
|
-
analysisType: summary.generatedBy
|
|
1483
|
-
};
|
|
1484
|
-
}
|
|
1485
|
-
function determineOverallStatus(severityBreakdown, errors) {
|
|
1486
|
-
if (errors.length > 0) {
|
|
1487
|
-
return "errors";
|
|
1488
|
-
}
|
|
1489
|
-
const totalIssues = severityBreakdown.high + severityBreakdown.medium + severityBreakdown.low;
|
|
1490
|
-
return totalIssues > 0 ? "issues_found" : "clean";
|
|
1491
|
-
}
|
|
1492
|
-
function transformToJsonOutput(result, performanceMetrics) {
|
|
1493
|
-
const output = {
|
|
1494
|
-
analysis_metadata: transformMetadata(result.analysisMetadata),
|
|
1495
|
-
sitemaps_discovered: result.sitemapsDiscovered,
|
|
1496
|
-
total_url_count: result.totalUrlCount,
|
|
1497
|
-
urls_analyzed: result.urlsAnalyzed,
|
|
1498
|
-
suspicious_groups: result.suspiciousGroups.map(transformGroup),
|
|
1499
|
-
risk_summary: transformRiskSummary(result.riskSummary),
|
|
1500
|
-
summary: transformSummary(result.summary),
|
|
1501
|
-
errors: result.errors
|
|
1502
|
-
};
|
|
1503
|
-
if (performanceMetrics) {
|
|
1504
|
-
output.performance_metrics = {
|
|
1505
|
-
total_execution_time_ms: performanceMetrics.totalExecutionTimeMs,
|
|
1506
|
-
phase_timings: performanceMetrics.phaseTimings,
|
|
1507
|
-
throughput: performanceMetrics.throughput,
|
|
1508
|
-
resource_usage: performanceMetrics.resourceUsage
|
|
1509
|
-
};
|
|
1510
|
-
}
|
|
1511
|
-
return output;
|
|
1512
|
-
}
|
|
1513
|
-
function transformMetadata(meta) {
|
|
1514
|
-
return {
|
|
1515
|
-
base_url: meta.baseUrl,
|
|
1516
|
-
analysis_timestamp: meta.analysisTimestamp,
|
|
1517
|
-
tool_version: meta.toolVersion,
|
|
1518
|
-
execution_time_ms: meta.executionTimeMs,
|
|
1519
|
-
analysis_type: meta.analysisType
|
|
1520
|
-
};
|
|
1521
|
-
}
|
|
1522
|
-
function transformGroup(group) {
|
|
1523
|
-
return {
|
|
1524
|
-
category: group.category,
|
|
1525
|
-
severity: group.severity,
|
|
1526
|
-
count: group.count,
|
|
1527
|
-
pattern: group.pattern,
|
|
1528
|
-
rationale: group.rationale,
|
|
1529
|
-
sample_urls: group.sampleUrls,
|
|
1530
|
-
recommended_action: group.recommendedAction
|
|
1531
|
-
};
|
|
1532
|
-
}
|
|
1533
|
-
function transformRiskSummary(summary) {
|
|
1534
|
-
return {
|
|
1535
|
-
overview: summary.overview,
|
|
1536
|
-
key_findings: summary.keyFindings,
|
|
1537
|
-
recommendations: summary.recommendations
|
|
1538
|
-
};
|
|
1539
|
-
}
|
|
1540
|
-
function transformSummary(summary) {
|
|
1541
|
-
return {
|
|
1542
|
-
high_severity_count: summary.highSeverityCount,
|
|
1543
|
-
medium_severity_count: summary.mediumSeverityCount,
|
|
1544
|
-
low_severity_count: summary.lowSeverityCount,
|
|
1545
|
-
total_risky_urls: summary.totalRiskyUrls,
|
|
1546
|
-
overall_status: summary.overallStatus
|
|
1547
|
-
};
|
|
1548
|
-
}
|
|
1549
|
-
function transformError(error) {
|
|
1550
|
-
if ("code" in error) {
|
|
1551
|
-
const customError = error;
|
|
1552
|
-
const errorDetail = {
|
|
1553
|
-
code: customError.code || "UNKNOWN_ERROR",
|
|
1554
|
-
message: error.message
|
|
1555
|
-
};
|
|
1556
|
-
if ("attemptedPaths" in customError) {
|
|
1557
|
-
errorDetail.context = {
|
|
1558
|
-
attempted_paths: customError.attemptedPaths
|
|
1559
|
-
};
|
|
1560
|
-
} else if ("sitemapUrl" in customError && "lineNumber" in customError) {
|
|
1561
|
-
errorDetail.context = {
|
|
1562
|
-
sitemap_url: customError.sitemapUrl,
|
|
1563
|
-
line_number: customError.lineNumber
|
|
1564
|
-
};
|
|
1565
|
-
} else if ("url" in customError) {
|
|
1566
|
-
errorDetail.context = {
|
|
1567
|
-
url: customError.url
|
|
1568
|
-
};
|
|
1569
|
-
}
|
|
1570
|
-
return errorDetail;
|
|
1571
|
-
}
|
|
1572
|
-
return {
|
|
1573
|
-
code: "UNKNOWN_ERROR",
|
|
1574
|
-
message: error.message
|
|
1575
|
-
};
|
|
1576
|
-
}
|
|
670
|
+
.btn {
|
|
671
|
+
display: inline-flex;
|
|
672
|
+
align-items: center;
|
|
673
|
+
gap: 8px;
|
|
674
|
+
background-color: var(--primary);
|
|
675
|
+
color: white;
|
|
676
|
+
padding: 8px 16px;
|
|
677
|
+
border-radius: 6px;
|
|
678
|
+
text-decoration: none;
|
|
679
|
+
font-size: 13px;
|
|
680
|
+
font-weight: 500;
|
|
681
|
+
}
|
|
682
|
+
.btn:hover { opacity: 0.9; }
|
|
1577
683
|
|
|
1578
|
-
|
|
1579
|
-
|
|
1580
|
-
|
|
1581
|
-
|
|
1582
|
-
|
|
1583
|
-
|
|
1584
|
-
|
|
1585
|
-
|
|
1586
|
-
|
|
1587
|
-
const lowSeverity = summary.categoryInsights.filter((g) => g.severity === "low");
|
|
1588
|
-
const html = `<!DOCTYPE html>
|
|
1589
|
-
<html lang="en">
|
|
1590
|
-
<head>
|
|
1591
|
-
<meta charset="UTF-8">
|
|
1592
|
-
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
1593
|
-
<title>Sitemap QA Report - ${config.baseUrl}</title>
|
|
1594
|
-
<style>
|
|
1595
|
-
* { margin: 0; padding: 0; box-sizing: border-box; }
|
|
1596
|
-
body {
|
|
1597
|
-
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
|
|
1598
|
-
line-height: 1.6;
|
|
1599
|
-
color: #1f2937;
|
|
1600
|
-
background: #ffffff;
|
|
1601
|
-
padding: 24px;
|
|
1602
|
-
}
|
|
1603
|
-
.container {
|
|
1604
|
-
max-width: 1400px;
|
|
1605
|
-
margin: 0 auto;
|
|
1606
|
-
background: white;
|
|
1607
|
-
box-shadow: 0 1px 3px rgba(0,0,0,0.05);
|
|
1608
|
-
border-radius: 12px;
|
|
1609
|
-
overflow: hidden;
|
|
1610
|
-
border: 1px solid #e5e7eb;
|
|
1611
|
-
}
|
|
1612
|
-
.header {
|
|
1613
|
-
background: #0f172a;
|
|
1614
|
-
color: white;
|
|
1615
|
-
padding: 48px 40px;
|
|
1616
|
-
border-bottom: 3px solid #3b82f6;
|
|
1617
|
-
}
|
|
1618
|
-
.header h1 {
|
|
1619
|
-
font-size: 1.875rem;
|
|
1620
|
-
font-weight: 700;
|
|
1621
|
-
margin-bottom: 12px;
|
|
1622
|
-
letter-spacing: -0.025em;
|
|
1623
|
-
}
|
|
1624
|
-
.header .meta {
|
|
1625
|
-
opacity: 0.75;
|
|
1626
|
-
font-size: 0.875rem;
|
|
1627
|
-
font-weight: 400;
|
|
1628
|
-
}
|
|
1629
|
-
.summary {
|
|
1630
|
-
display: grid;
|
|
1631
|
-
grid-template-columns: repeat(auto-fit, minmax(240px, 1fr));
|
|
1632
|
-
gap: 1px;
|
|
1633
|
-
background: #e5e7eb;
|
|
1634
|
-
border-bottom: 1px solid #e5e7eb;
|
|
1635
|
-
}
|
|
1636
|
-
.summary-card {
|
|
1637
|
-
background: white;
|
|
1638
|
-
padding: 28px 32px;
|
|
1639
|
-
text-align: center;
|
|
1640
|
-
}
|
|
1641
|
-
.summary-card .label {
|
|
1642
|
-
font-size: 0.75rem;
|
|
1643
|
-
color: #6b7280;
|
|
1644
|
-
text-transform: uppercase;
|
|
1645
|
-
letter-spacing: 0.05em;
|
|
1646
|
-
font-weight: 600;
|
|
1647
|
-
margin-bottom: 8px;
|
|
1648
|
-
}
|
|
1649
|
-
.summary-card .value {
|
|
1650
|
-
font-size: 2.25rem;
|
|
1651
|
-
font-weight: 700;
|
|
1652
|
-
color: #0f172a;
|
|
1653
|
-
font-variant-numeric: tabular-nums;
|
|
1654
|
-
}
|
|
1655
|
-
.content { padding: 40px; }
|
|
1656
|
-
.status-clean {
|
|
1657
|
-
text-align: center;
|
|
1658
|
-
padding: 80px 32px;
|
|
1659
|
-
background: #f0fdf4;
|
|
1660
|
-
border-radius: 8px;
|
|
1661
|
-
border: 1px solid #86efac;
|
|
1662
|
-
}
|
|
1663
|
-
.status-clean h2 {
|
|
1664
|
-
font-size: 1.875rem;
|
|
1665
|
-
margin-bottom: 12px;
|
|
1666
|
-
color: #166534;
|
|
1667
|
-
font-weight: 700;
|
|
1668
|
-
}
|
|
1669
|
-
.status-clean p {
|
|
1670
|
-
font-size: 1rem;
|
|
1671
|
-
color: #65a30d;
|
|
1672
|
-
}
|
|
1673
|
-
.severity-section { margin-bottom: 32px; }
|
|
1674
|
-
.severity-section h2 {
|
|
1675
|
-
font-size: 1.125rem;
|
|
1676
|
-
font-weight: 600;
|
|
1677
|
-
padding: 16px 20px;
|
|
1678
|
-
margin-bottom: 16px;
|
|
1679
|
-
border-radius: 8px;
|
|
1680
|
-
display: flex;
|
|
1681
|
-
align-items: center;
|
|
1682
|
-
gap: 12px;
|
|
1683
|
-
cursor: pointer;
|
|
1684
|
-
user-select: none;
|
|
1685
|
-
transition: all 0.2s;
|
|
1686
|
-
}
|
|
1687
|
-
.severity-section h2:hover {
|
|
1688
|
-
opacity: 0.85;
|
|
1689
|
-
transform: translateY(-1px);
|
|
1690
|
-
}
|
|
1691
|
-
.severity-section h2::after {
|
|
1692
|
-
content: '\u25BC';
|
|
1693
|
-
margin-left: auto;
|
|
1694
|
-
font-size: 0.8em;
|
|
1695
|
-
transition: transform 0.3s ease;
|
|
1696
|
-
opacity: 0.7;
|
|
1697
|
-
}
|
|
1698
|
-
.severity-section h2.collapsed::after {
|
|
1699
|
-
transform: rotate(-90deg);
|
|
1700
|
-
}
|
|
1701
|
-
.severity-section h2.collapsed {
|
|
1702
|
-
margin-bottom: 0;
|
|
1703
|
-
}
|
|
1704
|
-
.severity-content {
|
|
1705
|
-
max-height: none;
|
|
1706
|
-
overflow: visible;
|
|
1707
|
-
transition: max-height 0.4s ease-out, opacity 0.3s ease-out;
|
|
1708
|
-
opacity: 1;
|
|
1709
|
-
}
|
|
1710
|
-
.severity-content.collapsed {
|
|
1711
|
-
max-height: 0;
|
|
1712
|
-
overflow: hidden;
|
|
1713
|
-
opacity: 0;
|
|
1714
|
-
}
|
|
1715
|
-
.severity-high { background: #fef2f2; color: #dc2626; border: 1px solid #fecaca; }
|
|
1716
|
-
.severity-medium { background: #fffbeb; color: #d97706; border: 1px solid #fde68a; }
|
|
1717
|
-
.severity-low { background: #eff6ff; color: #2563eb; border: 1px solid #dbeafe; }
|
|
1718
|
-
.risk-group {
|
|
1719
|
-
background: white;
|
|
1720
|
-
border: 1px solid #e5e7eb;
|
|
1721
|
-
border-radius: 8px;
|
|
1722
|
-
padding: 24px;
|
|
1723
|
-
margin-bottom: 16px;
|
|
1724
|
-
}
|
|
1725
|
-
.risk-group h3 {
|
|
1726
|
-
font-size: 1rem;
|
|
1727
|
-
margin-bottom: 12px;
|
|
1728
|
-
color: #0f172a;
|
|
1729
|
-
font-weight: 600;
|
|
1730
|
-
}
|
|
1731
|
-
.risk-group .count {
|
|
1732
|
-
display: inline-block;
|
|
1733
|
-
background: #3b82f6;
|
|
1734
|
-
color: white;
|
|
1735
|
-
padding: 2px 10px;
|
|
1736
|
-
border-radius: 9999px;
|
|
1737
|
-
font-size: 0.75rem;
|
|
1738
|
-
font-weight: 600;
|
|
1739
|
-
margin-left: 8px;
|
|
1740
|
-
}
|
|
1741
|
-
.risk-group .impact {
|
|
1742
|
-
color: #64748b;
|
|
1743
|
-
margin-bottom: 16px;
|
|
1744
|
-
font-size: 0.875rem;
|
|
1745
|
-
line-height: 1.6;
|
|
1746
|
-
}
|
|
1747
|
-
.risk-group .urls {
|
|
1748
|
-
background: #f8fafc;
|
|
1749
|
-
border: 1px solid #e2e8f0;
|
|
1750
|
-
border-radius: 6px;
|
|
1751
|
-
padding: 16px;
|
|
1752
|
-
}
|
|
1753
|
-
.risk-group .urls h4 {
|
|
1754
|
-
font-size: 0.75rem;
|
|
1755
|
-
color: #64748b;
|
|
1756
|
-
margin-bottom: 12px;
|
|
1757
|
-
text-transform: uppercase;
|
|
1758
|
-
letter-spacing: 0.05em;
|
|
1759
|
-
font-weight: 600;
|
|
1760
|
-
}
|
|
1761
|
-
.risk-group .urls ul { list-style: none; }
|
|
1762
|
-
.risk-group .urls li {
|
|
1763
|
-
padding: 10px 12px;
|
|
1764
|
-
border-bottom: 1px solid #e2e8f0;
|
|
1765
|
-
font-family: 'SF Mono', 'Monaco', 'Cascadia Code', 'Consolas', monospace;
|
|
1766
|
-
font-size: 0.8125rem;
|
|
1767
|
-
color: #334155;
|
|
1768
|
-
background: white;
|
|
1769
|
-
margin-bottom: 4px;
|
|
1770
|
-
border-radius: 4px;
|
|
1771
|
-
word-break: break-all;
|
|
1772
|
-
line-height: 1.6;
|
|
1773
|
-
}
|
|
1774
|
-
.risk-group .urls li:last-child { border-bottom: none; margin-bottom: 0; }
|
|
1775
|
-
.risk-group .more {
|
|
1776
|
-
color: #3b82f6;
|
|
1777
|
-
font-style: italic;
|
|
1778
|
-
margin-top: 8px;
|
|
1779
|
-
font-size: 0.8125rem;
|
|
1780
|
-
}
|
|
1781
|
-
.download-btn {
|
|
1782
|
-
display: inline-block;
|
|
1783
|
-
background: #3b82f6;
|
|
1784
|
-
color: white;
|
|
1785
|
-
padding: 8px 16px;
|
|
1786
|
-
border-radius: 6px;
|
|
1787
|
-
text-decoration: none;
|
|
1788
|
-
font-size: 0.8125rem;
|
|
1789
|
-
font-weight: 500;
|
|
1790
|
-
margin-top: 12px;
|
|
1791
|
-
cursor: pointer;
|
|
1792
|
-
border: none;
|
|
1793
|
-
transition: all 0.15s;
|
|
1794
|
-
}
|
|
1795
|
-
.download-btn:hover {
|
|
1796
|
-
background: #2563eb;
|
|
1797
|
-
transform: translateY(-1px);
|
|
1798
|
-
box-shadow: 0 4px 6px -1px rgba(0,0,0,0.1);
|
|
1799
|
-
}
|
|
1800
|
-
.footer {
|
|
1801
|
-
background: #f8fafc;
|
|
1802
|
-
padding: 24px 40px;
|
|
1803
|
-
border-top: 1px solid #e5e7eb;
|
|
1804
|
-
text-align: center;
|
|
1805
|
-
color: #64748b;
|
|
1806
|
-
font-size: 0.8125rem;
|
|
1807
|
-
}
|
|
1808
|
-
.sitemaps {
|
|
1809
|
-
background: white;
|
|
1810
|
-
border: 1px solid #e5e7eb;
|
|
1811
|
-
border-radius: 8px;
|
|
1812
|
-
margin-bottom: 24px;
|
|
1813
|
-
overflow: hidden;
|
|
1814
|
-
}
|
|
1815
|
-
.sitemaps h3 {
|
|
1816
|
-
font-size: 1.125rem;
|
|
1817
|
-
font-weight: 600;
|
|
1818
|
-
padding: 16px 20px;
|
|
1819
|
-
margin: 0;
|
|
1820
|
-
color: #0f172a;
|
|
1821
|
-
background: #f8fafc;
|
|
1822
|
-
cursor: pointer;
|
|
1823
|
-
user-select: none;
|
|
1824
|
-
transition: all 0.15s;
|
|
1825
|
-
display: flex;
|
|
1826
|
-
align-items: center;
|
|
1827
|
-
gap: 10px;
|
|
1828
|
-
}
|
|
1829
|
-
.sitemaps h3:hover {
|
|
1830
|
-
background: #f1f5f9;
|
|
1831
|
-
}
|
|
1832
|
-
.sitemaps h3::after {
|
|
1833
|
-
content: '\u25BC';
|
|
1834
|
-
margin-left: auto;
|
|
1835
|
-
font-size: 0.8em;
|
|
1836
|
-
transition: transform 0.3s ease;
|
|
1837
|
-
opacity: 0.7;
|
|
1838
|
-
}
|
|
1839
|
-
.sitemaps h3.collapsed::after {
|
|
1840
|
-
transform: rotate(-90deg);
|
|
1841
|
-
}
|
|
1842
|
-
.sitemaps-content {
|
|
1843
|
-
max-height: none;
|
|
1844
|
-
overflow: visible;
|
|
1845
|
-
transition: max-height 0.4s ease-out, opacity 0.3s ease-out;
|
|
1846
|
-
opacity: 1;
|
|
1847
|
-
padding: 20px;
|
|
1848
|
-
}
|
|
1849
|
-
.sitemaps-content.collapsed {
|
|
1850
|
-
max-height: 0;
|
|
1851
|
-
overflow: hidden;
|
|
1852
|
-
opacity: 0;
|
|
1853
|
-
padding: 0 20px;
|
|
1854
|
-
}
|
|
1855
|
-
.sitemaps ul { list-style: none; }
|
|
1856
|
-
.sitemaps li {
|
|
1857
|
-
padding: 10px 12px;
|
|
1858
|
-
font-family: 'SF Mono', 'Monaco', 'Cascadia Code', 'Consolas', monospace;
|
|
1859
|
-
font-size: 0.8125rem;
|
|
1860
|
-
color: #475569;
|
|
1861
|
-
word-break: break-all;
|
|
1862
|
-
line-height: 1.6;
|
|
1863
|
-
background: #f8fafc;
|
|
1864
|
-
margin-bottom: 4px;
|
|
1865
|
-
border-radius: 4px;
|
|
1866
|
-
}
|
|
1867
|
-
.sitemaps li:last-child { margin-bottom: 0; }
|
|
1868
|
-
.errors-section {
|
|
1869
|
-
background: #fffbeb;
|
|
1870
|
-
border-left: 4px solid #f59e0b;
|
|
1871
|
-
padding: 20px;
|
|
1872
|
-
margin-bottom: 24px;
|
|
1873
|
-
border-radius: 8px;
|
|
1874
|
-
border: 1px solid #fde68a;
|
|
1875
|
-
}
|
|
1876
|
-
.errors-section h3 {
|
|
1877
|
-
color: #92400e;
|
|
1878
|
-
margin-bottom: 16px;
|
|
1879
|
-
font-size: 1.125rem;
|
|
1880
|
-
font-weight: 600;
|
|
1881
|
-
display: flex;
|
|
1882
|
-
align-items: center;
|
|
1883
|
-
gap: 8px;
|
|
1884
|
-
}
|
|
1885
|
-
.errors-section ul {
|
|
1886
|
-
list-style: none;
|
|
1887
|
-
padding: 0;
|
|
1888
|
-
}
|
|
1889
|
-
.errors-section li {
|
|
1890
|
-
padding: 12px;
|
|
1891
|
-
background: white;
|
|
1892
|
-
margin-bottom: 8px;
|
|
1893
|
-
border-radius: 6px;
|
|
1894
|
-
font-family: 'SF Mono', 'Monaco', 'Cascadia Code', 'Consolas', monospace;
|
|
1895
|
-
font-size: 0.8125rem;
|
|
1896
|
-
color: #78350f;
|
|
1897
|
-
word-break: break-all;
|
|
1898
|
-
line-height: 1.6;
|
|
1899
|
-
border: 1px solid #fde68a;
|
|
1900
|
-
}
|
|
1901
|
-
.errors-section li:last-child {
|
|
1902
|
-
margin-bottom: 0;
|
|
1903
|
-
}
|
|
1904
|
-
</style>
|
|
684
|
+
footer {
|
|
685
|
+
text-align: center;
|
|
686
|
+
padding: 40px;
|
|
687
|
+
color: var(--text-muted);
|
|
688
|
+
font-size: 12px;
|
|
689
|
+
border-top: 1px solid var(--border);
|
|
690
|
+
margin-top: 40px;
|
|
691
|
+
}
|
|
692
|
+
</style>
|
|
1905
693
|
</head>
|
|
1906
694
|
<body>
|
|
1907
|
-
|
|
1908
|
-
|
|
1909
|
-
|
|
1910
|
-
|
|
1911
|
-
|
|
1912
|
-
|
|
1913
|
-
|
|
1914
|
-
</div>
|
|
1915
|
-
|
|
1916
|
-
<div class="summary">
|
|
1917
|
-
<div class="summary-card">
|
|
1918
|
-
<div class="label">Sitemaps</div>
|
|
1919
|
-
<div class="value">${discoveryResult.sitemaps.length}</div>
|
|
1920
|
-
</div>
|
|
1921
|
-
<div class="summary-card">
|
|
1922
|
-
<div class="label">URLs Analyzed</div>
|
|
1923
|
-
<div class="value">${totalUrls.toLocaleString()}</div>
|
|
1924
|
-
</div>
|
|
1925
|
-
<div class="summary-card">
|
|
1926
|
-
<div class="label">Issues Found</div>
|
|
1927
|
-
<div class="value" style="color: ${riskyUrlCount > 0 ? "#dc2626" : "#059669"}">${riskyUrlCount}</div>
|
|
1928
|
-
</div>
|
|
1929
|
-
<div class="summary-card">
|
|
1930
|
-
<div class="label">Scan Time</div>
|
|
1931
|
-
<div class="value">${(summary.metadata.processingTime / 1e3).toFixed(1)}s</div>
|
|
1932
|
-
</div>
|
|
1933
|
-
</div>
|
|
1934
|
-
|
|
1935
|
-
<div class="content">
|
|
1936
|
-
${errors.length > 0 ? `
|
|
1937
|
-
<div class="errors-section">
|
|
1938
|
-
<h3>Parsing Errors & Warnings (${errors.length})</h3>
|
|
1939
|
-
<ul>
|
|
1940
|
-
${errors.map((err) => `<li>${err.message}</li>`).join("\n ")}
|
|
1941
|
-
</ul>
|
|
1942
|
-
</div>
|
|
1943
|
-
` : ""}
|
|
1944
|
-
|
|
1945
|
-
${discoveryResult.sitemaps.length > 0 ? `
|
|
1946
|
-
<div class="sitemaps">
|
|
1947
|
-
<h3 class="collapsed" onclick="toggleSection(this)">Sitemaps Discovered (${discoveryResult.sitemaps.length})</h3>
|
|
1948
|
-
<div class="sitemaps-content collapsed">
|
|
1949
|
-
<ul>
|
|
1950
|
-
${discoveryResult.sitemaps.map((s) => `<li>\u2022 ${s}</li>`).join("\n ")}
|
|
1951
|
-
</ul>
|
|
695
|
+
<header>
|
|
696
|
+
<div class="container">
|
|
697
|
+
<h1>Sitemap Analysis</h1>
|
|
698
|
+
<div class="meta">
|
|
699
|
+
<div>${esc(data.rootUrl)}</div>
|
|
700
|
+
<div>${esc(timestamp)}</div>
|
|
701
|
+
</div>
|
|
1952
702
|
</div>
|
|
1953
|
-
|
|
1954
|
-
` : ""}
|
|
703
|
+
</header>
|
|
1955
704
|
|
|
1956
|
-
|
|
1957
|
-
|
|
1958
|
-
|
|
1959
|
-
|
|
1960
|
-
</div>
|
|
1961
|
-
` : ""}
|
|
1962
|
-
|
|
1963
|
-
${highSeverity.length > 0 ? `
|
|
1964
|
-
<div class="severity-section">
|
|
1965
|
-
<h2 class="severity-high" onclick="toggleSection(this)">High Severity (${highSeverity.reduce((sum, g) => sum + g.count, 0)} URLs)</h2>
|
|
1966
|
-
<div class="severity-content">
|
|
1967
|
-
${highSeverity.map((group) => renderRiskGroup(group, maxUrls)).join("\n ")}
|
|
705
|
+
<div class="summary-grid">
|
|
706
|
+
<div class="summary-card">
|
|
707
|
+
<h3>Sitemaps</h3>
|
|
708
|
+
<p>${data.discoveredSitemaps.length}</p>
|
|
1968
709
|
</div>
|
|
1969
|
-
|
|
1970
|
-
|
|
1971
|
-
|
|
1972
|
-
${mediumSeverity.length > 0 ? `
|
|
1973
|
-
<div class="severity-section">
|
|
1974
|
-
<h2 class="severity-medium" onclick="toggleSection(this)">Medium Severity (${mediumSeverity.reduce((sum, g) => sum + g.count, 0)} URLs)</h2>
|
|
1975
|
-
<div class="severity-content">
|
|
1976
|
-
${mediumSeverity.map((group) => renderRiskGroup(group, maxUrls)).join("\n ")}
|
|
710
|
+
<div class="summary-card">
|
|
711
|
+
<h3>URLs Analyzed</h3>
|
|
712
|
+
<p>${data.totalUrls.toLocaleString()}</p>
|
|
1977
713
|
</div>
|
|
1978
|
-
|
|
1979
|
-
|
|
1980
|
-
|
|
1981
|
-
|
|
1982
|
-
|
|
1983
|
-
|
|
1984
|
-
|
|
1985
|
-
|
|
714
|
+
<div class="summary-card highlight">
|
|
715
|
+
<h3>Issues Found</h3>
|
|
716
|
+
<p>${data.totalRisks}</p>
|
|
717
|
+
</div>
|
|
718
|
+
<div class="summary-card">
|
|
719
|
+
<h3>URLs Ignored</h3>
|
|
720
|
+
<p>${data.ignoredUrls.length}</p>
|
|
721
|
+
</div>
|
|
722
|
+
<div class="summary-card">
|
|
723
|
+
<h3>Scan Time</h3>
|
|
724
|
+
<p>${duration}s</p>
|
|
1986
725
|
</div>
|
|
1987
|
-
</div>
|
|
1988
|
-
` : ""}
|
|
1989
726
|
</div>
|
|
1990
727
|
|
|
1991
|
-
<div class="
|
|
1992
|
-
|
|
728
|
+
<div class="container">
|
|
729
|
+
<details>
|
|
730
|
+
<summary>Sitemaps Discovered (${data.discoveredSitemaps.length})</summary>
|
|
731
|
+
<div style="padding: 20px; background: var(--bg-light);">
|
|
732
|
+
${data.discoveredSitemaps.map((s) => `<div class="url-item">${esc(s)}</div>`).join("")}
|
|
733
|
+
</div>
|
|
734
|
+
</details>
|
|
735
|
+
|
|
736
|
+
${data.ignoredUrls.length > 0 ? `
|
|
737
|
+
<details>
|
|
738
|
+
<summary>Ignored URLs (${data.ignoredUrls.length})</summary>
|
|
739
|
+
<div style="padding: 20px; background: var(--bg-light);">
|
|
740
|
+
${data.ignoredUrls.map((u) => {
|
|
741
|
+
const suppressedRisks = u.risks.length > 0 ? ` <span style="color: var(--danger); font-size: 11px; font-weight: bold;">[Suppressed Risks: ${[...new Set(u.risks.map((r) => r.category))].map(esc).join(", ")}]</span>` : "";
|
|
742
|
+
const ignoredBy = u.ignoredBy ?? "Unknown";
|
|
743
|
+
return `<div class="url-item" title="Ignored by: ${esc(ignoredBy)}">${esc(u.loc)} <span style="color: var(--text-muted); font-size: 11px;">(by ${esc(ignoredBy)})</span>${suppressedRisks}</div>`;
|
|
744
|
+
}).join("")}
|
|
745
|
+
</div>
|
|
746
|
+
</details>
|
|
747
|
+
` : ""}
|
|
748
|
+
|
|
749
|
+
${Object.entries(categories).map(([category, findings]) => {
|
|
750
|
+
const totalCategoryUrls = Object.values(findings).reduce((acc, f) => acc + f.urls.length, 0);
|
|
751
|
+
return `
|
|
752
|
+
<div class="category-section">
|
|
753
|
+
<div class="category-header">
|
|
754
|
+
<span>${esc(category)} (${totalCategoryUrls} URLs)</span>
|
|
755
|
+
<span>\u25BC</span>
|
|
756
|
+
</div>
|
|
757
|
+
<div class="category-content">
|
|
758
|
+
${Object.entries(findings).map(([pattern, finding]) => `
|
|
759
|
+
<div class="finding-group">
|
|
760
|
+
<div class="finding-header">
|
|
761
|
+
<h4>${esc(pattern)}</h4>
|
|
762
|
+
<span class="badge">${finding.urls.length} URLs</span>
|
|
763
|
+
</div>
|
|
764
|
+
<div class="finding-description">
|
|
765
|
+
${esc(finding.reason)}
|
|
766
|
+
</div>
|
|
767
|
+
<div class="url-list">
|
|
768
|
+
${finding.urls.slice(0, 3).map((url) => `
|
|
769
|
+
<div class="url-item">${esc(url)}</div>
|
|
770
|
+
`).join("")}
|
|
771
|
+
</div>
|
|
772
|
+
${finding.urls.length > 3 ? `
|
|
773
|
+
<div class="more-count">... and ${finding.urls.length - 3} more</div>
|
|
774
|
+
` : ""}
|
|
775
|
+
<a href="#" class="btn" onclick="downloadUrls(${JSON.stringify(pattern).replace(/"/g, """)}, ${JSON.stringify(finding.urls).replace(/"/g, """)})">
|
|
776
|
+
\u{1F4E5} Download All ${finding.urls.length} URLs
|
|
777
|
+
</a>
|
|
778
|
+
</div>
|
|
779
|
+
`).join("")}
|
|
780
|
+
</div>
|
|
781
|
+
</div>
|
|
782
|
+
`;
|
|
783
|
+
}).join("")}
|
|
1993
784
|
</div>
|
|
1994
|
-
|
|
1995
|
-
|
|
1996
|
-
|
|
1997
|
-
|
|
1998
|
-
|
|
1999
|
-
|
|
2000
|
-
|
|
2001
|
-
|
|
2002
|
-
|
|
2003
|
-
|
|
2004
|
-
|
|
2005
|
-
|
|
2006
|
-
|
|
2007
|
-
|
|
2008
|
-
|
|
2009
|
-
|
|
2010
|
-
|
|
2011
|
-
|
|
2012
|
-
// Create blob and download
|
|
2013
|
-
const blob = new Blob([textContent], { type: 'text/plain' });
|
|
2014
|
-
const url = URL.createObjectURL(blob);
|
|
2015
|
-
const a = document.createElement('a');
|
|
2016
|
-
a.href = url;
|
|
2017
|
-
a.download = categorySlug + '_urls.txt';
|
|
2018
|
-
document.body.appendChild(a);
|
|
2019
|
-
a.click();
|
|
2020
|
-
document.body.removeChild(a);
|
|
2021
|
-
URL.revokeObjectURL(url);
|
|
2022
|
-
}
|
|
2023
|
-
</script>
|
|
785
|
+
|
|
786
|
+
<footer>
|
|
787
|
+
Generated by sitemap-qa v1.0.0
|
|
788
|
+
</footer>
|
|
789
|
+
|
|
790
|
+
<script>
|
|
791
|
+
function downloadUrls(name, urls) {
|
|
792
|
+
const blob = new Blob([urls.join('\\n')], { type: 'text/plain' });
|
|
793
|
+
const url = window.URL.createObjectURL(blob);
|
|
794
|
+
const a = document.createElement('a');
|
|
795
|
+
a.href = url;
|
|
796
|
+
a.download = \`\${name.replace(/[^a-z0-9]/gi, '_').toLowerCase()}_urls.txt\`;
|
|
797
|
+
document.body.appendChild(a);
|
|
798
|
+
a.click();
|
|
799
|
+
window.URL.revokeObjectURL(url);
|
|
800
|
+
document.body.removeChild(a);
|
|
801
|
+
}
|
|
802
|
+
</script>
|
|
2024
803
|
</body>
|
|
2025
|
-
</html
|
|
2026
|
-
|
|
2027
|
-
}
|
|
2028
|
-
|
|
2029
|
-
|
|
2030
|
-
|
|
2031
|
-
|
|
2032
|
-
const categorySlug = group.category.toLowerCase();
|
|
2033
|
-
const allUrlsJson = JSON.stringify(group.allUrls);
|
|
2034
|
-
const encodedUrls = escapeHtml(allUrlsJson);
|
|
2035
|
-
return `<div class="risk-group">
|
|
2036
|
-
<h3>${categoryTitle} <span class="count">${group.count} URLs</span></h3>
|
|
2037
|
-
<div class="impact">${group.summary}</div>
|
|
2038
|
-
<div class="urls">
|
|
2039
|
-
<h4>Sample URLs</h4>
|
|
2040
|
-
<ul>
|
|
2041
|
-
${urlsToShow.map((url) => `<li>${escapeHtml(url)}</li>`).join("\n ")}
|
|
2042
|
-
</ul>
|
|
2043
|
-
${remaining > 0 ? `<div class="more">... and ${remaining} more</div>` : ""}
|
|
2044
|
-
<button class="download-btn" onclick="downloadUrls('${categorySlug}', '${encodedUrls}')">\u{1F4E5} Download All ${group.count} URLs</button>
|
|
2045
|
-
</div>
|
|
2046
|
-
</div>`;
|
|
2047
|
-
}
|
|
2048
|
-
function escapeHtml(text) {
|
|
2049
|
-
return text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'");
|
|
2050
|
-
}
|
|
2051
|
-
async function writeHtmlReport(summary, discoveryResult, totalUrls, config, outputPath, errors, options = {}) {
|
|
2052
|
-
const htmlContent = generateHtmlReport(summary, discoveryResult, totalUrls, config, errors, options);
|
|
2053
|
-
await fs.writeFile(outputPath, htmlContent, "utf-8");
|
|
2054
|
-
}
|
|
804
|
+
</html>
|
|
805
|
+
`;
|
|
806
|
+
}
|
|
807
|
+
escapeHtml(str) {
|
|
808
|
+
return str.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'");
|
|
809
|
+
}
|
|
810
|
+
};
|
|
2055
811
|
|
|
2056
812
|
// src/commands/analyze.ts
|
|
2057
|
-
var analyzeCommand = new Command("analyze").description("Analyze sitemap for
|
|
2058
|
-
|
|
813
|
+
var analyzeCommand = new Command("analyze").description("Analyze a sitemap for potential risks").argument("<url>", "Root sitemap URL").option("-c, --config <path>", "Path to sitemap-qa.yaml").option("-o, --output <format>", "Output format (json, html, all)").option("-d, --out-dir <path>", "Directory to save reports").action(async (url, options) => {
|
|
814
|
+
const startTime = /* @__PURE__ */ new Date();
|
|
815
|
+
const config = ConfigLoader.load(options.config);
|
|
816
|
+
const outDir = options.outDir || config.outDir || ".";
|
|
817
|
+
const outputFormat = options.output || config.outputFormat || "all";
|
|
818
|
+
const extractor = new ExtractorService();
|
|
819
|
+
const matcher = new MatcherService(config, url);
|
|
820
|
+
const urlsWithRisks = [];
|
|
821
|
+
const ignoredUrls = [];
|
|
822
|
+
let totalUrls = 0;
|
|
823
|
+
let totalRisks = 0;
|
|
824
|
+
console.log(chalk3.blue(`
|
|
825
|
+
\u{1F680} Starting analysis of ${url}...`));
|
|
2059
826
|
try {
|
|
2060
|
-
|
|
2061
|
-
|
|
2062
|
-
|
|
2063
|
-
|
|
2064
|
-
|
|
2065
|
-
|
|
2066
|
-
|
|
2067
|
-
|
|
2068
|
-
|
|
2069
|
-
|
|
2070
|
-
|
|
2071
|
-
|
|
2072
|
-
});
|
|
2073
|
-
config = loadedConfig;
|
|
2074
|
-
console.log(`
|
|
2075
|
-
\u{1F50D} Analyzing ${url}...
|
|
2076
|
-
`);
|
|
2077
|
-
const result = await runAnalysisPipeline(url, config);
|
|
2078
|
-
await fs2.mkdir(config.outputDir, { recursive: true });
|
|
2079
|
-
if (options.output === "json") {
|
|
2080
|
-
const jsonReport = generateJsonReport(
|
|
2081
|
-
result.summary,
|
|
2082
|
-
result.discoveryResult,
|
|
2083
|
-
{ totalCount: result.totalUrls, uniqueUrls: [], errors: [] },
|
|
2084
|
-
result.riskGroups,
|
|
2085
|
-
config,
|
|
2086
|
-
result.executionTime,
|
|
2087
|
-
{ pretty: true, indent: 2 }
|
|
2088
|
-
);
|
|
2089
|
-
console.log("\n" + jsonReport);
|
|
2090
|
-
if (options.outputFile) {
|
|
2091
|
-
const jsonFilePath = `${config.outputDir}/${options.outputFile}`;
|
|
2092
|
-
await fs2.writeFile(jsonFilePath, jsonReport, "utf-8");
|
|
2093
|
-
console.log(`
|
|
2094
|
-
\u{1F4C4} JSON report saved to: ${chalk.cyan(jsonFilePath)}`);
|
|
827
|
+
for await (const urlObj of extractor.extract(url)) {
|
|
828
|
+
totalUrls++;
|
|
829
|
+
const risks = matcher.match(urlObj);
|
|
830
|
+
if (risks.length > 0) {
|
|
831
|
+
urlObj.risks = risks;
|
|
832
|
+
urlsWithRisks.push(urlObj);
|
|
833
|
+
totalRisks += risks.length;
|
|
834
|
+
} else if (urlObj.ignored) {
|
|
835
|
+
ignoredUrls.push(urlObj);
|
|
836
|
+
}
|
|
837
|
+
if (totalUrls % 100 === 0) {
|
|
838
|
+
process.stdout.write(chalk3.gray(`\rProcessed ${totalUrls} URLs...`));
|
|
2095
839
|
}
|
|
2096
|
-
} else {
|
|
2097
|
-
showCliSummary(result);
|
|
2098
|
-
const htmlFileName = options.outputFile || `sitemap-qa-report-${Date.now()}.html`;
|
|
2099
|
-
const htmlFilePath = `${config.outputDir}/${htmlFileName}`;
|
|
2100
|
-
await writeHtmlReport(
|
|
2101
|
-
result.summary,
|
|
2102
|
-
result.discoveryResult,
|
|
2103
|
-
result.totalUrls,
|
|
2104
|
-
config,
|
|
2105
|
-
htmlFilePath,
|
|
2106
|
-
result.errors,
|
|
2107
|
-
{ maxUrlsPerGroup: 10 }
|
|
2108
|
-
);
|
|
2109
|
-
console.log(`
|
|
2110
|
-
\u{1F4C4} Full report saved to: ${chalk.cyan(htmlFilePath)}`);
|
|
2111
840
|
}
|
|
2112
|
-
|
|
2113
|
-
|
|
2114
|
-
|
|
2115
|
-
|
|
2116
|
-
|
|
2117
|
-
|
|
2118
|
-
|
|
2119
|
-
|
|
2120
|
-
|
|
2121
|
-
|
|
2122
|
-
|
|
2123
|
-
|
|
2124
|
-
);
|
|
2125
|
-
|
|
2126
|
-
|
|
2127
|
-
|
|
2128
|
-
|
|
2129
|
-
}
|
|
2130
|
-
}
|
|
2131
|
-
function showCliSummary(result) {
|
|
2132
|
-
const riskyUrlCount = result.summary.categoryInsights.reduce((sum, g) => sum + g.count, 0);
|
|
2133
|
-
console.log(chalk.dim("\u2500".repeat(50)));
|
|
2134
|
-
if (riskyUrlCount === 0) {
|
|
2135
|
-
console.log(chalk.green("No issues found - sitemap looks clean!"));
|
|
2136
|
-
} else {
|
|
2137
|
-
const { high, medium, low } = result.summary.severityBreakdown;
|
|
2138
|
-
const severityParts = [];
|
|
2139
|
-
if (high > 0) severityParts.push(chalk.red(`High: ${high}`));
|
|
2140
|
-
if (medium > 0) severityParts.push(chalk.yellow(`Medium: ${medium}`));
|
|
2141
|
-
if (low > 0) severityParts.push(chalk.blue(`Low: ${low}`));
|
|
2142
|
-
const severitySummary = severityParts.length > 0 ? ` (${severityParts.join(", ")})` : "";
|
|
2143
|
-
console.log(chalk.yellow(`\u26A0\uFE0F ${riskyUrlCount} risky URLs found${severitySummary}`));
|
|
2144
|
-
}
|
|
2145
|
-
console.log("");
|
|
2146
|
-
}
|
|
2147
|
-
async function runAnalysisPipeline(url, config) {
|
|
2148
|
-
const overallStartTime = Date.now();
|
|
2149
|
-
const phaseTimings = [];
|
|
2150
|
-
const errors = [];
|
|
2151
|
-
const showProgress = !config.silent && config.progressBar !== false && process.stdout.isTTY;
|
|
2152
|
-
let phaseStart = Date.now();
|
|
2153
|
-
const discoverySpinner = showProgress ? ora({ text: "Discovering sitemaps...", color: "cyan" }).start() : null;
|
|
2154
|
-
const discoveryResult = await discoverSitemaps(url, config);
|
|
2155
|
-
if (discoverySpinner) {
|
|
2156
|
-
discoverySpinner.stop();
|
|
2157
|
-
}
|
|
2158
|
-
phaseTimings.push({
|
|
2159
|
-
name: "Discovery",
|
|
2160
|
-
startTime: phaseStart,
|
|
2161
|
-
endTime: Date.now(),
|
|
2162
|
-
duration: Date.now() - phaseStart
|
|
2163
|
-
});
|
|
2164
|
-
if (discoveryResult.accessIssues.length > 0) {
|
|
2165
|
-
if (!config.silent) {
|
|
2166
|
-
console.warn(chalk.yellow(`\u26A0\uFE0F Warning: ${discoveryResult.accessIssues.length} sitemap(s) are access-blocked`));
|
|
841
|
+
process.stdout.write("\n");
|
|
842
|
+
const endTime = /* @__PURE__ */ new Date();
|
|
843
|
+
const reportData = {
|
|
844
|
+
rootUrl: url,
|
|
845
|
+
discoveredSitemaps: extractor.getDiscoveredSitemaps(),
|
|
846
|
+
totalUrls,
|
|
847
|
+
totalRisks,
|
|
848
|
+
urlsWithRisks,
|
|
849
|
+
ignoredUrls,
|
|
850
|
+
startTime,
|
|
851
|
+
endTime
|
|
852
|
+
};
|
|
853
|
+
const reporters = [new ConsoleReporter()];
|
|
854
|
+
await fs4.mkdir(outDir, { recursive: true });
|
|
855
|
+
if (outputFormat === "json" || outputFormat === "all") {
|
|
856
|
+
const jsonPath = path2.join(outDir, "sitemap-qa-report.json");
|
|
857
|
+
reporters.push(new JsonReporter(jsonPath));
|
|
2167
858
|
}
|
|
2168
|
-
|
|
2169
|
-
|
|
859
|
+
if (outputFormat === "html" || outputFormat === "all") {
|
|
860
|
+
const htmlPath = path2.join(outDir, "sitemap-qa-report.html");
|
|
861
|
+
reporters.push(new HtmlReporter(htmlPath));
|
|
2170
862
|
}
|
|
2171
|
-
|
|
2172
|
-
|
|
2173
|
-
throw new Error(`No sitemaps found at ${url}. Tried: /sitemap.xml, /sitemap_index.xml, /robots.txt`);
|
|
2174
|
-
}
|
|
2175
|
-
phaseStart = Date.now();
|
|
2176
|
-
let extractionResult;
|
|
2177
|
-
if (showProgress && discoveryResult.sitemaps.length > 10) {
|
|
2178
|
-
const parseBar = new cliProgress.SingleBar({
|
|
2179
|
-
format: "{bar} {percentage}% | {value}/{total} | ETA: {eta}s | {speed} sitemaps/sec",
|
|
2180
|
-
barCompleteChar: "\u2588",
|
|
2181
|
-
barIncompleteChar: "\u2591",
|
|
2182
|
-
hideCursor: true
|
|
2183
|
-
});
|
|
2184
|
-
parseBar.start(discoveryResult.sitemaps.length, 0, { speed: "0" });
|
|
2185
|
-
extractionResult = await extractAllUrls(
|
|
2186
|
-
discoveryResult.sitemaps,
|
|
2187
|
-
config,
|
|
2188
|
-
(completed, total) => {
|
|
2189
|
-
const elapsed = (Date.now() - phaseStart) / 1e3;
|
|
2190
|
-
const speed = elapsed > 0 ? (completed / elapsed).toFixed(1) : "0";
|
|
2191
|
-
parseBar.update(completed, { speed });
|
|
2192
|
-
}
|
|
2193
|
-
);
|
|
2194
|
-
parseBar.stop();
|
|
2195
|
-
} else {
|
|
2196
|
-
extractionResult = await extractAllUrls(discoveryResult.sitemaps, config);
|
|
2197
|
-
}
|
|
2198
|
-
phaseTimings.push({
|
|
2199
|
-
name: "Parsing",
|
|
2200
|
-
startTime: phaseStart,
|
|
2201
|
-
endTime: Date.now(),
|
|
2202
|
-
duration: Date.now() - phaseStart
|
|
2203
|
-
});
|
|
2204
|
-
if (extractionResult.errors.length > 0) {
|
|
2205
|
-
for (const err of extractionResult.errors) {
|
|
2206
|
-
if (typeof err === "string") {
|
|
2207
|
-
errors.push(new Error(err));
|
|
2208
|
-
} else {
|
|
2209
|
-
errors.push(err);
|
|
2210
|
-
}
|
|
863
|
+
for (const reporter of reporters) {
|
|
864
|
+
await reporter.generate(reportData);
|
|
2211
865
|
}
|
|
2212
|
-
|
|
2213
|
-
|
|
2214
|
-
throw new Error("No URLs extracted from sitemaps");
|
|
2215
|
-
}
|
|
2216
|
-
phaseStart = Date.now();
|
|
2217
|
-
const consolidatedResult = consolidateUrls(extractionResult.allUrls);
|
|
2218
|
-
phaseTimings.push({
|
|
2219
|
-
name: "Deduplication",
|
|
2220
|
-
startTime: phaseStart,
|
|
2221
|
-
endTime: Date.now(),
|
|
2222
|
-
duration: Date.now() - phaseStart
|
|
2223
|
-
});
|
|
2224
|
-
const duplicatesRemoved = extractionResult.allUrls.length - consolidatedResult.uniqueUrls.length;
|
|
2225
|
-
const duplicatePercentage = duplicatesRemoved / extractionResult.allUrls.length * 100;
|
|
2226
|
-
if (!config.silent) {
|
|
2227
|
-
if (duplicatesRemoved > 100 || duplicatePercentage > 1) {
|
|
2228
|
-
console.log(chalk.green(`Found ${discoveryResult.sitemaps.length} sitemap(s) \u2192 ${extractionResult.allUrls.length.toLocaleString()} URLs (${consolidatedResult.uniqueUrls.length.toLocaleString()} unique)`));
|
|
866
|
+
if (totalRisks > 0) {
|
|
867
|
+
process.exit(1);
|
|
2229
868
|
} else {
|
|
2230
|
-
|
|
869
|
+
process.exit(0);
|
|
2231
870
|
}
|
|
871
|
+
} catch (error) {
|
|
872
|
+
console.error(chalk3.red("\nAnalysis failed:"), error);
|
|
873
|
+
process.exit(1);
|
|
2232
874
|
}
|
|
2233
|
-
|
|
2234
|
-
|
|
2235
|
-
|
|
2236
|
-
|
|
2237
|
-
|
|
2238
|
-
|
|
2239
|
-
|
|
2240
|
-
|
|
2241
|
-
|
|
2242
|
-
|
|
2243
|
-
|
|
2244
|
-
|
|
2245
|
-
|
|
2246
|
-
|
|
2247
|
-
|
|
2248
|
-
|
|
2249
|
-
|
|
2250
|
-
|
|
2251
|
-
|
|
2252
|
-
|
|
2253
|
-
|
|
2254
|
-
|
|
2255
|
-
|
|
2256
|
-
|
|
2257
|
-
|
|
2258
|
-
|
|
2259
|
-
|
|
2260
|
-
|
|
2261
|
-
|
|
2262
|
-
|
|
2263
|
-
|
|
2264
|
-
|
|
2265
|
-
|
|
2266
|
-
|
|
2267
|
-
|
|
2268
|
-
|
|
2269
|
-
|
|
2270
|
-
|
|
2271
|
-
|
|
2272
|
-
|
|
2273
|
-
|
|
2274
|
-
|
|
2275
|
-
|
|
2276
|
-
|
|
2277
|
-
|
|
2278
|
-
|
|
2279
|
-
|
|
2280
|
-
|
|
2281
|
-
|
|
2282
|
-
|
|
2283
|
-
|
|
2284
|
-
|
|
2285
|
-
|
|
2286
|
-
|
|
2287
|
-
|
|
2288
|
-
|
|
2289
|
-
|
|
2290
|
-
|
|
2291
|
-
|
|
2292
|
-
|
|
2293
|
-
|
|
2294
|
-
|
|
2295
|
-
|
|
2296
|
-
|
|
2297
|
-
|
|
2298
|
-
console.error("\nSuggestions:");
|
|
2299
|
-
console.error(" \u2022 Check your internet connection");
|
|
2300
|
-
console.error(" \u2022 Verify the URL is accessible");
|
|
2301
|
-
console.error(" \u2022 Try increasing the timeout with --timeout option");
|
|
2302
|
-
}
|
|
2303
|
-
} else {
|
|
2304
|
-
console.error("Unknown error occurred");
|
|
2305
|
-
console.error(String(error));
|
|
875
|
+
});
|
|
876
|
+
|
|
877
|
+
// src/commands/init.ts
|
|
878
|
+
import { Command as Command2 } from "commander";
|
|
879
|
+
import fs5 from "fs";
|
|
880
|
+
import path3 from "path";
|
|
881
|
+
import chalk4 from "chalk";
|
|
882
|
+
var DEFAULT_CONFIG = `# sitemap-qa configuration
|
|
883
|
+
# This file defines the risk categories and patterns to monitor.
|
|
884
|
+
|
|
885
|
+
# Tool Settings
|
|
886
|
+
outDir: "./sitemap-qa/report"
|
|
887
|
+
outputFormat: "all" # Options: json, html, all
|
|
888
|
+
enforceDomainConsistency: true
|
|
889
|
+
|
|
890
|
+
# Risk Categories
|
|
891
|
+
# Each category contains a list of patterns to match against URLs found in sitemaps.
|
|
892
|
+
# Patterns can be:
|
|
893
|
+
# - literal: Exact string match
|
|
894
|
+
# - glob: Glob pattern (e.g., **/admin/**)
|
|
895
|
+
# - regex: Regular expression (e.g., /\\/v[0-9]+\\//)
|
|
896
|
+
|
|
897
|
+
# Acceptable Patterns
|
|
898
|
+
# URLs matching these patterns will be ignored and not flagged as risks.
|
|
899
|
+
acceptable_patterns:
|
|
900
|
+
- type: "literal"
|
|
901
|
+
value: "/acceptable-path"
|
|
902
|
+
reason: "Example of an acceptable path that should not be flagged."
|
|
903
|
+
- type: "glob"
|
|
904
|
+
value: "**/public-docs/**"
|
|
905
|
+
reason: "Public documentation is always acceptable."
|
|
906
|
+
|
|
907
|
+
policies:
|
|
908
|
+
- category: "Security & Admin"
|
|
909
|
+
patterns:
|
|
910
|
+
- type: "glob"
|
|
911
|
+
value: "**/admin/**"
|
|
912
|
+
reason: "Administrative interfaces should not be publicly indexed."
|
|
913
|
+
- type: "glob"
|
|
914
|
+
value: "**/.env*"
|
|
915
|
+
reason: "Environment files contain sensitive secrets."
|
|
916
|
+
- type: "literal"
|
|
917
|
+
value: "/wp-admin"
|
|
918
|
+
reason: "WordPress admin paths are common attack vectors."
|
|
919
|
+
|
|
920
|
+
- category: "Environment Leakage"
|
|
921
|
+
patterns:
|
|
922
|
+
- type: "glob"
|
|
923
|
+
value: "**/staging.**"
|
|
924
|
+
reason: "Staging environments should be restricted."
|
|
925
|
+
- type: "glob"
|
|
926
|
+
value: "**/dev.**"
|
|
927
|
+
reason: "Development subdomains detected in production sitemap."
|
|
928
|
+
|
|
929
|
+
- category: "Sensitive Files"
|
|
930
|
+
patterns:
|
|
931
|
+
- type: "glob"
|
|
932
|
+
value: "**/*.{sql,bak,zip,tar.gz}"
|
|
933
|
+
reason: "Archive or database backup files exposed."
|
|
934
|
+
`;
|
|
935
|
+
var initCommand = new Command2("init").description("Initialize a default sitemap-qa.yaml configuration file").action(() => {
|
|
936
|
+
const configPath = path3.join(process.cwd(), "sitemap-qa.yaml");
|
|
937
|
+
if (fs5.existsSync(configPath)) {
|
|
938
|
+
console.error(chalk4.red(`Error: ${configPath} already exists.`));
|
|
939
|
+
process.exit(1);
|
|
2306
940
|
}
|
|
2307
|
-
|
|
2308
|
-
|
|
2309
|
-
|
|
2310
|
-
|
|
2311
|
-
|
|
2312
|
-
|
|
2313
|
-
for (const timing of timings) {
|
|
2314
|
-
const seconds = (timing.duration / 1e3).toFixed(1);
|
|
2315
|
-
const percentage = (timing.duration / totalTime * 100).toFixed(1);
|
|
2316
|
-
const bar = "\u2022";
|
|
2317
|
-
console.log(` ${bar} ${timing.name.padEnd(15)}: ${seconds.padStart(5)}s (${percentage.padStart(5)}%)`);
|
|
941
|
+
try {
|
|
942
|
+
fs5.writeFileSync(configPath, DEFAULT_CONFIG, "utf8");
|
|
943
|
+
console.log(chalk4.green(`Successfully created ${configPath}`));
|
|
944
|
+
} catch (error) {
|
|
945
|
+
console.error(chalk4.red("Failed to create configuration file:"), error);
|
|
946
|
+
process.exit(1);
|
|
2318
947
|
}
|
|
2319
|
-
|
|
2320
|
-
}
|
|
2321
|
-
async function saveBenchmark(timings, url, totalTime, sitemapCount, urlCount, config) {
|
|
2322
|
-
const benchmark = {
|
|
2323
|
-
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2324
|
-
url,
|
|
2325
|
-
total_duration_ms: totalTime,
|
|
2326
|
-
phases: timings.map((t) => ({
|
|
2327
|
-
name: t.name.toLowerCase(),
|
|
2328
|
-
start_ms: t.startTime,
|
|
2329
|
-
end_ms: t.endTime,
|
|
2330
|
-
duration_ms: t.duration
|
|
2331
|
-
})),
|
|
2332
|
-
metrics: {
|
|
2333
|
-
sitemaps_processed: sitemapCount,
|
|
2334
|
-
urls_analyzed: urlCount,
|
|
2335
|
-
throughput: {
|
|
2336
|
-
urls_per_second: Math.round(urlCount / totalTime * 1e3),
|
|
2337
|
-
sitemaps_per_second: (sitemapCount / totalTime * 1e3).toFixed(2)
|
|
2338
|
-
}
|
|
2339
|
-
},
|
|
2340
|
-
system_info: {
|
|
2341
|
-
cpu_count: os2.cpus().length,
|
|
2342
|
-
node_version: process.version,
|
|
2343
|
-
platform: process.platform,
|
|
2344
|
-
memory_total_mb: Math.round(os2.totalmem() / 1024 / 1024)
|
|
2345
|
-
},
|
|
2346
|
-
config: {
|
|
2347
|
-
discovery_concurrency: config.discoveryConcurrency,
|
|
2348
|
-
parsing_concurrency: config.parsingConcurrency,
|
|
2349
|
-
risk_detection_concurrency: config.riskDetectionConcurrency,
|
|
2350
|
-
risk_detection_batch_size: config.riskDetectionBatchSize
|
|
2351
|
-
}
|
|
2352
|
-
};
|
|
2353
|
-
const filename = `performance-profile-${Date.now()}.json`;
|
|
2354
|
-
await fs2.writeFile(filename, JSON.stringify(benchmark, null, 2));
|
|
2355
|
-
console.log(chalk.blue(`\u{1F4CA} Benchmark saved to: ${filename}`));
|
|
2356
|
-
}
|
|
948
|
+
});
|
|
2357
949
|
|
|
2358
950
|
// src/index.ts
|
|
2359
|
-
var program = new
|
|
951
|
+
var program = new Command3();
|
|
2360
952
|
program.name("sitemap-qa").version("1.0.0").description("sitemap analysis for QA teams");
|
|
2361
953
|
program.addCommand(analyzeCommand);
|
|
954
|
+
program.addCommand(initCommand);
|
|
2362
955
|
process.on("unhandledRejection", (reason, promise) => {
|
|
2363
956
|
console.error("Unhandled Rejection at:", promise, "reason:", reason);
|
|
2364
957
|
process.exit(1);
|