@akotliar/sitemap-qa 1.0.0-alpha.4 → 1.0.0-alpha.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +98 -43
- package/dist/index.js +318 -357
- package/dist/index.js.map +1 -1
- package/dist/reporters/templates/partials/finding.hbs +20 -0
- package/dist/reporters/templates/partials/header.hbs +9 -0
- package/dist/reporters/templates/partials/summary.hbs +22 -0
- package/dist/reporters/templates/report.hbs +293 -0
- package/package.json +4 -1
package/dist/index.js
CHANGED
|
@@ -6,7 +6,7 @@ import { Command as Command3 } from "commander";
|
|
|
6
6
|
// src/commands/analyze.ts
|
|
7
7
|
import { Command } from "commander";
|
|
8
8
|
import chalk3 from "chalk";
|
|
9
|
-
import
|
|
9
|
+
import path3 from "path";
|
|
10
10
|
import fs4 from "fs/promises";
|
|
11
11
|
|
|
12
12
|
// src/config/loader.ts
|
|
@@ -27,13 +27,18 @@ var PolicySchema = z.object({
|
|
|
27
27
|
patterns: z.array(PatternSchema).min(1, "At least one pattern is required per category")
|
|
28
28
|
});
|
|
29
29
|
var ConfigSchema = z.object({
|
|
30
|
+
acceptable_patterns: z.array(PatternSchema).default([]),
|
|
30
31
|
policies: z.array(PolicySchema).default([]),
|
|
31
32
|
outDir: z.string().optional(),
|
|
32
|
-
outputFormat: z.enum(["json", "html", "all"]).default("all")
|
|
33
|
+
outputFormat: z.enum(["json", "html", "all"]).default("all"),
|
|
34
|
+
enforceDomainConsistency: z.boolean().default(true)
|
|
33
35
|
});
|
|
34
36
|
|
|
35
37
|
// src/config/defaults.ts
|
|
36
38
|
var DEFAULT_POLICIES = {
|
|
39
|
+
acceptable_patterns: [],
|
|
40
|
+
outputFormat: "all",
|
|
41
|
+
enforceDomainConsistency: true,
|
|
37
42
|
policies: [
|
|
38
43
|
{
|
|
39
44
|
category: "Security & Admin",
|
|
@@ -89,7 +94,12 @@ var ConfigLoader = class {
|
|
|
89
94
|
static DEFAULT_CONFIG_PATH = "sitemap-qa.yaml";
|
|
90
95
|
static load(configPath) {
|
|
91
96
|
const targetPath = configPath || path.join(process.cwd(), this.DEFAULT_CONFIG_PATH);
|
|
92
|
-
let userConfig = {
|
|
97
|
+
let userConfig = {
|
|
98
|
+
acceptable_patterns: [],
|
|
99
|
+
policies: [],
|
|
100
|
+
outputFormat: "all",
|
|
101
|
+
enforceDomainConsistency: true
|
|
102
|
+
};
|
|
93
103
|
if (fs.existsSync(targetPath)) {
|
|
94
104
|
try {
|
|
95
105
|
const fileContent = fs.readFileSync(targetPath, "utf8");
|
|
@@ -101,15 +111,18 @@ var ConfigLoader = class {
|
|
|
101
111
|
console.error(chalk.yellow(` - ${issue.path.join(".")}: ${issue.message}`));
|
|
102
112
|
});
|
|
103
113
|
process.exit(2);
|
|
114
|
+
return DEFAULT_POLICIES;
|
|
104
115
|
}
|
|
105
116
|
userConfig = result.data;
|
|
106
117
|
} catch (error) {
|
|
107
118
|
console.error(chalk.red("Failed to load configuration:"), error);
|
|
108
119
|
process.exit(2);
|
|
120
|
+
return DEFAULT_POLICIES;
|
|
109
121
|
}
|
|
110
122
|
} else if (configPath) {
|
|
111
123
|
console.error(chalk.red(`Error: Configuration file not found at ${targetPath}`));
|
|
112
124
|
process.exit(2);
|
|
125
|
+
return DEFAULT_POLICIES;
|
|
113
126
|
}
|
|
114
127
|
return this.mergeConfigs(DEFAULT_POLICIES, userConfig);
|
|
115
128
|
}
|
|
@@ -125,6 +138,7 @@ var ConfigLoader = class {
|
|
|
125
138
|
});
|
|
126
139
|
const merged = {
|
|
127
140
|
...defaults,
|
|
141
|
+
acceptable_patterns: [...defaults.acceptable_patterns || [], ...user.acceptable_patterns || []],
|
|
128
142
|
policies: mergedPolicies
|
|
129
143
|
};
|
|
130
144
|
if (user.outDir !== void 0) {
|
|
@@ -133,15 +147,90 @@ var ConfigLoader = class {
|
|
|
133
147
|
if (user.outputFormat !== void 0) {
|
|
134
148
|
merged.outputFormat = user.outputFormat;
|
|
135
149
|
}
|
|
150
|
+
if (user.enforceDomainConsistency !== void 0) {
|
|
151
|
+
merged.enforceDomainConsistency = user.enforceDomainConsistency;
|
|
152
|
+
}
|
|
136
153
|
return merged;
|
|
137
154
|
}
|
|
138
155
|
};
|
|
139
156
|
|
|
140
157
|
// src/core/discovery.ts
|
|
141
158
|
import { fetch } from "undici";
|
|
159
|
+
import { Readable } from "stream";
|
|
160
|
+
|
|
161
|
+
// src/core/xml-parser.ts
|
|
142
162
|
import { XMLParser } from "fast-xml-parser";
|
|
143
|
-
|
|
163
|
+
import { gunzipSync } from "zlib";
|
|
164
|
+
var StreamingXmlParser = class {
|
|
144
165
|
parser;
|
|
166
|
+
lastParsedXml;
|
|
167
|
+
constructor() {
|
|
168
|
+
this.parser = new XMLParser({
|
|
169
|
+
ignoreAttributes: false,
|
|
170
|
+
attributeNamePrefix: "@_",
|
|
171
|
+
// Ensure we always get arrays for sitemap and url tags
|
|
172
|
+
isArray: (name) => name === "sitemap" || name === "url",
|
|
173
|
+
removeNSPrefix: true
|
|
174
|
+
});
|
|
175
|
+
}
|
|
176
|
+
/**
|
|
177
|
+
* Parses an XML stream and yields typed entries as they are found.
|
|
178
|
+
* Generator-first design allows consumers to process entries without pre-collecting.
|
|
179
|
+
*/
|
|
180
|
+
async *parse(stream) {
|
|
181
|
+
const xmlData = typeof stream === "string" ? stream : await this.streamToString(stream);
|
|
182
|
+
this.lastParsedXml = xmlData;
|
|
183
|
+
const jsonObj = this.parser.parse(xmlData);
|
|
184
|
+
if (jsonObj.sitemapindex?.sitemap) {
|
|
185
|
+
const sitemaps = jsonObj.sitemapindex.sitemap;
|
|
186
|
+
for (const sitemap of sitemaps) {
|
|
187
|
+
if (sitemap?.loc) {
|
|
188
|
+
yield { type: "sitemap", loc: sitemap.loc };
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
if (jsonObj.urlset?.url) {
|
|
193
|
+
const urls = jsonObj.urlset.url;
|
|
194
|
+
for (const url of urls) {
|
|
195
|
+
if (url?.loc) {
|
|
196
|
+
yield {
|
|
197
|
+
type: "url",
|
|
198
|
+
loc: url.loc,
|
|
199
|
+
lastmod: url.lastmod,
|
|
200
|
+
changefreq: url.changefreq,
|
|
201
|
+
priority: url.priority
|
|
202
|
+
};
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
/**
|
|
208
|
+
* Get the last parsed XML data (useful to avoid re-fetching).
|
|
209
|
+
*/
|
|
210
|
+
getLastParsedXml() {
|
|
211
|
+
return this.lastParsedXml;
|
|
212
|
+
}
|
|
213
|
+
async streamToString(stream) {
|
|
214
|
+
const chunks = [];
|
|
215
|
+
for await (const chunk of stream) {
|
|
216
|
+
chunks.push(Buffer.from(chunk));
|
|
217
|
+
}
|
|
218
|
+
const buffer = Buffer.concat(chunks);
|
|
219
|
+
if (buffer.length >= 2 && buffer[0] === 31 && buffer[1] === 139) {
|
|
220
|
+
try {
|
|
221
|
+
const decompressed = gunzipSync(buffer);
|
|
222
|
+
return decompressed.toString("utf8");
|
|
223
|
+
} catch (error) {
|
|
224
|
+
throw new Error(`Failed to decompress gzipped content: ${error}`);
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
return buffer.toString("utf8");
|
|
228
|
+
}
|
|
229
|
+
};
|
|
230
|
+
|
|
231
|
+
// src/core/discovery.ts
|
|
232
|
+
var DiscoveryService = class {
|
|
233
|
+
xmlParser;
|
|
145
234
|
visited = /* @__PURE__ */ new Set();
|
|
146
235
|
STANDARD_PATHS = [
|
|
147
236
|
"/sitemap.xml",
|
|
@@ -151,10 +240,7 @@ var DiscoveryService = class {
|
|
|
151
240
|
"/sitemap.xml.gz"
|
|
152
241
|
];
|
|
153
242
|
constructor() {
|
|
154
|
-
this.
|
|
155
|
-
ignoreAttributes: false,
|
|
156
|
-
attributeNamePrefix: "@_"
|
|
157
|
-
});
|
|
243
|
+
this.xmlParser = new StreamingXmlParser();
|
|
158
244
|
}
|
|
159
245
|
/**
|
|
160
246
|
* Attempts to find sitemaps for a given base website URL.
|
|
@@ -176,9 +262,9 @@ var DiscoveryService = class {
|
|
|
176
262
|
} catch (e) {
|
|
177
263
|
}
|
|
178
264
|
if (sitemaps.size === 0) {
|
|
179
|
-
for (const
|
|
265
|
+
for (const path5 of this.STANDARD_PATHS) {
|
|
180
266
|
try {
|
|
181
|
-
const sitemapUrl = `${origin}${
|
|
267
|
+
const sitemapUrl = `${origin}${path5}`;
|
|
182
268
|
const response = await fetch(sitemapUrl, { method: "HEAD" });
|
|
183
269
|
if (response.status === 200) {
|
|
184
270
|
sitemaps.add(sitemapUrl);
|
|
@@ -191,6 +277,7 @@ var DiscoveryService = class {
|
|
|
191
277
|
}
|
|
192
278
|
/**
|
|
193
279
|
* Recursively discovers all leaf sitemaps from a root URL.
|
|
280
|
+
* Returns both the sitemap URL and its XML data to avoid duplicate fetches.
|
|
194
281
|
*/
|
|
195
282
|
async *discover(rootUrl) {
|
|
196
283
|
const queue = [rootUrl];
|
|
@@ -201,17 +288,39 @@ var DiscoveryService = class {
|
|
|
201
288
|
try {
|
|
202
289
|
const response = await fetch(currentUrl);
|
|
203
290
|
if (response.status !== 200) continue;
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
291
|
+
let isIndex = false;
|
|
292
|
+
let isLeaf = false;
|
|
293
|
+
const childSitemaps = [];
|
|
294
|
+
let xmlData;
|
|
295
|
+
let source;
|
|
296
|
+
if (response.body) {
|
|
297
|
+
const nodeStream = Readable.fromWeb(response.body);
|
|
298
|
+
source = nodeStream;
|
|
299
|
+
} else {
|
|
300
|
+
xmlData = await response.text();
|
|
301
|
+
source = xmlData;
|
|
302
|
+
}
|
|
303
|
+
for await (const entry of this.xmlParser.parse(source)) {
|
|
304
|
+
if (entry.type === "sitemap") {
|
|
305
|
+
isIndex = true;
|
|
306
|
+
childSitemaps.push(entry.loc);
|
|
307
|
+
} else if (entry.type === "url") {
|
|
308
|
+
isLeaf = true;
|
|
212
309
|
}
|
|
213
|
-
}
|
|
214
|
-
|
|
310
|
+
}
|
|
311
|
+
if (isIndex) {
|
|
312
|
+
for (const loc of childSitemaps) {
|
|
313
|
+
queue.push(loc);
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
if (!xmlData) {
|
|
317
|
+
xmlData = this.xmlParser.getLastParsedXml() || "";
|
|
318
|
+
}
|
|
319
|
+
if (isLeaf || !isIndex && xmlData.includes("<urlset")) {
|
|
320
|
+
yield {
|
|
321
|
+
url: currentUrl,
|
|
322
|
+
xmlData
|
|
323
|
+
};
|
|
215
324
|
}
|
|
216
325
|
} catch (error) {
|
|
217
326
|
console.error(`Failed to fetch or parse sitemap at ${currentUrl}:`, error);
|
|
@@ -221,47 +330,78 @@ var DiscoveryService = class {
|
|
|
221
330
|
};
|
|
222
331
|
|
|
223
332
|
// src/core/parser.ts
|
|
224
|
-
import {
|
|
333
|
+
import { Readable as Readable2 } from "stream";
|
|
225
334
|
import { fetch as fetch2 } from "undici";
|
|
226
335
|
var SitemapParser = class {
|
|
227
|
-
|
|
336
|
+
xmlParser;
|
|
228
337
|
constructor() {
|
|
229
|
-
this.
|
|
230
|
-
ignoreAttributes: false,
|
|
231
|
-
attributeNamePrefix: "@_"
|
|
232
|
-
});
|
|
338
|
+
this.xmlParser = new StreamingXmlParser();
|
|
233
339
|
}
|
|
234
340
|
/**
|
|
235
341
|
* Parses a leaf sitemap and yields SitemapUrl objects.
|
|
236
|
-
*
|
|
237
|
-
*
|
|
238
|
-
*
|
|
239
|
-
*
|
|
342
|
+
* Uses the shared StreamingXmlParser for consistent and efficient parsing.
|
|
343
|
+
*
|
|
344
|
+
* @param sitemapUrlOrData - Accepts one of three input types:
|
|
345
|
+
* - `string`: A URL string. The method will fetch the sitemap from this URL.
|
|
346
|
+
* Use this when you need to fetch a sitemap from a remote location.
|
|
347
|
+
* - `{ type: 'xmlData'; url: string; xmlData: string }`: An object with a URL and pre-fetched XML data.
|
|
348
|
+
* Use this when you already have the XML content (e.g., from a cache or file)
|
|
349
|
+
* and want to avoid an additional HTTP request.
|
|
350
|
+
* - `{ type: 'stream'; url: string; stream: ReadableStream | Readable }`: An object with a URL and a stream.
|
|
351
|
+
* Accepts either a Web ReadableStream or Node.js Readable stream.
|
|
352
|
+
* Use this when you have a stream source (e.g., from a streaming HTTP response)
|
|
353
|
+
* that should be consumed and parsed. Web streams are converted to Node.js Readable internally.
|
|
354
|
+
*
|
|
355
|
+
* @yields {SitemapUrl} Parsed sitemap URL entries containing `loc` (URL), `source` (sitemap URL),
|
|
356
|
+
* optional metadata (`lastmod`, `changefreq`, `priority`), and a `risks` array (initialized as empty,
|
|
357
|
+
* populated later in the processing pipeline). Other properties like `ignored`/`ignoredBy` are not
|
|
358
|
+
* set by this method and may be added by downstream processors.
|
|
240
359
|
*/
|
|
241
|
-
async *parse(
|
|
360
|
+
async *parse(sitemapUrlOrData) {
|
|
361
|
+
const sitemapUrl = typeof sitemapUrlOrData === "string" ? sitemapUrlOrData : sitemapUrlOrData.url;
|
|
242
362
|
try {
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
363
|
+
let source;
|
|
364
|
+
if (typeof sitemapUrlOrData === "string") {
|
|
365
|
+
const response = await fetch2(sitemapUrl);
|
|
366
|
+
if (response.status !== 200) throw new Error(`Failed to fetch sitemap at ${sitemapUrl}: HTTP ${response.status}`);
|
|
367
|
+
if (response.body) {
|
|
368
|
+
source = Readable2.fromWeb(response.body);
|
|
369
|
+
} else {
|
|
370
|
+
source = await response.text();
|
|
371
|
+
}
|
|
372
|
+
} else if (sitemapUrlOrData.type === "stream") {
|
|
373
|
+
if (sitemapUrlOrData.stream instanceof Readable2) {
|
|
374
|
+
source = sitemapUrlOrData.stream;
|
|
375
|
+
} else {
|
|
376
|
+
source = Readable2.fromWeb(sitemapUrlOrData.stream);
|
|
377
|
+
}
|
|
378
|
+
} else {
|
|
379
|
+
source = sitemapUrlOrData.xmlData;
|
|
380
|
+
source = sitemapUrlOrData.xmlData;
|
|
381
|
+
}
|
|
382
|
+
for await (const entry of this.xmlParser.parse(source)) {
|
|
383
|
+
if (entry.type === "url") {
|
|
384
|
+
yield {
|
|
385
|
+
loc: entry.loc,
|
|
386
|
+
source: sitemapUrl,
|
|
387
|
+
lastmod: entry.lastmod,
|
|
388
|
+
changefreq: entry.changefreq,
|
|
389
|
+
priority: entry.priority,
|
|
390
|
+
risks: []
|
|
391
|
+
};
|
|
259
392
|
}
|
|
260
393
|
}
|
|
261
394
|
} catch (error) {
|
|
262
395
|
console.error(`Failed to parse sitemap at ${sitemapUrl}:`, error);
|
|
263
396
|
}
|
|
264
397
|
}
|
|
398
|
+
async streamToString(stream) {
|
|
399
|
+
const chunks = [];
|
|
400
|
+
for await (const chunk of stream) {
|
|
401
|
+
chunks.push(Buffer.from(chunk));
|
|
402
|
+
}
|
|
403
|
+
return Buffer.concat(chunks).toString("utf8");
|
|
404
|
+
}
|
|
265
405
|
};
|
|
266
406
|
|
|
267
407
|
// src/core/extractor.ts
|
|
@@ -308,9 +448,10 @@ var ExtractorService = class {
|
|
|
308
448
|
}
|
|
309
449
|
}
|
|
310
450
|
for (const startUrl of startUrls) {
|
|
311
|
-
for await (const
|
|
312
|
-
this.discoveredSitemaps.add(
|
|
313
|
-
|
|
451
|
+
for await (const discovered of this.discovery.discover(startUrl)) {
|
|
452
|
+
this.discoveredSitemaps.add(discovered.url);
|
|
453
|
+
const parserInput = { type: "xmlData", url: discovered.url, xmlData: discovered.xmlData };
|
|
454
|
+
for await (const urlObj of this.parser.parse(parserInput)) {
|
|
314
455
|
const normalized = this.normalizeUrl(urlObj.loc);
|
|
315
456
|
if (!this.seenUrls.has(normalized)) {
|
|
316
457
|
this.seenUrls.add(normalized);
|
|
@@ -326,14 +467,42 @@ var ExtractorService = class {
|
|
|
326
467
|
import micromatch from "micromatch";
|
|
327
468
|
var MatcherService = class {
|
|
328
469
|
config;
|
|
329
|
-
|
|
470
|
+
rootDomain;
|
|
471
|
+
constructor(config, rootUrl) {
|
|
330
472
|
this.config = config;
|
|
473
|
+
if (rootUrl) {
|
|
474
|
+
try {
|
|
475
|
+
this.rootDomain = new URL(rootUrl).hostname.replace(/^www\./, "");
|
|
476
|
+
} catch {
|
|
477
|
+
}
|
|
478
|
+
}
|
|
331
479
|
}
|
|
332
480
|
/**
|
|
333
481
|
* Matches a URL against all policies and returns detected risks.
|
|
334
482
|
*/
|
|
335
483
|
match(urlObj) {
|
|
336
484
|
const risks = [];
|
|
485
|
+
if (this.config.enforceDomainConsistency && this.rootDomain) {
|
|
486
|
+
try {
|
|
487
|
+
const currentDomain = new URL(urlObj.loc).hostname.replace(/^www\./, "");
|
|
488
|
+
if (currentDomain !== this.rootDomain) {
|
|
489
|
+
risks.push({
|
|
490
|
+
category: "Domain Consistency",
|
|
491
|
+
pattern: this.rootDomain,
|
|
492
|
+
type: "literal",
|
|
493
|
+
reason: `URL domain mismatch: expected ${this.rootDomain} (or www.${this.rootDomain}), but found ${currentDomain}.`
|
|
494
|
+
});
|
|
495
|
+
}
|
|
496
|
+
} catch {
|
|
497
|
+
}
|
|
498
|
+
}
|
|
499
|
+
for (const pattern of this.config.acceptable_patterns) {
|
|
500
|
+
if (this.isMatch(urlObj.loc, pattern)) {
|
|
501
|
+
urlObj.ignored = true;
|
|
502
|
+
urlObj.ignoredBy = pattern.reason;
|
|
503
|
+
return risks;
|
|
504
|
+
}
|
|
505
|
+
}
|
|
337
506
|
for (const policy of this.config.policies) {
|
|
338
507
|
for (const pattern of policy.patterns) {
|
|
339
508
|
if (this.isMatch(urlObj.loc, pattern)) {
|
|
@@ -375,6 +544,7 @@ var ConsoleReporter = class {
|
|
|
375
544
|
console.log(`Total URLs Scanned: ${data.totalUrls}`);
|
|
376
545
|
console.log(`Total Risks Found: ${data.totalRisks > 0 ? chalk2.red(data.totalRisks) : chalk2.green(0)}`);
|
|
377
546
|
console.log(`URLs with Risks: ${data.urlsWithRisks.length}`);
|
|
547
|
+
console.log(`URLs Ignored: ${data.ignoredUrls.length > 0 ? chalk2.yellow(data.ignoredUrls.length) : 0}`);
|
|
378
548
|
console.log(`Duration: ${((data.endTime.getTime() - data.startTime.getTime()) / 1e3).toFixed(2)}s`);
|
|
379
549
|
if (data.urlsWithRisks.length > 0) {
|
|
380
550
|
console.log("\n" + chalk2.bold.yellow("Top Findings:"));
|
|
@@ -410,9 +580,11 @@ var JsonReporter = class {
|
|
|
410
580
|
summary: {
|
|
411
581
|
totalUrls: data.totalUrls,
|
|
412
582
|
totalRisks: data.totalRisks,
|
|
413
|
-
urlsWithRisksCount: data.urlsWithRisks.length
|
|
583
|
+
urlsWithRisksCount: data.urlsWithRisks.length,
|
|
584
|
+
ignoredUrlsCount: data.ignoredUrls.length
|
|
414
585
|
},
|
|
415
|
-
findings: data.urlsWithRisks
|
|
586
|
+
findings: data.urlsWithRisks,
|
|
587
|
+
ignored: data.ignoredUrls
|
|
416
588
|
};
|
|
417
589
|
await fs2.writeFile(this.outputPath, JSON.stringify(report, null, 2), "utf8");
|
|
418
590
|
console.log(`JSON report generated at ${this.outputPath}`);
|
|
@@ -421,322 +593,92 @@ var JsonReporter = class {
|
|
|
421
593
|
|
|
422
594
|
// src/reporters/html-reporter.ts
|
|
423
595
|
import fs3 from "fs/promises";
|
|
596
|
+
import path2 from "path";
|
|
597
|
+
import { fileURLToPath } from "url";
|
|
598
|
+
import Handlebars from "handlebars";
|
|
599
|
+
var __filename2 = fileURLToPath(import.meta.url);
|
|
600
|
+
var __dirname2 = path2.dirname(__filename2);
|
|
424
601
|
var HtmlReporter = class {
|
|
425
602
|
outputPath;
|
|
426
603
|
constructor(outputPath = "sitemap-qa-report.html") {
|
|
427
604
|
this.outputPath = outputPath;
|
|
605
|
+
Handlebars.registerHelper("json", (context) => {
|
|
606
|
+
return JSON.stringify(context);
|
|
607
|
+
});
|
|
428
608
|
}
|
|
429
609
|
async generate(data) {
|
|
430
|
-
const
|
|
431
|
-
|
|
610
|
+
const partialsDir = path2.join(__dirname2, "templates", "partials");
|
|
611
|
+
try {
|
|
612
|
+
const partialFiles = await fs3.readdir(partialsDir);
|
|
613
|
+
for (const file of partialFiles) {
|
|
614
|
+
if (file.endsWith(".hbs")) {
|
|
615
|
+
const partialName = path2.basename(file, ".hbs");
|
|
616
|
+
const partialSource = await fs3.readFile(path2.join(partialsDir, file), "utf8");
|
|
617
|
+
Handlebars.registerPartial(partialName, partialSource);
|
|
618
|
+
}
|
|
619
|
+
}
|
|
620
|
+
} catch (error) {
|
|
621
|
+
console.warn("Could not load partials:", error);
|
|
622
|
+
}
|
|
623
|
+
const templatePath = path2.join(__dirname2, "templates", "report.hbs");
|
|
624
|
+
const templateSource = await fs3.readFile(templatePath, "utf8");
|
|
625
|
+
const template = Handlebars.compile(templateSource);
|
|
626
|
+
const templateData = this.prepareTemplateData(data);
|
|
627
|
+
const html = template(templateData);
|
|
432
628
|
await fs3.writeFile(this.outputPath, html, "utf8");
|
|
433
629
|
console.log(`HTML report generated at ${this.outputPath}`);
|
|
434
630
|
}
|
|
435
|
-
|
|
436
|
-
const
|
|
631
|
+
prepareTemplateData(data) {
|
|
632
|
+
const duration = ((data.endTime.getTime() - data.startTime.getTime()) / 1e3).toFixed(1);
|
|
633
|
+
const timestamp = data.endTime.toLocaleString();
|
|
634
|
+
const categoriesMap = {};
|
|
437
635
|
for (const urlObj of data.urlsWithRisks) {
|
|
438
636
|
for (const risk of urlObj.risks) {
|
|
439
|
-
if (!
|
|
440
|
-
|
|
637
|
+
if (!categoriesMap[risk.category]) {
|
|
638
|
+
categoriesMap[risk.category] = {};
|
|
441
639
|
}
|
|
442
|
-
if (!
|
|
443
|
-
|
|
640
|
+
if (!categoriesMap[risk.category][risk.pattern]) {
|
|
641
|
+
categoriesMap[risk.category][risk.pattern] = {
|
|
444
642
|
reason: risk.reason,
|
|
445
643
|
urls: []
|
|
446
644
|
};
|
|
447
645
|
}
|
|
448
|
-
|
|
646
|
+
categoriesMap[risk.category][risk.pattern].urls.push(urlObj.loc);
|
|
449
647
|
}
|
|
450
648
|
}
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
color: white;
|
|
485
|
-
padding: 40px 20px;
|
|
486
|
-
text-align: left;
|
|
487
|
-
}
|
|
488
|
-
.container {
|
|
489
|
-
max-width: 1200px;
|
|
490
|
-
margin: 0 auto;
|
|
491
|
-
padding: 0 20px;
|
|
492
|
-
}
|
|
493
|
-
header h1 { margin: 0; font-size: 24px; }
|
|
494
|
-
header .meta { margin-top: 10px; color: #94a3b8; font-size: 14px; }
|
|
495
|
-
|
|
496
|
-
.summary-grid {
|
|
497
|
-
display: grid;
|
|
498
|
-
grid-template-columns: repeat(4, 1fr);
|
|
499
|
-
border-bottom: 1px solid var(--border);
|
|
500
|
-
margin-bottom: 40px;
|
|
501
|
-
}
|
|
502
|
-
.summary-card {
|
|
503
|
-
padding: 30px 20px;
|
|
504
|
-
text-align: center;
|
|
505
|
-
border-right: 1px solid var(--border);
|
|
506
|
-
}
|
|
507
|
-
.summary-card:last-child { border-right: none; }
|
|
508
|
-
.summary-card h3 {
|
|
509
|
-
margin: 0;
|
|
510
|
-
font-size: 12px;
|
|
511
|
-
text-transform: uppercase;
|
|
512
|
-
color: var(--text-muted);
|
|
513
|
-
letter-spacing: 0.05em;
|
|
514
|
-
}
|
|
515
|
-
.summary-card p {
|
|
516
|
-
margin: 10px 0 0;
|
|
517
|
-
font-size: 32px;
|
|
518
|
-
font-weight: 700;
|
|
519
|
-
color: var(--text-main);
|
|
520
|
-
}
|
|
521
|
-
.summary-card.highlight p { color: var(--danger); }
|
|
522
|
-
|
|
523
|
-
details {
|
|
524
|
-
margin-bottom: 20px;
|
|
525
|
-
border: 1px solid var(--border);
|
|
526
|
-
border-radius: 8px;
|
|
527
|
-
overflow: hidden;
|
|
528
|
-
}
|
|
529
|
-
summary {
|
|
530
|
-
padding: 15px 20px;
|
|
531
|
-
background-color: #fff;
|
|
532
|
-
cursor: pointer;
|
|
533
|
-
font-weight: 600;
|
|
534
|
-
display: flex;
|
|
535
|
-
justify-content: space-between;
|
|
536
|
-
align-items: center;
|
|
537
|
-
list-style: none;
|
|
538
|
-
}
|
|
539
|
-
summary::-webkit-details-marker { display: none; }
|
|
540
|
-
summary::after {
|
|
541
|
-
content: '\u25B6';
|
|
542
|
-
font-size: 12px;
|
|
543
|
-
color: var(--text-muted);
|
|
544
|
-
transition: transform 0.2s;
|
|
545
|
-
}
|
|
546
|
-
details[open] summary::after { transform: rotate(90deg); }
|
|
547
|
-
|
|
548
|
-
.category-section {
|
|
549
|
-
border: 1px solid var(--warning);
|
|
550
|
-
border-radius: 8px;
|
|
551
|
-
margin-bottom: 20px;
|
|
552
|
-
}
|
|
553
|
-
.category-header {
|
|
554
|
-
padding: 15px 20px;
|
|
555
|
-
background-color: #fffbeb;
|
|
556
|
-
color: var(--warning);
|
|
557
|
-
font-weight: 600;
|
|
558
|
-
cursor: pointer;
|
|
559
|
-
display: flex;
|
|
560
|
-
justify-content: space-between;
|
|
561
|
-
align-items: center;
|
|
562
|
-
}
|
|
563
|
-
.category-content {
|
|
564
|
-
padding: 20px;
|
|
565
|
-
background-color: #fff;
|
|
566
|
-
}
|
|
567
|
-
|
|
568
|
-
.finding-group {
|
|
569
|
-
border: 1px solid var(--border);
|
|
570
|
-
border-radius: 8px;
|
|
571
|
-
padding: 20px;
|
|
572
|
-
margin-bottom: 20px;
|
|
573
|
-
}
|
|
574
|
-
.finding-header {
|
|
575
|
-
display: flex;
|
|
576
|
-
align-items: center;
|
|
577
|
-
gap: 10px;
|
|
578
|
-
margin-bottom: 10px;
|
|
579
|
-
}
|
|
580
|
-
.finding-header h4 { margin: 0; font-size: 16px; }
|
|
581
|
-
.badge {
|
|
582
|
-
background-color: var(--primary);
|
|
583
|
-
color: white;
|
|
584
|
-
padding: 2px 8px;
|
|
585
|
-
border-radius: 12px;
|
|
586
|
-
font-size: 12px;
|
|
587
|
-
}
|
|
588
|
-
.finding-description {
|
|
589
|
-
color: var(--text-muted);
|
|
590
|
-
font-size: 14px;
|
|
591
|
-
margin-bottom: 20px;
|
|
592
|
-
}
|
|
593
|
-
|
|
594
|
-
.url-list {
|
|
595
|
-
background-color: var(--bg-light);
|
|
596
|
-
border-radius: 4px;
|
|
597
|
-
padding: 15px;
|
|
598
|
-
margin-bottom: 15px;
|
|
599
|
-
}
|
|
600
|
-
.url-item {
|
|
601
|
-
font-family: monospace;
|
|
602
|
-
font-size: 13px;
|
|
603
|
-
padding: 8px 12px;
|
|
604
|
-
background: white;
|
|
605
|
-
border: 1px solid var(--border);
|
|
606
|
-
border-radius: 4px;
|
|
607
|
-
margin-bottom: 8px;
|
|
608
|
-
white-space: nowrap;
|
|
609
|
-
overflow: hidden;
|
|
610
|
-
text-overflow: ellipsis;
|
|
611
|
-
}
|
|
612
|
-
.url-item:last-child { margin-bottom: 0; }
|
|
613
|
-
|
|
614
|
-
.more-count {
|
|
615
|
-
font-size: 12px;
|
|
616
|
-
color: var(--text-muted);
|
|
617
|
-
font-style: italic;
|
|
618
|
-
margin-bottom: 15px;
|
|
619
|
-
}
|
|
620
|
-
|
|
621
|
-
.btn {
|
|
622
|
-
display: inline-flex;
|
|
623
|
-
align-items: center;
|
|
624
|
-
gap: 8px;
|
|
625
|
-
background-color: var(--primary);
|
|
626
|
-
color: white;
|
|
627
|
-
padding: 8px 16px;
|
|
628
|
-
border-radius: 6px;
|
|
629
|
-
text-decoration: none;
|
|
630
|
-
font-size: 13px;
|
|
631
|
-
font-weight: 500;
|
|
632
|
-
}
|
|
633
|
-
.btn:hover { opacity: 0.9; }
|
|
634
|
-
|
|
635
|
-
footer {
|
|
636
|
-
text-align: center;
|
|
637
|
-
padding: 40px;
|
|
638
|
-
color: var(--text-muted);
|
|
639
|
-
font-size: 12px;
|
|
640
|
-
border-top: 1px solid var(--border);
|
|
641
|
-
margin-top: 40px;
|
|
642
|
-
}
|
|
643
|
-
</style>
|
|
644
|
-
</head>
|
|
645
|
-
<body>
|
|
646
|
-
<header>
|
|
647
|
-
<div class="container">
|
|
648
|
-
<h1>Sitemap Analysis</h1>
|
|
649
|
-
<div class="meta">
|
|
650
|
-
<div>${data.rootUrl}</div>
|
|
651
|
-
<div>${timestamp}</div>
|
|
652
|
-
</div>
|
|
653
|
-
</div>
|
|
654
|
-
</header>
|
|
655
|
-
|
|
656
|
-
<div class="summary-grid">
|
|
657
|
-
<div class="summary-card">
|
|
658
|
-
<h3>Sitemaps</h3>
|
|
659
|
-
<p>${data.discoveredSitemaps.length}</p>
|
|
660
|
-
</div>
|
|
661
|
-
<div class="summary-card">
|
|
662
|
-
<h3>URLs Analyzed</h3>
|
|
663
|
-
<p>${data.totalUrls.toLocaleString()}</p>
|
|
664
|
-
</div>
|
|
665
|
-
<div class="summary-card highlight">
|
|
666
|
-
<h3>Issues Found</h3>
|
|
667
|
-
<p>${data.totalRisks}</p>
|
|
668
|
-
</div>
|
|
669
|
-
<div class="summary-card">
|
|
670
|
-
<h3>Scan Time</h3>
|
|
671
|
-
<p>${duration}s</p>
|
|
672
|
-
</div>
|
|
673
|
-
</div>
|
|
674
|
-
|
|
675
|
-
<div class="container">
|
|
676
|
-
<details>
|
|
677
|
-
<summary>Sitemaps Discovered (${data.discoveredSitemaps.length})</summary>
|
|
678
|
-
<div style="padding: 20px; background: var(--bg-light);">
|
|
679
|
-
${data.discoveredSitemaps.map((s) => `<div class="url-item">${s}</div>`).join("")}
|
|
680
|
-
</div>
|
|
681
|
-
</details>
|
|
682
|
-
|
|
683
|
-
${Object.entries(categories).map(([category, findings]) => {
|
|
684
|
-
const totalCategoryUrls = Object.values(findings).reduce((acc, f) => acc + f.urls.length, 0);
|
|
685
|
-
return `
|
|
686
|
-
<div class="category-section">
|
|
687
|
-
<div class="category-header">
|
|
688
|
-
<span>${category} (${totalCategoryUrls} URLs)</span>
|
|
689
|
-
<span>\u25BC</span>
|
|
690
|
-
</div>
|
|
691
|
-
<div class="category-content">
|
|
692
|
-
${Object.entries(findings).map(([pattern, finding]) => `
|
|
693
|
-
<div class="finding-group">
|
|
694
|
-
<div class="finding-header">
|
|
695
|
-
<h4>${pattern}</h4>
|
|
696
|
-
<span class="badge">${finding.urls.length} URLs</span>
|
|
697
|
-
</div>
|
|
698
|
-
<div class="finding-description">
|
|
699
|
-
${finding.reason}
|
|
700
|
-
</div>
|
|
701
|
-
<div class="url-list">
|
|
702
|
-
${finding.urls.slice(0, 3).map((url) => `
|
|
703
|
-
<div class="url-item">${url}</div>
|
|
704
|
-
`).join("")}
|
|
705
|
-
</div>
|
|
706
|
-
${finding.urls.length > 3 ? `
|
|
707
|
-
<div class="more-count">... and ${finding.urls.length - 3} more</div>
|
|
708
|
-
` : ""}
|
|
709
|
-
<a href="#" class="btn" onclick="downloadUrls('${pattern}', ${JSON.stringify(finding.urls).replace(/"/g, """)})">
|
|
710
|
-
\u{1F4E5} Download All ${finding.urls.length} URLs
|
|
711
|
-
</a>
|
|
712
|
-
</div>
|
|
713
|
-
`).join("")}
|
|
714
|
-
</div>
|
|
715
|
-
</div>
|
|
716
|
-
`;
|
|
717
|
-
}).join("")}
|
|
718
|
-
</div>
|
|
719
|
-
|
|
720
|
-
<footer>
|
|
721
|
-
Generated by sitemap-qa v1.0.0
|
|
722
|
-
</footer>
|
|
723
|
-
|
|
724
|
-
<script>
|
|
725
|
-
function downloadUrls(name, urls) {
|
|
726
|
-
const blob = new Blob([urls.join('\\n')], { type: 'text/plain' });
|
|
727
|
-
const url = window.URL.createObjectURL(blob);
|
|
728
|
-
const a = document.createElement('a');
|
|
729
|
-
a.href = url;
|
|
730
|
-
a.download = \`\${name.replace(/[^a-z0-9]/gi, '_').toLowerCase()}_urls.txt\`;
|
|
731
|
-
document.body.appendChild(a);
|
|
732
|
-
a.click();
|
|
733
|
-
window.URL.revokeObjectURL(url);
|
|
734
|
-
document.body.removeChild(a);
|
|
735
|
-
}
|
|
736
|
-
</script>
|
|
737
|
-
</body>
|
|
738
|
-
</html>
|
|
739
|
-
`;
|
|
649
|
+
const categories = Object.entries(categoriesMap).map(([name, findingsMap]) => {
|
|
650
|
+
const findings = Object.entries(findingsMap).map(([pattern, finding]) => ({
|
|
651
|
+
pattern,
|
|
652
|
+
urls: finding.urls,
|
|
653
|
+
reason: finding.reason,
|
|
654
|
+
displayUrls: finding.urls.slice(0, 3),
|
|
655
|
+
moreCount: finding.urls.length > 3 ? finding.urls.length - 3 : 0
|
|
656
|
+
}));
|
|
657
|
+
const totalUrls = findings.reduce((acc, f) => acc + f.urls.length, 0);
|
|
658
|
+
return {
|
|
659
|
+
name,
|
|
660
|
+
totalUrls,
|
|
661
|
+
findings
|
|
662
|
+
};
|
|
663
|
+
});
|
|
664
|
+
const ignoredUrls = data.ignoredUrls.map((u) => {
|
|
665
|
+
const suppressedCategories = u.risks.length > 0 ? [...new Set(u.risks.map((r) => r.category))].join(", ") : void 0;
|
|
666
|
+
return {
|
|
667
|
+
loc: u.loc,
|
|
668
|
+
ignoredBy: u.ignoredBy ?? "Unknown",
|
|
669
|
+
suppressedCategories
|
|
670
|
+
};
|
|
671
|
+
});
|
|
672
|
+
return {
|
|
673
|
+
rootUrl: data.rootUrl,
|
|
674
|
+
timestamp,
|
|
675
|
+
discoveredSitemaps: data.discoveredSitemaps,
|
|
676
|
+
totalUrls: data.totalUrls.toLocaleString(),
|
|
677
|
+
totalRisks: data.totalRisks,
|
|
678
|
+
ignoredUrls,
|
|
679
|
+
duration,
|
|
680
|
+
categories
|
|
681
|
+
};
|
|
740
682
|
}
|
|
741
683
|
};
|
|
742
684
|
|
|
@@ -747,12 +689,13 @@ var analyzeCommand = new Command("analyze").description("Analyze a sitemap for p
|
|
|
747
689
|
const outDir = options.outDir || config.outDir || ".";
|
|
748
690
|
const outputFormat = options.output || config.outputFormat || "all";
|
|
749
691
|
const extractor = new ExtractorService();
|
|
750
|
-
const matcher = new MatcherService(config);
|
|
692
|
+
const matcher = new MatcherService(config, url);
|
|
751
693
|
const urlsWithRisks = [];
|
|
694
|
+
const ignoredUrls = [];
|
|
752
695
|
let totalUrls = 0;
|
|
753
696
|
let totalRisks = 0;
|
|
754
697
|
console.log(chalk3.blue(`
|
|
755
|
-
\
|
|
698
|
+
\u{1F680} Starting analysis of ${url}...`));
|
|
756
699
|
try {
|
|
757
700
|
for await (const urlObj of extractor.extract(url)) {
|
|
758
701
|
totalUrls++;
|
|
@@ -761,6 +704,8 @@ var analyzeCommand = new Command("analyze").description("Analyze a sitemap for p
|
|
|
761
704
|
urlObj.risks = risks;
|
|
762
705
|
urlsWithRisks.push(urlObj);
|
|
763
706
|
totalRisks += risks.length;
|
|
707
|
+
} else if (urlObj.ignored) {
|
|
708
|
+
ignoredUrls.push(urlObj);
|
|
764
709
|
}
|
|
765
710
|
if (totalUrls % 100 === 0) {
|
|
766
711
|
process.stdout.write(chalk3.gray(`\rProcessed ${totalUrls} URLs...`));
|
|
@@ -774,17 +719,18 @@ var analyzeCommand = new Command("analyze").description("Analyze a sitemap for p
|
|
|
774
719
|
totalUrls,
|
|
775
720
|
totalRisks,
|
|
776
721
|
urlsWithRisks,
|
|
722
|
+
ignoredUrls,
|
|
777
723
|
startTime,
|
|
778
724
|
endTime
|
|
779
725
|
};
|
|
780
726
|
const reporters = [new ConsoleReporter()];
|
|
781
727
|
await fs4.mkdir(outDir, { recursive: true });
|
|
782
728
|
if (outputFormat === "json" || outputFormat === "all") {
|
|
783
|
-
const jsonPath =
|
|
729
|
+
const jsonPath = path3.join(outDir, "sitemap-qa-report.json");
|
|
784
730
|
reporters.push(new JsonReporter(jsonPath));
|
|
785
731
|
}
|
|
786
732
|
if (outputFormat === "html" || outputFormat === "all") {
|
|
787
|
-
const htmlPath =
|
|
733
|
+
const htmlPath = path3.join(outDir, "sitemap-qa-report.html");
|
|
788
734
|
reporters.push(new HtmlReporter(htmlPath));
|
|
789
735
|
}
|
|
790
736
|
for (const reporter of reporters) {
|
|
@@ -804,11 +750,16 @@ var analyzeCommand = new Command("analyze").description("Analyze a sitemap for p
|
|
|
804
750
|
// src/commands/init.ts
|
|
805
751
|
import { Command as Command2 } from "commander";
|
|
806
752
|
import fs5 from "fs";
|
|
807
|
-
import
|
|
753
|
+
import path4 from "path";
|
|
808
754
|
import chalk4 from "chalk";
|
|
809
755
|
var DEFAULT_CONFIG = `# sitemap-qa configuration
|
|
810
756
|
# This file defines the risk categories and patterns to monitor.
|
|
811
757
|
|
|
758
|
+
# Tool Settings
|
|
759
|
+
outDir: "./sitemap-qa/report"
|
|
760
|
+
outputFormat: "all" # Options: json, html, all
|
|
761
|
+
enforceDomainConsistency: true
|
|
762
|
+
|
|
812
763
|
# Risk Categories
|
|
813
764
|
# Each category contains a list of patterns to match against URLs found in sitemaps.
|
|
814
765
|
# Patterns can be:
|
|
@@ -816,6 +767,16 @@ var DEFAULT_CONFIG = `# sitemap-qa configuration
|
|
|
816
767
|
# - glob: Glob pattern (e.g., **/admin/**)
|
|
817
768
|
# - regex: Regular expression (e.g., /\\/v[0-9]+\\//)
|
|
818
769
|
|
|
770
|
+
# Acceptable Patterns
|
|
771
|
+
# URLs matching these patterns will be ignored and not flagged as risks.
|
|
772
|
+
acceptable_patterns:
|
|
773
|
+
- type: "literal"
|
|
774
|
+
value: "/acceptable-path"
|
|
775
|
+
reason: "Example of an acceptable path that should not be flagged."
|
|
776
|
+
- type: "glob"
|
|
777
|
+
value: "**/public-docs/**"
|
|
778
|
+
reason: "Public documentation is always acceptable."
|
|
779
|
+
|
|
819
780
|
policies:
|
|
820
781
|
- category: "Security & Admin"
|
|
821
782
|
patterns:
|
|
@@ -845,7 +806,7 @@ policies:
|
|
|
845
806
|
reason: "Archive or database backup files exposed."
|
|
846
807
|
`;
|
|
847
808
|
var initCommand = new Command2("init").description("Initialize a default sitemap-qa.yaml configuration file").action(() => {
|
|
848
|
-
const configPath =
|
|
809
|
+
const configPath = path4.join(process.cwd(), "sitemap-qa.yaml");
|
|
849
810
|
if (fs5.existsSync(configPath)) {
|
|
850
811
|
console.error(chalk4.red(`Error: ${configPath} already exists.`));
|
|
851
812
|
process.exit(1);
|