@akotliar/sitemap-qa 1.0.0-alpha.4 → 1.0.0-alpha.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -6,7 +6,7 @@ import { Command as Command3 } from "commander";
6
6
  // src/commands/analyze.ts
7
7
  import { Command } from "commander";
8
8
  import chalk3 from "chalk";
9
- import path2 from "path";
9
+ import path3 from "path";
10
10
  import fs4 from "fs/promises";
11
11
 
12
12
  // src/config/loader.ts
@@ -27,13 +27,18 @@ var PolicySchema = z.object({
27
27
  patterns: z.array(PatternSchema).min(1, "At least one pattern is required per category")
28
28
  });
29
29
  var ConfigSchema = z.object({
30
+ acceptable_patterns: z.array(PatternSchema).default([]),
30
31
  policies: z.array(PolicySchema).default([]),
31
32
  outDir: z.string().optional(),
32
- outputFormat: z.enum(["json", "html", "all"]).default("all")
33
+ outputFormat: z.enum(["json", "html", "all"]).default("all"),
34
+ enforceDomainConsistency: z.boolean().default(true)
33
35
  });
34
36
 
35
37
  // src/config/defaults.ts
36
38
  var DEFAULT_POLICIES = {
39
+ acceptable_patterns: [],
40
+ outputFormat: "all",
41
+ enforceDomainConsistency: true,
37
42
  policies: [
38
43
  {
39
44
  category: "Security & Admin",
@@ -89,7 +94,12 @@ var ConfigLoader = class {
89
94
  static DEFAULT_CONFIG_PATH = "sitemap-qa.yaml";
90
95
  static load(configPath) {
91
96
  const targetPath = configPath || path.join(process.cwd(), this.DEFAULT_CONFIG_PATH);
92
- let userConfig = { policies: [] };
97
+ let userConfig = {
98
+ acceptable_patterns: [],
99
+ policies: [],
100
+ outputFormat: "all",
101
+ enforceDomainConsistency: true
102
+ };
93
103
  if (fs.existsSync(targetPath)) {
94
104
  try {
95
105
  const fileContent = fs.readFileSync(targetPath, "utf8");
@@ -101,15 +111,18 @@ var ConfigLoader = class {
101
111
  console.error(chalk.yellow(` - ${issue.path.join(".")}: ${issue.message}`));
102
112
  });
103
113
  process.exit(2);
114
+ return DEFAULT_POLICIES;
104
115
  }
105
116
  userConfig = result.data;
106
117
  } catch (error) {
107
118
  console.error(chalk.red("Failed to load configuration:"), error);
108
119
  process.exit(2);
120
+ return DEFAULT_POLICIES;
109
121
  }
110
122
  } else if (configPath) {
111
123
  console.error(chalk.red(`Error: Configuration file not found at ${targetPath}`));
112
124
  process.exit(2);
125
+ return DEFAULT_POLICIES;
113
126
  }
114
127
  return this.mergeConfigs(DEFAULT_POLICIES, userConfig);
115
128
  }
@@ -125,6 +138,7 @@ var ConfigLoader = class {
125
138
  });
126
139
  const merged = {
127
140
  ...defaults,
141
+ acceptable_patterns: [...defaults.acceptable_patterns || [], ...user.acceptable_patterns || []],
128
142
  policies: mergedPolicies
129
143
  };
130
144
  if (user.outDir !== void 0) {
@@ -133,15 +147,90 @@ var ConfigLoader = class {
133
147
  if (user.outputFormat !== void 0) {
134
148
  merged.outputFormat = user.outputFormat;
135
149
  }
150
+ if (user.enforceDomainConsistency !== void 0) {
151
+ merged.enforceDomainConsistency = user.enforceDomainConsistency;
152
+ }
136
153
  return merged;
137
154
  }
138
155
  };
139
156
 
140
157
  // src/core/discovery.ts
141
158
  import { fetch } from "undici";
159
+ import { Readable } from "stream";
160
+
161
+ // src/core/xml-parser.ts
142
162
  import { XMLParser } from "fast-xml-parser";
143
- var DiscoveryService = class {
163
+ import { gunzipSync } from "zlib";
164
+ var StreamingXmlParser = class {
144
165
  parser;
166
+ lastParsedXml;
167
+ constructor() {
168
+ this.parser = new XMLParser({
169
+ ignoreAttributes: false,
170
+ attributeNamePrefix: "@_",
171
+ // Ensure we always get arrays for sitemap and url tags
172
+ isArray: (name) => name === "sitemap" || name === "url",
173
+ removeNSPrefix: true
174
+ });
175
+ }
176
+ /**
177
+ * Parses an XML stream and yields typed entries as they are found.
178
+ * Generator-first design allows consumers to process entries without pre-collecting.
179
+ */
180
+ async *parse(stream) {
181
+ const xmlData = typeof stream === "string" ? stream : await this.streamToString(stream);
182
+ this.lastParsedXml = xmlData;
183
+ const jsonObj = this.parser.parse(xmlData);
184
+ if (jsonObj.sitemapindex?.sitemap) {
185
+ const sitemaps = jsonObj.sitemapindex.sitemap;
186
+ for (const sitemap of sitemaps) {
187
+ if (sitemap?.loc) {
188
+ yield { type: "sitemap", loc: sitemap.loc };
189
+ }
190
+ }
191
+ }
192
+ if (jsonObj.urlset?.url) {
193
+ const urls = jsonObj.urlset.url;
194
+ for (const url of urls) {
195
+ if (url?.loc) {
196
+ yield {
197
+ type: "url",
198
+ loc: url.loc,
199
+ lastmod: url.lastmod,
200
+ changefreq: url.changefreq,
201
+ priority: url.priority
202
+ };
203
+ }
204
+ }
205
+ }
206
+ }
207
+ /**
208
+ * Get the last parsed XML data (useful to avoid re-fetching).
209
+ */
210
+ getLastParsedXml() {
211
+ return this.lastParsedXml;
212
+ }
213
+ async streamToString(stream) {
214
+ const chunks = [];
215
+ for await (const chunk of stream) {
216
+ chunks.push(Buffer.from(chunk));
217
+ }
218
+ const buffer = Buffer.concat(chunks);
219
+ if (buffer.length >= 2 && buffer[0] === 31 && buffer[1] === 139) {
220
+ try {
221
+ const decompressed = gunzipSync(buffer);
222
+ return decompressed.toString("utf8");
223
+ } catch (error) {
224
+ throw new Error(`Failed to decompress gzipped content: ${error}`);
225
+ }
226
+ }
227
+ return buffer.toString("utf8");
228
+ }
229
+ };
230
+
231
+ // src/core/discovery.ts
232
+ var DiscoveryService = class {
233
+ xmlParser;
145
234
  visited = /* @__PURE__ */ new Set();
146
235
  STANDARD_PATHS = [
147
236
  "/sitemap.xml",
@@ -151,10 +240,7 @@ var DiscoveryService = class {
151
240
  "/sitemap.xml.gz"
152
241
  ];
153
242
  constructor() {
154
- this.parser = new XMLParser({
155
- ignoreAttributes: false,
156
- attributeNamePrefix: "@_"
157
- });
243
+ this.xmlParser = new StreamingXmlParser();
158
244
  }
159
245
  /**
160
246
  * Attempts to find sitemaps for a given base website URL.
@@ -176,9 +262,9 @@ var DiscoveryService = class {
176
262
  } catch (e) {
177
263
  }
178
264
  if (sitemaps.size === 0) {
179
- for (const path4 of this.STANDARD_PATHS) {
265
+ for (const path5 of this.STANDARD_PATHS) {
180
266
  try {
181
- const sitemapUrl = `${origin}${path4}`;
267
+ const sitemapUrl = `${origin}${path5}`;
182
268
  const response = await fetch(sitemapUrl, { method: "HEAD" });
183
269
  if (response.status === 200) {
184
270
  sitemaps.add(sitemapUrl);
@@ -191,6 +277,7 @@ var DiscoveryService = class {
191
277
  }
192
278
  /**
193
279
  * Recursively discovers all leaf sitemaps from a root URL.
280
+ * Returns both the sitemap URL and its XML data to avoid duplicate fetches.
194
281
  */
195
282
  async *discover(rootUrl) {
196
283
  const queue = [rootUrl];
@@ -201,17 +288,39 @@ var DiscoveryService = class {
201
288
  try {
202
289
  const response = await fetch(currentUrl);
203
290
  if (response.status !== 200) continue;
204
- const xmlData = await response.text();
205
- const jsonObj = this.parser.parse(xmlData);
206
- if (jsonObj.sitemapindex) {
207
- const sitemaps = Array.isArray(jsonObj.sitemapindex.sitemap) ? jsonObj.sitemapindex.sitemap : [jsonObj.sitemapindex.sitemap];
208
- for (const sitemap of sitemaps) {
209
- if (sitemap?.loc) {
210
- queue.push(sitemap.loc);
211
- }
291
+ let isIndex = false;
292
+ let isLeaf = false;
293
+ const childSitemaps = [];
294
+ let xmlData;
295
+ let source;
296
+ if (response.body) {
297
+ const nodeStream = Readable.fromWeb(response.body);
298
+ source = nodeStream;
299
+ } else {
300
+ xmlData = await response.text();
301
+ source = xmlData;
302
+ }
303
+ for await (const entry of this.xmlParser.parse(source)) {
304
+ if (entry.type === "sitemap") {
305
+ isIndex = true;
306
+ childSitemaps.push(entry.loc);
307
+ } else if (entry.type === "url") {
308
+ isLeaf = true;
212
309
  }
213
- } else if (jsonObj.urlset) {
214
- yield currentUrl;
310
+ }
311
+ if (isIndex) {
312
+ for (const loc of childSitemaps) {
313
+ queue.push(loc);
314
+ }
315
+ }
316
+ if (!xmlData) {
317
+ xmlData = this.xmlParser.getLastParsedXml() || "";
318
+ }
319
+ if (isLeaf || !isIndex && xmlData.includes("<urlset")) {
320
+ yield {
321
+ url: currentUrl,
322
+ xmlData
323
+ };
215
324
  }
216
325
  } catch (error) {
217
326
  console.error(`Failed to fetch or parse sitemap at ${currentUrl}:`, error);
@@ -221,47 +330,78 @@ var DiscoveryService = class {
221
330
  };
222
331
 
223
332
  // src/core/parser.ts
224
- import { XMLParser as XMLParser2 } from "fast-xml-parser";
333
+ import { Readable as Readable2 } from "stream";
225
334
  import { fetch as fetch2 } from "undici";
226
335
  var SitemapParser = class {
227
- parser;
336
+ xmlParser;
228
337
  constructor() {
229
- this.parser = new XMLParser2({
230
- ignoreAttributes: false,
231
- attributeNamePrefix: "@_"
232
- });
338
+ this.xmlParser = new StreamingXmlParser();
233
339
  }
234
340
  /**
235
341
  * Parses a leaf sitemap and yields SitemapUrl objects.
236
- * Note: For true streaming of massive files, we'd use a SAX-like approach.
237
- * fast-xml-parser's parse() is fast but loads the whole string.
238
- * Given the 50k URL requirement, we'll use a more memory-efficient approach if needed,
239
- * but let's start with a clean AsyncGenerator interface.
342
+ * Uses the shared StreamingXmlParser for consistent and efficient parsing.
343
+ *
344
+ * @param sitemapUrlOrData - Accepts one of three input types:
345
+ * - `string`: A URL string. The method will fetch the sitemap from this URL.
346
+ * Use this when you need to fetch a sitemap from a remote location.
347
+ * - `{ type: 'xmlData'; url: string; xmlData: string }`: An object with a URL and pre-fetched XML data.
348
+ * Use this when you already have the XML content (e.g., from a cache or file)
349
+ * and want to avoid an additional HTTP request.
350
+ * - `{ type: 'stream'; url: string; stream: ReadableStream | Readable }`: An object with a URL and a stream.
351
+ * Accepts either a Web ReadableStream or Node.js Readable stream.
352
+ * Use this when you have a stream source (e.g., from a streaming HTTP response)
353
+ * that should be consumed and parsed. Web streams are converted to Node.js Readable internally.
354
+ *
355
+ * @yields {SitemapUrl} Parsed sitemap URL entries containing `loc` (URL), `source` (sitemap URL),
356
+ * optional metadata (`lastmod`, `changefreq`, `priority`), and a `risks` array (initialized as empty,
357
+ * populated later in the processing pipeline). Other properties like `ignored`/`ignoredBy` are not
358
+ * set by this method and may be added by downstream processors.
240
359
  */
241
- async *parse(sitemapUrl) {
360
+ async *parse(sitemapUrlOrData) {
361
+ const sitemapUrl = typeof sitemapUrlOrData === "string" ? sitemapUrlOrData : sitemapUrlOrData.url;
242
362
  try {
243
- const response = await fetch2(sitemapUrl);
244
- const xmlData = await response.text();
245
- const jsonObj = this.parser.parse(xmlData);
246
- if (jsonObj.urlset && jsonObj.urlset.url) {
247
- const urls = Array.isArray(jsonObj.urlset.url) ? jsonObj.urlset.url : [jsonObj.urlset.url];
248
- for (const url of urls) {
249
- if (url.loc) {
250
- yield {
251
- loc: url.loc,
252
- source: sitemapUrl,
253
- lastmod: url.lastmod,
254
- changefreq: url.changefreq,
255
- priority: url.priority,
256
- risks: []
257
- };
258
- }
363
+ let source;
364
+ if (typeof sitemapUrlOrData === "string") {
365
+ const response = await fetch2(sitemapUrl);
366
+ if (response.status !== 200) throw new Error(`Failed to fetch sitemap at ${sitemapUrl}: HTTP ${response.status}`);
367
+ if (response.body) {
368
+ source = Readable2.fromWeb(response.body);
369
+ } else {
370
+ source = await response.text();
371
+ }
372
+ } else if (sitemapUrlOrData.type === "stream") {
373
+ if (sitemapUrlOrData.stream instanceof Readable2) {
374
+ source = sitemapUrlOrData.stream;
375
+ } else {
376
+ source = Readable2.fromWeb(sitemapUrlOrData.stream);
377
+ }
378
+ } else {
379
+ source = sitemapUrlOrData.xmlData;
380
+ source = sitemapUrlOrData.xmlData;
381
+ }
382
+ for await (const entry of this.xmlParser.parse(source)) {
383
+ if (entry.type === "url") {
384
+ yield {
385
+ loc: entry.loc,
386
+ source: sitemapUrl,
387
+ lastmod: entry.lastmod,
388
+ changefreq: entry.changefreq,
389
+ priority: entry.priority,
390
+ risks: []
391
+ };
259
392
  }
260
393
  }
261
394
  } catch (error) {
262
395
  console.error(`Failed to parse sitemap at ${sitemapUrl}:`, error);
263
396
  }
264
397
  }
398
+ async streamToString(stream) {
399
+ const chunks = [];
400
+ for await (const chunk of stream) {
401
+ chunks.push(Buffer.from(chunk));
402
+ }
403
+ return Buffer.concat(chunks).toString("utf8");
404
+ }
265
405
  };
266
406
 
267
407
  // src/core/extractor.ts
@@ -308,9 +448,10 @@ var ExtractorService = class {
308
448
  }
309
449
  }
310
450
  for (const startUrl of startUrls) {
311
- for await (const sitemapUrl of this.discovery.discover(startUrl)) {
312
- this.discoveredSitemaps.add(sitemapUrl);
313
- for await (const urlObj of this.parser.parse(sitemapUrl)) {
451
+ for await (const discovered of this.discovery.discover(startUrl)) {
452
+ this.discoveredSitemaps.add(discovered.url);
453
+ const parserInput = { type: "xmlData", url: discovered.url, xmlData: discovered.xmlData };
454
+ for await (const urlObj of this.parser.parse(parserInput)) {
314
455
  const normalized = this.normalizeUrl(urlObj.loc);
315
456
  if (!this.seenUrls.has(normalized)) {
316
457
  this.seenUrls.add(normalized);
@@ -326,14 +467,42 @@ var ExtractorService = class {
326
467
  import micromatch from "micromatch";
327
468
  var MatcherService = class {
328
469
  config;
329
- constructor(config) {
470
+ rootDomain;
471
+ constructor(config, rootUrl) {
330
472
  this.config = config;
473
+ if (rootUrl) {
474
+ try {
475
+ this.rootDomain = new URL(rootUrl).hostname.replace(/^www\./, "");
476
+ } catch {
477
+ }
478
+ }
331
479
  }
332
480
  /**
333
481
  * Matches a URL against all policies and returns detected risks.
334
482
  */
335
483
  match(urlObj) {
336
484
  const risks = [];
485
+ if (this.config.enforceDomainConsistency && this.rootDomain) {
486
+ try {
487
+ const currentDomain = new URL(urlObj.loc).hostname.replace(/^www\./, "");
488
+ if (currentDomain !== this.rootDomain) {
489
+ risks.push({
490
+ category: "Domain Consistency",
491
+ pattern: this.rootDomain,
492
+ type: "literal",
493
+ reason: `URL domain mismatch: expected ${this.rootDomain} (or www.${this.rootDomain}), but found ${currentDomain}.`
494
+ });
495
+ }
496
+ } catch {
497
+ }
498
+ }
499
+ for (const pattern of this.config.acceptable_patterns) {
500
+ if (this.isMatch(urlObj.loc, pattern)) {
501
+ urlObj.ignored = true;
502
+ urlObj.ignoredBy = pattern.reason;
503
+ return risks;
504
+ }
505
+ }
337
506
  for (const policy of this.config.policies) {
338
507
  for (const pattern of policy.patterns) {
339
508
  if (this.isMatch(urlObj.loc, pattern)) {
@@ -375,6 +544,7 @@ var ConsoleReporter = class {
375
544
  console.log(`Total URLs Scanned: ${data.totalUrls}`);
376
545
  console.log(`Total Risks Found: ${data.totalRisks > 0 ? chalk2.red(data.totalRisks) : chalk2.green(0)}`);
377
546
  console.log(`URLs with Risks: ${data.urlsWithRisks.length}`);
547
+ console.log(`URLs Ignored: ${data.ignoredUrls.length > 0 ? chalk2.yellow(data.ignoredUrls.length) : 0}`);
378
548
  console.log(`Duration: ${((data.endTime.getTime() - data.startTime.getTime()) / 1e3).toFixed(2)}s`);
379
549
  if (data.urlsWithRisks.length > 0) {
380
550
  console.log("\n" + chalk2.bold.yellow("Top Findings:"));
@@ -410,9 +580,11 @@ var JsonReporter = class {
410
580
  summary: {
411
581
  totalUrls: data.totalUrls,
412
582
  totalRisks: data.totalRisks,
413
- urlsWithRisksCount: data.urlsWithRisks.length
583
+ urlsWithRisksCount: data.urlsWithRisks.length,
584
+ ignoredUrlsCount: data.ignoredUrls.length
414
585
  },
415
- findings: data.urlsWithRisks
586
+ findings: data.urlsWithRisks,
587
+ ignored: data.ignoredUrls
416
588
  };
417
589
  await fs2.writeFile(this.outputPath, JSON.stringify(report, null, 2), "utf8");
418
590
  console.log(`JSON report generated at ${this.outputPath}`);
@@ -421,322 +593,92 @@ var JsonReporter = class {
421
593
 
422
594
  // src/reporters/html-reporter.ts
423
595
  import fs3 from "fs/promises";
596
+ import path2 from "path";
597
+ import { fileURLToPath } from "url";
598
+ import Handlebars from "handlebars";
599
+ var __filename2 = fileURLToPath(import.meta.url);
600
+ var __dirname2 = path2.dirname(__filename2);
424
601
  var HtmlReporter = class {
425
602
  outputPath;
426
603
  constructor(outputPath = "sitemap-qa-report.html") {
427
604
  this.outputPath = outputPath;
605
+ Handlebars.registerHelper("json", (context) => {
606
+ return JSON.stringify(context);
607
+ });
428
608
  }
429
609
  async generate(data) {
430
- const categories = this.groupRisks(data);
431
- const html = this.generateHtml(data, categories);
610
+ const partialsDir = path2.join(__dirname2, "templates", "partials");
611
+ try {
612
+ const partialFiles = await fs3.readdir(partialsDir);
613
+ for (const file of partialFiles) {
614
+ if (file.endsWith(".hbs")) {
615
+ const partialName = path2.basename(file, ".hbs");
616
+ const partialSource = await fs3.readFile(path2.join(partialsDir, file), "utf8");
617
+ Handlebars.registerPartial(partialName, partialSource);
618
+ }
619
+ }
620
+ } catch (error) {
621
+ console.warn("Could not load partials:", error);
622
+ }
623
+ const templatePath = path2.join(__dirname2, "templates", "report.hbs");
624
+ const templateSource = await fs3.readFile(templatePath, "utf8");
625
+ const template = Handlebars.compile(templateSource);
626
+ const templateData = this.prepareTemplateData(data);
627
+ const html = template(templateData);
432
628
  await fs3.writeFile(this.outputPath, html, "utf8");
433
629
  console.log(`HTML report generated at ${this.outputPath}`);
434
630
  }
435
- groupRisks(data) {
436
- const categories = {};
631
+ prepareTemplateData(data) {
632
+ const duration = ((data.endTime.getTime() - data.startTime.getTime()) / 1e3).toFixed(1);
633
+ const timestamp = data.endTime.toLocaleString();
634
+ const categoriesMap = {};
437
635
  for (const urlObj of data.urlsWithRisks) {
438
636
  for (const risk of urlObj.risks) {
439
- if (!categories[risk.category]) {
440
- categories[risk.category] = {};
637
+ if (!categoriesMap[risk.category]) {
638
+ categoriesMap[risk.category] = {};
441
639
  }
442
- if (!categories[risk.category][risk.pattern]) {
443
- categories[risk.category][risk.pattern] = {
640
+ if (!categoriesMap[risk.category][risk.pattern]) {
641
+ categoriesMap[risk.category][risk.pattern] = {
444
642
  reason: risk.reason,
445
643
  urls: []
446
644
  };
447
645
  }
448
- categories[risk.category][risk.pattern].urls.push(urlObj.loc);
646
+ categoriesMap[risk.category][risk.pattern].urls.push(urlObj.loc);
449
647
  }
450
648
  }
451
- return categories;
452
- }
453
- generateHtml(data, categories) {
454
- const duration = ((data.endTime.getTime() - data.startTime.getTime()) / 1e3).toFixed(1);
455
- const timestamp = data.endTime.toLocaleString();
456
- return `
457
- <!DOCTYPE html>
458
- <html lang="en">
459
- <head>
460
- <meta charset="UTF-8">
461
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
462
- <title>Sitemap Analysis - ${data.rootUrl}</title>
463
- <style>
464
- :root {
465
- --bg-dark: #0f172a;
466
- --bg-light: #f8fafc;
467
- --text-main: #1e293b;
468
- --text-muted: #64748b;
469
- --primary: #3b82f6;
470
- --danger: #ef4444;
471
- --warning: #f59e0b;
472
- --border: #e2e8f0;
473
- }
474
- body {
475
- font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
476
- line-height: 1.5;
477
- color: var(--text-main);
478
- background-color: #fff;
479
- margin: 0;
480
- padding: 0;
481
- }
482
- header {
483
- background-color: var(--bg-dark);
484
- color: white;
485
- padding: 40px 20px;
486
- text-align: left;
487
- }
488
- .container {
489
- max-width: 1200px;
490
- margin: 0 auto;
491
- padding: 0 20px;
492
- }
493
- header h1 { margin: 0; font-size: 24px; }
494
- header .meta { margin-top: 10px; color: #94a3b8; font-size: 14px; }
495
-
496
- .summary-grid {
497
- display: grid;
498
- grid-template-columns: repeat(4, 1fr);
499
- border-bottom: 1px solid var(--border);
500
- margin-bottom: 40px;
501
- }
502
- .summary-card {
503
- padding: 30px 20px;
504
- text-align: center;
505
- border-right: 1px solid var(--border);
506
- }
507
- .summary-card:last-child { border-right: none; }
508
- .summary-card h3 {
509
- margin: 0;
510
- font-size: 12px;
511
- text-transform: uppercase;
512
- color: var(--text-muted);
513
- letter-spacing: 0.05em;
514
- }
515
- .summary-card p {
516
- margin: 10px 0 0;
517
- font-size: 32px;
518
- font-weight: 700;
519
- color: var(--text-main);
520
- }
521
- .summary-card.highlight p { color: var(--danger); }
522
-
523
- details {
524
- margin-bottom: 20px;
525
- border: 1px solid var(--border);
526
- border-radius: 8px;
527
- overflow: hidden;
528
- }
529
- summary {
530
- padding: 15px 20px;
531
- background-color: #fff;
532
- cursor: pointer;
533
- font-weight: 600;
534
- display: flex;
535
- justify-content: space-between;
536
- align-items: center;
537
- list-style: none;
538
- }
539
- summary::-webkit-details-marker { display: none; }
540
- summary::after {
541
- content: '\u25B6';
542
- font-size: 12px;
543
- color: var(--text-muted);
544
- transition: transform 0.2s;
545
- }
546
- details[open] summary::after { transform: rotate(90deg); }
547
-
548
- .category-section {
549
- border: 1px solid var(--warning);
550
- border-radius: 8px;
551
- margin-bottom: 20px;
552
- }
553
- .category-header {
554
- padding: 15px 20px;
555
- background-color: #fffbeb;
556
- color: var(--warning);
557
- font-weight: 600;
558
- cursor: pointer;
559
- display: flex;
560
- justify-content: space-between;
561
- align-items: center;
562
- }
563
- .category-content {
564
- padding: 20px;
565
- background-color: #fff;
566
- }
567
-
568
- .finding-group {
569
- border: 1px solid var(--border);
570
- border-radius: 8px;
571
- padding: 20px;
572
- margin-bottom: 20px;
573
- }
574
- .finding-header {
575
- display: flex;
576
- align-items: center;
577
- gap: 10px;
578
- margin-bottom: 10px;
579
- }
580
- .finding-header h4 { margin: 0; font-size: 16px; }
581
- .badge {
582
- background-color: var(--primary);
583
- color: white;
584
- padding: 2px 8px;
585
- border-radius: 12px;
586
- font-size: 12px;
587
- }
588
- .finding-description {
589
- color: var(--text-muted);
590
- font-size: 14px;
591
- margin-bottom: 20px;
592
- }
593
-
594
- .url-list {
595
- background-color: var(--bg-light);
596
- border-radius: 4px;
597
- padding: 15px;
598
- margin-bottom: 15px;
599
- }
600
- .url-item {
601
- font-family: monospace;
602
- font-size: 13px;
603
- padding: 8px 12px;
604
- background: white;
605
- border: 1px solid var(--border);
606
- border-radius: 4px;
607
- margin-bottom: 8px;
608
- white-space: nowrap;
609
- overflow: hidden;
610
- text-overflow: ellipsis;
611
- }
612
- .url-item:last-child { margin-bottom: 0; }
613
-
614
- .more-count {
615
- font-size: 12px;
616
- color: var(--text-muted);
617
- font-style: italic;
618
- margin-bottom: 15px;
619
- }
620
-
621
- .btn {
622
- display: inline-flex;
623
- align-items: center;
624
- gap: 8px;
625
- background-color: var(--primary);
626
- color: white;
627
- padding: 8px 16px;
628
- border-radius: 6px;
629
- text-decoration: none;
630
- font-size: 13px;
631
- font-weight: 500;
632
- }
633
- .btn:hover { opacity: 0.9; }
634
-
635
- footer {
636
- text-align: center;
637
- padding: 40px;
638
- color: var(--text-muted);
639
- font-size: 12px;
640
- border-top: 1px solid var(--border);
641
- margin-top: 40px;
642
- }
643
- </style>
644
- </head>
645
- <body>
646
- <header>
647
- <div class="container">
648
- <h1>Sitemap Analysis</h1>
649
- <div class="meta">
650
- <div>${data.rootUrl}</div>
651
- <div>${timestamp}</div>
652
- </div>
653
- </div>
654
- </header>
655
-
656
- <div class="summary-grid">
657
- <div class="summary-card">
658
- <h3>Sitemaps</h3>
659
- <p>${data.discoveredSitemaps.length}</p>
660
- </div>
661
- <div class="summary-card">
662
- <h3>URLs Analyzed</h3>
663
- <p>${data.totalUrls.toLocaleString()}</p>
664
- </div>
665
- <div class="summary-card highlight">
666
- <h3>Issues Found</h3>
667
- <p>${data.totalRisks}</p>
668
- </div>
669
- <div class="summary-card">
670
- <h3>Scan Time</h3>
671
- <p>${duration}s</p>
672
- </div>
673
- </div>
674
-
675
- <div class="container">
676
- <details>
677
- <summary>Sitemaps Discovered (${data.discoveredSitemaps.length})</summary>
678
- <div style="padding: 20px; background: var(--bg-light);">
679
- ${data.discoveredSitemaps.map((s) => `<div class="url-item">${s}</div>`).join("")}
680
- </div>
681
- </details>
682
-
683
- ${Object.entries(categories).map(([category, findings]) => {
684
- const totalCategoryUrls = Object.values(findings).reduce((acc, f) => acc + f.urls.length, 0);
685
- return `
686
- <div class="category-section">
687
- <div class="category-header">
688
- <span>${category} (${totalCategoryUrls} URLs)</span>
689
- <span>\u25BC</span>
690
- </div>
691
- <div class="category-content">
692
- ${Object.entries(findings).map(([pattern, finding]) => `
693
- <div class="finding-group">
694
- <div class="finding-header">
695
- <h4>${pattern}</h4>
696
- <span class="badge">${finding.urls.length} URLs</span>
697
- </div>
698
- <div class="finding-description">
699
- ${finding.reason}
700
- </div>
701
- <div class="url-list">
702
- ${finding.urls.slice(0, 3).map((url) => `
703
- <div class="url-item">${url}</div>
704
- `).join("")}
705
- </div>
706
- ${finding.urls.length > 3 ? `
707
- <div class="more-count">... and ${finding.urls.length - 3} more</div>
708
- ` : ""}
709
- <a href="#" class="btn" onclick="downloadUrls('${pattern}', ${JSON.stringify(finding.urls).replace(/"/g, "&quot;")})">
710
- \u{1F4E5} Download All ${finding.urls.length} URLs
711
- </a>
712
- </div>
713
- `).join("")}
714
- </div>
715
- </div>
716
- `;
717
- }).join("")}
718
- </div>
719
-
720
- <footer>
721
- Generated by sitemap-qa v1.0.0
722
- </footer>
723
-
724
- <script>
725
- function downloadUrls(name, urls) {
726
- const blob = new Blob([urls.join('\\n')], { type: 'text/plain' });
727
- const url = window.URL.createObjectURL(blob);
728
- const a = document.createElement('a');
729
- a.href = url;
730
- a.download = \`\${name.replace(/[^a-z0-9]/gi, '_').toLowerCase()}_urls.txt\`;
731
- document.body.appendChild(a);
732
- a.click();
733
- window.URL.revokeObjectURL(url);
734
- document.body.removeChild(a);
735
- }
736
- </script>
737
- </body>
738
- </html>
739
- `;
649
+ const categories = Object.entries(categoriesMap).map(([name, findingsMap]) => {
650
+ const findings = Object.entries(findingsMap).map(([pattern, finding]) => ({
651
+ pattern,
652
+ urls: finding.urls,
653
+ reason: finding.reason,
654
+ displayUrls: finding.urls.slice(0, 3),
655
+ moreCount: finding.urls.length > 3 ? finding.urls.length - 3 : 0
656
+ }));
657
+ const totalUrls = findings.reduce((acc, f) => acc + f.urls.length, 0);
658
+ return {
659
+ name,
660
+ totalUrls,
661
+ findings
662
+ };
663
+ });
664
+ const ignoredUrls = data.ignoredUrls.map((u) => {
665
+ const suppressedCategories = u.risks.length > 0 ? [...new Set(u.risks.map((r) => r.category))].join(", ") : void 0;
666
+ return {
667
+ loc: u.loc,
668
+ ignoredBy: u.ignoredBy ?? "Unknown",
669
+ suppressedCategories
670
+ };
671
+ });
672
+ return {
673
+ rootUrl: data.rootUrl,
674
+ timestamp,
675
+ discoveredSitemaps: data.discoveredSitemaps,
676
+ totalUrls: data.totalUrls.toLocaleString(),
677
+ totalRisks: data.totalRisks,
678
+ ignoredUrls,
679
+ duration,
680
+ categories
681
+ };
740
682
  }
741
683
  };
742
684
 
@@ -747,12 +689,13 @@ var analyzeCommand = new Command("analyze").description("Analyze a sitemap for p
747
689
  const outDir = options.outDir || config.outDir || ".";
748
690
  const outputFormat = options.output || config.outputFormat || "all";
749
691
  const extractor = new ExtractorService();
750
- const matcher = new MatcherService(config);
692
+ const matcher = new MatcherService(config, url);
751
693
  const urlsWithRisks = [];
694
+ const ignoredUrls = [];
752
695
  let totalUrls = 0;
753
696
  let totalRisks = 0;
754
697
  console.log(chalk3.blue(`
755
- \uFFFD\uFFFD\uFFFD Starting analysis of ${url}...`));
698
+ \u{1F680} Starting analysis of ${url}...`));
756
699
  try {
757
700
  for await (const urlObj of extractor.extract(url)) {
758
701
  totalUrls++;
@@ -761,6 +704,8 @@ var analyzeCommand = new Command("analyze").description("Analyze a sitemap for p
761
704
  urlObj.risks = risks;
762
705
  urlsWithRisks.push(urlObj);
763
706
  totalRisks += risks.length;
707
+ } else if (urlObj.ignored) {
708
+ ignoredUrls.push(urlObj);
764
709
  }
765
710
  if (totalUrls % 100 === 0) {
766
711
  process.stdout.write(chalk3.gray(`\rProcessed ${totalUrls} URLs...`));
@@ -774,17 +719,18 @@ var analyzeCommand = new Command("analyze").description("Analyze a sitemap for p
774
719
  totalUrls,
775
720
  totalRisks,
776
721
  urlsWithRisks,
722
+ ignoredUrls,
777
723
  startTime,
778
724
  endTime
779
725
  };
780
726
  const reporters = [new ConsoleReporter()];
781
727
  await fs4.mkdir(outDir, { recursive: true });
782
728
  if (outputFormat === "json" || outputFormat === "all") {
783
- const jsonPath = path2.join(outDir, "sitemap-qa-report.json");
729
+ const jsonPath = path3.join(outDir, "sitemap-qa-report.json");
784
730
  reporters.push(new JsonReporter(jsonPath));
785
731
  }
786
732
  if (outputFormat === "html" || outputFormat === "all") {
787
- const htmlPath = path2.join(outDir, "sitemap-qa-report.html");
733
+ const htmlPath = path3.join(outDir, "sitemap-qa-report.html");
788
734
  reporters.push(new HtmlReporter(htmlPath));
789
735
  }
790
736
  for (const reporter of reporters) {
@@ -804,11 +750,16 @@ var analyzeCommand = new Command("analyze").description("Analyze a sitemap for p
804
750
  // src/commands/init.ts
805
751
  import { Command as Command2 } from "commander";
806
752
  import fs5 from "fs";
807
- import path3 from "path";
753
+ import path4 from "path";
808
754
  import chalk4 from "chalk";
809
755
  var DEFAULT_CONFIG = `# sitemap-qa configuration
810
756
  # This file defines the risk categories and patterns to monitor.
811
757
 
758
+ # Tool Settings
759
+ outDir: "./sitemap-qa/report"
760
+ outputFormat: "all" # Options: json, html, all
761
+ enforceDomainConsistency: true
762
+
812
763
  # Risk Categories
813
764
  # Each category contains a list of patterns to match against URLs found in sitemaps.
814
765
  # Patterns can be:
@@ -816,6 +767,16 @@ var DEFAULT_CONFIG = `# sitemap-qa configuration
816
767
  # - glob: Glob pattern (e.g., **/admin/**)
817
768
  # - regex: Regular expression (e.g., /\\/v[0-9]+\\//)
818
769
 
770
+ # Acceptable Patterns
771
+ # URLs matching these patterns will be ignored and not flagged as risks.
772
+ acceptable_patterns:
773
+ - type: "literal"
774
+ value: "/acceptable-path"
775
+ reason: "Example of an acceptable path that should not be flagged."
776
+ - type: "glob"
777
+ value: "**/public-docs/**"
778
+ reason: "Public documentation is always acceptable."
779
+
819
780
  policies:
820
781
  - category: "Security & Admin"
821
782
  patterns:
@@ -845,7 +806,7 @@ policies:
845
806
  reason: "Archive or database backup files exposed."
846
807
  `;
847
808
  var initCommand = new Command2("init").description("Initialize a default sitemap-qa.yaml configuration file").action(() => {
848
- const configPath = path3.join(process.cwd(), "sitemap-qa.yaml");
809
+ const configPath = path4.join(process.cwd(), "sitemap-qa.yaml");
849
810
  if (fs5.existsSync(configPath)) {
850
811
  console.error(chalk4.red(`Error: ${configPath} already exists.`));
851
812
  process.exit(1);