@akotliar/sitemap-qa 1.0.0-alpha.1 → 1.0.0-alpha.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -312,31 +312,6 @@ async function fetchUrl(url, options = {}) {
312
312
  }
313
313
 
314
314
  // src/core/discovery.ts
315
- function normalizeBaseUrl(url) {
316
- const parsed = new URL(url);
317
- return parsed.origin;
318
- }
319
- async function detectCanonicalDomain(baseUrl, config) {
320
- const urlObj = new URL(baseUrl);
321
- const hasWww = urlObj.hostname.startsWith("www.");
322
- const alternateHostname = hasWww ? urlObj.hostname.substring(4) : `www.${urlObj.hostname}`;
323
- const alternateUrl = `${urlObj.protocol}//${alternateHostname}/robots.txt`;
324
- try {
325
- const result = await fetchUrl(alternateUrl, {
326
- timeout: config.timeout,
327
- maxRetries: 1
328
- });
329
- if (result.statusCode === 200 || result.statusCode === 404) {
330
- return alternateHostname;
331
- }
332
- return urlObj.hostname;
333
- } catch (error) {
334
- if (error instanceof HttpError && error.statusCode === 301) {
335
- return urlObj.hostname;
336
- }
337
- return urlObj.hostname;
338
- }
339
- }
340
315
  async function tryStandardPaths(baseUrl, config) {
341
316
  const baseDomain = new URL(baseUrl).origin;
342
317
  const accessIssues = [];
@@ -480,13 +455,11 @@ function extractSitemapIndexUrls(xmlContent) {
480
455
  }
481
456
  return urls;
482
457
  }
483
- async function discoverAllSitemaps(initialSitemaps, config, baseUrl, canonicalDomain, _maxDepth = 10) {
458
+ async function discoverAllSitemaps(initialSitemaps, config) {
484
459
  const finalSitemaps = [];
485
460
  const toProcess = [...initialSitemaps];
486
461
  const processed = /* @__PURE__ */ new Set();
487
- const failed = /* @__PURE__ */ new Set();
488
- const redirected = /* @__PURE__ */ new Set();
489
- let detectedCanonical = canonicalDomain;
462
+ const inaccessible = /* @__PURE__ */ new Set();
490
463
  const BATCH_SIZE = config.discoveryConcurrency || 50;
491
464
  while (toProcess.length > 0) {
492
465
  const batch = toProcess.splice(0, Math.min(BATCH_SIZE, toProcess.length));
@@ -519,40 +492,12 @@ async function discoverAllSitemaps(initialSitemaps, config, baseUrl, canonicalDo
519
492
  return { type: "sitemap", url: sitemapUrl };
520
493
  }
521
494
  } catch (error) {
522
- if (error instanceof HttpError && error.statusCode === 301) {
523
- redirected.add(sitemapUrl);
524
- if (config.verbose) {
525
- if (!detectedCanonical) {
526
- detectedCanonical = await detectCanonicalDomain(baseUrl, config);
527
- if (config.verbose) {
528
- console.log(`Canonical domain detected: ${detectedCanonical}`);
529
- }
530
- }
531
- try {
532
- const sitemapUrlObj = new URL(sitemapUrl);
533
- if (sitemapUrlObj.hostname !== detectedCanonical) {
534
- console.warn(`\u26A0\uFE0F Sitemap URL redirects (301): ${sitemapUrl}`);
535
- console.warn(` Problem: The sitemap index contains a URL that redirects.`);
536
- console.warn(` Likely issue: Domain mismatch - expected "${detectedCanonical}" but got "${sitemapUrlObj.hostname}"`);
537
- console.warn(` Fix: Update sitemap index to use "https://${detectedCanonical}${sitemapUrlObj.pathname}"`);
538
- } else {
539
- console.warn(`\u26A0\uFE0F Sitemap URL redirects (301): ${sitemapUrl}`);
540
- console.warn(` Fix: Update the sitemap index to reference the final URL after redirect.`);
541
- }
542
- } catch {
543
- const message = error instanceof Error ? error.message : String(error);
544
- console.warn(`Failed to fetch sitemap ${sitemapUrl}: ${message}`);
545
- }
546
- }
547
- return { type: "redirect" };
548
- } else {
549
- failed.add(sitemapUrl);
550
- if (config.verbose) {
551
- const message = error instanceof Error ? error.message : String(error);
552
- console.warn(`Failed to fetch sitemap ${sitemapUrl}: ${message}`);
553
- }
554
- return { type: "failed" };
495
+ inaccessible.add(sitemapUrl);
496
+ if (config.verbose) {
497
+ const message = error instanceof Error ? error.message : String(error);
498
+ console.warn(`Failed to fetch sitemap ${sitemapUrl}: ${message}`);
555
499
  }
500
+ return { type: "failed" };
556
501
  }
557
502
  }));
558
503
  for (const result of batchResults) {
@@ -567,117 +512,50 @@ async function discoverAllSitemaps(initialSitemaps, config, baseUrl, canonicalDo
567
512
  break;
568
513
  }
569
514
  }
570
- const totalProcessed = processed.size;
571
- const totalFailed = failed.size;
572
- const totalRedirected = redirected.size;
573
- const sitemapIndexCount = totalProcessed - finalSitemaps.length - totalFailed - totalRedirected;
574
- if (finalSitemaps.length === 0 && totalProcessed > 0) {
515
+ if (finalSitemaps.length === 0 && inaccessible.size > 0) {
575
516
  console.warn(`
576
- \u26A0\uFE0F SITEMAP DISCOVERY ISSUE`);
577
- if (sitemapIndexCount > 0 && (totalFailed > 0 || totalRedirected > 0)) {
578
- console.warn(`Found ${sitemapIndexCount} sitemap index(es) containing ${totalFailed + totalRedirected} child sitemap(s):`);
579
- if (totalRedirected > 0) {
580
- console.warn(` - ${totalRedirected} sitemap(s) return 301 redirects (content not accessible without following redirect)`);
581
- }
582
- if (totalFailed > 0) {
583
- console.warn(` - ${totalFailed} sitemap(s) returned errors (404, 403, 500, or network issues)`);
584
- }
585
- } else if (totalRedirected > 0) {
586
- console.warn(`All ${totalRedirected} sitemap(s) return 301 redirects.`);
587
- } else if (totalFailed > 0) {
588
- console.warn(`All ${totalFailed} sitemap(s) returned errors.`);
589
- console.warn(`
590
- Common causes:`);
591
- console.warn(` - 403 Forbidden: Bot protection (Cloudflare, etc.) or IP blocking`);
592
- console.warn(` - 404 Not Found: Sitemaps don't exist at these URLs`);
593
- console.warn(` - 500/502/503: Server errors or maintenance`);
594
- console.warn(`
595
- If sitemaps work in your browser but not here, the site likely has bot protection.`);
596
- console.warn(`Try: Check if sitemaps load without JavaScript, or contact site administrator.`);
597
- } else {
598
- console.warn(`Processed ${totalProcessed} URL(s) but found no accessible sitemaps.`);
599
- }
600
- console.warn(`
601
- Note: This tool does not follow redirects for sitemap URLs.`);
602
- if (totalRedirected > 0) {
603
- console.warn(`
604
- Possible causes of redirects:`);
605
- console.warn(` - Sitemap index uses non-canonical domain (e.g., missing 'www' or vice versa)`);
606
- console.warn(` - Sitemap URLs redirect from HTTP to HTTPS`);
607
- console.warn(` - Intentional redirects in your site configuration`);
608
- console.warn(`
609
- Recommendation: Update sitemap index URLs to match the final destination (no redirects).`);
610
- }
611
- console.warn(``);
612
- }
613
- return { sitemaps: finalSitemaps, canonicalDomain: detectedCanonical };
517
+ \u26A0\uFE0F All ${inaccessible.size} sitemap(s) were inaccessible`);
518
+ console.warn(`Common causes: 403/404 errors, network issues, or bot protection`);
519
+ }
520
+ return finalSitemaps;
614
521
  }
615
522
  async function discoverSitemaps(baseUrl, config) {
616
- const normalizedUrl = normalizeBaseUrl(baseUrl);
617
- let allAccessIssues = [];
618
- let canonicalDomain;
523
+ const normalizedUrl = new URL(baseUrl).origin;
619
524
  if (config.verbose) {
620
- console.log("Strategy 1: Checking robots.txt for sitemap directives...");
525
+ console.log("Checking robots.txt for sitemap directives...");
621
526
  }
622
527
  const robotsSitemaps = await parseRobotsTxt(normalizedUrl, config);
623
528
  if (robotsSitemaps.length > 0) {
624
- const { sitemaps: allSitemaps, canonicalDomain: detected } = await discoverAllSitemaps(robotsSitemaps, config, normalizedUrl, canonicalDomain);
625
- canonicalDomain = detected;
529
+ const sitemaps = await discoverAllSitemaps(robotsSitemaps, config);
626
530
  return {
627
- sitemaps: allSitemaps,
531
+ sitemaps,
628
532
  source: "robots-txt",
629
- accessIssues: [],
630
- // Clear access issues since we found working sitemaps
631
- canonicalDomain
533
+ accessIssues: []
632
534
  };
633
535
  }
634
536
  if (config.verbose) {
635
- console.log("Strategy 2: Trying standard sitemap paths...");
537
+ console.log("Trying standard sitemap paths...");
636
538
  }
637
- const { sitemaps: standardSitemaps, issues, redirectedToCanonical } = await tryStandardPaths(normalizedUrl, config);
638
- allAccessIssues = issues;
539
+ const { sitemaps: standardSitemaps, issues } = await tryStandardPaths(normalizedUrl, config);
639
540
  if (standardSitemaps.length > 0) {
640
- const { sitemaps: allSitemaps, canonicalDomain: detected } = await discoverAllSitemaps(standardSitemaps, config, normalizedUrl, canonicalDomain);
641
- canonicalDomain = detected;
642
- return {
643
- sitemaps: allSitemaps,
644
- source: "standard-path",
645
- accessIssues: [],
646
- // Clear access issues since we found working sitemaps
647
- canonicalDomain
648
- };
649
- }
650
- if (redirectedToCanonical) {
651
- const canonicalUrl = `https://${redirectedToCanonical}`;
652
- console.log(`
653
- \u{1F4A1} All requests redirected. Retrying with canonical domain: ${redirectedToCanonical}
654
- `);
655
- const canonicalRobotsSitemaps = await parseRobotsTxt(canonicalUrl, config);
656
- if (canonicalRobotsSitemaps.length > 0) {
657
- const { sitemaps: allSitemaps, canonicalDomain: detected } = await discoverAllSitemaps(canonicalRobotsSitemaps, config, canonicalUrl, redirectedToCanonical);
541
+ const sitemaps = await discoverAllSitemaps(standardSitemaps, config);
542
+ if (sitemaps.length > 0) {
658
543
  return {
659
- sitemaps: allSitemaps,
660
- source: "robots-txt",
661
- accessIssues: [],
662
- canonicalDomain: detected || redirectedToCanonical
663
- };
664
- }
665
- const { sitemaps: canonicalStandardSitemaps } = await tryStandardPaths(canonicalUrl, config);
666
- if (canonicalStandardSitemaps.length > 0) {
667
- const { sitemaps: allSitemaps, canonicalDomain: detected } = await discoverAllSitemaps(canonicalStandardSitemaps, config, canonicalUrl, redirectedToCanonical);
668
- return {
669
- sitemaps: allSitemaps,
544
+ sitemaps,
670
545
  source: "standard-path",
671
- accessIssues: [],
672
- canonicalDomain: detected || redirectedToCanonical
546
+ accessIssues: []
673
547
  };
674
548
  }
549
+ return {
550
+ sitemaps: [],
551
+ source: "standard-path",
552
+ accessIssues: issues
553
+ };
675
554
  }
676
555
  return {
677
556
  sitemaps: [],
678
557
  source: "none",
679
- accessIssues: allAccessIssues,
680
- canonicalDomain
558
+ accessIssues: issues
681
559
  };
682
560
  }
683
561
 
@@ -999,9 +877,6 @@ Top duplicates:`);
999
877
 
1000
878
  // src/core/patterns/risk-patterns.ts
1001
879
  var RISK_PATTERNS = [
1002
- // Note: Environment leakage patterns moved to domain-patterns.ts
1003
- // Note: Admin path patterns moved to admin-patterns.ts
1004
- // to avoid duplication and improve maintainability
1005
880
  // Sensitive Parameter Patterns (HIGH)
1006
881
  {
1007
882
  name: "Authentication Parameter",
@@ -1555,7 +1430,7 @@ function summarizeRisks(request) {
1555
1430
  }
1556
1431
 
1557
1432
  // src/reporters/json-reporter.ts
1558
- var TOOL_VERSION = true ? "1.0.0-alpha.1" : "dev";
1433
+ var TOOL_VERSION = true ? "1.0.0-alpha.2" : "dev";
1559
1434
  function generateJsonReport(summary, discoveryResult, parseResult, riskGroups, config, startTime, options = {}) {
1560
1435
  const {
1561
1436
  pretty = true,
@@ -1725,7 +1600,7 @@ function transformError(error) {
1725
1600
 
1726
1601
  // src/reporters/html-reporter.ts
1727
1602
  var import_fs2 = require("fs");
1728
- var TOOL_VERSION2 = "1.0.0-alpha.1";
1603
+ var TOOL_VERSION2 = "1.0.0-alpha.2";
1729
1604
  function generateHtmlReport(summary, discoveryResult, totalUrls, config, errors, options = {}) {
1730
1605
  const maxUrls = options.maxUrlsPerGroup ?? 10;
1731
1606
  const timestamp = (/* @__PURE__ */ new Date()).toISOString();
@@ -2280,7 +2155,7 @@ function showCliSummary(result) {
2280
2155
  const riskyUrlCount = result.summary.categoryInsights.reduce((sum, g) => sum + g.count, 0);
2281
2156
  console.log(import_chalk.default.dim("\u2500".repeat(50)));
2282
2157
  if (riskyUrlCount === 0) {
2283
- console.log(import_chalk.default.green("\u2705 No issues found - sitemap looks clean!"));
2158
+ console.log(import_chalk.default.green("No issues found - sitemap looks clean!"));
2284
2159
  } else {
2285
2160
  const { high, medium, low } = result.summary.severityBreakdown;
2286
2161
  const severityParts = [];
@@ -2373,9 +2248,9 @@ async function runAnalysisPipeline(url, config) {
2373
2248
  const duplicatePercentage = duplicatesRemoved / extractionResult.allUrls.length * 100;
2374
2249
  if (!config.silent) {
2375
2250
  if (duplicatesRemoved > 100 || duplicatePercentage > 1) {
2376
- console.log(import_chalk.default.green(`\u2713 Analyzed ${discoveryResult.sitemaps.length} sitemap(s) \u2192 ${extractionResult.allUrls.length.toLocaleString()} URLs (${consolidatedResult.uniqueUrls.length.toLocaleString()} unique)`));
2251
+ console.log(import_chalk.default.green(`Found ${discoveryResult.sitemaps.length} sitemap(s) \u2192 ${extractionResult.allUrls.length.toLocaleString()} URLs (${consolidatedResult.uniqueUrls.length.toLocaleString()} unique)`));
2377
2252
  } else {
2378
- console.log(import_chalk.default.green(`\u2713 Analyzed ${discoveryResult.sitemaps.length} sitemap(s) \u2192 ${extractionResult.allUrls.length.toLocaleString()} URLs`));
2253
+ console.log(import_chalk.default.green(`Found ${discoveryResult.sitemaps.length} sitemap(s) \u2192 ${extractionResult.allUrls.length.toLocaleString()} URLs`));
2379
2254
  }
2380
2255
  }
2381
2256
  phaseStart = Date.now();
@@ -2406,7 +2281,7 @@ async function runAnalysisPipeline(url, config) {
2406
2281
  } else if (!config.silent) {
2407
2282
  const parsingPhase = phaseTimings.find((p) => p.name === "Parsing");
2408
2283
  const sitemapsPerSec = parsingPhase ? (discoveryResult.sitemaps.length / (parsingPhase.duration / 1e3)).toFixed(1) : "0";
2409
- console.log(import_chalk.default.green(`\u2705 Analysis complete (${(executionTime / 1e3).toFixed(1)}s \xB7 ${sitemapsPerSec} sitemaps/sec)
2284
+ console.log(import_chalk.default.green(`Analysis complete (${(executionTime / 1e3).toFixed(1)}s \xB7 ${sitemapsPerSec} sitemaps/sec)
2410
2285
  `));
2411
2286
  }
2412
2287
  if (config.benchmark) {
@@ -2455,7 +2330,7 @@ function handleAnalysisError(error, config) {
2455
2330
  }
2456
2331
  function displayPhaseSummary(timings, totalTime) {
2457
2332
  console.log(import_chalk.default.green(`
2458
- \u2705 Analysis Complete (Total: ${(totalTime / 1e3).toFixed(1)}s)
2333
+ Analysis Complete (Total: ${(totalTime / 1e3).toFixed(1)}s)
2459
2334
  `));
2460
2335
  console.log(import_chalk.default.cyan("Phase Breakdown:"));
2461
2336
  for (const timing of timings) {