@akotliar/sitemap-qa 1.0.0-alpha.1 → 1.0.0-alpha.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -289,31 +289,6 @@ async function fetchUrl(url, options = {}) {
289
289
  }
290
290
 
291
291
  // src/core/discovery.ts
292
- function normalizeBaseUrl(url) {
293
- const parsed = new URL(url);
294
- return parsed.origin;
295
- }
296
- async function detectCanonicalDomain(baseUrl, config) {
297
- const urlObj = new URL(baseUrl);
298
- const hasWww = urlObj.hostname.startsWith("www.");
299
- const alternateHostname = hasWww ? urlObj.hostname.substring(4) : `www.${urlObj.hostname}`;
300
- const alternateUrl = `${urlObj.protocol}//${alternateHostname}/robots.txt`;
301
- try {
302
- const result = await fetchUrl(alternateUrl, {
303
- timeout: config.timeout,
304
- maxRetries: 1
305
- });
306
- if (result.statusCode === 200 || result.statusCode === 404) {
307
- return alternateHostname;
308
- }
309
- return urlObj.hostname;
310
- } catch (error) {
311
- if (error instanceof HttpError && error.statusCode === 301) {
312
- return urlObj.hostname;
313
- }
314
- return urlObj.hostname;
315
- }
316
- }
317
292
  async function tryStandardPaths(baseUrl, config) {
318
293
  const baseDomain = new URL(baseUrl).origin;
319
294
  const accessIssues = [];
@@ -457,13 +432,11 @@ function extractSitemapIndexUrls(xmlContent) {
457
432
  }
458
433
  return urls;
459
434
  }
460
- async function discoverAllSitemaps(initialSitemaps, config, baseUrl, canonicalDomain, _maxDepth = 10) {
435
+ async function discoverAllSitemaps(initialSitemaps, config) {
461
436
  const finalSitemaps = [];
462
437
  const toProcess = [...initialSitemaps];
463
438
  const processed = /* @__PURE__ */ new Set();
464
- const failed = /* @__PURE__ */ new Set();
465
- const redirected = /* @__PURE__ */ new Set();
466
- let detectedCanonical = canonicalDomain;
439
+ const inaccessible = /* @__PURE__ */ new Set();
467
440
  const BATCH_SIZE = config.discoveryConcurrency || 50;
468
441
  while (toProcess.length > 0) {
469
442
  const batch = toProcess.splice(0, Math.min(BATCH_SIZE, toProcess.length));
@@ -496,40 +469,12 @@ async function discoverAllSitemaps(initialSitemaps, config, baseUrl, canonicalDo
496
469
  return { type: "sitemap", url: sitemapUrl };
497
470
  }
498
471
  } catch (error) {
499
- if (error instanceof HttpError && error.statusCode === 301) {
500
- redirected.add(sitemapUrl);
501
- if (config.verbose) {
502
- if (!detectedCanonical) {
503
- detectedCanonical = await detectCanonicalDomain(baseUrl, config);
504
- if (config.verbose) {
505
- console.log(`Canonical domain detected: ${detectedCanonical}`);
506
- }
507
- }
508
- try {
509
- const sitemapUrlObj = new URL(sitemapUrl);
510
- if (sitemapUrlObj.hostname !== detectedCanonical) {
511
- console.warn(`\u26A0\uFE0F Sitemap URL redirects (301): ${sitemapUrl}`);
512
- console.warn(` Problem: The sitemap index contains a URL that redirects.`);
513
- console.warn(` Likely issue: Domain mismatch - expected "${detectedCanonical}" but got "${sitemapUrlObj.hostname}"`);
514
- console.warn(` Fix: Update sitemap index to use "https://${detectedCanonical}${sitemapUrlObj.pathname}"`);
515
- } else {
516
- console.warn(`\u26A0\uFE0F Sitemap URL redirects (301): ${sitemapUrl}`);
517
- console.warn(` Fix: Update the sitemap index to reference the final URL after redirect.`);
518
- }
519
- } catch {
520
- const message = error instanceof Error ? error.message : String(error);
521
- console.warn(`Failed to fetch sitemap ${sitemapUrl}: ${message}`);
522
- }
523
- }
524
- return { type: "redirect" };
525
- } else {
526
- failed.add(sitemapUrl);
527
- if (config.verbose) {
528
- const message = error instanceof Error ? error.message : String(error);
529
- console.warn(`Failed to fetch sitemap ${sitemapUrl}: ${message}`);
530
- }
531
- return { type: "failed" };
472
+ inaccessible.add(sitemapUrl);
473
+ if (config.verbose) {
474
+ const message = error instanceof Error ? error.message : String(error);
475
+ console.warn(`Failed to fetch sitemap ${sitemapUrl}: ${message}`);
532
476
  }
477
+ return { type: "failed" };
533
478
  }
534
479
  }));
535
480
  for (const result of batchResults) {
@@ -544,117 +489,50 @@ async function discoverAllSitemaps(initialSitemaps, config, baseUrl, canonicalDo
544
489
  break;
545
490
  }
546
491
  }
547
- const totalProcessed = processed.size;
548
- const totalFailed = failed.size;
549
- const totalRedirected = redirected.size;
550
- const sitemapIndexCount = totalProcessed - finalSitemaps.length - totalFailed - totalRedirected;
551
- if (finalSitemaps.length === 0 && totalProcessed > 0) {
492
+ if (finalSitemaps.length === 0 && inaccessible.size > 0) {
552
493
  console.warn(`
553
- \u26A0\uFE0F SITEMAP DISCOVERY ISSUE`);
554
- if (sitemapIndexCount > 0 && (totalFailed > 0 || totalRedirected > 0)) {
555
- console.warn(`Found ${sitemapIndexCount} sitemap index(es) containing ${totalFailed + totalRedirected} child sitemap(s):`);
556
- if (totalRedirected > 0) {
557
- console.warn(` - ${totalRedirected} sitemap(s) return 301 redirects (content not accessible without following redirect)`);
558
- }
559
- if (totalFailed > 0) {
560
- console.warn(` - ${totalFailed} sitemap(s) returned errors (404, 403, 500, or network issues)`);
561
- }
562
- } else if (totalRedirected > 0) {
563
- console.warn(`All ${totalRedirected} sitemap(s) return 301 redirects.`);
564
- } else if (totalFailed > 0) {
565
- console.warn(`All ${totalFailed} sitemap(s) returned errors.`);
566
- console.warn(`
567
- Common causes:`);
568
- console.warn(` - 403 Forbidden: Bot protection (Cloudflare, etc.) or IP blocking`);
569
- console.warn(` - 404 Not Found: Sitemaps don't exist at these URLs`);
570
- console.warn(` - 500/502/503: Server errors or maintenance`);
571
- console.warn(`
572
- If sitemaps work in your browser but not here, the site likely has bot protection.`);
573
- console.warn(`Try: Check if sitemaps load without JavaScript, or contact site administrator.`);
574
- } else {
575
- console.warn(`Processed ${totalProcessed} URL(s) but found no accessible sitemaps.`);
576
- }
577
- console.warn(`
578
- Note: This tool does not follow redirects for sitemap URLs.`);
579
- if (totalRedirected > 0) {
580
- console.warn(`
581
- Possible causes of redirects:`);
582
- console.warn(` - Sitemap index uses non-canonical domain (e.g., missing 'www' or vice versa)`);
583
- console.warn(` - Sitemap URLs redirect from HTTP to HTTPS`);
584
- console.warn(` - Intentional redirects in your site configuration`);
585
- console.warn(`
586
- Recommendation: Update sitemap index URLs to match the final destination (no redirects).`);
587
- }
588
- console.warn(``);
589
- }
590
- return { sitemaps: finalSitemaps, canonicalDomain: detectedCanonical };
494
+ \u26A0\uFE0F All ${inaccessible.size} sitemap(s) were inaccessible`);
495
+ console.warn(`Common causes: 403/404 errors, network issues, or bot protection`);
496
+ }
497
+ return finalSitemaps;
591
498
  }
592
499
  async function discoverSitemaps(baseUrl, config) {
593
- const normalizedUrl = normalizeBaseUrl(baseUrl);
594
- let allAccessIssues = [];
595
- let canonicalDomain;
500
+ const normalizedUrl = new URL(baseUrl).origin;
596
501
  if (config.verbose) {
597
- console.log("Strategy 1: Checking robots.txt for sitemap directives...");
502
+ console.log("Checking robots.txt for sitemap directives...");
598
503
  }
599
504
  const robotsSitemaps = await parseRobotsTxt(normalizedUrl, config);
600
505
  if (robotsSitemaps.length > 0) {
601
- const { sitemaps: allSitemaps, canonicalDomain: detected } = await discoverAllSitemaps(robotsSitemaps, config, normalizedUrl, canonicalDomain);
602
- canonicalDomain = detected;
506
+ const sitemaps = await discoverAllSitemaps(robotsSitemaps, config);
603
507
  return {
604
- sitemaps: allSitemaps,
508
+ sitemaps,
605
509
  source: "robots-txt",
606
- accessIssues: [],
607
- // Clear access issues since we found working sitemaps
608
- canonicalDomain
510
+ accessIssues: []
609
511
  };
610
512
  }
611
513
  if (config.verbose) {
612
- console.log("Strategy 2: Trying standard sitemap paths...");
514
+ console.log("Trying standard sitemap paths...");
613
515
  }
614
- const { sitemaps: standardSitemaps, issues, redirectedToCanonical } = await tryStandardPaths(normalizedUrl, config);
615
- allAccessIssues = issues;
516
+ const { sitemaps: standardSitemaps, issues } = await tryStandardPaths(normalizedUrl, config);
616
517
  if (standardSitemaps.length > 0) {
617
- const { sitemaps: allSitemaps, canonicalDomain: detected } = await discoverAllSitemaps(standardSitemaps, config, normalizedUrl, canonicalDomain);
618
- canonicalDomain = detected;
619
- return {
620
- sitemaps: allSitemaps,
621
- source: "standard-path",
622
- accessIssues: [],
623
- // Clear access issues since we found working sitemaps
624
- canonicalDomain
625
- };
626
- }
627
- if (redirectedToCanonical) {
628
- const canonicalUrl = `https://${redirectedToCanonical}`;
629
- console.log(`
630
- \u{1F4A1} All requests redirected. Retrying with canonical domain: ${redirectedToCanonical}
631
- `);
632
- const canonicalRobotsSitemaps = await parseRobotsTxt(canonicalUrl, config);
633
- if (canonicalRobotsSitemaps.length > 0) {
634
- const { sitemaps: allSitemaps, canonicalDomain: detected } = await discoverAllSitemaps(canonicalRobotsSitemaps, config, canonicalUrl, redirectedToCanonical);
518
+ const sitemaps = await discoverAllSitemaps(standardSitemaps, config);
519
+ if (sitemaps.length > 0) {
635
520
  return {
636
- sitemaps: allSitemaps,
637
- source: "robots-txt",
638
- accessIssues: [],
639
- canonicalDomain: detected || redirectedToCanonical
640
- };
641
- }
642
- const { sitemaps: canonicalStandardSitemaps } = await tryStandardPaths(canonicalUrl, config);
643
- if (canonicalStandardSitemaps.length > 0) {
644
- const { sitemaps: allSitemaps, canonicalDomain: detected } = await discoverAllSitemaps(canonicalStandardSitemaps, config, canonicalUrl, redirectedToCanonical);
645
- return {
646
- sitemaps: allSitemaps,
521
+ sitemaps,
647
522
  source: "standard-path",
648
- accessIssues: [],
649
- canonicalDomain: detected || redirectedToCanonical
523
+ accessIssues: []
650
524
  };
651
525
  }
526
+ return {
527
+ sitemaps: [],
528
+ source: "standard-path",
529
+ accessIssues: issues
530
+ };
652
531
  }
653
532
  return {
654
533
  sitemaps: [],
655
534
  source: "none",
656
- accessIssues: allAccessIssues,
657
- canonicalDomain
535
+ accessIssues: issues
658
536
  };
659
537
  }
660
538
 
@@ -976,9 +854,6 @@ Top duplicates:`);
976
854
 
977
855
  // src/core/patterns/risk-patterns.ts
978
856
  var RISK_PATTERNS = [
979
- // Note: Environment leakage patterns moved to domain-patterns.ts
980
- // Note: Admin path patterns moved to admin-patterns.ts
981
- // to avoid duplication and improve maintainability
982
857
  // Sensitive Parameter Patterns (HIGH)
983
858
  {
984
859
  name: "Authentication Parameter",
@@ -1532,7 +1407,7 @@ function summarizeRisks(request) {
1532
1407
  }
1533
1408
 
1534
1409
  // src/reporters/json-reporter.ts
1535
- var TOOL_VERSION = true ? "1.0.0-alpha.1" : "dev";
1410
+ var TOOL_VERSION = true ? "1.0.0-alpha.2" : "dev";
1536
1411
  function generateJsonReport(summary, discoveryResult, parseResult, riskGroups, config, startTime, options = {}) {
1537
1412
  const {
1538
1413
  pretty = true,
@@ -1702,7 +1577,7 @@ function transformError(error) {
1702
1577
 
1703
1578
  // src/reporters/html-reporter.ts
1704
1579
  import { promises as fs } from "fs";
1705
- var TOOL_VERSION2 = "1.0.0-alpha.1";
1580
+ var TOOL_VERSION2 = "1.0.0-alpha.2";
1706
1581
  function generateHtmlReport(summary, discoveryResult, totalUrls, config, errors, options = {}) {
1707
1582
  const maxUrls = options.maxUrlsPerGroup ?? 10;
1708
1583
  const timestamp = (/* @__PURE__ */ new Date()).toISOString();
@@ -2257,7 +2132,7 @@ function showCliSummary(result) {
2257
2132
  const riskyUrlCount = result.summary.categoryInsights.reduce((sum, g) => sum + g.count, 0);
2258
2133
  console.log(chalk.dim("\u2500".repeat(50)));
2259
2134
  if (riskyUrlCount === 0) {
2260
- console.log(chalk.green("\u2705 No issues found - sitemap looks clean!"));
2135
+ console.log(chalk.green("No issues found - sitemap looks clean!"));
2261
2136
  } else {
2262
2137
  const { high, medium, low } = result.summary.severityBreakdown;
2263
2138
  const severityParts = [];
@@ -2350,9 +2225,9 @@ async function runAnalysisPipeline(url, config) {
2350
2225
  const duplicatePercentage = duplicatesRemoved / extractionResult.allUrls.length * 100;
2351
2226
  if (!config.silent) {
2352
2227
  if (duplicatesRemoved > 100 || duplicatePercentage > 1) {
2353
- console.log(chalk.green(`\u2713 Analyzed ${discoveryResult.sitemaps.length} sitemap(s) \u2192 ${extractionResult.allUrls.length.toLocaleString()} URLs (${consolidatedResult.uniqueUrls.length.toLocaleString()} unique)`));
2228
+ console.log(chalk.green(`Found ${discoveryResult.sitemaps.length} sitemap(s) \u2192 ${extractionResult.allUrls.length.toLocaleString()} URLs (${consolidatedResult.uniqueUrls.length.toLocaleString()} unique)`));
2354
2229
  } else {
2355
- console.log(chalk.green(`\u2713 Analyzed ${discoveryResult.sitemaps.length} sitemap(s) \u2192 ${extractionResult.allUrls.length.toLocaleString()} URLs`));
2230
+ console.log(chalk.green(`Found ${discoveryResult.sitemaps.length} sitemap(s) \u2192 ${extractionResult.allUrls.length.toLocaleString()} URLs`));
2356
2231
  }
2357
2232
  }
2358
2233
  phaseStart = Date.now();
@@ -2383,7 +2258,7 @@ async function runAnalysisPipeline(url, config) {
2383
2258
  } else if (!config.silent) {
2384
2259
  const parsingPhase = phaseTimings.find((p) => p.name === "Parsing");
2385
2260
  const sitemapsPerSec = parsingPhase ? (discoveryResult.sitemaps.length / (parsingPhase.duration / 1e3)).toFixed(1) : "0";
2386
- console.log(chalk.green(`\u2705 Analysis complete (${(executionTime / 1e3).toFixed(1)}s \xB7 ${sitemapsPerSec} sitemaps/sec)
2261
+ console.log(chalk.green(`Analysis complete (${(executionTime / 1e3).toFixed(1)}s \xB7 ${sitemapsPerSec} sitemaps/sec)
2387
2262
  `));
2388
2263
  }
2389
2264
  if (config.benchmark) {
@@ -2432,7 +2307,7 @@ function handleAnalysisError(error, config) {
2432
2307
  }
2433
2308
  function displayPhaseSummary(timings, totalTime) {
2434
2309
  console.log(chalk.green(`
2435
- \u2705 Analysis Complete (Total: ${(totalTime / 1e3).toFixed(1)}s)
2310
+ Analysis Complete (Total: ${(totalTime / 1e3).toFixed(1)}s)
2436
2311
  `));
2437
2312
  console.log(chalk.cyan("Phase Breakdown:"));
2438
2313
  for (const timing of timings) {