@akotliar/sitemap-qa 1.0.0-alpha.1 → 1.0.0-alpha.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +36 -161
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +36 -161
- package/dist/index.js.map +1 -1
- package/package.json +7 -9
package/dist/index.cjs
CHANGED
|
@@ -312,31 +312,6 @@ async function fetchUrl(url, options = {}) {
|
|
|
312
312
|
}
|
|
313
313
|
|
|
314
314
|
// src/core/discovery.ts
|
|
315
|
-
function normalizeBaseUrl(url) {
|
|
316
|
-
const parsed = new URL(url);
|
|
317
|
-
return parsed.origin;
|
|
318
|
-
}
|
|
319
|
-
async function detectCanonicalDomain(baseUrl, config) {
|
|
320
|
-
const urlObj = new URL(baseUrl);
|
|
321
|
-
const hasWww = urlObj.hostname.startsWith("www.");
|
|
322
|
-
const alternateHostname = hasWww ? urlObj.hostname.substring(4) : `www.${urlObj.hostname}`;
|
|
323
|
-
const alternateUrl = `${urlObj.protocol}//${alternateHostname}/robots.txt`;
|
|
324
|
-
try {
|
|
325
|
-
const result = await fetchUrl(alternateUrl, {
|
|
326
|
-
timeout: config.timeout,
|
|
327
|
-
maxRetries: 1
|
|
328
|
-
});
|
|
329
|
-
if (result.statusCode === 200 || result.statusCode === 404) {
|
|
330
|
-
return alternateHostname;
|
|
331
|
-
}
|
|
332
|
-
return urlObj.hostname;
|
|
333
|
-
} catch (error) {
|
|
334
|
-
if (error instanceof HttpError && error.statusCode === 301) {
|
|
335
|
-
return urlObj.hostname;
|
|
336
|
-
}
|
|
337
|
-
return urlObj.hostname;
|
|
338
|
-
}
|
|
339
|
-
}
|
|
340
315
|
async function tryStandardPaths(baseUrl, config) {
|
|
341
316
|
const baseDomain = new URL(baseUrl).origin;
|
|
342
317
|
const accessIssues = [];
|
|
@@ -480,13 +455,11 @@ function extractSitemapIndexUrls(xmlContent) {
|
|
|
480
455
|
}
|
|
481
456
|
return urls;
|
|
482
457
|
}
|
|
483
|
-
async function discoverAllSitemaps(initialSitemaps, config
|
|
458
|
+
async function discoverAllSitemaps(initialSitemaps, config) {
|
|
484
459
|
const finalSitemaps = [];
|
|
485
460
|
const toProcess = [...initialSitemaps];
|
|
486
461
|
const processed = /* @__PURE__ */ new Set();
|
|
487
|
-
const
|
|
488
|
-
const redirected = /* @__PURE__ */ new Set();
|
|
489
|
-
let detectedCanonical = canonicalDomain;
|
|
462
|
+
const inaccessible = /* @__PURE__ */ new Set();
|
|
490
463
|
const BATCH_SIZE = config.discoveryConcurrency || 50;
|
|
491
464
|
while (toProcess.length > 0) {
|
|
492
465
|
const batch = toProcess.splice(0, Math.min(BATCH_SIZE, toProcess.length));
|
|
@@ -519,40 +492,12 @@ async function discoverAllSitemaps(initialSitemaps, config, baseUrl, canonicalDo
|
|
|
519
492
|
return { type: "sitemap", url: sitemapUrl };
|
|
520
493
|
}
|
|
521
494
|
} catch (error) {
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
detectedCanonical = await detectCanonicalDomain(baseUrl, config);
|
|
527
|
-
if (config.verbose) {
|
|
528
|
-
console.log(`Canonical domain detected: ${detectedCanonical}`);
|
|
529
|
-
}
|
|
530
|
-
}
|
|
531
|
-
try {
|
|
532
|
-
const sitemapUrlObj = new URL(sitemapUrl);
|
|
533
|
-
if (sitemapUrlObj.hostname !== detectedCanonical) {
|
|
534
|
-
console.warn(`\u26A0\uFE0F Sitemap URL redirects (301): ${sitemapUrl}`);
|
|
535
|
-
console.warn(` Problem: The sitemap index contains a URL that redirects.`);
|
|
536
|
-
console.warn(` Likely issue: Domain mismatch - expected "${detectedCanonical}" but got "${sitemapUrlObj.hostname}"`);
|
|
537
|
-
console.warn(` Fix: Update sitemap index to use "https://${detectedCanonical}${sitemapUrlObj.pathname}"`);
|
|
538
|
-
} else {
|
|
539
|
-
console.warn(`\u26A0\uFE0F Sitemap URL redirects (301): ${sitemapUrl}`);
|
|
540
|
-
console.warn(` Fix: Update the sitemap index to reference the final URL after redirect.`);
|
|
541
|
-
}
|
|
542
|
-
} catch {
|
|
543
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
544
|
-
console.warn(`Failed to fetch sitemap ${sitemapUrl}: ${message}`);
|
|
545
|
-
}
|
|
546
|
-
}
|
|
547
|
-
return { type: "redirect" };
|
|
548
|
-
} else {
|
|
549
|
-
failed.add(sitemapUrl);
|
|
550
|
-
if (config.verbose) {
|
|
551
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
552
|
-
console.warn(`Failed to fetch sitemap ${sitemapUrl}: ${message}`);
|
|
553
|
-
}
|
|
554
|
-
return { type: "failed" };
|
|
495
|
+
inaccessible.add(sitemapUrl);
|
|
496
|
+
if (config.verbose) {
|
|
497
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
498
|
+
console.warn(`Failed to fetch sitemap ${sitemapUrl}: ${message}`);
|
|
555
499
|
}
|
|
500
|
+
return { type: "failed" };
|
|
556
501
|
}
|
|
557
502
|
}));
|
|
558
503
|
for (const result of batchResults) {
|
|
@@ -567,117 +512,50 @@ async function discoverAllSitemaps(initialSitemaps, config, baseUrl, canonicalDo
|
|
|
567
512
|
break;
|
|
568
513
|
}
|
|
569
514
|
}
|
|
570
|
-
|
|
571
|
-
const totalFailed = failed.size;
|
|
572
|
-
const totalRedirected = redirected.size;
|
|
573
|
-
const sitemapIndexCount = totalProcessed - finalSitemaps.length - totalFailed - totalRedirected;
|
|
574
|
-
if (finalSitemaps.length === 0 && totalProcessed > 0) {
|
|
515
|
+
if (finalSitemaps.length === 0 && inaccessible.size > 0) {
|
|
575
516
|
console.warn(`
|
|
576
|
-
\u26A0\uFE0F
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
console.warn(` - ${totalRedirected} sitemap(s) return 301 redirects (content not accessible without following redirect)`);
|
|
581
|
-
}
|
|
582
|
-
if (totalFailed > 0) {
|
|
583
|
-
console.warn(` - ${totalFailed} sitemap(s) returned errors (404, 403, 500, or network issues)`);
|
|
584
|
-
}
|
|
585
|
-
} else if (totalRedirected > 0) {
|
|
586
|
-
console.warn(`All ${totalRedirected} sitemap(s) return 301 redirects.`);
|
|
587
|
-
} else if (totalFailed > 0) {
|
|
588
|
-
console.warn(`All ${totalFailed} sitemap(s) returned errors.`);
|
|
589
|
-
console.warn(`
|
|
590
|
-
Common causes:`);
|
|
591
|
-
console.warn(` - 403 Forbidden: Bot protection (Cloudflare, etc.) or IP blocking`);
|
|
592
|
-
console.warn(` - 404 Not Found: Sitemaps don't exist at these URLs`);
|
|
593
|
-
console.warn(` - 500/502/503: Server errors or maintenance`);
|
|
594
|
-
console.warn(`
|
|
595
|
-
If sitemaps work in your browser but not here, the site likely has bot protection.`);
|
|
596
|
-
console.warn(`Try: Check if sitemaps load without JavaScript, or contact site administrator.`);
|
|
597
|
-
} else {
|
|
598
|
-
console.warn(`Processed ${totalProcessed} URL(s) but found no accessible sitemaps.`);
|
|
599
|
-
}
|
|
600
|
-
console.warn(`
|
|
601
|
-
Note: This tool does not follow redirects for sitemap URLs.`);
|
|
602
|
-
if (totalRedirected > 0) {
|
|
603
|
-
console.warn(`
|
|
604
|
-
Possible causes of redirects:`);
|
|
605
|
-
console.warn(` - Sitemap index uses non-canonical domain (e.g., missing 'www' or vice versa)`);
|
|
606
|
-
console.warn(` - Sitemap URLs redirect from HTTP to HTTPS`);
|
|
607
|
-
console.warn(` - Intentional redirects in your site configuration`);
|
|
608
|
-
console.warn(`
|
|
609
|
-
Recommendation: Update sitemap index URLs to match the final destination (no redirects).`);
|
|
610
|
-
}
|
|
611
|
-
console.warn(``);
|
|
612
|
-
}
|
|
613
|
-
return { sitemaps: finalSitemaps, canonicalDomain: detectedCanonical };
|
|
517
|
+
\u26A0\uFE0F All ${inaccessible.size} sitemap(s) were inaccessible`);
|
|
518
|
+
console.warn(`Common causes: 403/404 errors, network issues, or bot protection`);
|
|
519
|
+
}
|
|
520
|
+
return finalSitemaps;
|
|
614
521
|
}
|
|
615
522
|
async function discoverSitemaps(baseUrl, config) {
|
|
616
|
-
const normalizedUrl =
|
|
617
|
-
let allAccessIssues = [];
|
|
618
|
-
let canonicalDomain;
|
|
523
|
+
const normalizedUrl = new URL(baseUrl).origin;
|
|
619
524
|
if (config.verbose) {
|
|
620
|
-
console.log("
|
|
525
|
+
console.log("Checking robots.txt for sitemap directives...");
|
|
621
526
|
}
|
|
622
527
|
const robotsSitemaps = await parseRobotsTxt(normalizedUrl, config);
|
|
623
528
|
if (robotsSitemaps.length > 0) {
|
|
624
|
-
const
|
|
625
|
-
canonicalDomain = detected;
|
|
529
|
+
const sitemaps = await discoverAllSitemaps(robotsSitemaps, config);
|
|
626
530
|
return {
|
|
627
|
-
sitemaps
|
|
531
|
+
sitemaps,
|
|
628
532
|
source: "robots-txt",
|
|
629
|
-
accessIssues: []
|
|
630
|
-
// Clear access issues since we found working sitemaps
|
|
631
|
-
canonicalDomain
|
|
533
|
+
accessIssues: []
|
|
632
534
|
};
|
|
633
535
|
}
|
|
634
536
|
if (config.verbose) {
|
|
635
|
-
console.log("
|
|
537
|
+
console.log("Trying standard sitemap paths...");
|
|
636
538
|
}
|
|
637
|
-
const { sitemaps: standardSitemaps, issues
|
|
638
|
-
allAccessIssues = issues;
|
|
539
|
+
const { sitemaps: standardSitemaps, issues } = await tryStandardPaths(normalizedUrl, config);
|
|
639
540
|
if (standardSitemaps.length > 0) {
|
|
640
|
-
const
|
|
641
|
-
|
|
642
|
-
return {
|
|
643
|
-
sitemaps: allSitemaps,
|
|
644
|
-
source: "standard-path",
|
|
645
|
-
accessIssues: [],
|
|
646
|
-
// Clear access issues since we found working sitemaps
|
|
647
|
-
canonicalDomain
|
|
648
|
-
};
|
|
649
|
-
}
|
|
650
|
-
if (redirectedToCanonical) {
|
|
651
|
-
const canonicalUrl = `https://${redirectedToCanonical}`;
|
|
652
|
-
console.log(`
|
|
653
|
-
\u{1F4A1} All requests redirected. Retrying with canonical domain: ${redirectedToCanonical}
|
|
654
|
-
`);
|
|
655
|
-
const canonicalRobotsSitemaps = await parseRobotsTxt(canonicalUrl, config);
|
|
656
|
-
if (canonicalRobotsSitemaps.length > 0) {
|
|
657
|
-
const { sitemaps: allSitemaps, canonicalDomain: detected } = await discoverAllSitemaps(canonicalRobotsSitemaps, config, canonicalUrl, redirectedToCanonical);
|
|
541
|
+
const sitemaps = await discoverAllSitemaps(standardSitemaps, config);
|
|
542
|
+
if (sitemaps.length > 0) {
|
|
658
543
|
return {
|
|
659
|
-
sitemaps
|
|
660
|
-
source: "robots-txt",
|
|
661
|
-
accessIssues: [],
|
|
662
|
-
canonicalDomain: detected || redirectedToCanonical
|
|
663
|
-
};
|
|
664
|
-
}
|
|
665
|
-
const { sitemaps: canonicalStandardSitemaps } = await tryStandardPaths(canonicalUrl, config);
|
|
666
|
-
if (canonicalStandardSitemaps.length > 0) {
|
|
667
|
-
const { sitemaps: allSitemaps, canonicalDomain: detected } = await discoverAllSitemaps(canonicalStandardSitemaps, config, canonicalUrl, redirectedToCanonical);
|
|
668
|
-
return {
|
|
669
|
-
sitemaps: allSitemaps,
|
|
544
|
+
sitemaps,
|
|
670
545
|
source: "standard-path",
|
|
671
|
-
accessIssues: []
|
|
672
|
-
canonicalDomain: detected || redirectedToCanonical
|
|
546
|
+
accessIssues: []
|
|
673
547
|
};
|
|
674
548
|
}
|
|
549
|
+
return {
|
|
550
|
+
sitemaps: [],
|
|
551
|
+
source: "standard-path",
|
|
552
|
+
accessIssues: issues
|
|
553
|
+
};
|
|
675
554
|
}
|
|
676
555
|
return {
|
|
677
556
|
sitemaps: [],
|
|
678
557
|
source: "none",
|
|
679
|
-
accessIssues:
|
|
680
|
-
canonicalDomain
|
|
558
|
+
accessIssues: issues
|
|
681
559
|
};
|
|
682
560
|
}
|
|
683
561
|
|
|
@@ -999,9 +877,6 @@ Top duplicates:`);
|
|
|
999
877
|
|
|
1000
878
|
// src/core/patterns/risk-patterns.ts
|
|
1001
879
|
var RISK_PATTERNS = [
|
|
1002
|
-
// Note: Environment leakage patterns moved to domain-patterns.ts
|
|
1003
|
-
// Note: Admin path patterns moved to admin-patterns.ts
|
|
1004
|
-
// to avoid duplication and improve maintainability
|
|
1005
880
|
// Sensitive Parameter Patterns (HIGH)
|
|
1006
881
|
{
|
|
1007
882
|
name: "Authentication Parameter",
|
|
@@ -1555,7 +1430,7 @@ function summarizeRisks(request) {
|
|
|
1555
1430
|
}
|
|
1556
1431
|
|
|
1557
1432
|
// src/reporters/json-reporter.ts
|
|
1558
|
-
var TOOL_VERSION = true ? "1.0.0-alpha.
|
|
1433
|
+
var TOOL_VERSION = true ? "1.0.0-alpha.2" : "dev";
|
|
1559
1434
|
function generateJsonReport(summary, discoveryResult, parseResult, riskGroups, config, startTime, options = {}) {
|
|
1560
1435
|
const {
|
|
1561
1436
|
pretty = true,
|
|
@@ -1725,7 +1600,7 @@ function transformError(error) {
|
|
|
1725
1600
|
|
|
1726
1601
|
// src/reporters/html-reporter.ts
|
|
1727
1602
|
var import_fs2 = require("fs");
|
|
1728
|
-
var TOOL_VERSION2 = "1.0.0-alpha.
|
|
1603
|
+
var TOOL_VERSION2 = "1.0.0-alpha.2";
|
|
1729
1604
|
function generateHtmlReport(summary, discoveryResult, totalUrls, config, errors, options = {}) {
|
|
1730
1605
|
const maxUrls = options.maxUrlsPerGroup ?? 10;
|
|
1731
1606
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString();
|
|
@@ -2280,7 +2155,7 @@ function showCliSummary(result) {
|
|
|
2280
2155
|
const riskyUrlCount = result.summary.categoryInsights.reduce((sum, g) => sum + g.count, 0);
|
|
2281
2156
|
console.log(import_chalk.default.dim("\u2500".repeat(50)));
|
|
2282
2157
|
if (riskyUrlCount === 0) {
|
|
2283
|
-
console.log(import_chalk.default.green("
|
|
2158
|
+
console.log(import_chalk.default.green("No issues found - sitemap looks clean!"));
|
|
2284
2159
|
} else {
|
|
2285
2160
|
const { high, medium, low } = result.summary.severityBreakdown;
|
|
2286
2161
|
const severityParts = [];
|
|
@@ -2373,9 +2248,9 @@ async function runAnalysisPipeline(url, config) {
|
|
|
2373
2248
|
const duplicatePercentage = duplicatesRemoved / extractionResult.allUrls.length * 100;
|
|
2374
2249
|
if (!config.silent) {
|
|
2375
2250
|
if (duplicatesRemoved > 100 || duplicatePercentage > 1) {
|
|
2376
|
-
console.log(import_chalk.default.green(
|
|
2251
|
+
console.log(import_chalk.default.green(`Found ${discoveryResult.sitemaps.length} sitemap(s) \u2192 ${extractionResult.allUrls.length.toLocaleString()} URLs (${consolidatedResult.uniqueUrls.length.toLocaleString()} unique)`));
|
|
2377
2252
|
} else {
|
|
2378
|
-
console.log(import_chalk.default.green(
|
|
2253
|
+
console.log(import_chalk.default.green(`Found ${discoveryResult.sitemaps.length} sitemap(s) \u2192 ${extractionResult.allUrls.length.toLocaleString()} URLs`));
|
|
2379
2254
|
}
|
|
2380
2255
|
}
|
|
2381
2256
|
phaseStart = Date.now();
|
|
@@ -2406,7 +2281,7 @@ async function runAnalysisPipeline(url, config) {
|
|
|
2406
2281
|
} else if (!config.silent) {
|
|
2407
2282
|
const parsingPhase = phaseTimings.find((p) => p.name === "Parsing");
|
|
2408
2283
|
const sitemapsPerSec = parsingPhase ? (discoveryResult.sitemaps.length / (parsingPhase.duration / 1e3)).toFixed(1) : "0";
|
|
2409
|
-
console.log(import_chalk.default.green(
|
|
2284
|
+
console.log(import_chalk.default.green(`Analysis complete (${(executionTime / 1e3).toFixed(1)}s \xB7 ${sitemapsPerSec} sitemaps/sec)
|
|
2410
2285
|
`));
|
|
2411
2286
|
}
|
|
2412
2287
|
if (config.benchmark) {
|
|
@@ -2455,7 +2330,7 @@ function handleAnalysisError(error, config) {
|
|
|
2455
2330
|
}
|
|
2456
2331
|
function displayPhaseSummary(timings, totalTime) {
|
|
2457
2332
|
console.log(import_chalk.default.green(`
|
|
2458
|
-
|
|
2333
|
+
Analysis Complete (Total: ${(totalTime / 1e3).toFixed(1)}s)
|
|
2459
2334
|
`));
|
|
2460
2335
|
console.log(import_chalk.default.cyan("Phase Breakdown:"));
|
|
2461
2336
|
for (const timing of timings) {
|