@akotliar/sitemap-qa 1.0.0-alpha.1 → 1.0.0-alpha.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +36 -161
- package/dist/index.js.map +1 -1
- package/package.json +7 -9
- package/dist/index.cjs +0 -2523
- package/dist/index.cjs.map +0 -1
- package/dist/index.d.cts +0 -1
package/dist/index.js
CHANGED
|
@@ -289,31 +289,6 @@ async function fetchUrl(url, options = {}) {
|
|
|
289
289
|
}
|
|
290
290
|
|
|
291
291
|
// src/core/discovery.ts
|
|
292
|
-
function normalizeBaseUrl(url) {
|
|
293
|
-
const parsed = new URL(url);
|
|
294
|
-
return parsed.origin;
|
|
295
|
-
}
|
|
296
|
-
async function detectCanonicalDomain(baseUrl, config) {
|
|
297
|
-
const urlObj = new URL(baseUrl);
|
|
298
|
-
const hasWww = urlObj.hostname.startsWith("www.");
|
|
299
|
-
const alternateHostname = hasWww ? urlObj.hostname.substring(4) : `www.${urlObj.hostname}`;
|
|
300
|
-
const alternateUrl = `${urlObj.protocol}//${alternateHostname}/robots.txt`;
|
|
301
|
-
try {
|
|
302
|
-
const result = await fetchUrl(alternateUrl, {
|
|
303
|
-
timeout: config.timeout,
|
|
304
|
-
maxRetries: 1
|
|
305
|
-
});
|
|
306
|
-
if (result.statusCode === 200 || result.statusCode === 404) {
|
|
307
|
-
return alternateHostname;
|
|
308
|
-
}
|
|
309
|
-
return urlObj.hostname;
|
|
310
|
-
} catch (error) {
|
|
311
|
-
if (error instanceof HttpError && error.statusCode === 301) {
|
|
312
|
-
return urlObj.hostname;
|
|
313
|
-
}
|
|
314
|
-
return urlObj.hostname;
|
|
315
|
-
}
|
|
316
|
-
}
|
|
317
292
|
async function tryStandardPaths(baseUrl, config) {
|
|
318
293
|
const baseDomain = new URL(baseUrl).origin;
|
|
319
294
|
const accessIssues = [];
|
|
@@ -457,13 +432,11 @@ function extractSitemapIndexUrls(xmlContent) {
|
|
|
457
432
|
}
|
|
458
433
|
return urls;
|
|
459
434
|
}
|
|
460
|
-
async function discoverAllSitemaps(initialSitemaps, config
|
|
435
|
+
async function discoverAllSitemaps(initialSitemaps, config) {
|
|
461
436
|
const finalSitemaps = [];
|
|
462
437
|
const toProcess = [...initialSitemaps];
|
|
463
438
|
const processed = /* @__PURE__ */ new Set();
|
|
464
|
-
const
|
|
465
|
-
const redirected = /* @__PURE__ */ new Set();
|
|
466
|
-
let detectedCanonical = canonicalDomain;
|
|
439
|
+
const inaccessible = /* @__PURE__ */ new Set();
|
|
467
440
|
const BATCH_SIZE = config.discoveryConcurrency || 50;
|
|
468
441
|
while (toProcess.length > 0) {
|
|
469
442
|
const batch = toProcess.splice(0, Math.min(BATCH_SIZE, toProcess.length));
|
|
@@ -496,40 +469,12 @@ async function discoverAllSitemaps(initialSitemaps, config, baseUrl, canonicalDo
|
|
|
496
469
|
return { type: "sitemap", url: sitemapUrl };
|
|
497
470
|
}
|
|
498
471
|
} catch (error) {
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
detectedCanonical = await detectCanonicalDomain(baseUrl, config);
|
|
504
|
-
if (config.verbose) {
|
|
505
|
-
console.log(`Canonical domain detected: ${detectedCanonical}`);
|
|
506
|
-
}
|
|
507
|
-
}
|
|
508
|
-
try {
|
|
509
|
-
const sitemapUrlObj = new URL(sitemapUrl);
|
|
510
|
-
if (sitemapUrlObj.hostname !== detectedCanonical) {
|
|
511
|
-
console.warn(`\u26A0\uFE0F Sitemap URL redirects (301): ${sitemapUrl}`);
|
|
512
|
-
console.warn(` Problem: The sitemap index contains a URL that redirects.`);
|
|
513
|
-
console.warn(` Likely issue: Domain mismatch - expected "${detectedCanonical}" but got "${sitemapUrlObj.hostname}"`);
|
|
514
|
-
console.warn(` Fix: Update sitemap index to use "https://${detectedCanonical}${sitemapUrlObj.pathname}"`);
|
|
515
|
-
} else {
|
|
516
|
-
console.warn(`\u26A0\uFE0F Sitemap URL redirects (301): ${sitemapUrl}`);
|
|
517
|
-
console.warn(` Fix: Update the sitemap index to reference the final URL after redirect.`);
|
|
518
|
-
}
|
|
519
|
-
} catch {
|
|
520
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
521
|
-
console.warn(`Failed to fetch sitemap ${sitemapUrl}: ${message}`);
|
|
522
|
-
}
|
|
523
|
-
}
|
|
524
|
-
return { type: "redirect" };
|
|
525
|
-
} else {
|
|
526
|
-
failed.add(sitemapUrl);
|
|
527
|
-
if (config.verbose) {
|
|
528
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
529
|
-
console.warn(`Failed to fetch sitemap ${sitemapUrl}: ${message}`);
|
|
530
|
-
}
|
|
531
|
-
return { type: "failed" };
|
|
472
|
+
inaccessible.add(sitemapUrl);
|
|
473
|
+
if (config.verbose) {
|
|
474
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
475
|
+
console.warn(`Failed to fetch sitemap ${sitemapUrl}: ${message}`);
|
|
532
476
|
}
|
|
477
|
+
return { type: "failed" };
|
|
533
478
|
}
|
|
534
479
|
}));
|
|
535
480
|
for (const result of batchResults) {
|
|
@@ -544,117 +489,50 @@ async function discoverAllSitemaps(initialSitemaps, config, baseUrl, canonicalDo
|
|
|
544
489
|
break;
|
|
545
490
|
}
|
|
546
491
|
}
|
|
547
|
-
|
|
548
|
-
const totalFailed = failed.size;
|
|
549
|
-
const totalRedirected = redirected.size;
|
|
550
|
-
const sitemapIndexCount = totalProcessed - finalSitemaps.length - totalFailed - totalRedirected;
|
|
551
|
-
if (finalSitemaps.length === 0 && totalProcessed > 0) {
|
|
492
|
+
if (finalSitemaps.length === 0 && inaccessible.size > 0) {
|
|
552
493
|
console.warn(`
|
|
553
|
-
\u26A0\uFE0F
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
console.warn(` - ${totalRedirected} sitemap(s) return 301 redirects (content not accessible without following redirect)`);
|
|
558
|
-
}
|
|
559
|
-
if (totalFailed > 0) {
|
|
560
|
-
console.warn(` - ${totalFailed} sitemap(s) returned errors (404, 403, 500, or network issues)`);
|
|
561
|
-
}
|
|
562
|
-
} else if (totalRedirected > 0) {
|
|
563
|
-
console.warn(`All ${totalRedirected} sitemap(s) return 301 redirects.`);
|
|
564
|
-
} else if (totalFailed > 0) {
|
|
565
|
-
console.warn(`All ${totalFailed} sitemap(s) returned errors.`);
|
|
566
|
-
console.warn(`
|
|
567
|
-
Common causes:`);
|
|
568
|
-
console.warn(` - 403 Forbidden: Bot protection (Cloudflare, etc.) or IP blocking`);
|
|
569
|
-
console.warn(` - 404 Not Found: Sitemaps don't exist at these URLs`);
|
|
570
|
-
console.warn(` - 500/502/503: Server errors or maintenance`);
|
|
571
|
-
console.warn(`
|
|
572
|
-
If sitemaps work in your browser but not here, the site likely has bot protection.`);
|
|
573
|
-
console.warn(`Try: Check if sitemaps load without JavaScript, or contact site administrator.`);
|
|
574
|
-
} else {
|
|
575
|
-
console.warn(`Processed ${totalProcessed} URL(s) but found no accessible sitemaps.`);
|
|
576
|
-
}
|
|
577
|
-
console.warn(`
|
|
578
|
-
Note: This tool does not follow redirects for sitemap URLs.`);
|
|
579
|
-
if (totalRedirected > 0) {
|
|
580
|
-
console.warn(`
|
|
581
|
-
Possible causes of redirects:`);
|
|
582
|
-
console.warn(` - Sitemap index uses non-canonical domain (e.g., missing 'www' or vice versa)`);
|
|
583
|
-
console.warn(` - Sitemap URLs redirect from HTTP to HTTPS`);
|
|
584
|
-
console.warn(` - Intentional redirects in your site configuration`);
|
|
585
|
-
console.warn(`
|
|
586
|
-
Recommendation: Update sitemap index URLs to match the final destination (no redirects).`);
|
|
587
|
-
}
|
|
588
|
-
console.warn(``);
|
|
589
|
-
}
|
|
590
|
-
return { sitemaps: finalSitemaps, canonicalDomain: detectedCanonical };
|
|
494
|
+
\u26A0\uFE0F All ${inaccessible.size} sitemap(s) were inaccessible`);
|
|
495
|
+
console.warn(`Common causes: 403/404 errors, network issues, or bot protection`);
|
|
496
|
+
}
|
|
497
|
+
return finalSitemaps;
|
|
591
498
|
}
|
|
592
499
|
async function discoverSitemaps(baseUrl, config) {
|
|
593
|
-
const normalizedUrl =
|
|
594
|
-
let allAccessIssues = [];
|
|
595
|
-
let canonicalDomain;
|
|
500
|
+
const normalizedUrl = new URL(baseUrl).origin;
|
|
596
501
|
if (config.verbose) {
|
|
597
|
-
console.log("
|
|
502
|
+
console.log("Checking robots.txt for sitemap directives...");
|
|
598
503
|
}
|
|
599
504
|
const robotsSitemaps = await parseRobotsTxt(normalizedUrl, config);
|
|
600
505
|
if (robotsSitemaps.length > 0) {
|
|
601
|
-
const
|
|
602
|
-
canonicalDomain = detected;
|
|
506
|
+
const sitemaps = await discoverAllSitemaps(robotsSitemaps, config);
|
|
603
507
|
return {
|
|
604
|
-
sitemaps
|
|
508
|
+
sitemaps,
|
|
605
509
|
source: "robots-txt",
|
|
606
|
-
accessIssues: []
|
|
607
|
-
// Clear access issues since we found working sitemaps
|
|
608
|
-
canonicalDomain
|
|
510
|
+
accessIssues: []
|
|
609
511
|
};
|
|
610
512
|
}
|
|
611
513
|
if (config.verbose) {
|
|
612
|
-
console.log("
|
|
514
|
+
console.log("Trying standard sitemap paths...");
|
|
613
515
|
}
|
|
614
|
-
const { sitemaps: standardSitemaps, issues
|
|
615
|
-
allAccessIssues = issues;
|
|
516
|
+
const { sitemaps: standardSitemaps, issues } = await tryStandardPaths(normalizedUrl, config);
|
|
616
517
|
if (standardSitemaps.length > 0) {
|
|
617
|
-
const
|
|
618
|
-
|
|
619
|
-
return {
|
|
620
|
-
sitemaps: allSitemaps,
|
|
621
|
-
source: "standard-path",
|
|
622
|
-
accessIssues: [],
|
|
623
|
-
// Clear access issues since we found working sitemaps
|
|
624
|
-
canonicalDomain
|
|
625
|
-
};
|
|
626
|
-
}
|
|
627
|
-
if (redirectedToCanonical) {
|
|
628
|
-
const canonicalUrl = `https://${redirectedToCanonical}`;
|
|
629
|
-
console.log(`
|
|
630
|
-
\u{1F4A1} All requests redirected. Retrying with canonical domain: ${redirectedToCanonical}
|
|
631
|
-
`);
|
|
632
|
-
const canonicalRobotsSitemaps = await parseRobotsTxt(canonicalUrl, config);
|
|
633
|
-
if (canonicalRobotsSitemaps.length > 0) {
|
|
634
|
-
const { sitemaps: allSitemaps, canonicalDomain: detected } = await discoverAllSitemaps(canonicalRobotsSitemaps, config, canonicalUrl, redirectedToCanonical);
|
|
518
|
+
const sitemaps = await discoverAllSitemaps(standardSitemaps, config);
|
|
519
|
+
if (sitemaps.length > 0) {
|
|
635
520
|
return {
|
|
636
|
-
sitemaps
|
|
637
|
-
source: "robots-txt",
|
|
638
|
-
accessIssues: [],
|
|
639
|
-
canonicalDomain: detected || redirectedToCanonical
|
|
640
|
-
};
|
|
641
|
-
}
|
|
642
|
-
const { sitemaps: canonicalStandardSitemaps } = await tryStandardPaths(canonicalUrl, config);
|
|
643
|
-
if (canonicalStandardSitemaps.length > 0) {
|
|
644
|
-
const { sitemaps: allSitemaps, canonicalDomain: detected } = await discoverAllSitemaps(canonicalStandardSitemaps, config, canonicalUrl, redirectedToCanonical);
|
|
645
|
-
return {
|
|
646
|
-
sitemaps: allSitemaps,
|
|
521
|
+
sitemaps,
|
|
647
522
|
source: "standard-path",
|
|
648
|
-
accessIssues: []
|
|
649
|
-
canonicalDomain: detected || redirectedToCanonical
|
|
523
|
+
accessIssues: []
|
|
650
524
|
};
|
|
651
525
|
}
|
|
526
|
+
return {
|
|
527
|
+
sitemaps: [],
|
|
528
|
+
source: "standard-path",
|
|
529
|
+
accessIssues: issues
|
|
530
|
+
};
|
|
652
531
|
}
|
|
653
532
|
return {
|
|
654
533
|
sitemaps: [],
|
|
655
534
|
source: "none",
|
|
656
|
-
accessIssues:
|
|
657
|
-
canonicalDomain
|
|
535
|
+
accessIssues: issues
|
|
658
536
|
};
|
|
659
537
|
}
|
|
660
538
|
|
|
@@ -976,9 +854,6 @@ Top duplicates:`);
|
|
|
976
854
|
|
|
977
855
|
// src/core/patterns/risk-patterns.ts
|
|
978
856
|
var RISK_PATTERNS = [
|
|
979
|
-
// Note: Environment leakage patterns moved to domain-patterns.ts
|
|
980
|
-
// Note: Admin path patterns moved to admin-patterns.ts
|
|
981
|
-
// to avoid duplication and improve maintainability
|
|
982
857
|
// Sensitive Parameter Patterns (HIGH)
|
|
983
858
|
{
|
|
984
859
|
name: "Authentication Parameter",
|
|
@@ -1532,7 +1407,7 @@ function summarizeRisks(request) {
|
|
|
1532
1407
|
}
|
|
1533
1408
|
|
|
1534
1409
|
// src/reporters/json-reporter.ts
|
|
1535
|
-
var TOOL_VERSION = true ? "1.0.0-alpha.
|
|
1410
|
+
var TOOL_VERSION = true ? "1.0.0-alpha.2" : "dev";
|
|
1536
1411
|
function generateJsonReport(summary, discoveryResult, parseResult, riskGroups, config, startTime, options = {}) {
|
|
1537
1412
|
const {
|
|
1538
1413
|
pretty = true,
|
|
@@ -1702,7 +1577,7 @@ function transformError(error) {
|
|
|
1702
1577
|
|
|
1703
1578
|
// src/reporters/html-reporter.ts
|
|
1704
1579
|
import { promises as fs } from "fs";
|
|
1705
|
-
var TOOL_VERSION2 = "1.0.0-alpha.
|
|
1580
|
+
var TOOL_VERSION2 = "1.0.0-alpha.2";
|
|
1706
1581
|
function generateHtmlReport(summary, discoveryResult, totalUrls, config, errors, options = {}) {
|
|
1707
1582
|
const maxUrls = options.maxUrlsPerGroup ?? 10;
|
|
1708
1583
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString();
|
|
@@ -2257,7 +2132,7 @@ function showCliSummary(result) {
|
|
|
2257
2132
|
const riskyUrlCount = result.summary.categoryInsights.reduce((sum, g) => sum + g.count, 0);
|
|
2258
2133
|
console.log(chalk.dim("\u2500".repeat(50)));
|
|
2259
2134
|
if (riskyUrlCount === 0) {
|
|
2260
|
-
console.log(chalk.green("
|
|
2135
|
+
console.log(chalk.green("No issues found - sitemap looks clean!"));
|
|
2261
2136
|
} else {
|
|
2262
2137
|
const { high, medium, low } = result.summary.severityBreakdown;
|
|
2263
2138
|
const severityParts = [];
|
|
@@ -2350,9 +2225,9 @@ async function runAnalysisPipeline(url, config) {
|
|
|
2350
2225
|
const duplicatePercentage = duplicatesRemoved / extractionResult.allUrls.length * 100;
|
|
2351
2226
|
if (!config.silent) {
|
|
2352
2227
|
if (duplicatesRemoved > 100 || duplicatePercentage > 1) {
|
|
2353
|
-
console.log(chalk.green(
|
|
2228
|
+
console.log(chalk.green(`Found ${discoveryResult.sitemaps.length} sitemap(s) \u2192 ${extractionResult.allUrls.length.toLocaleString()} URLs (${consolidatedResult.uniqueUrls.length.toLocaleString()} unique)`));
|
|
2354
2229
|
} else {
|
|
2355
|
-
console.log(chalk.green(
|
|
2230
|
+
console.log(chalk.green(`Found ${discoveryResult.sitemaps.length} sitemap(s) \u2192 ${extractionResult.allUrls.length.toLocaleString()} URLs`));
|
|
2356
2231
|
}
|
|
2357
2232
|
}
|
|
2358
2233
|
phaseStart = Date.now();
|
|
@@ -2383,7 +2258,7 @@ async function runAnalysisPipeline(url, config) {
|
|
|
2383
2258
|
} else if (!config.silent) {
|
|
2384
2259
|
const parsingPhase = phaseTimings.find((p) => p.name === "Parsing");
|
|
2385
2260
|
const sitemapsPerSec = parsingPhase ? (discoveryResult.sitemaps.length / (parsingPhase.duration / 1e3)).toFixed(1) : "0";
|
|
2386
|
-
console.log(chalk.green(
|
|
2261
|
+
console.log(chalk.green(`Analysis complete (${(executionTime / 1e3).toFixed(1)}s \xB7 ${sitemapsPerSec} sitemaps/sec)
|
|
2387
2262
|
`));
|
|
2388
2263
|
}
|
|
2389
2264
|
if (config.benchmark) {
|
|
@@ -2432,7 +2307,7 @@ function handleAnalysisError(error, config) {
|
|
|
2432
2307
|
}
|
|
2433
2308
|
function displayPhaseSummary(timings, totalTime) {
|
|
2434
2309
|
console.log(chalk.green(`
|
|
2435
|
-
|
|
2310
|
+
Analysis Complete (Total: ${(totalTime / 1e3).toFixed(1)}s)
|
|
2436
2311
|
`));
|
|
2437
2312
|
console.log(chalk.cyan("Phase Breakdown:"));
|
|
2438
2313
|
for (const timing of timings) {
|