@akotliar/sitemap-qa 1.0.0-alpha.0 → 1.0.0-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,27 +1,1721 @@
1
1
  #!/usr/bin/env node
2
- import"dotenv/config";import{Command as qe}from"commander";import{Command as Ne}from"commander";import{promises as re}from"fs";import C from"ora";import k from"chalk";import{readFile as I}from"fs/promises";import{existsSync as D}from"fs";import{join as N}from"path";import{homedir as se}from"os";var L={timeout:30,concurrency:10,outputFormat:"html",outputDir:"./sitemap-qa/report",verbose:!1,baseUrl:"https://example.com",acceptedPatterns:[]};async function F(e){let t={...L},r=N(se(),".sitemap-qa","config.json");if(D(r))try{let a=JSON.parse(await I(r,"utf-8"));t={...t,...a}}catch(a){console.warn(`Warning: Failed to load global config: ${a}`)}let s=N(process.cwd(),".sitemap-qa.config.json");if(D(s))try{let a=JSON.parse(await I(s,"utf-8"));t={...t,...a}}catch(a){console.warn(`Warning: Failed to load project config: ${a}`)}let n=ne();return t={...t,...n},t=ie(t,e),e.baseUrl&&(t.baseUrl=e.baseUrl),ae(t),t}function ne(){let e={};return process.env.SITEMAP_VERIFY_TIMEOUT&&(e.timeout=parseInt(process.env.SITEMAP_VERIFY_TIMEOUT,10)),e}function ie(e,t){let r={...e};return t.timeout&&t.timeout!=="30"&&(r.timeout=parseInt(t.timeout,10)),t.output&&(r.outputFormat=t.output),t.outputDir&&(r.outputDir=t.outputDir),t.verbose===!0&&(r.verbose=!0),t.acceptedPatterns&&(r.acceptedPatterns=t.acceptedPatterns.split(",").map(s=>s.trim()).filter(Boolean)),r}function ae(e){if(e.timeout<1||e.timeout>300)throw new Error("Timeout must be between 1 and 300 seconds");if(!["json","html"].includes(e.outputFormat))throw new Error("Output format must be json or html")}var U=class extends Error{constructor(r,s){super(`Network request failed for ${r}: ${s.message}`);this.url=r;this.originalError=s;this.name="NetworkError"}code="NETWORK_ERROR"},x=class extends Error{constructor(r,s,n){let a=`HTTP ${s} error for ${r}`;s===403&&(a+=`
3
- Note: 403 Forbidden often indicates bot protection (Cloudflare, etc.) or access restrictions`);super(a);this.url=r;this.statusCode=s;this.statusText=n;this.name="HttpError"}code="HTTP_ERROR"};import{chromium as ce}from"playwright";async function le(e,t){let r;try{r=await ce.launch({headless:!0,args:["--disable-blink-features=AutomationControlled","--disable-dev-shm-usage","--no-sandbox"]});let n=await(await r.newContext({userAgent:"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",viewport:{width:1920,height:1080},locale:"en-US",timezoneId:"America/New_York",extraHTTPHeaders:{Accept:"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8","Accept-Language":"en-US,en;q=0.9","Accept-Encoding":"gzip, deflate, br",DNT:"1",Connection:"keep-alive","Upgrade-Insecure-Requests":"1"}})).newPage();await n.addInitScript(()=>{Object.defineProperty(navigator,"webdriver",{get:()=>!1}),window.chrome={runtime:{}};let m=window.navigator.permissions.query;window.navigator.permissions.query=l=>l.name==="notifications"?Promise.resolve({state:Notification.permission}):m(l)}),n.setDefaultTimeout(t*1e3);let a=await n.goto(e,{waitUntil:"domcontentloaded",timeout:t*1e3});if(!a)throw new Error("No response received from page");let c=a.status(),o=await n.content(),i=n.url();if(await r.close(),c>=200&&c<300)return{content:o,statusCode:c,url:i};throw new x(i,c)}catch(s){throw r&&await r.close(),s.code==="HTTP_ERROR"?s:new U(e,s)}}async function S(e,t={}){let{timeout:r=30,maxRetries:s=3,retryDelay:n=1e3,useBrowser:a=!1}=t;new URL(e);let c=[408,429,500,502,503,504],o=null,i=!1;for(let m=0;m<=s;m++){try{if(a||i)return await le(e,r);let l=new AbortController,d=setTimeout(()=>l.abort(),r*1e3),p=await fetch(e,{method:"GET",headers:{"User-Agent":"sitemap-qa/1.0.0",Accept:"text/xml,application/xml,text/plain,*/*"},signal:l.signal,redirect:"follow"});clearTimeout(d);let u=p.status,g=await p.text();if(u>=200&&u<300)return{content:g,statusCode:u,url:p.url};if(u===403&&!i){i=!0;continue}if(!c.includes(u))throw new x(p.url,u);o=new x(p.url,u)}catch(l){if(l.code==="HTTP_ERROR"){let d=l;if(!c.includes(d.statusCode))throw l;o=l}else o=new U(e,l);if(m===s)break}if(m<s){let l=n*Math.pow(2,m);await new Promise(d=>setTimeout(d,l))}}throw o}function me(e){return new URL(e).origin}async function ue(e,t){let r=new URL(e),n=r.hostname.startsWith("www.")?r.hostname.substring(4):`www.${r.hostname}`,a=`${r.protocol}//${n}/robots.txt`;try{let c=await S(a,{timeout:t.timeout,maxRetries:1});return c.statusCode===200||c.statusCode===404?n:r.hostname}catch(c){return c instanceof x&&c.statusCode===301,r.hostname}}async function M(e,t){let r=new URL(e).origin,s=[],n=["/sitemap.xml","/sitemap_index.xml","/sitemap-index.xml"],a=await Promise.allSettled(n.map(async c=>{let o=`${r}${c}`;try{return(await S(o,{timeout:t.timeout,maxRetries:1})).statusCode===200?(t.verbose&&console.log(`\u2713 Found sitemap at: ${o}`),{found:!0,url:o}):{found:!1}}catch(i){return i instanceof x?i.statusCode===401||i.statusCode===403?(s.push({url:o,statusCode:i.statusCode,error:i.statusCode===401?"Unauthorized":"Access Denied"}),t.verbose&&console.log(`\u26A0 Access denied: ${o} (${i.statusCode})`)):t.verbose&&console.log(`\u2717 Not found: ${o} (${i.statusCode})`):t.verbose&&console.log(`\u2717 Not found: ${o}`),{found:!1}}}));for(let c of a)if(c.status==="fulfilled"&&c.value.found)return{sitemaps:[c.value.url],issues:s};return t.verbose&&console.log("No sitemap found at standard paths"),{sitemaps:[],issues:s}}async function z(e,t){let r=`${new URL(e).origin}/robots.txt`;try{let n=(await S(r,{timeout:t.timeout,maxRetries:1})).content.split(`
4
- `),a=[];for(let c of n){let o=c.match(/^Sitemap:\s*(.+)$/i);if(o){let i=o[1].trim();try{new URL(i),a.push(i)}catch{t.verbose&&console.warn(`Invalid sitemap URL in robots.txt: ${i}`)}}}return t.verbose&&a.length>0&&console.log(`Found ${a.length} sitemap(s) in robots.txt`),a}catch{return t.verbose&&console.log(`No robots.txt found at ${r}`),[]}}function de(e){if(e.includes("<sitemapindex"))return!0;if(e.includes("<urlset")){let t=/<url[^>]*>.*?<loc>([^<]+)<\/loc>.*?<\/url>/gs,r=Array.from(e.matchAll(t)),s=Math.min(5,r.length),n=0;for(let a=0;a<s;a++){let c=r[a][1].trim().toLowerCase();(c.includes("sitemap")||c.endsWith(".xml"))&&n++}return n>s/2}return!1}function pe(e){let t=[];if(e.includes("<sitemapindex")){let r=/<sitemap[^>]*>(.*?)<\/sitemap>/gs,s;for(;(s=r.exec(e))!==null;){let n=/<loc>([^<]+)<\/loc>/i.exec(s[1]);if(n){let a=n[1].trim();try{new URL(a),t.push(a)}catch{}}}}else{let r=/<url[^>]*>(.*?)<\/url>/gs,s;for(;(s=r.exec(e))!==null;){let n=/<loc>([^<]+)<\/loc>/i.exec(s[1]);if(n){let a=n[1].trim();if(a.toLowerCase().includes("sitemap")||a.toLowerCase().endsWith(".xml"))try{new URL(a),t.push(a)}catch{}}}}return t}async function $(e,t,r,s,n=10){let a=[],c=[...e],o=new Set,i=new Set,m=new Set,l=s,d=5;for(;c.length>0;){let w=c.splice(0,Math.min(d,c.length));if(await Promise.all(w.map(async h=>{if(o.has(h)){t.verbose&&console.warn(`Skipping duplicate sitemap: ${h}`);return}o.add(h);try{let y=await S(h,{timeout:t.timeout,maxRetries:2});if(de(y.content)){t.verbose&&console.log(`Found sitemap index: ${h}`);let R=pe(y.content);c.push(...R),t.verbose&&console.log(` \u2514\u2500 Contains ${R.length} child sitemap(s)`)}else a.push(h),t.verbose&&console.log(`\u2713 Discovered sitemap: ${h}`)}catch(y){if(y instanceof x&&y.statusCode===301?m.add(h):i.add(h),t.verbose){let R=y instanceof Error?y.message:String(y);if(y instanceof x&&y.statusCode===301){l||(l=await ue(r,t),t.verbose&&console.log(`Canonical domain detected: ${l}`));try{let T=new URL(h);T.hostname!==l?(console.warn(`\u26A0\uFE0F Sitemap URL redirects (301): ${h}`),console.warn(" Problem: The sitemap index contains a URL that redirects."),console.warn(` Likely issue: Domain mismatch - expected "${l}" but got "${T.hostname}"`),console.warn(` Fix: Update sitemap index to use "https://${l}${T.pathname}"`)):(console.warn(`\u26A0\uFE0F Sitemap URL redirects (301): ${h}`),console.warn(" Fix: Update the sitemap index to reference the final URL after redirect."))}catch{console.warn(`Failed to fetch sitemap ${h}: ${R}`)}}else console.warn(`Failed to fetch sitemap ${h}: ${R}`)}}})),o.size>1e3){console.warn("\u26A0\uFE0F Processed over 1000 sitemap URLs. Stopping to prevent excessive requests.");break}}let p=o.size,u=i.size,g=m.size,f=p-a.length-u-g;return a.length===0&&p>0&&(console.warn(`
5
- \u26A0\uFE0F SITEMAP DISCOVERY ISSUE`),f>0&&(u>0||g>0)?(console.warn(`Found ${f} sitemap index(es) containing ${u+g} child sitemap(s):`),g>0&&console.warn(` - ${g} sitemap(s) return 301 redirects (content not accessible without following redirect)`),u>0&&console.warn(` - ${u} sitemap(s) returned errors (404, 403, 500, or network issues)`)):g>0?console.warn(`All ${g} sitemap(s) return 301 redirects.`):u>0?(console.warn(`All ${u} sitemap(s) returned errors.`),console.warn(`
6
- Common causes:`),console.warn(" - 403 Forbidden: Bot protection (Cloudflare, etc.) or IP blocking"),console.warn(" - 404 Not Found: Sitemaps don't exist at these URLs"),console.warn(" - 500/502/503: Server errors or maintenance"),console.warn(`
7
- If sitemaps work in your browser but not here, the site likely has bot protection.`),console.warn("Try: Check if sitemaps load without JavaScript, or contact site administrator.")):console.warn(`Processed ${p} URL(s) but found no accessible sitemaps.`),console.warn(`
8
- Note: This tool does not follow redirects for sitemap URLs.`),g>0&&(console.warn(`
9
- Possible causes of redirects:`),console.warn(" - Sitemap index uses non-canonical domain (e.g., missing 'www' or vice versa)"),console.warn(" - Sitemap URLs redirect from HTTP to HTTPS"),console.warn(" - Intentional redirects in your site configuration"),console.warn(`
10
- Recommendation: Update sitemap index URLs to match the final destination (no redirects).`)),console.warn("")),{sitemaps:a,canonicalDomain:l}}async function O(e,t){let r=me(e),s=[],n;t.verbose&&console.log("Strategy 1: Checking robots.txt for sitemap directives...");let a=await z(r,t);if(a.length>0){let{sitemaps:m,canonicalDomain:l}=await $(a,t,r,n);return n=l,{sitemaps:m,source:"robots-txt",accessIssues:[],canonicalDomain:n}}t.verbose&&console.log("Strategy 2: Trying standard sitemap paths...");let{sitemaps:c,issues:o,redirectedToCanonical:i}=await M(r,t);if(s=o,c.length>0){let{sitemaps:m,canonicalDomain:l}=await $(c,t,r,n);return n=l,{sitemaps:m,source:"standard-path",accessIssues:[],canonicalDomain:n}}if(i){let m=`https://${i}`;console.log(`
11
- \u{1F4A1} All requests redirected. Retrying with canonical domain: ${i}
12
- `);let l=await z(m,t);if(l.length>0){let{sitemaps:p,canonicalDomain:u}=await $(l,t,m,i);return{sitemaps:p,source:"robots-txt",accessIssues:[],canonicalDomain:u||i}}let{sitemaps:d}=await M(m,t);if(d.length>0){let{sitemaps:p,canonicalDomain:u}=await $(d,t,m,i);return{sitemaps:p,source:"standard-path",accessIssues:[],canonicalDomain:u||i}}}return{sitemaps:[],source:"none",accessIssues:s,canonicalDomain:n}}import{XMLParser as ge,XMLValidator as fe}from"fast-xml-parser";var he=new ge({ignoreAttributes:!1,attributeNamePrefix:"@_",textNodeName:"_text",parseAttributeValue:!0,trimValues:!0,allowBooleanAttributes:!0,parseTagValue:!1});function ye(e,t){let r=[];if(e.urlset){let s=Array.isArray(e.urlset.url)?e.urlset.url:[e.urlset.url];for(let n of s)!n||!n.loc||r.push({loc:n.loc,lastmod:n.lastmod,changefreq:n.changefreq,priority:n.priority?parseFloat(n.priority):void 0,source:t})}return r}async function G(e,t){let r=[];try{let s=fe.validate(e);if(s!==!0){let o=typeof s=="object"?s.err.msg:"Invalid XML";return{urls:[],errors:[`[${t}] XML parsing failed: ${o}`],totalCount:0,sitemapUrl:t}}let n=he.parse(e),a=ye(n,t),c=[];for(let o of a)try{new URL(o.loc),o.priority!==void 0&&(o.priority<0||o.priority>1)&&(r.push(`Invalid priority ${o.priority} for ${o.loc} - clamping to 0-1`),o.priority=Math.max(0,Math.min(1,o.priority))),o.changefreq&&(["always","hourly","daily","weekly","monthly","yearly","never"].includes(o.changefreq.toLowerCase())||(r.push(`Invalid changefreq "${o.changefreq}" for ${o.loc}`),o.changefreq=void 0)),c.push(o)}catch{r.push(`Invalid URL format: ${o.loc}`)}return{urls:c,errors:r,totalCount:c.length,sitemapUrl:t}}catch(s){let n=s instanceof Error?s.message:String(s);return{urls:[],errors:[`[${t}] XML parsing failed: ${n}`],totalCount:0,sitemapUrl:t}}}async function q(e,t){let r=[],s=[],n=0,a=0;t.verbose&&console.log(`
13
- Extracting URLs from ${e.length} sitemap(s)...`);let o=await ve(e,10,async i=>{try{t.verbose&&console.log(`Extracting URLs from: ${i}`);let m=await S(i,{timeout:t.timeout,maxRetries:2}),l=await G(m.content,i),d=l.urls.map(p=>({...p,extractedAt:new Date().toISOString()}));return t.verbose&&console.log(` \u2713 Extracted ${l.urls.length} URLs from ${i}`),{success:!0,urls:d,errors:l.errors}}catch(m){let l=`Failed to process ${i}: ${m instanceof Error?m.message:String(m)}`;return t.verbose&&console.error(` \u2717 ${l}`),{success:!1,urls:[],errors:[l]}}});for(let i of o)i.success?(n++,r.push(...i.urls)):a++,s.push(...i.errors);return t.verbose&&(console.log(`
14
- Extraction complete:`),console.log(` - Sitemaps processed: ${n}`),console.log(` - Sitemaps failed: ${a}`),console.log(` - Total URLs: ${r.length}`),console.log(` - Errors: ${s.length}`)),{allUrls:r,sitemapsProcessed:n,sitemapsFailed:a,totalUrls:r.length,errors:s}}async function ve(e,t,r){let s=[];for(let n=0;n<e.length;n+=t){let a=e.slice(n,n+t),c=await Promise.all(a.map(r));s.push(...c)}return s}function be(e){try{let t=new URL(e),r=t.pathname;r.endsWith("/")&&r!=="/"&&(r=r.slice(0,-1));let s=Array.from(t.searchParams.entries()).sort(([a],[c])=>a.localeCompare(c)),n=new URLSearchParams(s);return`${t.protocol}//${t.host}${r}${n.toString()?"?"+n.toString():""}${t.hash}`}catch{return e}}function we(e){if(e.length===1)return e[0];let t={...e[0]},r=e.map(o=>o.source);t.source=r.join(", ");let s=e.map(o=>o.lastmod).filter(o=>!!o).map(o=>new Date(o).getTime()).sort((o,i)=>i-o);s.length>0&&(t.lastmod=new Date(s[0]).toISOString());let n=e.map(o=>o.priority).filter(o=>o!==void 0);n.length>0&&(t.priority=Math.max(...n));let a=e.map(o=>o.changefreq).filter(o=>!!o);if(a.length>0){let o=new Map;for(let m of a)o.set(m,(o.get(m)||0)+1);let i=Array.from(o.entries()).sort((m,l)=>l[1]-m[1]);t.changefreq=i[0][0]}let c=e.map(o=>o.extractedAt).filter(o=>!!o).map(o=>new Date(o).getTime()).sort((o,i)=>i-o);return c.length>0&&(t.extractedAt=new Date(c[0]).toISOString()),t}function j(e,t=!1){let r=e.length;t&&console.log(`
15
- Consolidating ${e.length} URL(s)...`);let s=new Map;for(let c of e){let o=be(c.loc);s.has(o)||s.set(o,[]),s.get(o).push(c)}let n=[],a=[];for(let[c,o]of s.entries()){let i=we(o);n.push(i),o.length>1&&a.push({url:c,count:o.length,sources:o.map(m=>m.source)})}if(t&&(console.log("Consolidation complete:"),console.log(` - Input URLs: ${r}`),console.log(` - Unique URLs: ${n.length}`),console.log(` - Duplicates removed: ${r-n.length}`),a.length>0)){console.log(`
16
- Top duplicates:`);let c=a.sort((o,i)=>i.count-o.count).slice(0,5);for(let o of c)console.log(` - ${o.url} (${o.count} times)`)}return{uniqueUrls:n,totalInputUrls:r,duplicatesRemoved:r-n.length,duplicateGroups:a}}var H=[{name:"Authentication Parameter",category:"sensitive_params",severity:"high",regex:/[?&](token|auth|key|password|secret|apikey|session|credentials)=/i,description:"Query parameter may contain sensitive authentication data"},{name:"Debug Parameter",category:"sensitive_params",severity:"medium",regex:/[?&](debug|trace|verbose|test_mode)=/i,description:"Query parameter may contain debug or diagnostic flag"},{name:"HTTP in HTTPS Site",category:"protocol_inconsistency",severity:"medium",regex:/^http:\/\//,description:"HTTP URL in HTTPS sitemap (potential mixed content)"},{name:"Test Content Path",category:"test_content",severity:"medium",regex:/\/(?:test-|demo-|sample-|temp-|temporary-|placeholder-)|\/(test|demo|sample|temp|temporary|placeholder)(?:\/|$)/i,description:"URL path suggests test, demo, or unfinished content that may not be intended for indexing"}];function E(e){return e.replace(/[.*+?^${}()|[\]\\]/g,"\\$&")}function xe(e){let t=e.split(".");return t.length>=2?t.slice(-2).join("."):e}function V(e,t){let r=new URL(e).hostname,s=xe(r);if(t?.allowedSubdomains&&t.allowedSubdomains.length>0){let c=E(s),i=`^https?://(?!(?:(?:${t.allowedSubdomains.map(E).join("|")})\\.)?${c}(?:/|$))`;return{name:"Domain Mismatch",category:"domain_mismatch",severity:"high",regex:new RegExp(i),description:"URL does not match expected domain or allowed subdomains"}}let a=`^https?://(?!(?:www\\.)?${E(s)}(?:/|$))`;return{name:"Domain Mismatch",category:"domain_mismatch",severity:"high",regex:new RegExp(a),description:`URL does not match expected domain: ${s} (including www variant)`}}var B=[{name:"Staging Subdomain",category:"environment_leakage",severity:"high",regex:/^https?:\/\/(staging|stg)\./i,description:"URL uses staging subdomain"},{name:"Development Subdomain",category:"environment_leakage",severity:"high",regex:/^https?:\/\/(dev|development)\./i,description:"URL uses development subdomain"},{name:"QA/Test Subdomain",category:"environment_leakage",severity:"high",regex:/^https?:\/\/(qa|test|uat|preprod)\./i,description:"URL uses test environment subdomain"},{name:"Localhost URL",category:"environment_leakage",severity:"high",regex:/^https?:\/\/(localhost|127\.0\.0\.1|0\.0\.0\.0)/,description:"URL points to localhost (development environment)"},{name:"Environment in Path",category:"environment_leakage",severity:"high",regex:/^https?:\/\/[^/]+\/(staging|dev|qa|uat|preprod)\//i,description:"URL path contains environment identifier at root level"}];var W=[{name:"Admin Path",category:"admin_paths",severity:"high",regex:/\/(admin|administrator)(?:\/|$|\?)/i,description:"URL contains /admin or /administrator as a path segment"},{name:"Dashboard Path",category:"admin_paths",severity:"high",regex:/\/dashboard(?:\/|$|\?)/i,description:"URL contains /dashboard as a path segment"},{name:"Config Path",category:"admin_paths",severity:"high",regex:/\/(config|configuration)(?:\/|$|\?)/i,description:"URL contains /config or /configuration as a path segment"},{name:"Console Path",category:"admin_paths",severity:"high",regex:/\/console(?:\/|$|\?)/i,description:"URL contains /console as a path segment"},{name:"Control Panel Path",category:"admin_paths",severity:"high",regex:/\/(cpanel|control-panel)(?:\/|$|\?)/i,description:"URL contains control panel as a path segment"}],J=[{name:"Internal Content Path",category:"internal_content",severity:"medium",regex:/\/internal\b/i,description:"URL contains /internal path segment - may be internal-only content not intended for public indexing"}],Q=[{name:"Authentication Token Parameter",category:"sensitive_params",severity:"high",regex:/[?&](token|auth_token|access_token|api_token)=/i,description:"Query parameter may contain authentication token"},{name:"API Key Parameter",category:"sensitive_params",severity:"high",regex:/[?&](apikey|api_key|key)=/i,description:"Query parameter may contain API key"},{name:"Password Parameter",category:"sensitive_params",severity:"high",regex:/[?&](password|passwd|pwd)=/i,description:"Query parameter may contain password"},{name:"Secret Parameter",category:"sensitive_params",severity:"high",regex:/[?&](secret|client_secret)=/i,description:"Query parameter may contain secret value"},{name:"Session Parameter",category:"sensitive_params",severity:"high",regex:/[?&](session|sessionid|sid)=/i,description:"Query parameter may contain session identifier"},{name:"Credentials Parameter",category:"sensitive_params",severity:"high",regex:/[?&]credentials=/i,description:"Query parameter may contain credentials"},{name:"Debug Parameter",category:"sensitive_params",severity:"medium",regex:/[?&](debug|trace|verbose)=/i,description:"Query parameter contains debug or diagnostic flag"},{name:"Test Mode Parameter",category:"sensitive_params",severity:"medium",regex:/[?&](test_mode|test|testing)=/i,description:"Query parameter indicates test mode"}];function K(e){try{let t=new URL(e),r=["token","auth","auth_token","access_token","api_token","apikey","api_key","key","password","passwd","pwd","secret","client_secret","session","sessionid","sid","credentials"];for(let s of r)t.searchParams.has(s)&&t.searchParams.set(s,"[REDACTED]");return t.toString()}catch{return e}}function Re(e,t,r){switch(e){case"environment_leakage":return{rationale:`Production sitemap contains ${r} URL(s) from non-production environments (staging, dev, QA, test). This indicates configuration errors or environment leakage.`,recommendedAction:"Verify sitemap generation excludes non-production environments. Review deployment configuration and environment filtering rules."};case"admin_paths":return{rationale:`${r} administrative path(s) detected in public sitemap (admin, dashboard, config). These paths may expose privileged access points.`,recommendedAction:"Confirm if admin paths should be publicly indexed. Consider excluding via robots.txt or removing from sitemap. Verify access controls."};case"internal_content":return{rationale:`${r} URL(s) contain "internal" in the path. These may be internal-facing content not intended for public indexing.`,recommendedAction:"Review URLs to determine if they should be publicly accessible. Consider excluding internal content from sitemap or adding noindex meta tags."};case"test_content":return{rationale:`${r} URL(s) contain test/demo/sample identifiers. These may be placeholder or unfinished content not intended for indexing.`,recommendedAction:"Review and remove test content from production sitemaps. Verify content is production-ready before including in sitemap."};case"sensitive_params":return{rationale:`${r} URL(s) contain sensitive query parameters (token, auth, key, password, session). This may expose authentication credentials or debugging flags.`,recommendedAction:"Review why sensitive parameters are in sitemap URLs. Remove authentication tokens from URLs. Consider POST requests for sensitive data."};case"protocol_inconsistency":return{rationale:`${r} URL(s) use HTTP protocol in HTTPS sitemap. This creates mixed content warnings and potential security issues.`,recommendedAction:"Update URLs to use HTTPS consistently. Verify SSL certificate coverage. Check for hardcoded HTTP URLs in content."};case"domain_mismatch":return{rationale:`${r} URL(s) do not match expected base domain. This may indicate external links, CDN URLs, or configuration errors.`,recommendedAction:"Verify if external domains are intentional. Review sitemap generation logic. Confirm CDN or subdomain configuration is correct."};default:return{rationale:`${r} URL(s) flagged in category: ${e}`,recommendedAction:"Review flagged URLs and determine appropriate action."}}}function P(e,t=5){let r=new Map;for(let i of e)r.has(i.category)||r.set(i.category,[]),r.get(i.category).push(i);let s=[];for(let[i,m]of r.entries()){let l=Array.from(new Set(m.map(f=>f.url))),d=m.reduce((f,w)=>{let h=["low","medium","high"];return h.indexOf(w.severity)>h.indexOf(f)?w.severity:f},"low"),p=l.slice(0,t),{rationale:u,recommendedAction:g}=Re(i,d,l.length);s.push({category:i,severity:d,count:l.length,rationale:u,sampleUrls:p,recommendedAction:g,allUrls:l})}s.sort((i,m)=>{let l=["high","medium","low"];return l.indexOf(i.severity)-l.indexOf(m.severity)});let n=new Set(e.map(i=>i.url)).size,a=s.filter(i=>i.severity==="high").reduce((i,m)=>i+m.count,0),c=s.filter(i=>i.severity==="medium").reduce((i,m)=>i+m.count,0),o=s.filter(i=>i.severity==="low").reduce((i,m)=>i+m.count,0);return{groups:s,totalRiskUrls:n,highSeverityCount:a,mediumSeverityCount:c,lowSeverityCount:o}}async function Y(e,t,r){let s=Date.now(),n=[],a=V(t),c=[...H,...B,...W,...Q,...J,a],o=[];if(r.acceptedPatterns&&r.acceptedPatterns.length>0)for(let p of r.acceptedPatterns)try{let u=p.replace(/[.+?^${}()|[\]\\]/g,"\\$&").replace(/\*/g,"[^/]*");!u.endsWith("$")&&!u.includes("(?:")&&(u=u+"(?:/|$|\\?|#)"),o.push(new RegExp(u,"i"))}catch{r.verbose&&console.warn(`Invalid accepted pattern: ${p}`)}if(r.verbose){console.log(`
17
- Analyzing ${e.length} URLs for risk patterns...`);try{console.log(`Base domain: ${new URL(t).hostname}`)}catch{console.log(`Base URL: ${t}`)}o.length>0&&console.log(`Accepted patterns: ${o.length}`)}let i;try{i=new URL(t).protocol}catch{r.verbose&&console.warn(`Invalid base URL: ${t}, defaulting to https:`),i="https:"}let m=0;for(let p of e){let u=p.loc;m++,(m%1e4===0||m===e.length)&&process.stdout.write(`\r\x1B[K Analyzing: ${m.toLocaleString()}/${e.length.toLocaleString()} URLs...`);let g=!1;for(let f of o)if(f.test(u)){g=!0;break}if(!g)for(let f of c)if(f.category==="protocol_inconsistency")try{let w=new URL(u).protocol;i==="https:"&&w==="http:"&&n.push({url:u,category:f.category,severity:f.severity,pattern:f.name,rationale:f.description,matchedValue:"http://"})}catch{r.verbose&&console.warn(`Skipping invalid URL: ${u}`);continue}else try{let w=u.match(f.regex);w&&n.push({url:f.category==="sensitive_params"?K(u):u,category:f.category,severity:f.severity,pattern:f.name,rationale:f.description,matchedValue:w[0]})}catch(w){r.verbose&&console.error(`Pattern matching failed for ${f.name}: ${w instanceof Error?w.message:String(w)}`);continue}}e.length>=1e4&&process.stdout.write("\r\x1B[K");let l=P(n),d=Date.now()-s;if(r.verbose&&(console.log(`
18
- Risk Summary:`),console.log(` - Total URLs analyzed: ${e.length}`),console.log(` - Risk URLs found: ${l.totalRiskUrls}`),console.log(` - HIGH severity: ${l.highSeverityCount}`),console.log(` - MEDIUM severity: ${l.mediumSeverityCount}`),console.log(` - LOW severity: ${l.lowSeverityCount}`),console.log(` - Processing time: ${d}ms`),l.groups.length>0)){console.log(`
19
- Risk Categories Found:`);for(let p of l.groups)console.log(` - ${p.category}: ${p.count} URLs (${p.severity.toUpperCase()})`)}return{findings:n,groups:l.groups,totalUrlsAnalyzed:e.length,riskUrlCount:l.totalRiskUrls,cleanUrlCount:e.length-l.totalRiskUrls,highSeverityCount:l.highSeverityCount,mediumSeverityCount:l.mediumSeverityCount,lowSeverityCount:l.lowSeverityCount,processingTimeMs:d}}function X(e){let t={high:0,medium:0,low:0},r=e.riskGroups.map(c=>{t[c.severity]+=c.count;let o=c.allUrls||c.sampleUrls;return{category:c.category,count:c.count,severity:c.severity,summary:c.rationale,examples:o.slice(0,3),allUrls:o}}),s=e.riskGroups.reduce((c,o)=>c+o.count,0),n=s>0?`Found ${s} potentially risky URLs across ${e.riskGroups.length} categories in ${e.totalUrls} total URLs.`:`Analyzed ${e.totalUrls} URLs. No suspicious patterns detected.`,a=[];return t.high>0&&a.push(`${t.high} high-severity issues require immediate attention`),t.medium>0&&a.push(`${t.medium} medium-severity issues should be reviewed`),t.low>0&&a.push(`${t.low} low-severity items flagged for awareness`),{overview:n,keyFindings:a,categoryInsights:r,severityBreakdown:t,recommendations:[],generatedBy:"rule-based analysis",metadata:{tokensUsed:0,processingTime:e.processingTime||0,model:"pattern-matching"}}}var Se="1.0.0-alpha.0";function Z(e,t,r,s,n,a,c={}){let{pretty:o=!0,indent:i=2}=c,m=ke(e,t,r,s,n,a),l=$e(m);return o?JSON.stringify(l,null,i):JSON.stringify(l)}function ke(e,t,r,s,n,a){let c=Ue(n.baseUrl||"unknown",a,e),o=s.map(d=>({category:d.category,severity:d.severity,count:d.count,pattern:d.category,rationale:d.rationale,sampleUrls:d.sampleUrls.slice(0,5),recommendedAction:d.recommendedAction})),i={highSeverityCount:e.severityBreakdown.high,mediumSeverityCount:e.severityBreakdown.medium,lowSeverityCount:e.severityBreakdown.low,totalRiskyUrls:s.reduce((d,p)=>d+p.count,0),overallStatus:Ce(e.severityBreakdown,r.errors)},m={overview:e.overview,keyFindings:e.keyFindings,recommendations:e.recommendations},l=r.errors.map(Ae);return{analysisMetadata:c,sitemapsDiscovered:t.sitemaps,totalUrlCount:r.totalCount,urlsAnalyzed:r.totalCount,suspiciousGroups:o,riskSummary:m,summary:i,errors:l}}function Ue(e,t,r){return{baseUrl:e,analysisTimestamp:new Date().toISOString(),toolVersion:Se,executionTimeMs:Date.now()-t,analysisType:r.generatedBy}}function Ce(e,t){return t.length>0?"errors":e.high+e.medium+e.low>0?"issues_found":"clean"}function $e(e){return{analysis_metadata:Pe(e.analysisMetadata),sitemaps_discovered:e.sitemapsDiscovered,total_url_count:e.totalUrlCount,urls_analyzed:e.urlsAnalyzed,suspicious_groups:e.suspiciousGroups.map(Te),risk_summary:Ee(e.riskSummary),summary:_e(e.summary),errors:e.errors}}function Pe(e){return{base_url:e.baseUrl,analysis_timestamp:e.analysisTimestamp,tool_version:e.toolVersion,execution_time_ms:e.executionTimeMs,analysis_type:e.analysisType}}function Te(e){return{category:e.category,severity:e.severity,count:e.count,pattern:e.pattern,rationale:e.rationale,sample_urls:e.sampleUrls,recommended_action:e.recommendedAction}}function Ee(e){return{overview:e.overview,key_findings:e.keyFindings,recommendations:e.recommendations}}function _e(e){return{high_severity_count:e.highSeverityCount,medium_severity_count:e.mediumSeverityCount,low_severity_count:e.lowSeverityCount,total_risky_urls:e.totalRiskyUrls,overall_status:e.overallStatus}}function Ae(e){if("code"in e){let t=e,r={code:t.code||"UNKNOWN_ERROR",message:e.message};return"attemptedPaths"in t?r.context={attempted_paths:t.attemptedPaths}:"sitemapUrl"in t&&"lineNumber"in t?r.context={sitemap_url:t.sitemapUrl,line_number:t.lineNumber}:"url"in t&&(r.context={url:t.url}),r}return{code:"UNKNOWN_ERROR",message:e.message}}import{promises as Le}from"fs";var Ie="1.0.0-alpha.0";function De(e,t,r,s,n,a={}){let c=a.maxUrlsPerGroup??10,o=new Date().toISOString(),i=e.categoryInsights.reduce((u,g)=>u+g.count,0),m=e.categoryInsights.filter(u=>u.severity==="high"),l=e.categoryInsights.filter(u=>u.severity==="medium"),d=e.categoryInsights.filter(u=>u.severity==="low");return`<!DOCTYPE html>
2
+
3
+ // src/index.ts
4
+ import "dotenv/config";
5
+ import { Command as Command2 } from "commander";
6
+
7
+ // src/commands/analyze.ts
8
+ import { Command } from "commander";
9
+ import { promises as fs2 } from "fs";
10
+ import ora from "ora";
11
+ import chalk from "chalk";
12
+ import cliProgress from "cli-progress";
13
+ import os2 from "os";
14
+
15
+ // src/config/config-loader.ts
16
+ import { readFile } from "fs/promises";
17
+ import { existsSync } from "fs";
18
+ import { join } from "path";
19
+ import { homedir } from "os";
20
+
21
+ // src/types/config.ts
22
+ var DEFAULT_CONFIG = {
23
+ timeout: 30,
24
+ concurrency: 10,
25
+ parsingConcurrency: 50,
26
+ // Optimized for network-bound parallel parsing
27
+ discoveryConcurrency: 50,
28
+ // Optimized for recursive sitemap index discovery
29
+ outputFormat: "html",
30
+ outputDir: "./sitemap-qa/report",
31
+ verbose: false,
32
+ baseUrl: "https://example.com",
33
+ // Default for tests
34
+ acceptedPatterns: [],
35
+ riskDetectionBatchSize: 1e4,
36
+ riskDetectionConcurrency: void 0,
37
+ // Auto-detect in risk-detector.ts
38
+ progressBar: void 0,
39
+ // Auto-detect TTY
40
+ silent: false,
41
+ benchmark: false
42
+ };
43
+
44
+ // src/config/config-loader.ts
45
+ async function loadConfig(cliOptions) {
46
+ let config = { ...DEFAULT_CONFIG };
47
+ const globalConfigPath = join(homedir(), ".sitemap-qa", "config.json");
48
+ if (existsSync(globalConfigPath)) {
49
+ try {
50
+ const globalConfig = JSON.parse(await readFile(globalConfigPath, "utf-8"));
51
+ config = { ...config, ...globalConfig };
52
+ } catch (error) {
53
+ console.warn(`Warning: Failed to load global config: ${error}`);
54
+ }
55
+ }
56
+ const projectConfigPath = join(process.cwd(), ".sitemap-qa.config.json");
57
+ if (existsSync(projectConfigPath)) {
58
+ try {
59
+ const projectConfig = JSON.parse(await readFile(projectConfigPath, "utf-8"));
60
+ config = { ...config, ...projectConfig };
61
+ } catch (error) {
62
+ console.warn(`Warning: Failed to load project config: ${error}`);
63
+ }
64
+ }
65
+ const envConfig = loadFromEnv();
66
+ config = { ...config, ...envConfig };
67
+ config = mergeCliOptions(config, cliOptions);
68
+ if (cliOptions.baseUrl) {
69
+ config.baseUrl = cliOptions.baseUrl;
70
+ }
71
+ validateConfig(config);
72
+ return config;
73
+ }
74
+ function loadFromEnv() {
75
+ const env = {};
76
+ if (process.env.SITEMAP_VERIFY_TIMEOUT) {
77
+ env.timeout = parseInt(process.env.SITEMAP_VERIFY_TIMEOUT, 10);
78
+ }
79
+ return env;
80
+ }
81
+ function mergeCliOptions(config, cliOptions) {
82
+ const merged = { ...config };
83
+ if (cliOptions.timeout && cliOptions.timeout !== "30") {
84
+ merged.timeout = parseInt(cliOptions.timeout, 10);
85
+ }
86
+ if (cliOptions.output) {
87
+ merged.outputFormat = cliOptions.output;
88
+ }
89
+ if (cliOptions.outputDir) {
90
+ merged.outputDir = cliOptions.outputDir;
91
+ }
92
+ if (cliOptions.verbose === true) {
93
+ merged.verbose = true;
94
+ }
95
+ if (cliOptions.acceptedPatterns) {
96
+ merged.acceptedPatterns = cliOptions.acceptedPatterns.split(",").map((p) => p.trim()).filter(Boolean);
97
+ }
98
+ return merged;
99
+ }
100
+ function validateConfig(config) {
101
+ if (config.timeout < 1 || config.timeout > 300) {
102
+ throw new Error("Timeout must be between 1 and 300 seconds");
103
+ }
104
+ if (!["json", "html"].includes(config.outputFormat)) {
105
+ throw new Error("Output format must be json or html");
106
+ }
107
+ }
108
+
109
+ // src/errors/network-errors.ts
110
+ var NetworkError = class extends Error {
111
+ constructor(url, originalError) {
112
+ super(`Network request failed for ${url}: ${originalError.message}`);
113
+ this.url = url;
114
+ this.originalError = originalError;
115
+ this.name = "NetworkError";
116
+ }
117
+ code = "NETWORK_ERROR";
118
+ };
119
+ var HttpError = class extends Error {
120
+ constructor(url, statusCode, statusText) {
121
+ let message = `HTTP ${statusCode} error for ${url}`;
122
+ if (statusCode === 403) {
123
+ message += "\n Note: 403 Forbidden often indicates bot protection (Cloudflare, etc.) or access restrictions";
124
+ }
125
+ super(message);
126
+ this.url = url;
127
+ this.statusCode = statusCode;
128
+ this.statusText = statusText;
129
+ this.name = "HttpError";
130
+ }
131
+ code = "HTTP_ERROR";
132
+ };
133
+
134
+ // src/utils/http-client.ts
135
+ import { chromium } from "playwright";
136
+ import axios from "axios";
137
+ import { Agent as HttpAgent } from "http";
138
+ import { Agent as HttpsAgent } from "https";
139
+ var httpAgent = new HttpAgent({
140
+ keepAlive: true,
141
+ maxSockets: 200,
142
+ // Allow many concurrent connections
143
+ maxFreeSockets: 50,
144
+ timeout: 15e3
145
+ });
146
+ var httpsAgent = new HttpsAgent({
147
+ keepAlive: true,
148
+ maxSockets: 200,
149
+ maxFreeSockets: 50,
150
+ timeout: 15e3
151
+ });
152
+ var axiosInstance = axios.create({
153
+ httpAgent,
154
+ httpsAgent,
155
+ maxRedirects: 5,
156
+ validateStatus: () => true
157
+ // Don't throw on any status code
158
+ });
159
+ async function fetchUrlWithBrowser(url, timeout) {
160
+ let browser;
161
+ try {
162
+ browser = await chromium.launch({
163
+ headless: true,
164
+ args: [
165
+ "--disable-blink-features=AutomationControlled",
166
+ // Hide automation flags
167
+ "--disable-dev-shm-usage",
168
+ "--no-sandbox"
169
+ ]
170
+ });
171
+ const context = await browser.newContext({
172
+ userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
173
+ viewport: { width: 1920, height: 1080 },
174
+ locale: "en-US",
175
+ timezoneId: "America/New_York",
176
+ extraHTTPHeaders: {
177
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
178
+ "Accept-Language": "en-US,en;q=0.9",
179
+ "Accept-Encoding": "gzip, deflate, br",
180
+ "DNT": "1",
181
+ "Connection": "keep-alive",
182
+ "Upgrade-Insecure-Requests": "1"
183
+ }
184
+ });
185
+ const page = await context.newPage();
186
+ await page.addInitScript(() => {
187
+ Object.defineProperty(navigator, "webdriver", {
188
+ get: () => false
189
+ });
190
+ window.chrome = {
191
+ runtime: {}
192
+ };
193
+ const originalQuery = window.navigator.permissions.query;
194
+ window.navigator.permissions.query = (parameters) => parameters.name === "notifications" ? Promise.resolve({ state: Notification.permission }) : originalQuery(parameters);
195
+ });
196
+ page.setDefaultTimeout(timeout * 1e3);
197
+ const response = await page.goto(url, {
198
+ waitUntil: "domcontentloaded",
199
+ // Changed from networkidle - faster for simple XML
200
+ timeout: timeout * 1e3
201
+ });
202
+ if (!response) {
203
+ throw new Error("No response received from page");
204
+ }
205
+ const statusCode = response.status();
206
+ const content = await page.content();
207
+ const finalUrl = page.url();
208
+ await browser.close();
209
+ if (statusCode >= 200 && statusCode < 300) {
210
+ return {
211
+ content,
212
+ statusCode,
213
+ url: finalUrl
214
+ };
215
+ }
216
+ throw new HttpError(finalUrl, statusCode);
217
+ } catch (error) {
218
+ if (browser) {
219
+ await browser.close();
220
+ }
221
+ if (error.code === "HTTP_ERROR") {
222
+ throw error;
223
+ }
224
+ throw new NetworkError(url, error);
225
+ }
226
+ }
227
+ async function fetchUrl(url, options = {}) {
228
+ const {
229
+ timeout = 30,
230
+ maxRetries = 3,
231
+ retryDelay = 1e3,
232
+ useBrowser = false,
233
+ disableBrowserFallback = false
234
+ } = options;
235
+ new URL(url);
236
+ const retryableStatuses = [408, 429, 500, 502, 503, 504];
237
+ let lastError = null;
238
+ let attemptedBrowser = false;
239
+ for (let attempt = 0; attempt <= maxRetries; attempt++) {
240
+ try {
241
+ if (useBrowser || attemptedBrowser) {
242
+ return await fetchUrlWithBrowser(url, timeout);
243
+ }
244
+ const response = await axiosInstance.get(url, {
245
+ timeout: timeout * 1e3,
246
+ headers: {
247
+ "User-Agent": "sitemap-qa/1.0.0 (compatible; +https://github.com/Akotliar/sitemap-qa)",
248
+ "Accept": "text/xml,application/xml,text/plain,*/*",
249
+ "Accept-Encoding": "gzip, deflate",
250
+ "Connection": "keep-alive"
251
+ }
252
+ });
253
+ const statusCode = response.status;
254
+ const body = response.data;
255
+ if (statusCode >= 200 && statusCode < 300) {
256
+ return {
257
+ content: typeof body === "string" ? body : JSON.stringify(body),
258
+ statusCode,
259
+ url: response.request?.res?.responseUrl || url
260
+ // Final URL after redirects
261
+ };
262
+ }
263
+ if (statusCode === 403 && !attemptedBrowser && !disableBrowserFallback) {
264
+ attemptedBrowser = true;
265
+ continue;
266
+ }
267
+ if (!retryableStatuses.includes(statusCode)) {
268
+ throw new HttpError(response.request?.res?.responseUrl || url, statusCode);
269
+ }
270
+ lastError = new HttpError(response.request?.res?.responseUrl || url, statusCode);
271
+ } catch (error) {
272
+ if (error.code === "HTTP_ERROR") {
273
+ const httpError = error;
274
+ if (!retryableStatuses.includes(httpError.statusCode)) {
275
+ throw error;
276
+ }
277
+ lastError = error;
278
+ } else {
279
+ lastError = new NetworkError(url, error);
280
+ }
281
+ if (attempt === maxRetries) break;
282
+ }
283
+ if (attempt < maxRetries) {
284
+ const delay = retryDelay * Math.pow(2, attempt);
285
+ await new Promise((resolve) => setTimeout(resolve, delay));
286
+ }
287
+ }
288
+ throw lastError;
289
+ }
290
+
291
+ // src/core/discovery.ts
292
+ function normalizeBaseUrl(url) {
293
+ const parsed = new URL(url);
294
+ return parsed.origin;
295
+ }
296
+ async function detectCanonicalDomain(baseUrl, config) {
297
+ const urlObj = new URL(baseUrl);
298
+ const hasWww = urlObj.hostname.startsWith("www.");
299
+ const alternateHostname = hasWww ? urlObj.hostname.substring(4) : `www.${urlObj.hostname}`;
300
+ const alternateUrl = `${urlObj.protocol}//${alternateHostname}/robots.txt`;
301
+ try {
302
+ const result = await fetchUrl(alternateUrl, {
303
+ timeout: config.timeout,
304
+ maxRetries: 1
305
+ });
306
+ if (result.statusCode === 200 || result.statusCode === 404) {
307
+ return alternateHostname;
308
+ }
309
+ return urlObj.hostname;
310
+ } catch (error) {
311
+ if (error instanceof HttpError && error.statusCode === 301) {
312
+ return urlObj.hostname;
313
+ }
314
+ return urlObj.hostname;
315
+ }
316
+ }
317
+ async function tryStandardPaths(baseUrl, config) {
318
+ const baseDomain = new URL(baseUrl).origin;
319
+ const accessIssues = [];
320
+ const standardPaths = [
321
+ "/sitemap.xml",
322
+ "/sitemap_index.xml",
323
+ "/sitemap-index.xml"
324
+ ];
325
+ const results = await Promise.allSettled(
326
+ standardPaths.map(async (path) => {
327
+ const sitemapUrl = `${baseDomain}${path}`;
328
+ try {
329
+ const result = await fetchUrl(sitemapUrl, {
330
+ timeout: config.timeout,
331
+ maxRetries: 0
332
+ // Don't retry on standard paths - fail fast
333
+ });
334
+ if (result.statusCode === 200) {
335
+ if (config.verbose) {
336
+ console.log(`\u2713 Found sitemap at: ${sitemapUrl}`);
337
+ }
338
+ return { found: true, url: sitemapUrl };
339
+ }
340
+ return { found: false };
341
+ } catch (error) {
342
+ if (error instanceof HttpError) {
343
+ if (error.statusCode === 401 || error.statusCode === 403) {
344
+ accessIssues.push({
345
+ url: sitemapUrl,
346
+ statusCode: error.statusCode,
347
+ error: error.statusCode === 401 ? "Unauthorized" : "Access Denied"
348
+ });
349
+ if (config.verbose) {
350
+ console.log(`\u26A0 Access denied: ${sitemapUrl} (${error.statusCode})`);
351
+ }
352
+ } else if (config.verbose) {
353
+ console.log(`\u2717 Not found: ${sitemapUrl} (${error.statusCode})`);
354
+ }
355
+ } else if (config.verbose) {
356
+ console.log(`\u2717 Not found: ${sitemapUrl}`);
357
+ }
358
+ return { found: false };
359
+ }
360
+ })
361
+ );
362
+ for (const result of results) {
363
+ if (result.status === "fulfilled" && result.value.found) {
364
+ return { sitemaps: [result.value.url], issues: accessIssues };
365
+ }
366
+ }
367
+ if (config.verbose) {
368
+ console.log("No sitemap found at standard paths");
369
+ }
370
+ return { sitemaps: [], issues: accessIssues };
371
+ }
372
+ async function parseRobotsTxt(baseUrl, config) {
373
+ const robotsUrl = `${new URL(baseUrl).origin}/robots.txt`;
374
+ try {
375
+ const result = await fetchUrl(robotsUrl, {
376
+ timeout: config.timeout,
377
+ maxRetries: 1
378
+ });
379
+ const lines = result.content.split("\n");
380
+ const sitemaps = [];
381
+ for (const line of lines) {
382
+ const match = line.match(/^Sitemap:\s*(.+)$/i);
383
+ if (match) {
384
+ const sitemapUrl = match[1].trim();
385
+ try {
386
+ new URL(sitemapUrl);
387
+ sitemaps.push(sitemapUrl);
388
+ } catch {
389
+ if (config.verbose) {
390
+ console.warn(`Invalid sitemap URL in robots.txt: ${sitemapUrl}`);
391
+ }
392
+ }
393
+ }
394
+ }
395
+ if (config.verbose && sitemaps.length > 0) {
396
+ console.log(`Found ${sitemaps.length} sitemap(s) in robots.txt`);
397
+ }
398
+ return sitemaps;
399
+ } catch (error) {
400
+ if (config.verbose) {
401
+ console.log(`No robots.txt found at ${robotsUrl}`);
402
+ }
403
+ return [];
404
+ }
405
+ }
406
+ function isSitemapIndex(xmlContent) {
407
+ if (xmlContent.includes("<sitemapindex")) {
408
+ return true;
409
+ }
410
+ if (xmlContent.includes("<urlset")) {
411
+ const urlBlockRegex = /<url[^>]*>.*?<loc>([^<]+)<\/loc>.*?<\/url>/gs;
412
+ const matches = Array.from(xmlContent.matchAll(urlBlockRegex));
413
+ const samplesToCheck = Math.min(5, matches.length);
414
+ let sitemapLikeCount = 0;
415
+ for (let i = 0; i < samplesToCheck; i++) {
416
+ const url = matches[i][1].trim().toLowerCase();
417
+ if (url.includes("sitemap") || url.endsWith(".xml")) {
418
+ sitemapLikeCount++;
419
+ }
420
+ }
421
+ return sitemapLikeCount > samplesToCheck / 2;
422
+ }
423
+ return false;
424
+ }
425
+ function extractSitemapIndexUrls(xmlContent) {
426
+ const urls = [];
427
+ if (xmlContent.includes("<sitemapindex")) {
428
+ const sitemapBlockRegex = /<sitemap[^>]*>(.*?)<\/sitemap>/gs;
429
+ let sitemapMatch;
430
+ while ((sitemapMatch = sitemapBlockRegex.exec(xmlContent)) !== null) {
431
+ const locMatch = /<loc>([^<]+)<\/loc>/i.exec(sitemapMatch[1]);
432
+ if (locMatch) {
433
+ const url = locMatch[1].trim();
434
+ try {
435
+ new URL(url);
436
+ urls.push(url);
437
+ } catch {
438
+ }
439
+ }
440
+ }
441
+ } else {
442
+ const urlBlockRegex = /<url[^>]*>(.*?)<\/url>/gs;
443
+ let urlMatch;
444
+ while ((urlMatch = urlBlockRegex.exec(xmlContent)) !== null) {
445
+ const locMatch = /<loc>([^<]+)<\/loc>/i.exec(urlMatch[1]);
446
+ if (locMatch) {
447
+ const url = locMatch[1].trim();
448
+ if (url.toLowerCase().includes("sitemap") || url.toLowerCase().endsWith(".xml")) {
449
+ try {
450
+ new URL(url);
451
+ urls.push(url);
452
+ } catch {
453
+ }
454
+ }
455
+ }
456
+ }
457
+ }
458
+ return urls;
459
+ }
460
+ async function discoverAllSitemaps(initialSitemaps, config, baseUrl, canonicalDomain, _maxDepth = 10) {
461
+ const finalSitemaps = [];
462
+ const toProcess = [...initialSitemaps];
463
+ const processed = /* @__PURE__ */ new Set();
464
+ const failed = /* @__PURE__ */ new Set();
465
+ const redirected = /* @__PURE__ */ new Set();
466
+ let detectedCanonical = canonicalDomain;
467
+ const BATCH_SIZE = config.discoveryConcurrency || 50;
468
+ while (toProcess.length > 0) {
469
+ const batch = toProcess.splice(0, Math.min(BATCH_SIZE, toProcess.length));
470
+ const batchResults = await Promise.all(batch.map(async (sitemapUrl) => {
471
+ if (processed.has(sitemapUrl)) {
472
+ if (config.verbose) {
473
+ console.warn(`Skipping duplicate sitemap: ${sitemapUrl}`);
474
+ }
475
+ return { type: "skip" };
476
+ }
477
+ processed.add(sitemapUrl);
478
+ try {
479
+ const result = await fetchUrl(sitemapUrl, {
480
+ timeout: config.timeout,
481
+ maxRetries: 2
482
+ });
483
+ if (isSitemapIndex(result.content)) {
484
+ if (config.verbose) {
485
+ console.log(`Found sitemap index: ${sitemapUrl}`);
486
+ }
487
+ const childUrls = extractSitemapIndexUrls(result.content);
488
+ if (config.verbose) {
489
+ console.log(` \u2514\u2500 Contains ${childUrls.length} child sitemap(s)`);
490
+ }
491
+ return { type: "index", childUrls };
492
+ } else {
493
+ if (config.verbose) {
494
+ console.log(`\u2713 Discovered sitemap: ${sitemapUrl}`);
495
+ }
496
+ return { type: "sitemap", url: sitemapUrl };
497
+ }
498
+ } catch (error) {
499
+ if (error instanceof HttpError && error.statusCode === 301) {
500
+ redirected.add(sitemapUrl);
501
+ if (config.verbose) {
502
+ if (!detectedCanonical) {
503
+ detectedCanonical = await detectCanonicalDomain(baseUrl, config);
504
+ if (config.verbose) {
505
+ console.log(`Canonical domain detected: ${detectedCanonical}`);
506
+ }
507
+ }
508
+ try {
509
+ const sitemapUrlObj = new URL(sitemapUrl);
510
+ if (sitemapUrlObj.hostname !== detectedCanonical) {
511
+ console.warn(`\u26A0\uFE0F Sitemap URL redirects (301): ${sitemapUrl}`);
512
+ console.warn(` Problem: The sitemap index contains a URL that redirects.`);
513
+ console.warn(` Likely issue: Domain mismatch - expected "${detectedCanonical}" but got "${sitemapUrlObj.hostname}"`);
514
+ console.warn(` Fix: Update sitemap index to use "https://${detectedCanonical}${sitemapUrlObj.pathname}"`);
515
+ } else {
516
+ console.warn(`\u26A0\uFE0F Sitemap URL redirects (301): ${sitemapUrl}`);
517
+ console.warn(` Fix: Update the sitemap index to reference the final URL after redirect.`);
518
+ }
519
+ } catch {
520
+ const message = error instanceof Error ? error.message : String(error);
521
+ console.warn(`Failed to fetch sitemap ${sitemapUrl}: ${message}`);
522
+ }
523
+ }
524
+ return { type: "redirect" };
525
+ } else {
526
+ failed.add(sitemapUrl);
527
+ if (config.verbose) {
528
+ const message = error instanceof Error ? error.message : String(error);
529
+ console.warn(`Failed to fetch sitemap ${sitemapUrl}: ${message}`);
530
+ }
531
+ return { type: "failed" };
532
+ }
533
+ }
534
+ }));
535
+ for (const result of batchResults) {
536
+ if (result.type === "index") {
537
+ toProcess.push(...result.childUrls);
538
+ } else if (result.type === "sitemap") {
539
+ finalSitemaps.push(result.url);
540
+ }
541
+ }
542
+ if (processed.size > 1e3) {
543
+ console.warn(`\u26A0\uFE0F Processed over 1000 sitemap URLs. Stopping to prevent excessive requests.`);
544
+ break;
545
+ }
546
+ }
547
+ const totalProcessed = processed.size;
548
+ const totalFailed = failed.size;
549
+ const totalRedirected = redirected.size;
550
+ const sitemapIndexCount = totalProcessed - finalSitemaps.length - totalFailed - totalRedirected;
551
+ if (finalSitemaps.length === 0 && totalProcessed > 0) {
552
+ console.warn(`
553
+ \u26A0\uFE0F SITEMAP DISCOVERY ISSUE`);
554
+ if (sitemapIndexCount > 0 && (totalFailed > 0 || totalRedirected > 0)) {
555
+ console.warn(`Found ${sitemapIndexCount} sitemap index(es) containing ${totalFailed + totalRedirected} child sitemap(s):`);
556
+ if (totalRedirected > 0) {
557
+ console.warn(` - ${totalRedirected} sitemap(s) return 301 redirects (content not accessible without following redirect)`);
558
+ }
559
+ if (totalFailed > 0) {
560
+ console.warn(` - ${totalFailed} sitemap(s) returned errors (404, 403, 500, or network issues)`);
561
+ }
562
+ } else if (totalRedirected > 0) {
563
+ console.warn(`All ${totalRedirected} sitemap(s) return 301 redirects.`);
564
+ } else if (totalFailed > 0) {
565
+ console.warn(`All ${totalFailed} sitemap(s) returned errors.`);
566
+ console.warn(`
567
+ Common causes:`);
568
+ console.warn(` - 403 Forbidden: Bot protection (Cloudflare, etc.) or IP blocking`);
569
+ console.warn(` - 404 Not Found: Sitemaps don't exist at these URLs`);
570
+ console.warn(` - 500/502/503: Server errors or maintenance`);
571
+ console.warn(`
572
+ If sitemaps work in your browser but not here, the site likely has bot protection.`);
573
+ console.warn(`Try: Check if sitemaps load without JavaScript, or contact site administrator.`);
574
+ } else {
575
+ console.warn(`Processed ${totalProcessed} URL(s) but found no accessible sitemaps.`);
576
+ }
577
+ console.warn(`
578
+ Note: This tool does not follow redirects for sitemap URLs.`);
579
+ if (totalRedirected > 0) {
580
+ console.warn(`
581
+ Possible causes of redirects:`);
582
+ console.warn(` - Sitemap index uses non-canonical domain (e.g., missing 'www' or vice versa)`);
583
+ console.warn(` - Sitemap URLs redirect from HTTP to HTTPS`);
584
+ console.warn(` - Intentional redirects in your site configuration`);
585
+ console.warn(`
586
+ Recommendation: Update sitemap index URLs to match the final destination (no redirects).`);
587
+ }
588
+ console.warn(``);
589
+ }
590
+ return { sitemaps: finalSitemaps, canonicalDomain: detectedCanonical };
591
+ }
592
+ async function discoverSitemaps(baseUrl, config) {
593
+ const normalizedUrl = normalizeBaseUrl(baseUrl);
594
+ let allAccessIssues = [];
595
+ let canonicalDomain;
596
+ if (config.verbose) {
597
+ console.log("Strategy 1: Checking robots.txt for sitemap directives...");
598
+ }
599
+ const robotsSitemaps = await parseRobotsTxt(normalizedUrl, config);
600
+ if (robotsSitemaps.length > 0) {
601
+ const { sitemaps: allSitemaps, canonicalDomain: detected } = await discoverAllSitemaps(robotsSitemaps, config, normalizedUrl, canonicalDomain);
602
+ canonicalDomain = detected;
603
+ return {
604
+ sitemaps: allSitemaps,
605
+ source: "robots-txt",
606
+ accessIssues: [],
607
+ // Clear access issues since we found working sitemaps
608
+ canonicalDomain
609
+ };
610
+ }
611
+ if (config.verbose) {
612
+ console.log("Strategy 2: Trying standard sitemap paths...");
613
+ }
614
+ const { sitemaps: standardSitemaps, issues, redirectedToCanonical } = await tryStandardPaths(normalizedUrl, config);
615
+ allAccessIssues = issues;
616
+ if (standardSitemaps.length > 0) {
617
+ const { sitemaps: allSitemaps, canonicalDomain: detected } = await discoverAllSitemaps(standardSitemaps, config, normalizedUrl, canonicalDomain);
618
+ canonicalDomain = detected;
619
+ return {
620
+ sitemaps: allSitemaps,
621
+ source: "standard-path",
622
+ accessIssues: [],
623
+ // Clear access issues since we found working sitemaps
624
+ canonicalDomain
625
+ };
626
+ }
627
+ if (redirectedToCanonical) {
628
+ const canonicalUrl = `https://${redirectedToCanonical}`;
629
+ console.log(`
630
+ \u{1F4A1} All requests redirected. Retrying with canonical domain: ${redirectedToCanonical}
631
+ `);
632
+ const canonicalRobotsSitemaps = await parseRobotsTxt(canonicalUrl, config);
633
+ if (canonicalRobotsSitemaps.length > 0) {
634
+ const { sitemaps: allSitemaps, canonicalDomain: detected } = await discoverAllSitemaps(canonicalRobotsSitemaps, config, canonicalUrl, redirectedToCanonical);
635
+ return {
636
+ sitemaps: allSitemaps,
637
+ source: "robots-txt",
638
+ accessIssues: [],
639
+ canonicalDomain: detected || redirectedToCanonical
640
+ };
641
+ }
642
+ const { sitemaps: canonicalStandardSitemaps } = await tryStandardPaths(canonicalUrl, config);
643
+ if (canonicalStandardSitemaps.length > 0) {
644
+ const { sitemaps: allSitemaps, canonicalDomain: detected } = await discoverAllSitemaps(canonicalStandardSitemaps, config, canonicalUrl, redirectedToCanonical);
645
+ return {
646
+ sitemaps: allSitemaps,
647
+ source: "standard-path",
648
+ accessIssues: [],
649
+ canonicalDomain: detected || redirectedToCanonical
650
+ };
651
+ }
652
+ }
653
+ return {
654
+ sitemaps: [],
655
+ source: "none",
656
+ accessIssues: allAccessIssues,
657
+ canonicalDomain
658
+ };
659
+ }
660
+
661
+ // src/core/parser.ts
662
+ import { XMLParser, XMLValidator } from "fast-xml-parser";
663
+ var VALID_CHANGEFREQ = /* @__PURE__ */ new Set([
664
+ "always",
665
+ "hourly",
666
+ "daily",
667
+ "weekly",
668
+ "monthly",
669
+ "yearly",
670
+ "never"
671
+ ]);
672
+ var parser = new XMLParser({
673
+ ignoreAttributes: false,
674
+ attributeNamePrefix: "@_",
675
+ textNodeName: "_text",
676
+ parseAttributeValue: true,
677
+ trimValues: true,
678
+ allowBooleanAttributes: true,
679
+ parseTagValue: false
680
+ // Keep values as strings for validation
681
+ });
682
+ function extractUrls(parsedXml, sitemapUrl) {
683
+ const urls = [];
684
+ if (parsedXml.urlset) {
685
+ const urlNodes = Array.isArray(parsedXml.urlset.url) ? parsedXml.urlset.url : [parsedXml.urlset.url];
686
+ for (let i = 0; i < urlNodes.length; i++) {
687
+ const node = urlNodes[i];
688
+ if (!node || !node.loc) {
689
+ continue;
690
+ }
691
+ urls.push({
692
+ loc: node.loc,
693
+ lastmod: node.lastmod,
694
+ changefreq: node.changefreq,
695
+ priority: node.priority ? parseFloat(node.priority) : void 0,
696
+ source: sitemapUrl
697
+ });
698
+ }
699
+ }
700
+ return urls;
701
+ }
702
+ async function parseSitemap(xml, sitemapUrl) {
703
+ const errors = [];
704
+ try {
705
+ const validationResult = XMLValidator.validate(xml);
706
+ if (validationResult !== true) {
707
+ const validationError = typeof validationResult === "object" ? validationResult.err.msg : "Invalid XML";
708
+ return {
709
+ urls: [],
710
+ errors: [
711
+ `[${sitemapUrl}] XML parsing failed: ${validationError}`
712
+ ],
713
+ totalCount: 0,
714
+ sitemapUrl
715
+ };
716
+ }
717
+ const parsed = parser.parse(xml);
718
+ const urls = extractUrls(parsed, sitemapUrl);
719
+ const validUrls = [];
720
+ for (const entry of urls) {
721
+ try {
722
+ new URL(entry.loc);
723
+ if (entry.priority !== void 0) {
724
+ if (entry.priority < 0 || entry.priority > 1) {
725
+ errors.push(
726
+ `Invalid priority ${entry.priority} for ${entry.loc} - clamping to 0-1`
727
+ );
728
+ entry.priority = Math.max(0, Math.min(1, entry.priority));
729
+ }
730
+ }
731
+ if (entry.changefreq) {
732
+ if (!VALID_CHANGEFREQ.has(entry.changefreq.toLowerCase())) {
733
+ errors.push(
734
+ `Invalid changefreq "${entry.changefreq}" for ${entry.loc}`
735
+ );
736
+ entry.changefreq = void 0;
737
+ }
738
+ }
739
+ validUrls.push(entry);
740
+ } catch (urlError) {
741
+ errors.push(`Invalid URL format: ${entry.loc}`);
742
+ }
743
+ }
744
+ return {
745
+ urls: validUrls,
746
+ errors,
747
+ totalCount: validUrls.length,
748
+ sitemapUrl
749
+ };
750
+ } catch (parseError) {
751
+ const errorMsg = parseError instanceof Error ? parseError.message : String(parseError);
752
+ return {
753
+ urls: [],
754
+ errors: [
755
+ `[${sitemapUrl}] XML parsing failed: ${errorMsg}`
756
+ ],
757
+ totalCount: 0,
758
+ sitemapUrl
759
+ };
760
+ }
761
+ }
762
+
763
+ // src/utils/batch-processor.ts
764
+ function chunkArray(array, chunkSize) {
765
+ const chunks = [];
766
+ for (let i = 0; i < array.length; i += chunkSize) {
767
+ chunks.push(array.slice(i, i + chunkSize));
768
+ }
769
+ return chunks;
770
+ }
771
+ async function processInBatches(items, concurrency, processor, onProgress) {
772
+ const results = new Array(items.length);
773
+ let completed = 0;
774
+ let currentIndex = 0;
775
+ const errors = [];
776
+ const workers = Array(Math.min(concurrency, items.length)).fill(null).map(async () => {
777
+ while (currentIndex < items.length) {
778
+ const index = currentIndex++;
779
+ const item = items[index];
780
+ try {
781
+ results[index] = await processor(item);
782
+ } catch (error) {
783
+ errors.push({ index, error });
784
+ results[index] = null;
785
+ }
786
+ completed++;
787
+ if (onProgress) {
788
+ onProgress(completed, items.length);
789
+ }
790
+ }
791
+ });
792
+ await Promise.all(workers);
793
+ if (errors.length > 0) {
794
+ console.warn(`Processed ${items.length} items with ${errors.length} errors`);
795
+ }
796
+ return results;
797
+ }
798
+
799
+ // src/core/extractor.ts
800
+ async function extractAllUrls(sitemapUrls, config, onProgress) {
801
+ const allUrls = [];
802
+ const allErrors = [];
803
+ let sitemapsProcessed = 0;
804
+ let sitemapsFailed = 0;
805
+ if (config.verbose) {
806
+ console.log(`
807
+ Extracting URLs from ${sitemapUrls.length} sitemap(s)...`);
808
+ }
809
+ const CONCURRENCY = config.parsingConcurrency || 50;
810
+ if (!config.silent && config.verbose) {
811
+ console.log(`Using parsing concurrency: ${CONCURRENCY}`);
812
+ }
813
+ const results = await processInBatches(
814
+ sitemapUrls,
815
+ CONCURRENCY,
816
+ async (sitemapUrl) => {
817
+ try {
818
+ if (config.verbose) {
819
+ console.log(`Extracting URLs from: ${sitemapUrl}`);
820
+ }
821
+ const response = await fetchUrl(sitemapUrl, {
822
+ timeout: 10,
823
+ // Fast timeout for sitemaps
824
+ maxRetries: 0,
825
+ // No retries - fail fast
826
+ disableBrowserFallback: true
827
+ // Don't use browser for bulk parsing
828
+ });
829
+ const parseResult = await parseSitemap(response.content, sitemapUrl);
830
+ const extractedAt = (/* @__PURE__ */ new Date()).toISOString();
831
+ parseResult.urls.forEach((url) => {
832
+ url.extractedAt = extractedAt;
833
+ });
834
+ if (config.verbose) {
835
+ console.log(` \u2713 Extracted ${parseResult.urls.length} URLs from ${sitemapUrl}`);
836
+ }
837
+ return {
838
+ success: true,
839
+ urls: parseResult.urls,
840
+ errors: parseResult.errors
841
+ };
842
+ } catch (error) {
843
+ const errorMsg = `Failed to process ${sitemapUrl}: ${error instanceof Error ? error.message : String(error)}`;
844
+ if (config.verbose) {
845
+ console.error(` \u2717 ${errorMsg}`);
846
+ }
847
+ return {
848
+ success: false,
849
+ urls: [],
850
+ errors: [errorMsg]
851
+ };
852
+ }
853
+ },
854
+ onProgress
855
+ // Pass progress callback to batch processor
856
+ );
857
+ for (const result of results) {
858
+ if (result.success) {
859
+ sitemapsProcessed++;
860
+ allUrls.push(...result.urls);
861
+ } else {
862
+ sitemapsFailed++;
863
+ }
864
+ allErrors.push(...result.errors);
865
+ }
866
+ if (config.verbose) {
867
+ console.log(`
868
+ Extraction complete:`);
869
+ console.log(` - Sitemaps processed: ${sitemapsProcessed}`);
870
+ console.log(` - Sitemaps failed: ${sitemapsFailed}`);
871
+ console.log(` - Total URLs: ${allUrls.length}`);
872
+ console.log(` - Errors: ${allErrors.length}`);
873
+ }
874
+ return {
875
+ allUrls,
876
+ sitemapsProcessed,
877
+ sitemapsFailed,
878
+ totalUrls: allUrls.length,
879
+ errors: allErrors
880
+ };
881
+ }
882
+
883
+ // src/core/consolidator.ts
884
+ function normalizeUrl(url) {
885
+ try {
886
+ const parsed = new URL(url);
887
+ let pathname = parsed.pathname;
888
+ if (pathname.endsWith("/") && pathname !== "/") {
889
+ pathname = pathname.slice(0, -1);
890
+ }
891
+ const params = Array.from(parsed.searchParams.entries()).sort(
892
+ ([a], [b]) => a.localeCompare(b)
893
+ );
894
+ const sortedParams = new URLSearchParams(params);
895
+ return `${parsed.protocol}//${parsed.host}${pathname}${sortedParams.toString() ? "?" + sortedParams.toString() : ""}${parsed.hash}`;
896
+ } catch {
897
+ return url;
898
+ }
899
+ }
900
+ function mergeUrlEntries(entries) {
901
+ if (entries.length === 1) return entries[0];
902
+ const merged = { ...entries[0] };
903
+ const sources = entries.map((e) => e.source);
904
+ merged.source = sources.join(", ");
905
+ const lastmods = entries.map((e) => e.lastmod).filter((lm) => !!lm).map((lm) => new Date(lm).getTime()).sort((a, b) => b - a);
906
+ if (lastmods.length > 0) {
907
+ merged.lastmod = new Date(lastmods[0]).toISOString();
908
+ }
909
+ const priorities = entries.map((e) => e.priority).filter((p) => p !== void 0);
910
+ if (priorities.length > 0) {
911
+ merged.priority = Math.max(...priorities);
912
+ }
913
+ const changefreqs = entries.map((e) => e.changefreq).filter((cf) => !!cf);
914
+ if (changefreqs.length > 0) {
915
+ const counts = /* @__PURE__ */ new Map();
916
+ for (const cf of changefreqs) {
917
+ counts.set(cf, (counts.get(cf) || 0) + 1);
918
+ }
919
+ const sorted = Array.from(counts.entries()).sort((a, b) => b[1] - a[1]);
920
+ merged.changefreq = sorted[0][0];
921
+ }
922
+ const extractedAts = entries.map((e) => e.extractedAt).filter((ea) => !!ea).map((ea) => new Date(ea).getTime()).sort((a, b) => b - a);
923
+ if (extractedAts.length > 0) {
924
+ merged.extractedAt = new Date(extractedAts[0]).toISOString();
925
+ }
926
+ return merged;
927
+ }
928
+ function consolidateUrls(urls, verbose = false) {
929
+ const totalInputUrls = urls.length;
930
+ if (verbose) {
931
+ console.log(`
932
+ Consolidating ${urls.length} URL(s)...`);
933
+ }
934
+ const urlMap = /* @__PURE__ */ new Map();
935
+ for (const entry of urls) {
936
+ const normalized = normalizeUrl(entry.loc);
937
+ if (!urlMap.has(normalized)) {
938
+ urlMap.set(normalized, []);
939
+ }
940
+ urlMap.get(normalized).push(entry);
941
+ }
942
+ const uniqueUrls = [];
943
+ const duplicateGroups = [];
944
+ for (const [normalized, entries] of urlMap.entries()) {
945
+ const merged = mergeUrlEntries(entries);
946
+ uniqueUrls.push(merged);
947
+ if (entries.length > 1) {
948
+ duplicateGroups.push({
949
+ url: normalized,
950
+ count: entries.length,
951
+ sources: entries.map((e) => e.source)
952
+ });
953
+ }
954
+ }
955
+ if (verbose) {
956
+ console.log(`Consolidation complete:`);
957
+ console.log(` - Input URLs: ${totalInputUrls}`);
958
+ console.log(` - Unique URLs: ${uniqueUrls.length}`);
959
+ console.log(` - Duplicates removed: ${totalInputUrls - uniqueUrls.length}`);
960
+ if (duplicateGroups.length > 0) {
961
+ console.log(`
962
+ Top duplicates:`);
963
+ const top5 = duplicateGroups.sort((a, b) => b.count - a.count).slice(0, 5);
964
+ for (const group of top5) {
965
+ console.log(` - ${group.url} (${group.count} times)`);
966
+ }
967
+ }
968
+ }
969
+ return {
970
+ uniqueUrls,
971
+ totalInputUrls,
972
+ duplicatesRemoved: totalInputUrls - uniqueUrls.length,
973
+ duplicateGroups
974
+ };
975
+ }
976
+
977
+ // src/core/patterns/risk-patterns.ts
978
+ var RISK_PATTERNS = [
979
+ // Note: Environment leakage patterns moved to domain-patterns.ts
980
+ // Note: Admin path patterns moved to admin-patterns.ts
981
+ // to avoid duplication and improve maintainability
982
+ // Sensitive Parameter Patterns (HIGH)
983
+ {
984
+ name: "Authentication Parameter",
985
+ category: "sensitive_params",
986
+ severity: "high",
987
+ regex: /[?&](token|auth|key|password|secret|apikey|session|credentials)=/i,
988
+ description: "Query parameter may contain sensitive authentication data"
989
+ },
990
+ {
991
+ name: "Debug Parameter",
992
+ category: "sensitive_params",
993
+ severity: "medium",
994
+ regex: /[?&](debug|trace|verbose|test_mode)=/i,
995
+ description: "Query parameter may contain debug or diagnostic flag"
996
+ },
997
+ // Protocol Inconsistency Patterns (MEDIUM)
998
+ {
999
+ name: "HTTP in HTTPS Site",
1000
+ category: "protocol_inconsistency",
1001
+ severity: "medium",
1002
+ regex: /^http:\/\//,
1003
+ description: "HTTP URL in HTTPS sitemap (potential mixed content)"
1004
+ },
1005
+ // Test/Unfinished Content Patterns (MEDIUM)
1006
+ // Focuses on obvious test/placeholder patterns, avoiding false positives with legitimate content
1007
+ {
1008
+ name: "Test Content Path",
1009
+ category: "test_content",
1010
+ severity: "medium",
1011
+ regex: /\/(?:test-|demo-|sample-|temp-|temporary-|placeholder-)|\/(test|demo|sample|temp|temporary|placeholder)(?:\/|$)/i,
1012
+ description: "URL path suggests test, demo, or unfinished content that may not be intended for indexing"
1013
+ }
1014
+ ];
1015
+
1016
+ // src/core/patterns/domain-patterns.ts
1017
+ function escapeRegex(str) {
1018
+ return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
1019
+ }
1020
+ function extractRootDomain(hostname) {
1021
+ const parts = hostname.split(".");
1022
+ if (parts.length >= 2) {
1023
+ return parts.slice(-2).join(".");
1024
+ }
1025
+ return hostname;
1026
+ }
1027
+ function createDomainMismatchPattern(baseUrl, options) {
1028
+ const baseDomain = new URL(baseUrl).hostname;
1029
+ const rootDomain = extractRootDomain(baseDomain);
1030
+ if (options?.allowedSubdomains && options.allowedSubdomains.length > 0) {
1031
+ const escapedRoot2 = escapeRegex(rootDomain);
1032
+ const escapedSubdomains = options.allowedSubdomains.map(escapeRegex).join("|");
1033
+ const pattern2 = `^https?://(?!(?:(?:${escapedSubdomains})\\.)?${escapedRoot2}(?:/|$))`;
1034
+ return {
1035
+ name: "Domain Mismatch",
1036
+ category: "domain_mismatch",
1037
+ severity: "high",
1038
+ regex: new RegExp(pattern2),
1039
+ description: `URL does not match expected domain or allowed subdomains`
1040
+ };
1041
+ }
1042
+ const escapedRoot = escapeRegex(rootDomain);
1043
+ const pattern = `^https?://(?!(?:www\\.)?${escapedRoot}(?:/|$))`;
1044
+ return {
1045
+ name: "Domain Mismatch",
1046
+ category: "domain_mismatch",
1047
+ severity: "high",
1048
+ regex: new RegExp(pattern),
1049
+ description: `URL does not match expected domain: ${rootDomain} (including www variant)`
1050
+ };
1051
+ }
1052
+ var ENVIRONMENT_PATTERNS = [
1053
+ {
1054
+ name: "Staging Subdomain",
1055
+ category: "environment_leakage",
1056
+ severity: "high",
1057
+ regex: /^https?:\/\/(staging|stg)\./i,
1058
+ description: "URL uses staging subdomain"
1059
+ },
1060
+ {
1061
+ name: "Development Subdomain",
1062
+ category: "environment_leakage",
1063
+ severity: "high",
1064
+ regex: /^https?:\/\/(dev|development)\./i,
1065
+ description: "URL uses development subdomain"
1066
+ },
1067
+ {
1068
+ name: "QA/Test Subdomain",
1069
+ category: "environment_leakage",
1070
+ severity: "high",
1071
+ regex: /^https?:\/\/(qa|test|uat|preprod)\./i,
1072
+ description: "URL uses test environment subdomain"
1073
+ },
1074
+ {
1075
+ name: "Localhost URL",
1076
+ category: "environment_leakage",
1077
+ severity: "high",
1078
+ regex: /^https?:\/\/(localhost|127\.0\.0\.1|0\.0\.0\.0)/,
1079
+ description: "URL points to localhost (development environment)"
1080
+ },
1081
+ {
1082
+ name: "Environment in Path",
1083
+ category: "environment_leakage",
1084
+ severity: "high",
1085
+ regex: /^https?:\/\/[^/]+\/(staging|dev|qa|uat|preprod)\//i,
1086
+ description: "URL path contains environment identifier at root level"
1087
+ }
1088
+ ];
1089
+
1090
+ // src/core/patterns/admin-patterns.ts
1091
+ var ADMIN_PATH_PATTERNS = [
1092
+ {
1093
+ name: "Admin Path",
1094
+ category: "admin_paths",
1095
+ severity: "high",
1096
+ regex: /\/(admin|administrator)(?:\/|$|\?)/i,
1097
+ description: "URL contains /admin or /administrator as a path segment"
1098
+ },
1099
+ {
1100
+ name: "Dashboard Path",
1101
+ category: "admin_paths",
1102
+ severity: "high",
1103
+ regex: /\/dashboard(?:\/|$|\?)/i,
1104
+ description: "URL contains /dashboard as a path segment"
1105
+ },
1106
+ {
1107
+ name: "Config Path",
1108
+ category: "admin_paths",
1109
+ severity: "high",
1110
+ regex: /\/(config|configuration)(?:\/|$|\?)/i,
1111
+ description: "URL contains /config or /configuration as a path segment"
1112
+ },
1113
+ {
1114
+ name: "Console Path",
1115
+ category: "admin_paths",
1116
+ severity: "high",
1117
+ regex: /\/console(?:\/|$|\?)/i,
1118
+ description: "URL contains /console as a path segment"
1119
+ },
1120
+ {
1121
+ name: "Control Panel Path",
1122
+ category: "admin_paths",
1123
+ severity: "high",
1124
+ regex: /\/(cpanel|control-panel)(?:\/|$|\?)/i,
1125
+ description: "URL contains control panel as a path segment"
1126
+ }
1127
+ ];
1128
+ var INTERNAL_CONTENT_PATTERNS = [
1129
+ {
1130
+ name: "Internal Content Path",
1131
+ category: "internal_content",
1132
+ severity: "medium",
1133
+ regex: /\/internal\b/i,
1134
+ description: "URL contains /internal path segment - may be internal-only content not intended for public indexing"
1135
+ }
1136
+ ];
1137
+ var SENSITIVE_PARAM_PATTERNS = [
1138
+ {
1139
+ name: "Authentication Token Parameter",
1140
+ category: "sensitive_params",
1141
+ severity: "high",
1142
+ regex: /[?&](token|auth_token|access_token|api_token)=/i,
1143
+ description: "Query parameter may contain authentication token"
1144
+ },
1145
+ {
1146
+ name: "API Key Parameter",
1147
+ category: "sensitive_params",
1148
+ severity: "high",
1149
+ regex: /[?&](apikey|api_key|key)=/i,
1150
+ description: "Query parameter may contain API key"
1151
+ },
1152
+ {
1153
+ name: "Password Parameter",
1154
+ category: "sensitive_params",
1155
+ severity: "high",
1156
+ regex: /[?&](password|passwd|pwd)=/i,
1157
+ description: "Query parameter may contain password"
1158
+ },
1159
+ {
1160
+ name: "Secret Parameter",
1161
+ category: "sensitive_params",
1162
+ severity: "high",
1163
+ regex: /[?&](secret|client_secret)=/i,
1164
+ description: "Query parameter may contain secret value"
1165
+ },
1166
+ {
1167
+ name: "Session Parameter",
1168
+ category: "sensitive_params",
1169
+ severity: "high",
1170
+ regex: /[?&](session|sessionid|sid)=/i,
1171
+ description: "Query parameter may contain session identifier"
1172
+ },
1173
+ {
1174
+ name: "Credentials Parameter",
1175
+ category: "sensitive_params",
1176
+ severity: "high",
1177
+ regex: /[?&]credentials=/i,
1178
+ description: "Query parameter may contain credentials"
1179
+ },
1180
+ {
1181
+ name: "Debug Parameter",
1182
+ category: "sensitive_params",
1183
+ severity: "medium",
1184
+ regex: /[?&](debug|trace|verbose)=/i,
1185
+ description: "Query parameter contains debug or diagnostic flag"
1186
+ },
1187
+ {
1188
+ name: "Test Mode Parameter",
1189
+ category: "sensitive_params",
1190
+ severity: "medium",
1191
+ regex: /[?&](test_mode|test|testing)=/i,
1192
+ description: "Query parameter indicates test mode"
1193
+ }
1194
+ ];
1195
+
1196
+ // src/utils/sanitizer.ts
1197
+ function sanitizeUrl(url) {
1198
+ try {
1199
+ const parsed = new URL(url);
1200
+ const sensitiveParams = [
1201
+ "token",
1202
+ "auth",
1203
+ "auth_token",
1204
+ "access_token",
1205
+ "api_token",
1206
+ "apikey",
1207
+ "api_key",
1208
+ "key",
1209
+ "password",
1210
+ "passwd",
1211
+ "pwd",
1212
+ "secret",
1213
+ "client_secret",
1214
+ "session",
1215
+ "sessionid",
1216
+ "sid",
1217
+ "credentials"
1218
+ ];
1219
+ for (const param of sensitiveParams) {
1220
+ if (parsed.searchParams.has(param)) {
1221
+ parsed.searchParams.set(param, "[REDACTED]");
1222
+ }
1223
+ }
1224
+ return parsed.toString();
1225
+ } catch {
1226
+ return url;
1227
+ }
1228
+ }
1229
+
1230
+ // src/core/risk-grouper.ts
1231
+ function generateRecommendation(category, _severity, count) {
1232
+ switch (category) {
1233
+ case "environment_leakage":
1234
+ return {
1235
+ rationale: `Production sitemap contains ${count} URL(s) from non-production environments (staging, dev, QA, test). This indicates configuration errors or environment leakage.`,
1236
+ recommendedAction: "Verify sitemap generation excludes non-production environments. Review deployment configuration and environment filtering rules."
1237
+ };
1238
+ case "admin_paths":
1239
+ return {
1240
+ rationale: `${count} administrative path(s) detected in public sitemap (admin, dashboard, config). These paths may expose privileged access points.`,
1241
+ recommendedAction: "Confirm if admin paths should be publicly indexed. Consider excluding via robots.txt or removing from sitemap. Verify access controls."
1242
+ };
1243
+ case "internal_content":
1244
+ return {
1245
+ rationale: `${count} URL(s) contain "internal" in the path. These may be internal-facing content not intended for public indexing.`,
1246
+ recommendedAction: "Review URLs to determine if they should be publicly accessible. Consider excluding internal content from sitemap or adding noindex meta tags."
1247
+ };
1248
+ case "test_content":
1249
+ return {
1250
+ rationale: `${count} URL(s) contain test/demo/sample identifiers. These may be placeholder or unfinished content not intended for indexing.`,
1251
+ recommendedAction: "Review and remove test content from production sitemaps. Verify content is production-ready before including in sitemap."
1252
+ };
1253
+ case "sensitive_params":
1254
+ return {
1255
+ rationale: `${count} URL(s) contain sensitive query parameters (token, auth, key, password, session). This may expose authentication credentials or debugging flags.`,
1256
+ recommendedAction: "Review why sensitive parameters are in sitemap URLs. Remove authentication tokens from URLs. Consider POST requests for sensitive data."
1257
+ };
1258
+ case "protocol_inconsistency":
1259
+ return {
1260
+ rationale: `${count} URL(s) use HTTP protocol in HTTPS sitemap. This creates mixed content warnings and potential security issues.`,
1261
+ recommendedAction: "Update URLs to use HTTPS consistently. Verify SSL certificate coverage. Check for hardcoded HTTP URLs in content."
1262
+ };
1263
+ case "domain_mismatch":
1264
+ return {
1265
+ rationale: `${count} URL(s) do not match expected base domain. This may indicate external links, CDN URLs, or configuration errors.`,
1266
+ recommendedAction: "Verify if external domains are intentional. Review sitemap generation logic. Confirm CDN or subdomain configuration is correct."
1267
+ };
1268
+ default:
1269
+ return {
1270
+ rationale: `${count} URL(s) flagged in category: ${category}`,
1271
+ recommendedAction: "Review flagged URLs and determine appropriate action."
1272
+ };
1273
+ }
1274
+ }
1275
+ function groupRiskFindings(findings, maxSampleUrls = 5) {
1276
+ const categoryMap = /* @__PURE__ */ new Map();
1277
+ for (const finding of findings) {
1278
+ if (!categoryMap.has(finding.category)) {
1279
+ categoryMap.set(finding.category, []);
1280
+ }
1281
+ categoryMap.get(finding.category).push(finding);
1282
+ }
1283
+ const groups = [];
1284
+ for (const [category, categoryFindings] of categoryMap.entries()) {
1285
+ const uniqueUrls = Array.from(new Set(categoryFindings.map((f) => f.url)));
1286
+ const severity = categoryFindings.reduce((highest, finding) => {
1287
+ const severityOrder = ["low", "medium", "high"];
1288
+ return severityOrder.indexOf(finding.severity) > severityOrder.indexOf(highest) ? finding.severity : highest;
1289
+ }, "low");
1290
+ const sampleUrls = uniqueUrls.slice(0, maxSampleUrls);
1291
+ const { rationale, recommendedAction } = generateRecommendation(category, severity, uniqueUrls.length);
1292
+ groups.push({
1293
+ category,
1294
+ severity,
1295
+ count: uniqueUrls.length,
1296
+ rationale,
1297
+ sampleUrls,
1298
+ recommendedAction,
1299
+ allUrls: uniqueUrls
1300
+ });
1301
+ }
1302
+ groups.sort((a, b) => {
1303
+ const severityOrder = ["high", "medium", "low"];
1304
+ return severityOrder.indexOf(a.severity) - severityOrder.indexOf(b.severity);
1305
+ });
1306
+ const totalRiskUrls = new Set(findings.map((f) => f.url)).size;
1307
+ const highSeverityCount = groups.filter((g) => g.severity === "high").reduce((sum, g) => sum + g.count, 0);
1308
+ const mediumSeverityCount = groups.filter((g) => g.severity === "medium").reduce((sum, g) => sum + g.count, 0);
1309
+ const lowSeverityCount = groups.filter((g) => g.severity === "low").reduce((sum, g) => sum + g.count, 0);
1310
+ return {
1311
+ groups,
1312
+ totalRiskUrls,
1313
+ highSeverityCount,
1314
+ mediumSeverityCount,
1315
+ lowSeverityCount
1316
+ };
1317
+ }
1318
+
1319
+ // src/core/risk-detector.ts
1320
+ import os from "os";
1321
+ function compileAcceptedPatterns(config) {
1322
+ const patterns = [];
1323
+ if (config.acceptedPatterns && config.acceptedPatterns.length > 0) {
1324
+ for (const pattern of config.acceptedPatterns) {
1325
+ try {
1326
+ let regexPattern = pattern.replace(/[.+?^${}()|[\]\\]/g, "\\$&").replace(/\*/g, "[^/]*");
1327
+ if (!regexPattern.endsWith("$") && !regexPattern.includes("(?:")) {
1328
+ regexPattern = regexPattern + "(?:/|$|\\?|#)";
1329
+ }
1330
+ patterns.push(new RegExp(regexPattern, "i"));
1331
+ } catch (error) {
1332
+ if (config.verbose) {
1333
+ console.warn(`Invalid accepted pattern: ${pattern}`);
1334
+ }
1335
+ }
1336
+ }
1337
+ }
1338
+ return patterns;
1339
+ }
1340
+ async function detectRisksInBatch(urls, allPatterns, acceptedPatterns, expectedProtocol, verbose) {
1341
+ const findings = [];
1342
+ for (const urlEntry of urls) {
1343
+ const url = urlEntry.loc;
1344
+ let isAccepted = false;
1345
+ for (const acceptedPattern of acceptedPatterns) {
1346
+ if (acceptedPattern.test(url)) {
1347
+ isAccepted = true;
1348
+ break;
1349
+ }
1350
+ }
1351
+ if (isAccepted) continue;
1352
+ for (const pattern of allPatterns) {
1353
+ if (pattern.category === "protocol_inconsistency") {
1354
+ try {
1355
+ const urlProtocol = new URL(url).protocol;
1356
+ if (expectedProtocol === "https:" && urlProtocol === "http:") {
1357
+ findings.push({
1358
+ url,
1359
+ category: pattern.category,
1360
+ severity: pattern.severity,
1361
+ pattern: pattern.name,
1362
+ rationale: pattern.description,
1363
+ matchedValue: "http://"
1364
+ });
1365
+ }
1366
+ } catch (error) {
1367
+ continue;
1368
+ }
1369
+ } else {
1370
+ try {
1371
+ const match = url.match(pattern.regex);
1372
+ if (match) {
1373
+ findings.push({
1374
+ url: pattern.category === "sensitive_params" ? sanitizeUrl(url) : url,
1375
+ category: pattern.category,
1376
+ severity: pattern.severity,
1377
+ pattern: pattern.name,
1378
+ rationale: pattern.description,
1379
+ matchedValue: match[0]
1380
+ });
1381
+ }
1382
+ } catch (error) {
1383
+ if (verbose) {
1384
+ console.error(`Pattern matching failed for ${pattern.name}: ${error instanceof Error ? error.message : String(error)}`);
1385
+ }
1386
+ continue;
1387
+ }
1388
+ }
1389
+ }
1390
+ }
1391
+ return { findings, urlsProcessed: urls.length };
1392
+ }
1393
+ async function detectRisks(urls, baseUrl, config) {
1394
+ const startTime = Date.now();
1395
+ const domainPattern = createDomainMismatchPattern(baseUrl);
1396
+ const allPatterns = [
1397
+ ...RISK_PATTERNS,
1398
+ ...ENVIRONMENT_PATTERNS,
1399
+ ...ADMIN_PATH_PATTERNS,
1400
+ ...SENSITIVE_PARAM_PATTERNS,
1401
+ ...INTERNAL_CONTENT_PATTERNS,
1402
+ domainPattern
1403
+ ];
1404
+ const acceptedPatterns = compileAcceptedPatterns(config);
1405
+ let expectedProtocol;
1406
+ try {
1407
+ expectedProtocol = new URL(baseUrl).protocol;
1408
+ } catch (error) {
1409
+ if (config.verbose) {
1410
+ console.warn(`Invalid base URL: ${baseUrl}, defaulting to https:`);
1411
+ }
1412
+ expectedProtocol = "https:";
1413
+ }
1414
+ const BATCH_SIZE = config.riskDetectionBatchSize || 1e4;
1415
+ const CONCURRENCY = config.riskDetectionConcurrency || Math.max(2, os.cpus().length - 1);
1416
+ const batches = chunkArray(urls, BATCH_SIZE);
1417
+ if (config.verbose) {
1418
+ console.log(`
1419
+ Risk Detection Configuration:`);
1420
+ console.log(` - Total URLs: ${urls.length.toLocaleString()}`);
1421
+ console.log(` - Batch size: ${BATCH_SIZE.toLocaleString()}`);
1422
+ console.log(` - Concurrency: ${CONCURRENCY}`);
1423
+ console.log(` - Total batches: ${batches.length}`);
1424
+ try {
1425
+ console.log(` - Base domain: ${new URL(baseUrl).hostname}`);
1426
+ } catch (error) {
1427
+ console.log(` - Base URL: ${baseUrl}`);
1428
+ }
1429
+ if (acceptedPatterns.length > 0) {
1430
+ console.log(` - Accepted patterns: ${acceptedPatterns.length}`);
1431
+ }
1432
+ }
1433
+ let completedBatches = 0;
1434
+ const totalBatches = batches.length;
1435
+ const batchStartTime = Date.now();
1436
+ const batchResults = await processInBatches(
1437
+ batches,
1438
+ CONCURRENCY,
1439
+ (batch) => detectRisksInBatch(batch, allPatterns, acceptedPatterns, expectedProtocol, config.verbose),
1440
+ (completed) => {
1441
+ completedBatches = completed;
1442
+ const pct = (completed / totalBatches * 100).toFixed(1);
1443
+ const elapsed = (Date.now() - batchStartTime) / 1e3;
1444
+ const urlsProcessed = completed * BATCH_SIZE;
1445
+ const speed = Math.round(urlsProcessed / elapsed);
1446
+ const remaining = totalBatches - completed;
1447
+ const eta = Math.round(remaining * BATCH_SIZE / speed);
1448
+ process.stdout.write(
1449
+ `\r\x1B[K Analyzing batch ${completed}/${totalBatches} (${pct}%) | ETA: ~${eta}s | ${speed.toLocaleString()} URLs/sec`
1450
+ );
1451
+ }
1452
+ );
1453
+ process.stdout.write("\r\x1B[K");
1454
+ const allFindings = batchResults.flatMap((r) => r.findings);
1455
+ const groupingResult = groupRiskFindings(allFindings);
1456
+ const processingTimeMs = Date.now() - startTime;
1457
+ if (config.verbose) {
1458
+ console.log(`
1459
+ Risk Detection Summary:`);
1460
+ console.log(` - Total URLs analyzed: ${urls.length.toLocaleString()}`);
1461
+ console.log(` - Risk URLs found: ${groupingResult.totalRiskUrls.toLocaleString()}`);
1462
+ console.log(` - HIGH severity: ${groupingResult.highSeverityCount}`);
1463
+ console.log(` - MEDIUM severity: ${groupingResult.mediumSeverityCount}`);
1464
+ console.log(` - LOW severity: ${groupingResult.lowSeverityCount}`);
1465
+ console.log(` - Processing time: ${(processingTimeMs / 1e3).toFixed(1)}s`);
1466
+ if (groupingResult.groups.length > 0) {
1467
+ console.log(`
1468
+ Risk Categories Found:`);
1469
+ for (const group of groupingResult.groups) {
1470
+ console.log(` - ${group.category}: ${group.count} URLs (${group.severity.toUpperCase()})`);
1471
+ }
1472
+ }
1473
+ }
1474
+ return {
1475
+ findings: allFindings,
1476
+ groups: groupingResult.groups,
1477
+ totalUrlsAnalyzed: urls.length,
1478
+ riskUrlCount: groupingResult.totalRiskUrls,
1479
+ cleanUrlCount: urls.length - groupingResult.totalRiskUrls,
1480
+ highSeverityCount: groupingResult.highSeverityCount,
1481
+ mediumSeverityCount: groupingResult.mediumSeverityCount,
1482
+ lowSeverityCount: groupingResult.lowSeverityCount,
1483
+ processingTimeMs
1484
+ };
1485
+ }
1486
+
1487
+ // src/summarizer.ts
1488
+ function summarizeRisks(request) {
1489
+ const severityBreakdown = {
1490
+ high: 0,
1491
+ medium: 0,
1492
+ low: 0
1493
+ };
1494
+ const categoryInsights = request.riskGroups.map((group) => {
1495
+ severityBreakdown[group.severity] += group.count;
1496
+ const urls = group.allUrls || group.sampleUrls;
1497
+ return {
1498
+ category: group.category,
1499
+ count: group.count,
1500
+ severity: group.severity,
1501
+ summary: group.rationale,
1502
+ examples: urls.slice(0, 3),
1503
+ allUrls: urls
1504
+ // Include all URLs for download functionality
1505
+ };
1506
+ });
1507
+ const totalRisks = request.riskGroups.reduce((sum, g) => sum + g.count, 0);
1508
+ const overview = totalRisks > 0 ? `Found ${totalRisks} potentially risky URLs across ${request.riskGroups.length} categories in ${request.totalUrls} total URLs.` : `Analyzed ${request.totalUrls} URLs. No suspicious patterns detected.`;
1509
+ const keyFindings = [];
1510
+ if (severityBreakdown.high > 0) {
1511
+ keyFindings.push(`${severityBreakdown.high} high-severity issues require immediate attention`);
1512
+ }
1513
+ if (severityBreakdown.medium > 0) {
1514
+ keyFindings.push(`${severityBreakdown.medium} medium-severity issues should be reviewed`);
1515
+ }
1516
+ if (severityBreakdown.low > 0) {
1517
+ keyFindings.push(`${severityBreakdown.low} low-severity items flagged for awareness`);
1518
+ }
1519
+ return {
1520
+ overview,
1521
+ keyFindings,
1522
+ categoryInsights,
1523
+ severityBreakdown,
1524
+ recommendations: [],
1525
+ generatedBy: "rule-based analysis",
1526
+ metadata: {
1527
+ tokensUsed: 0,
1528
+ processingTime: request.processingTime || 0,
1529
+ model: "pattern-matching"
1530
+ }
1531
+ };
1532
+ }
1533
+
1534
+ // src/reporters/json-reporter.ts
1535
+ var TOOL_VERSION = true ? "1.0.0-alpha.1" : "dev";
1536
+ function generateJsonReport(summary, discoveryResult, parseResult, riskGroups, config, startTime, options = {}) {
1537
+ const {
1538
+ pretty = true,
1539
+ indent = 2,
1540
+ performanceMetrics
1541
+ } = options;
1542
+ const result = buildAnalysisResult(
1543
+ summary,
1544
+ discoveryResult,
1545
+ parseResult,
1546
+ riskGroups,
1547
+ config,
1548
+ startTime
1549
+ );
1550
+ const jsonOutput = transformToJsonOutput(result, performanceMetrics);
1551
+ if (pretty) {
1552
+ return JSON.stringify(jsonOutput, null, indent);
1553
+ } else {
1554
+ return JSON.stringify(jsonOutput);
1555
+ }
1556
+ }
1557
+ function buildAnalysisResult(summary, discoveryResult, parseResult, riskGroups, config, startTime) {
1558
+ const metadata = buildAnalysisMetadata(
1559
+ config.baseUrl || "unknown",
1560
+ startTime,
1561
+ summary
1562
+ );
1563
+ const suspiciousGroups = riskGroups.map((group) => ({
1564
+ category: group.category,
1565
+ severity: group.severity,
1566
+ count: group.count,
1567
+ pattern: group.category,
1568
+ // Use category as pattern identifier
1569
+ rationale: group.rationale,
1570
+ sampleUrls: group.sampleUrls.slice(0, 5),
1571
+ // Limit to 5 samples
1572
+ recommendedAction: group.recommendedAction
1573
+ }));
1574
+ const summaryStats = {
1575
+ highSeverityCount: summary.severityBreakdown.high,
1576
+ mediumSeverityCount: summary.severityBreakdown.medium,
1577
+ lowSeverityCount: summary.severityBreakdown.low,
1578
+ totalRiskyUrls: riskGroups.reduce((sum, g) => sum + g.count, 0),
1579
+ overallStatus: determineOverallStatus(
1580
+ summary.severityBreakdown,
1581
+ parseResult.errors
1582
+ )
1583
+ };
1584
+ const riskSummary = {
1585
+ overview: summary.overview,
1586
+ keyFindings: summary.keyFindings,
1587
+ recommendations: summary.recommendations
1588
+ };
1589
+ const errors = parseResult.errors.map(transformError);
1590
+ return {
1591
+ analysisMetadata: metadata,
1592
+ sitemapsDiscovered: discoveryResult.sitemaps,
1593
+ totalUrlCount: parseResult.totalCount,
1594
+ urlsAnalyzed: parseResult.totalCount,
1595
+ suspiciousGroups,
1596
+ riskSummary,
1597
+ summary: summaryStats,
1598
+ errors
1599
+ };
1600
+ }
1601
+ function buildAnalysisMetadata(baseUrl, startTime, summary) {
1602
+ return {
1603
+ baseUrl,
1604
+ analysisTimestamp: (/* @__PURE__ */ new Date()).toISOString(),
1605
+ toolVersion: TOOL_VERSION,
1606
+ executionTimeMs: Date.now() - startTime,
1607
+ analysisType: summary.generatedBy
1608
+ };
1609
+ }
1610
+ function determineOverallStatus(severityBreakdown, errors) {
1611
+ if (errors.length > 0) {
1612
+ return "errors";
1613
+ }
1614
+ const totalIssues = severityBreakdown.high + severityBreakdown.medium + severityBreakdown.low;
1615
+ return totalIssues > 0 ? "issues_found" : "clean";
1616
+ }
1617
+ function transformToJsonOutput(result, performanceMetrics) {
1618
+ const output = {
1619
+ analysis_metadata: transformMetadata(result.analysisMetadata),
1620
+ sitemaps_discovered: result.sitemapsDiscovered,
1621
+ total_url_count: result.totalUrlCount,
1622
+ urls_analyzed: result.urlsAnalyzed,
1623
+ suspicious_groups: result.suspiciousGroups.map(transformGroup),
1624
+ risk_summary: transformRiskSummary(result.riskSummary),
1625
+ summary: transformSummary(result.summary),
1626
+ errors: result.errors
1627
+ };
1628
+ if (performanceMetrics) {
1629
+ output.performance_metrics = {
1630
+ total_execution_time_ms: performanceMetrics.totalExecutionTimeMs,
1631
+ phase_timings: performanceMetrics.phaseTimings,
1632
+ throughput: performanceMetrics.throughput,
1633
+ resource_usage: performanceMetrics.resourceUsage
1634
+ };
1635
+ }
1636
+ return output;
1637
+ }
1638
+ function transformMetadata(meta) {
1639
+ return {
1640
+ base_url: meta.baseUrl,
1641
+ analysis_timestamp: meta.analysisTimestamp,
1642
+ tool_version: meta.toolVersion,
1643
+ execution_time_ms: meta.executionTimeMs,
1644
+ analysis_type: meta.analysisType
1645
+ };
1646
+ }
1647
+ function transformGroup(group) {
1648
+ return {
1649
+ category: group.category,
1650
+ severity: group.severity,
1651
+ count: group.count,
1652
+ pattern: group.pattern,
1653
+ rationale: group.rationale,
1654
+ sample_urls: group.sampleUrls,
1655
+ recommended_action: group.recommendedAction
1656
+ };
1657
+ }
1658
+ function transformRiskSummary(summary) {
1659
+ return {
1660
+ overview: summary.overview,
1661
+ key_findings: summary.keyFindings,
1662
+ recommendations: summary.recommendations
1663
+ };
1664
+ }
1665
+ function transformSummary(summary) {
1666
+ return {
1667
+ high_severity_count: summary.highSeverityCount,
1668
+ medium_severity_count: summary.mediumSeverityCount,
1669
+ low_severity_count: summary.lowSeverityCount,
1670
+ total_risky_urls: summary.totalRiskyUrls,
1671
+ overall_status: summary.overallStatus
1672
+ };
1673
+ }
1674
+ function transformError(error) {
1675
+ if ("code" in error) {
1676
+ const customError = error;
1677
+ const errorDetail = {
1678
+ code: customError.code || "UNKNOWN_ERROR",
1679
+ message: error.message
1680
+ };
1681
+ if ("attemptedPaths" in customError) {
1682
+ errorDetail.context = {
1683
+ attempted_paths: customError.attemptedPaths
1684
+ };
1685
+ } else if ("sitemapUrl" in customError && "lineNumber" in customError) {
1686
+ errorDetail.context = {
1687
+ sitemap_url: customError.sitemapUrl,
1688
+ line_number: customError.lineNumber
1689
+ };
1690
+ } else if ("url" in customError) {
1691
+ errorDetail.context = {
1692
+ url: customError.url
1693
+ };
1694
+ }
1695
+ return errorDetail;
1696
+ }
1697
+ return {
1698
+ code: "UNKNOWN_ERROR",
1699
+ message: error.message
1700
+ };
1701
+ }
1702
+
1703
+ // src/reporters/html-reporter.ts
1704
+ import { promises as fs } from "fs";
1705
+ var TOOL_VERSION2 = "1.0.0-alpha.1";
1706
+ function generateHtmlReport(summary, discoveryResult, totalUrls, config, errors, options = {}) {
1707
+ const maxUrls = options.maxUrlsPerGroup ?? 10;
1708
+ const timestamp = (/* @__PURE__ */ new Date()).toISOString();
1709
+ const riskyUrlCount = summary.categoryInsights.reduce((sum, g) => sum + g.count, 0);
1710
+ const highSeverity = summary.categoryInsights.filter((g) => g.severity === "high");
1711
+ const mediumSeverity = summary.categoryInsights.filter((g) => g.severity === "medium");
1712
+ const lowSeverity = summary.categoryInsights.filter((g) => g.severity === "low");
1713
+ const html = `<!DOCTYPE html>
20
1714
  <html lang="en">
21
1715
  <head>
22
1716
  <meta charset="UTF-8">
23
1717
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
24
- <title>Sitemap QA Report - ${s.baseUrl}</title>
1718
+ <title>Sitemap QA Report - ${config.baseUrl}</title>
25
1719
  <style>
26
1720
  * { margin: 0; padding: 0; box-sizing: border-box; }
27
1721
  body {
@@ -339,93 +2033,88 @@ Risk Categories Found:`);for(let p of l.groups)console.log(` - ${p.category}: $
339
2033
  <div class="header">
340
2034
  <h1>Sitemap Analysis</h1>
341
2035
  <div class="meta">
342
- <div>${s.baseUrl}</div>
343
- <div>${new Date(o).toLocaleString()}</div>
2036
+ <div>${config.baseUrl}</div>
2037
+ <div>${new Date(timestamp).toLocaleString()}</div>
344
2038
  </div>
345
2039
  </div>
346
2040
 
347
2041
  <div class="summary">
348
2042
  <div class="summary-card">
349
2043
  <div class="label">Sitemaps</div>
350
- <div class="value">${t.sitemaps.length}</div>
2044
+ <div class="value">${discoveryResult.sitemaps.length}</div>
351
2045
  </div>
352
2046
  <div class="summary-card">
353
2047
  <div class="label">URLs Analyzed</div>
354
- <div class="value">${r.toLocaleString()}</div>
2048
+ <div class="value">${totalUrls.toLocaleString()}</div>
355
2049
  </div>
356
2050
  <div class="summary-card">
357
2051
  <div class="label">Issues Found</div>
358
- <div class="value" style="color: ${i>0?"#dc2626":"#059669"}">${i}</div>
2052
+ <div class="value" style="color: ${riskyUrlCount > 0 ? "#dc2626" : "#059669"}">${riskyUrlCount}</div>
359
2053
  </div>
360
2054
  <div class="summary-card">
361
2055
  <div class="label">Scan Time</div>
362
- <div class="value">${(e.metadata.processingTime/1e3).toFixed(1)}s</div>
2056
+ <div class="value">${(summary.metadata.processingTime / 1e3).toFixed(1)}s</div>
363
2057
  </div>
364
2058
  </div>
365
2059
 
366
2060
  <div class="content">
367
- ${n.length>0?`
2061
+ ${errors.length > 0 ? `
368
2062
  <div class="errors-section">
369
- <h3>Parsing Errors & Warnings (${n.length})</h3>
2063
+ <h3>Parsing Errors & Warnings (${errors.length})</h3>
370
2064
  <ul>
371
- ${n.map(u=>`<li>${u.message}</li>`).join(`
372
- `)}
2065
+ ${errors.map((err) => `<li>${err.message}</li>`).join("\n ")}
373
2066
  </ul>
374
2067
  </div>
375
- `:""}
2068
+ ` : ""}
376
2069
 
377
- ${t.sitemaps.length>0?`
2070
+ ${discoveryResult.sitemaps.length > 0 ? `
378
2071
  <div class="sitemaps">
379
- <h3 class="collapsed" onclick="toggleSection(this)">Sitemaps Discovered (${t.sitemaps.length})</h3>
2072
+ <h3 class="collapsed" onclick="toggleSection(this)">Sitemaps Discovered (${discoveryResult.sitemaps.length})</h3>
380
2073
  <div class="sitemaps-content collapsed">
381
2074
  <ul>
382
- ${t.sitemaps.map(u=>`<li>\u2022 ${u}</li>`).join(`
383
- `)}
2075
+ ${discoveryResult.sitemaps.map((s) => `<li>\u2022 ${s}</li>`).join("\n ")}
384
2076
  </ul>
385
2077
  </div>
386
2078
  </div>
387
- `:""}
2079
+ ` : ""}
388
2080
 
389
- ${i===0?`
2081
+ ${riskyUrlCount === 0 ? `
390
2082
  <div class="status-clean">
391
2083
  <h2>No Issues Found</h2>
392
2084
  <p>All URLs in the sitemap passed validation checks.</p>
393
2085
  </div>
394
- `:""}
2086
+ ` : ""}
395
2087
 
396
- ${m.length>0?`
2088
+ ${highSeverity.length > 0 ? `
397
2089
  <div class="severity-section">
398
- <h2 class="severity-high" onclick="toggleSection(this)">High Severity (${m.reduce((u,g)=>u+g.count,0)} URLs)</h2>
2090
+ <h2 class="severity-high" onclick="toggleSection(this)">High Severity (${highSeverity.reduce((sum, g) => sum + g.count, 0)} URLs)</h2>
399
2091
  <div class="severity-content">
400
- ${m.map(u=>_(u,c)).join(`
401
- `)}
2092
+ ${highSeverity.map((group) => renderRiskGroup(group, maxUrls)).join("\n ")}
402
2093
  </div>
403
2094
  </div>
404
- `:""}
2095
+ ` : ""}
405
2096
 
406
- ${l.length>0?`
2097
+ ${mediumSeverity.length > 0 ? `
407
2098
  <div class="severity-section">
408
- <h2 class="severity-medium" onclick="toggleSection(this)">Medium Severity (${l.reduce((u,g)=>u+g.count,0)} URLs)</h2>
2099
+ <h2 class="severity-medium" onclick="toggleSection(this)">Medium Severity (${mediumSeverity.reduce((sum, g) => sum + g.count, 0)} URLs)</h2>
409
2100
  <div class="severity-content">
410
- ${l.map(u=>_(u,c)).join(`
411
- `)}
2101
+ ${mediumSeverity.map((group) => renderRiskGroup(group, maxUrls)).join("\n ")}
412
2102
  </div>
413
2103
  </div>
414
- `:""}
2104
+ ` : ""}
415
2105
 
416
- ${d.length>0?`
2106
+ ${lowSeverity.length > 0 ? `
417
2107
  <div class="severity-section">
418
- <h2 class="severity-low" onclick="toggleSection(this)">Low Severity (${d.reduce((u,g)=>u+g.count,0)} URLs)</h2>
2108
+ <h2 class="severity-low" onclick="toggleSection(this)">Low Severity (${lowSeverity.reduce((sum, g) => sum + g.count, 0)} URLs)</h2>
419
2109
  <div class="severity-content">
420
- ${d.map(u=>_(u,c)).join(`
421
- `)}
2110
+ ${lowSeverity.map((group) => renderRiskGroup(group, maxUrls)).join("\n ")}
422
2111
  </div>
423
2112
  </div>
424
- `:""}
2113
+ ` : ""}
425
2114
  </div>
426
2115
 
427
2116
  <div class="footer">
428
- Generated by <strong>sitemap-qa</strong> v${Ie}
2117
+ Generated by <strong>sitemap-qa</strong> v${TOOL_VERSION2}
429
2118
  </div>
430
2119
  </div>
431
2120
 
@@ -458,27 +2147,354 @@ Risk Categories Found:`);for(let p of l.groups)console.log(` - ${p.category}: $
458
2147
  }
459
2148
  </script>
460
2149
  </body>
461
- </html>`}function _(e,t){let r=e.category.split("_").map(i=>i.charAt(0).toUpperCase()+i.slice(1)).join(" "),s=e.examples.slice(0,t),n=e.count-s.length,a=e.category.toLowerCase(),c=JSON.stringify(e.allUrls),o=ee(c);return`<div class="risk-group">
462
- <h3>${r} <span class="count">${e.count} URLs</span></h3>
463
- <div class="impact">${e.summary}</div>
2150
+ </html>`;
2151
+ return html;
2152
+ }
2153
+ function renderRiskGroup(group, maxUrls) {
2154
+ const categoryTitle = group.category.split("_").map((word) => word.charAt(0).toUpperCase() + word.slice(1)).join(" ");
2155
+ const urlsToShow = group.examples.slice(0, maxUrls);
2156
+ const remaining = group.count - urlsToShow.length;
2157
+ const categorySlug = group.category.toLowerCase();
2158
+ const allUrlsJson = JSON.stringify(group.allUrls);
2159
+ const encodedUrls = escapeHtml(allUrlsJson);
2160
+ return `<div class="risk-group">
2161
+ <h3>${categoryTitle} <span class="count">${group.count} URLs</span></h3>
2162
+ <div class="impact">${group.summary}</div>
464
2163
  <div class="urls">
465
2164
  <h4>Sample URLs</h4>
466
2165
  <ul>
467
- ${s.map(i=>`<li>${ee(i)}</li>`).join(`
468
- `)}
2166
+ ${urlsToShow.map((url) => `<li>${escapeHtml(url)}</li>`).join("\n ")}
469
2167
  </ul>
470
- ${n>0?`<div class="more">... and ${n} more</div>`:""}
471
- <button class="download-btn" onclick="downloadUrls('${a}', '${o}')">\u{1F4E5} Download All ${e.count} URLs</button>
2168
+ ${remaining > 0 ? `<div class="more">... and ${remaining} more</div>` : ""}
2169
+ <button class="download-btn" onclick="downloadUrls('${categorySlug}', '${encodedUrls}')">\u{1F4E5} Download All ${group.count} URLs</button>
472
2170
  </div>
473
- </div>`}function ee(e){return e.replace(/&/g,"&amp;").replace(/</g,"&lt;").replace(/>/g,"&gt;").replace(/"/g,"&quot;").replace(/'/g,"&#039;")}async function te(e,t,r,s,n,a,c={}){let o=De(e,t,r,s,a,c);await Le.writeFile(n,o,"utf-8")}var oe=new Ne("analyze").description("Analyze sitemap for QA issues").argument("<url>","Base URL to analyze").option("--timeout <seconds>","HTTP timeout in seconds","30").option("--no-progress","Disable progress bar").option("--output <format>","Output format: html or json","html").option("--output-dir <path>","Output directory for reports").option("--output-file <path>","Custom output filename").option("--accepted-patterns <patterns>","Comma-separated regex patterns to exclude from risk detection").option("--no-color","Disable ANSI color codes in CLI output").option("--verbose","Enable verbose logging",!1).action(async(e,t)=>{let r;try{Fe(t),r=await F({...t,baseUrl:e,outputFormat:t.output}),console.log(`
474
- \u{1F50D} Analyzing ${e}...
475
- `);let n=await ze(e,r);Me(n),await re.mkdir(r.outputDir,{recursive:!0});let a=t.outputFile||`sitemap-qa-report-${Date.now()}.html`,c=`${r.outputDir}/${a}`;if(await te(n.summary,n.discoveryResult,n.totalUrls,r,c,n.errors,{maxUrlsPerGroup:10}),console.log(`
476
- \u{1F4C4} Full report saved to: ${k.cyan(c)}`),t.output==="json"){let i=a.replace(/\.html$/,".json"),m=`${r.outputDir}/${i}`,l=Z(n.summary,n.discoveryResult,{totalCount:n.totalUrls,uniqueUrls:[],errors:[]},n.riskGroups,r,n.executionTime,{pretty:!0,indent:2});await re.writeFile(m,l,"utf-8"),console.log(`\u{1F4C4} JSON report saved to: ${k.cyan(m)}`)}let o=Oe(n);process.exit(o)}catch(s){Ge(s,r),process.exit(2)}});function Fe(e){let t=["json","html"];if(!t.includes(e.output))throw new Error(`Invalid output format: ${e.output}. Must be one of: ${t.join(", ")}`);let r=parseInt(e.timeout);if(isNaN(r)||r<=0)throw new Error(`Invalid timeout: ${e.timeout}. Must be a positive number.`)}function Me(e){console.log("");let t=e.summary.categoryInsights.reduce((r,s)=>r+s.count,0);if(t===0)console.log(k.green("\u2705 No issues found - sitemap looks clean!"));else{console.log(k.yellow(`\u26A0\uFE0F Found ${t} potentially risky URL(s)`)),console.log("");let{high:r,medium:s,low:n}=e.summary.severityBreakdown;r>0&&console.log(k.red(` \u{1F6A8} High severity: ${r} URLs`)),s>0&&console.log(k.yellow(` \u26A0\uFE0F Medium severity: ${s} URLs`)),n>0&&console.log(k.blue(` \u2139\uFE0F Low severity: ${n} URLs`))}console.log("")}async function ze(e,t){let r=Date.now(),s=[],n=C("Discovering sitemaps...").start(),a=await O(e,t);if(n.succeed(`Found ${a.sitemaps.length} sitemap(s)`),a.accessIssues.length>0){console.warn(`\u26A0\uFE0F Warning: ${a.accessIssues.length} sitemap(s) are access-blocked`);for(let y of a.accessIssues)s.push(new Error(`Access blocked: ${y.url} (${y.statusCode})`))}if(a.sitemaps.length===0)throw new Error(`No sitemaps found at ${e}. Tried: /sitemap.xml, /sitemap_index.xml, /robots.txt`);let c=C("Parsing sitemaps...").start(),o=await q(a.sitemaps,t);if(c.succeed(`Extracted ${o.allUrls.length.toLocaleString()} URLs`),o.errors.length>0)for(let y of o.errors)typeof y=="string"?s.push(new Error(y)):s.push(y);if(o.allUrls.length===0)throw new Error("No URLs extracted from sitemaps");let i=C("Removing duplicates...").start(),m=j(o.allUrls),l=o.allUrls.length-m.uniqueUrls.length;l>0?i.succeed(`${m.uniqueUrls.length.toLocaleString()} unique URLs (removed ${l.toLocaleString()} duplicates)`):i.succeed(`${m.uniqueUrls.length.toLocaleString()} unique URLs`);let d=C("Analyzing for risks...").start(),p=await Y(m.uniqueUrls,e,t),u=P(p.findings),g=u.groups.reduce((y,R)=>y+R.count,0);g>0?d.warn(`Found ${g} risky URL(s)`):d.succeed("No risks detected");let f=Date.now()-r,w=C("Generating report...").start(),h=X({riskGroups:u.groups,totalUrls:m.uniqueUrls.length,sitemapUrl:e,processingTime:f});return w.succeed("Analysis complete"),{discoveryResult:a,totalUrls:m.uniqueUrls.length,riskGroups:u.groups,summary:h,errors:s,executionTime:f}}function Oe(e){return e.summary.severityBreakdown.high>0?1:0}function Ge(e,t){console.error(`
477
- \u274C Analysis failed
478
- `),e instanceof Error?(console.error(`Error: ${e.message}`),t?.verbose&&e.stack&&(console.error(`
479
- Stack trace:`),console.error(e.stack)),e.message.includes("No sitemaps found")?(console.error(`
480
- Suggestions:`),console.error(" \u2022 Verify the base URL is correct"),console.error(" \u2022 Check if the site has a sitemap"),console.error(" \u2022 Ensure the sitemap is publicly accessible")):(e.message.includes("Network")||e.message.includes("timeout"))&&(console.error(`
481
- Suggestions:`),console.error(" \u2022 Check your internet connection"),console.error(" \u2022 Verify the URL is accessible"),console.error(" \u2022 Try increasing the timeout with --timeout option"))):(console.error("Unknown error occurred"),console.error(String(e)))}var A=new qe;A.name("sitemap-qa").version("1.0.0").description("sitemap analysis for QA teams");A.addCommand(oe);process.on("unhandledRejection",(e,t)=>{console.error("Unhandled Rejection at:",t,"reason:",e),process.exit(1)});process.on("SIGINT",()=>{console.log(`
482
- Gracefully shutting down...`),process.exit(0)});process.on("SIGTERM",()=>{console.log(`
483
- Gracefully shutting down...`),process.exit(0)});A.parse();
2171
+ </div>`;
2172
+ }
2173
+ function escapeHtml(text) {
2174
+ return text.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;").replace(/'/g, "&#039;");
2175
+ }
2176
+ async function writeHtmlReport(summary, discoveryResult, totalUrls, config, outputPath, errors, options = {}) {
2177
+ const htmlContent = generateHtmlReport(summary, discoveryResult, totalUrls, config, errors, options);
2178
+ await fs.writeFile(outputPath, htmlContent, "utf-8");
2179
+ }
2180
+
2181
+ // src/commands/analyze.ts
2182
+ var analyzeCommand = new Command("analyze").description("Analyze sitemap for QA issues").argument("<url>", "Base URL to analyze").option("--timeout <seconds>", "HTTP timeout in seconds", "30").option("--no-progress", "Disable progress bar").option("--output <format>", "Output format: html or json", "html").option("--output-dir <path>", "Output directory for reports").option("--output-file <path>", "Custom output filename").option("--accepted-patterns <patterns>", "Comma-separated regex patterns to exclude from risk detection").option("--concurrency <number>", "Number of concurrent workers for risk detection").option("--batch-size <number>", "URLs per batch for risk detection", "10000").option("--parsing-concurrency <number>", "Number of concurrent sitemap parsers", "50").option("--discovery-concurrency <number>", "Number of concurrent sitemap index fetches", "50").option("--silent", "Disable all progress output").option("--benchmark", "Save performance profile").option("--no-color", "Disable ANSI color codes in CLI output").option("--verbose", "Enable verbose logging", false).action(async (url, options) => {
2183
+ let config;
2184
+ try {
2185
+ validateAnalyzeOptions(options);
2186
+ const loadedConfig = await loadConfig({
2187
+ ...options,
2188
+ baseUrl: url,
2189
+ outputFormat: options.output,
2190
+ riskDetectionConcurrency: options.concurrency ? parseInt(options.concurrency) : void 0,
2191
+ riskDetectionBatchSize: options.batchSize ? parseInt(options.batchSize) : void 0,
2192
+ parsingConcurrency: options.parsingConcurrency ? parseInt(options.parsingConcurrency) : void 0,
2193
+ discoveryConcurrency: options.discoveryConcurrency ? parseInt(options.discoveryConcurrency) : void 0,
2194
+ silent: options.silent,
2195
+ benchmark: options.benchmark,
2196
+ progressBar: options.progress
2197
+ });
2198
+ config = loadedConfig;
2199
+ console.log(`
2200
+ \u{1F50D} Analyzing ${url}...
2201
+ `);
2202
+ const result = await runAnalysisPipeline(url, config);
2203
+ await fs2.mkdir(config.outputDir, { recursive: true });
2204
+ if (options.output === "json") {
2205
+ const jsonReport = generateJsonReport(
2206
+ result.summary,
2207
+ result.discoveryResult,
2208
+ { totalCount: result.totalUrls, uniqueUrls: [], errors: [] },
2209
+ result.riskGroups,
2210
+ config,
2211
+ result.executionTime,
2212
+ { pretty: true, indent: 2 }
2213
+ );
2214
+ console.log("\n" + jsonReport);
2215
+ if (options.outputFile) {
2216
+ const jsonFilePath = `${config.outputDir}/${options.outputFile}`;
2217
+ await fs2.writeFile(jsonFilePath, jsonReport, "utf-8");
2218
+ console.log(`
2219
+ \u{1F4C4} JSON report saved to: ${chalk.cyan(jsonFilePath)}`);
2220
+ }
2221
+ } else {
2222
+ showCliSummary(result);
2223
+ const htmlFileName = options.outputFile || `sitemap-qa-report-${Date.now()}.html`;
2224
+ const htmlFilePath = `${config.outputDir}/${htmlFileName}`;
2225
+ await writeHtmlReport(
2226
+ result.summary,
2227
+ result.discoveryResult,
2228
+ result.totalUrls,
2229
+ config,
2230
+ htmlFilePath,
2231
+ result.errors,
2232
+ { maxUrlsPerGroup: 10 }
2233
+ );
2234
+ console.log(`
2235
+ \u{1F4C4} Full report saved to: ${chalk.cyan(htmlFilePath)}`);
2236
+ }
2237
+ const exitCode = determineExitCode(result);
2238
+ process.exit(exitCode);
2239
+ } catch (error) {
2240
+ handleAnalysisError(error, config);
2241
+ process.exit(2);
2242
+ }
2243
+ });
2244
+ function validateAnalyzeOptions(options) {
2245
+ const validFormats = ["json", "html"];
2246
+ if (!validFormats.includes(options.output)) {
2247
+ throw new Error(
2248
+ `Invalid output format: ${options.output}. Must be one of: ${validFormats.join(", ")}`
2249
+ );
2250
+ }
2251
+ const timeout = parseInt(options.timeout);
2252
+ if (isNaN(timeout) || timeout <= 0) {
2253
+ throw new Error(`Invalid timeout: ${options.timeout}. Must be a positive number.`);
2254
+ }
2255
+ }
2256
+ function showCliSummary(result) {
2257
+ const riskyUrlCount = result.summary.categoryInsights.reduce((sum, g) => sum + g.count, 0);
2258
+ console.log(chalk.dim("\u2500".repeat(50)));
2259
+ if (riskyUrlCount === 0) {
2260
+ console.log(chalk.green("\u2705 No issues found - sitemap looks clean!"));
2261
+ } else {
2262
+ const { high, medium, low } = result.summary.severityBreakdown;
2263
+ const severityParts = [];
2264
+ if (high > 0) severityParts.push(chalk.red(`High: ${high}`));
2265
+ if (medium > 0) severityParts.push(chalk.yellow(`Medium: ${medium}`));
2266
+ if (low > 0) severityParts.push(chalk.blue(`Low: ${low}`));
2267
+ const severitySummary = severityParts.length > 0 ? ` (${severityParts.join(", ")})` : "";
2268
+ console.log(chalk.yellow(`\u26A0\uFE0F ${riskyUrlCount} risky URLs found${severitySummary}`));
2269
+ }
2270
+ console.log("");
2271
+ }
2272
+ async function runAnalysisPipeline(url, config) {
2273
+ const overallStartTime = Date.now();
2274
+ const phaseTimings = [];
2275
+ const errors = [];
2276
+ const showProgress = !config.silent && config.progressBar !== false && process.stdout.isTTY;
2277
+ let phaseStart = Date.now();
2278
+ const discoverySpinner = showProgress ? ora({ text: "Discovering sitemaps...", color: "cyan" }).start() : null;
2279
+ const discoveryResult = await discoverSitemaps(url, config);
2280
+ if (discoverySpinner) {
2281
+ discoverySpinner.stop();
2282
+ }
2283
+ phaseTimings.push({
2284
+ name: "Discovery",
2285
+ startTime: phaseStart,
2286
+ endTime: Date.now(),
2287
+ duration: Date.now() - phaseStart
2288
+ });
2289
+ if (discoveryResult.accessIssues.length > 0) {
2290
+ if (!config.silent) {
2291
+ console.warn(chalk.yellow(`\u26A0\uFE0F Warning: ${discoveryResult.accessIssues.length} sitemap(s) are access-blocked`));
2292
+ }
2293
+ for (const issue of discoveryResult.accessIssues) {
2294
+ errors.push(new Error(`Access blocked: ${issue.url} (${issue.statusCode})`));
2295
+ }
2296
+ }
2297
+ if (discoveryResult.sitemaps.length === 0) {
2298
+ throw new Error(`No sitemaps found at ${url}. Tried: /sitemap.xml, /sitemap_index.xml, /robots.txt`);
2299
+ }
2300
+ phaseStart = Date.now();
2301
+ let extractionResult;
2302
+ if (showProgress && discoveryResult.sitemaps.length > 10) {
2303
+ const parseBar = new cliProgress.SingleBar({
2304
+ format: "{bar} {percentage}% | {value}/{total} | ETA: {eta}s | {speed} sitemaps/sec",
2305
+ barCompleteChar: "\u2588",
2306
+ barIncompleteChar: "\u2591",
2307
+ hideCursor: true
2308
+ });
2309
+ parseBar.start(discoveryResult.sitemaps.length, 0, { speed: "0" });
2310
+ extractionResult = await extractAllUrls(
2311
+ discoveryResult.sitemaps,
2312
+ config,
2313
+ (completed, total) => {
2314
+ const elapsed = (Date.now() - phaseStart) / 1e3;
2315
+ const speed = elapsed > 0 ? (completed / elapsed).toFixed(1) : "0";
2316
+ parseBar.update(completed, { speed });
2317
+ }
2318
+ );
2319
+ parseBar.stop();
2320
+ } else {
2321
+ extractionResult = await extractAllUrls(discoveryResult.sitemaps, config);
2322
+ }
2323
+ phaseTimings.push({
2324
+ name: "Parsing",
2325
+ startTime: phaseStart,
2326
+ endTime: Date.now(),
2327
+ duration: Date.now() - phaseStart
2328
+ });
2329
+ if (extractionResult.errors.length > 0) {
2330
+ for (const err of extractionResult.errors) {
2331
+ if (typeof err === "string") {
2332
+ errors.push(new Error(err));
2333
+ } else {
2334
+ errors.push(err);
2335
+ }
2336
+ }
2337
+ }
2338
+ if (extractionResult.allUrls.length === 0) {
2339
+ throw new Error("No URLs extracted from sitemaps");
2340
+ }
2341
+ phaseStart = Date.now();
2342
+ const consolidatedResult = consolidateUrls(extractionResult.allUrls);
2343
+ phaseTimings.push({
2344
+ name: "Deduplication",
2345
+ startTime: phaseStart,
2346
+ endTime: Date.now(),
2347
+ duration: Date.now() - phaseStart
2348
+ });
2349
+ const duplicatesRemoved = extractionResult.allUrls.length - consolidatedResult.uniqueUrls.length;
2350
+ const duplicatePercentage = duplicatesRemoved / extractionResult.allUrls.length * 100;
2351
+ if (!config.silent) {
2352
+ if (duplicatesRemoved > 100 || duplicatePercentage > 1) {
2353
+ console.log(chalk.green(`\u2713 Analyzed ${discoveryResult.sitemaps.length} sitemap(s) \u2192 ${extractionResult.allUrls.length.toLocaleString()} URLs (${consolidatedResult.uniqueUrls.length.toLocaleString()} unique)`));
2354
+ } else {
2355
+ console.log(chalk.green(`\u2713 Analyzed ${discoveryResult.sitemaps.length} sitemap(s) \u2192 ${extractionResult.allUrls.length.toLocaleString()} URLs`));
2356
+ }
2357
+ }
2358
+ phaseStart = Date.now();
2359
+ const riskResult = await detectRisks(consolidatedResult.uniqueUrls, url, config);
2360
+ const riskGroups = groupRiskFindings(riskResult.findings);
2361
+ phaseTimings.push({
2362
+ name: "Risk Detection",
2363
+ startTime: phaseStart,
2364
+ endTime: Date.now(),
2365
+ duration: Date.now() - phaseStart
2366
+ });
2367
+ phaseStart = Date.now();
2368
+ const executionTime = Date.now() - overallStartTime;
2369
+ const summary = summarizeRisks({
2370
+ riskGroups: riskGroups.groups,
2371
+ totalUrls: consolidatedResult.uniqueUrls.length,
2372
+ sitemapUrl: url,
2373
+ processingTime: executionTime
2374
+ });
2375
+ phaseTimings.push({
2376
+ name: "Summarization",
2377
+ startTime: phaseStart,
2378
+ endTime: Date.now(),
2379
+ duration: Date.now() - phaseStart
2380
+ });
2381
+ if (!config.silent && config.verbose) {
2382
+ displayPhaseSummary(phaseTimings, executionTime);
2383
+ } else if (!config.silent) {
2384
+ const parsingPhase = phaseTimings.find((p) => p.name === "Parsing");
2385
+ const sitemapsPerSec = parsingPhase ? (discoveryResult.sitemaps.length / (parsingPhase.duration / 1e3)).toFixed(1) : "0";
2386
+ console.log(chalk.green(`\u2705 Analysis complete (${(executionTime / 1e3).toFixed(1)}s \xB7 ${sitemapsPerSec} sitemaps/sec)
2387
+ `));
2388
+ }
2389
+ if (config.benchmark) {
2390
+ await saveBenchmark(phaseTimings, url, executionTime, discoveryResult.sitemaps.length, consolidatedResult.uniqueUrls.length, config);
2391
+ }
2392
+ return {
2393
+ discoveryResult,
2394
+ totalUrls: consolidatedResult.uniqueUrls.length,
2395
+ riskGroups: riskGroups.groups,
2396
+ summary,
2397
+ errors,
2398
+ executionTime,
2399
+ phaseTimings
2400
+ };
2401
+ }
2402
+ function determineExitCode(result) {
2403
+ const highSeverityCount = result.summary.severityBreakdown.high;
2404
+ if (highSeverityCount > 0) {
2405
+ return 1;
2406
+ }
2407
+ return 0;
2408
+ }
2409
+ function handleAnalysisError(error, config) {
2410
+ console.error("\n\u274C Analysis failed\n");
2411
+ if (error instanceof Error) {
2412
+ console.error(`Error: ${error.message}`);
2413
+ if (config?.verbose && error.stack) {
2414
+ console.error("\nStack trace:");
2415
+ console.error(error.stack);
2416
+ }
2417
+ if (error.message.includes("No sitemaps found")) {
2418
+ console.error("\nSuggestions:");
2419
+ console.error(" \u2022 Verify the base URL is correct");
2420
+ console.error(" \u2022 Check if the site has a sitemap");
2421
+ console.error(" \u2022 Ensure the sitemap is publicly accessible");
2422
+ } else if (error.message.includes("Network") || error.message.includes("timeout")) {
2423
+ console.error("\nSuggestions:");
2424
+ console.error(" \u2022 Check your internet connection");
2425
+ console.error(" \u2022 Verify the URL is accessible");
2426
+ console.error(" \u2022 Try increasing the timeout with --timeout option");
2427
+ }
2428
+ } else {
2429
+ console.error("Unknown error occurred");
2430
+ console.error(String(error));
2431
+ }
2432
+ }
2433
+ function displayPhaseSummary(timings, totalTime) {
2434
+ console.log(chalk.green(`
2435
+ \u2705 Analysis Complete (Total: ${(totalTime / 1e3).toFixed(1)}s)
2436
+ `));
2437
+ console.log(chalk.cyan("Phase Breakdown:"));
2438
+ for (const timing of timings) {
2439
+ const seconds = (timing.duration / 1e3).toFixed(1);
2440
+ const percentage = (timing.duration / totalTime * 100).toFixed(1);
2441
+ const bar = "\u2022";
2442
+ console.log(` ${bar} ${timing.name.padEnd(15)}: ${seconds.padStart(5)}s (${percentage.padStart(5)}%)`);
2443
+ }
2444
+ console.log("");
2445
+ }
2446
+ async function saveBenchmark(timings, url, totalTime, sitemapCount, urlCount, config) {
2447
+ const benchmark = {
2448
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
2449
+ url,
2450
+ total_duration_ms: totalTime,
2451
+ phases: timings.map((t) => ({
2452
+ name: t.name.toLowerCase(),
2453
+ start_ms: t.startTime,
2454
+ end_ms: t.endTime,
2455
+ duration_ms: t.duration
2456
+ })),
2457
+ metrics: {
2458
+ sitemaps_processed: sitemapCount,
2459
+ urls_analyzed: urlCount,
2460
+ throughput: {
2461
+ urls_per_second: Math.round(urlCount / totalTime * 1e3),
2462
+ sitemaps_per_second: (sitemapCount / totalTime * 1e3).toFixed(2)
2463
+ }
2464
+ },
2465
+ system_info: {
2466
+ cpu_count: os2.cpus().length,
2467
+ node_version: process.version,
2468
+ platform: process.platform,
2469
+ memory_total_mb: Math.round(os2.totalmem() / 1024 / 1024)
2470
+ },
2471
+ config: {
2472
+ discovery_concurrency: config.discoveryConcurrency,
2473
+ parsing_concurrency: config.parsingConcurrency,
2474
+ risk_detection_concurrency: config.riskDetectionConcurrency,
2475
+ risk_detection_batch_size: config.riskDetectionBatchSize
2476
+ }
2477
+ };
2478
+ const filename = `performance-profile-${Date.now()}.json`;
2479
+ await fs2.writeFile(filename, JSON.stringify(benchmark, null, 2));
2480
+ console.log(chalk.blue(`\u{1F4CA} Benchmark saved to: ${filename}`));
2481
+ }
2482
+
2483
+ // src/index.ts
2484
+ var program = new Command2();
2485
+ program.name("sitemap-qa").version("1.0.0").description("sitemap analysis for QA teams");
2486
+ program.addCommand(analyzeCommand);
2487
+ process.on("unhandledRejection", (reason, promise) => {
2488
+ console.error("Unhandled Rejection at:", promise, "reason:", reason);
2489
+ process.exit(1);
2490
+ });
2491
+ process.on("SIGINT", () => {
2492
+ console.log("\nGracefully shutting down...");
2493
+ process.exit(0);
2494
+ });
2495
+ process.on("SIGTERM", () => {
2496
+ console.log("\nGracefully shutting down...");
2497
+ process.exit(0);
2498
+ });
2499
+ program.parse();
484
2500
  //# sourceMappingURL=index.js.map