aeorank 1.6.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -151,7 +151,7 @@ function extractInternalLinks(html, domain) {
151
151
  return Array.from(urls);
152
152
  }
153
153
  var CATEGORY_PATTERNS = [
154
- [/\/(blog|articles?|posts?|news|insights|guides)\b/i, "blog"],
154
+ [/\/([^/]*-?)?(blog|articles?|posts?|news|insights|guides)\b/i, "blog"],
155
155
  [/\/(about|about-us|company|who-we-are)\b/i, "about"],
156
156
  [/\/(pricing|plans|packages)\b/i, "pricing"],
157
157
  [/\/(services?|features?|solutions?|products?|what-we-do|offerings?)\b/i, "services"],
@@ -289,4 +289,4 @@ export {
289
289
  isDisallowedByRobots,
290
290
  parseRobotsTxt
291
291
  };
292
- //# sourceMappingURL=full-site-crawler-F7J2HRL4.js.map
292
+ //# sourceMappingURL=full-site-crawler-FQYO46YV.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/full-site-crawler.ts"],"sourcesContent":["/**\n * Full-site crawler for deep AEO audits.\n * BFS crawl that discovers all internal pages up to a configurable limit.\n */\n\nimport type { FetchResult, SiteData, PageCategory } from './site-crawler.js';\n\n// ─── Types ──────────────────────────────────────────────────────────────────\n\nexport interface CrawlOptions {\n /** Maximum pages to fetch (default 200) */\n maxPages?: number;\n /** Per-page fetch timeout in ms (default 10000) */\n timeoutMs?: number;\n /** Parallel fetches (default 5) */\n concurrency?: number;\n /** Honor robots.txt Disallow rules (default true) */\n respectRobots?: boolean;\n /** Include asset files — skipped by default */\n includeAssets?: boolean;\n}\n\nexport interface CrawlResult {\n pages: FetchResult[];\n discoveredUrls: string[];\n fetchedUrls: string[];\n skippedUrls: string[];\n elapsed: number;\n}\n\n// ─── Resource file extensions to skip ────────────────────────────────────────\n\nconst RESOURCE_EXTENSIONS = /\\.(js|css|png|jpg|jpeg|gif|svg|ico|pdf|xml|txt|woff|woff2|ttf|eot|mp4|mp3|webp|avif|zip|gz|tar|json)$/i;\n\nconst SKIP_PATH_PATTERNS = /^\\/(api|wp-admin|wp-json|static|assets|_next|auth|login|signup|cart|checkout|admin|feed|xmlrpc)\\b/i;\n\n// ─── Robots.txt parsing ─────────────────────────────────────────────────────\n\ninterface RobotsRules {\n disallow: string[];\n allow: string[];\n}\n\nexport function parseRobotsTxt(robotsText: string): RobotsRules {\n const lines = robotsText.split('\\n');\n const rules: RobotsRules = { disallow: [], allow: [] };\n\n // Collect rules for User-agent: * and User-agent: AEO-Visibility-Bot\n let inRelevantSection = false;\n\n for (const rawLine of lines) {\n const line = rawLine.trim();\n if (!line || line.startsWith('#')) continue;\n\n const uaMatch = line.match(/^user-agent:\\s*(.+)/i);\n if (uaMatch) {\n const agent = uaMatch[1].trim().toLowerCase();\n inRelevantSection = agent === '*' || agent === 'aeo-visibility-bot';\n continue;\n }\n\n if (!inRelevantSection) continue;\n\n const disallowMatch = line.match(/^disallow:\\s*(.*)/i);\n if (disallowMatch) {\n const path = disallowMatch[1].trim();\n if (path) rules.disallow.push(path);\n continue;\n }\n\n const allowMatch = line.match(/^allow:\\s*(.*)/i);\n if (allowMatch) {\n const path = allowMatch[1].trim();\n if (path) rules.allow.push(path);\n }\n }\n\n return rules;\n}\n\nexport function isDisallowedByRobots(urlPath: string, rules: RobotsRules): boolean {\n // Check allow rules first — more specific (longer) rules take precedence\n let longestAllow = 0;\n let longestDisallow = 0;\n\n for (const pattern of rules.allow) {\n if (urlPath.startsWith(pattern) && pattern.length > longestAllow) {\n longestAllow = pattern.length;\n }\n }\n\n for (const pattern of rules.disallow) {\n if (urlPath.startsWith(pattern) && pattern.length > longestDisallow) {\n longestDisallow = pattern.length;\n }\n }\n\n // More specific (longer) rule wins; if equal length, allow wins\n if (longestAllow === 0 && longestDisallow === 0) return false;\n return longestDisallow > longestAllow;\n}\n\n// ─── Fetch helper (matches multi-page-fetcher.ts fetchPage) ──────────────────\n\nasync function fetchPage(url: string, timeoutMs = 10000): Promise<FetchResult | null> {\n try {\n const res = await fetch(url, {\n signal: AbortSignal.timeout(timeoutMs),\n headers: { 'User-Agent': 'AEO-Visibility-Bot/1.0' },\n redirect: 'follow',\n });\n if (res.status !== 200) return null;\n const text = await res.text();\n if (text.length < 200) return null;\n return { text: text.slice(0, 500_000), status: res.status, finalUrl: res.url };\n } catch {\n return null;\n }\n}\n\nasync function fetchSitemapXml(url: string, timeoutMs = 10000): Promise<string | null> {\n try {\n const res = await fetch(url, {\n signal: AbortSignal.timeout(timeoutMs),\n headers: { 'User-Agent': 'AEO-Visibility-Bot/1.0' },\n redirect: 'follow',\n });\n if (res.status !== 200) return null;\n return await res.text();\n } catch {\n return null;\n }\n}\n\n// ─── Sitemap parsing ────────────────────────────────────────────────────────\n\n/**\n * Extract all page URLs from sitemap XML (handles sitemapindex with sub-sitemaps).\n * Filters to same domain only, skips resource files.\n */\nexport async function extractAllUrlsFromSitemap(\n sitemapText: string,\n domain: string,\n timeoutMs = 10000,\n): Promise<string[]> {\n const cleanDomain = domain.replace(/^www\\./, '').toLowerCase();\n const urls = new Set<string>();\n\n // Check for sitemapindex — fetch sub-sitemaps\n const subSitemapLocs = sitemapText.match(/<sitemap>[\\s\\S]*?<loc>([^<]+)<\\/loc>[\\s\\S]*?<\\/sitemap>/gi) || [];\n if (subSitemapLocs.length > 0) {\n const subUrls: string[] = [];\n for (const block of subSitemapLocs) {\n const locMatch = block.match(/<loc>([^<]+)<\\/loc>/i);\n if (locMatch) subUrls.push(locMatch[1].trim());\n }\n\n // Fetch sub-sitemaps in parallel (limit to 10)\n const fetches = subUrls.slice(0, 10).map(u => fetchSitemapXml(u, timeoutMs));\n const results = await Promise.all(fetches);\n for (const text of results) {\n if (text) {\n extractLocsFromXml(text, cleanDomain, urls);\n }\n }\n }\n\n // Also extract <url><loc> from the main sitemap text (could be a regular sitemap)\n extractLocsFromXml(sitemapText, cleanDomain, urls);\n\n return Array.from(urls);\n}\n\nfunction extractLocsFromXml(xml: string, cleanDomain: string, urls: Set<string>): void {\n const locMatches = xml.match(/<url>[\\s\\S]*?<loc>([^<]+)<\\/loc>[\\s\\S]*?<\\/url>/gi) || [];\n for (const block of locMatches) {\n const locMatch = block.match(/<loc>([^<]+)<\\/loc>/i);\n if (!locMatch) continue;\n const url = locMatch[1].trim();\n\n try {\n const parsed = new URL(url);\n const urlDomain = parsed.hostname.replace(/^www\\./, '').toLowerCase();\n if (urlDomain !== cleanDomain) continue;\n if (RESOURCE_EXTENSIONS.test(parsed.pathname)) continue;\n urls.add(url);\n } catch {\n continue;\n }\n }\n}\n\n// ─── Internal link extraction ───────────────────────────────────────────────\n\n/**\n * Extract ALL internal links from HTML (not just nav).\n * Returns deduplicated full URLs for the same domain.\n */\nexport function extractInternalLinks(html: string, domain: string): string[] {\n const cleanDomain = domain.replace(/^www\\./, '').toLowerCase();\n const hrefMatches = html.match(/href=\"([^\"]*)\"/gi) || [];\n const urls = new Set<string>();\n\n for (const match of hrefMatches) {\n const href = match.match(/href=\"([^\"]*)\"/i)?.[1];\n if (!href || !href.trim()) continue;\n\n let fullUrl: string;\n\n if (href.startsWith('//')) {\n fullUrl = `https:${href}`;\n } else if (href.startsWith('/')) {\n // Skip fragment-only, query-only, and anchor links\n if (href === '/' || href.startsWith('/#')) continue;\n fullUrl = `https://${domain}${href}`;\n } else if (href.startsWith('http')) {\n fullUrl = href;\n } else if (href.startsWith('#') || href.startsWith('?') || href.startsWith('mailto:') || href.startsWith('tel:') || href.startsWith('javascript:')) {\n continue;\n } else {\n // Relative path\n fullUrl = `https://${domain}/${href}`;\n }\n\n try {\n const parsed = new URL(fullUrl);\n const linkDomain = parsed.hostname.replace(/^www\\./, '').toLowerCase();\n if (linkDomain !== cleanDomain) continue;\n\n // Strip hash and normalize\n parsed.hash = '';\n const path = parsed.pathname;\n\n if (path === '/' || path === '') continue;\n if (RESOURCE_EXTENSIONS.test(path)) continue;\n if (SKIP_PATH_PATTERNS.test(path)) continue;\n\n // Normalize: strip trailing slash\n const normalized = parsed.origin + path.replace(/\\/+$/, '') + parsed.search;\n urls.add(normalized);\n } catch {\n continue;\n }\n }\n\n return Array.from(urls);\n}\n\n// ─── Category inference ─────────────────────────────────────────────────────\n\nconst CATEGORY_PATTERNS: Array<[RegExp, PageCategory]> = [\n [/\\/([^/]*-?)?(blog|articles?|posts?|news|insights|guides)\\b/i, 'blog'],\n [/\\/(about|about-us|company|who-we-are)\\b/i, 'about'],\n [/\\/(pricing|plans|packages)\\b/i, 'pricing'],\n [/\\/(services?|features?|solutions?|products?|what-we-do|offerings?)\\b/i, 'services'],\n [/\\/(contact|contact-us|get-in-touch)\\b/i, 'contact'],\n [/\\/(team|our-team|authors?|people|leadership|staff)\\b/i, 'team'],\n [/\\/(resources?|resource-center|library|downloads?)\\b/i, 'resources'],\n [/\\/(docs?|documentation|help|help-center|support|knowledge-base)\\b/i, 'docs'],\n [/\\/(case-stud\\w*|cases|customers?|success-stor\\w*|testimonials?)\\b/i, 'cases'],\n [/\\/(faq|frequently-asked|questions)\\b/i, 'faq'],\n];\n\n/**\n * Infer PageCategory from URL path patterns.\n */\nexport function inferCategory(url: string): PageCategory {\n try {\n const path = new URL(url).pathname;\n for (const [pattern, category] of CATEGORY_PATTERNS) {\n if (pattern.test(path)) return category;\n }\n } catch {\n // Fall through to default\n }\n return 'content';\n}\n\n// ─── Main crawler ───────────────────────────────────────────────────────────\n\n/**\n * BFS crawl of a site, discovering all internal pages up to maxPages.\n * Seeds from sitemap URLs + homepage internal links.\n * Skips URLs already in siteData.blogSample and homepage.\n */\nexport async function crawlFullSite(\n siteData: SiteData,\n options?: CrawlOptions,\n): Promise<CrawlResult> {\n const startTime = Date.now();\n const maxPages = options?.maxPages ?? 200;\n const timeoutMs = options?.timeoutMs ?? 10000;\n const concurrency = options?.concurrency ?? 5;\n const respectRobots = options?.respectRobots ?? true;\n\n const pages: FetchResult[] = [];\n const discoveredUrls = new Set<string>();\n const fetchedUrls = new Set<string>();\n const skippedUrls = new Set<string>();\n const visited = new Set<string>();\n\n // Parse robots.txt rules\n let robotsRules: RobotsRules = { disallow: [], allow: [] };\n if (respectRobots && siteData.robotsTxt?.text) {\n robotsRules = parseRobotsTxt(siteData.robotsTxt.text);\n }\n\n const baseUrl = `${siteData.protocol}://${siteData.domain}`;\n\n // Mark already-fetched URLs as visited\n visited.add(normalizeUrl(baseUrl));\n visited.add(normalizeUrl(baseUrl + '/'));\n if (siteData.blogSample) {\n for (const page of siteData.blogSample) {\n if (page.finalUrl) visited.add(normalizeUrl(page.finalUrl));\n }\n }\n\n // Seed the queue from sitemap\n const queue: string[] = [];\n if (siteData.sitemapXml?.text) {\n const sitemapUrls = await extractAllUrlsFromSitemap(\n siteData.sitemapXml.text,\n siteData.domain,\n timeoutMs,\n );\n for (const url of sitemapUrls) {\n const norm = normalizeUrl(url);\n if (!visited.has(norm)) {\n discoveredUrls.add(url);\n if (!queue.includes(url)) queue.push(url);\n }\n }\n }\n\n // Seed from homepage internal links\n if (siteData.homepage?.text) {\n const homeLinks = extractInternalLinks(siteData.homepage.text, siteData.domain);\n for (const url of homeLinks) {\n const norm = normalizeUrl(url);\n if (!visited.has(norm) && !discoveredUrls.has(url)) {\n discoveredUrls.add(url);\n if (!queue.includes(url)) queue.push(url);\n }\n }\n }\n\n // BFS loop\n while (queue.length > 0 && fetchedUrls.size < maxPages) {\n // Take a batch\n const batchSize = Math.min(concurrency, maxPages - fetchedUrls.size, queue.length);\n const batch: string[] = [];\n\n while (batch.length < batchSize && queue.length > 0) {\n const url = queue.shift()!;\n const norm = normalizeUrl(url);\n\n if (visited.has(norm)) continue;\n visited.add(norm);\n\n // Check robots.txt\n if (respectRobots) {\n try {\n const path = new URL(url).pathname;\n if (isDisallowedByRobots(path, robotsRules)) {\n skippedUrls.add(url);\n continue;\n }\n } catch {\n // Skip malformed URLs\n continue;\n }\n }\n\n batch.push(url);\n }\n\n if (batch.length === 0) continue;\n\n // Fetch batch in parallel\n const results = await Promise.all(batch.map(url => fetchPage(url, timeoutMs)));\n\n for (let i = 0; i < results.length; i++) {\n const result = results[i];\n const url = batch[i];\n fetchedUrls.add(url);\n\n if (!result) continue;\n\n result.category = inferCategory(url);\n pages.push(result);\n\n // Extract new internal links from fetched page\n const newLinks = extractInternalLinks(result.text, siteData.domain);\n for (const link of newLinks) {\n const norm = normalizeUrl(link);\n if (!visited.has(norm) && !discoveredUrls.has(link)) {\n discoveredUrls.add(link);\n queue.push(link);\n }\n }\n }\n }\n\n // Any remaining queued URLs count as discovered but skipped\n for (const url of queue) {\n if (!fetchedUrls.has(url)) {\n skippedUrls.add(url);\n }\n }\n\n return {\n pages,\n discoveredUrls: Array.from(discoveredUrls),\n fetchedUrls: Array.from(fetchedUrls),\n skippedUrls: Array.from(skippedUrls),\n elapsed: Math.round((Date.now() - startTime) / 100) / 10,\n };\n}\n\nfunction normalizeUrl(url: string): string {\n try {\n const parsed = new URL(url);\n // Normalize: lowercase host, strip trailing slash, strip hash\n return (parsed.origin + parsed.pathname.replace(/\\/+$/, '') + parsed.search).toLowerCase();\n } catch {\n return url.toLowerCase();\n }\n}\n"],"mappings":";;;AAgCA,IAAM,sBAAsB;AAE5B,IAAM,qBAAqB;AASpB,SAAS,eAAe,YAAiC;AAC9D,QAAM,QAAQ,WAAW,MAAM,IAAI;AACnC,QAAM,QAAqB,EAAE,UAAU,CAAC,GAAG,OAAO,CAAC,EAAE;AAGrD,MAAI,oBAAoB;AAExB,aAAW,WAAW,OAAO;AAC3B,UAAM,OAAO,QAAQ,KAAK;AAC1B,QAAI,CAAC,QAAQ,KAAK,WAAW,GAAG,EAAG;AAEnC,UAAM,UAAU,KAAK,MAAM,sBAAsB;AACjD,QAAI,SAAS;AACX,YAAM,QAAQ,QAAQ,CAAC,EAAE,KAAK,EAAE,YAAY;AAC5C,0BAAoB,UAAU,OAAO,UAAU;AAC/C;AAAA,IACF;AAEA,QAAI,CAAC,kBAAmB;AAExB,UAAM,gBAAgB,KAAK,MAAM,oBAAoB;AACrD,QAAI,eAAe;AACjB,YAAM,OAAO,cAAc,CAAC,EAAE,KAAK;AACnC,UAAI,KAAM,OAAM,SAAS,KAAK,IAAI;AAClC;AAAA,IACF;AAEA,UAAM,aAAa,KAAK,MAAM,iBAAiB;AAC/C,QAAI,YAAY;AACd,YAAM,OAAO,WAAW,CAAC,EAAE,KAAK;AAChC,UAAI,KAAM,OAAM,MAAM,KAAK,IAAI;AAAA,IACjC;AAAA,EACF;AAEA,SAAO;AACT;AAEO,SAAS,qBAAqB,SAAiB,OAA6B;AAEjF,MAAI,eAAe;AACnB,MAAI,kBAAkB;AAEtB,aAAW,WAAW,MAAM,OAAO;AACjC,QAAI,QAAQ,WAAW,OAAO,KAAK,QAAQ,SAAS,cAAc;AAChE,qBAAe,QAAQ;AAAA,IACzB;AAAA,EACF;AAEA,aAAW,WAAW,MAAM,UAAU;AACpC,QAAI,QAAQ,WAAW,OAAO,KAAK,QAAQ,SAAS,iBAAiB;AACnE,wBAAkB,QAAQ;AAAA,IAC5B;AAAA,EACF;AAGA,MAAI,iBAAiB,KAAK,oBAAoB,EAAG,QAAO;AACxD,SAAO,kBAAkB;AAC3B;AAIA,eAAe,UAAU,KAAa,YAAY,KAAoC;AACpF,MAAI;AACF,UAAM,MAAM,MAAM,MAAM,KAAK;AAAA,MAC3B,QAAQ,YAAY,QAAQ,SAAS;AAAA,MACrC,SAAS,EAAE,cAAc,yBAAyB;AAAA,MAClD,UAAU;AAAA,IACZ,CAAC;AACD,QAAI,IAAI,WAAW,IAAK,QAAO;AAC/B,UAAM,OAAO,MAAM,IAAI,KAAK;AAC5B,QAAI,KAAK,SAAS,IAAK,QAAO;AAC9B,WAAO,EAAE,MAAM,KAAK,MAAM,GAAG,GAAO,GAAG,QAAQ,IAAI,QAAQ,UAAU,IAAI,IAAI;AAAA,EAC/E,QAAQ;AACN,WAAO;AAAA,EACT;AACF;AAEA,eAAe,gBAAgB,KAAa,YAAY,KAA+B;AACrF,MAAI;AACF,UAAM,MAAM,MAAM,MAAM,KAAK;AAAA,MAC3B,QAAQ,YAAY,QAAQ,SAAS;AAAA,MACrC,SAAS,EAAE,cAAc,yBAAyB;AAAA,MAClD,UAAU;AAAA,IACZ,CAAC;AACD,QAAI,IAAI,WAAW,IAAK,QAAO;AAC/B,WAAO,MAAM,IAAI,KAAK;AAAA,EACxB,QAAQ;AACN,WAAO;AAAA,EACT;AACF;AAQA,eAAsB,0BACpB,aACA,QACA,YAAY,KACO;AACnB,QAAM,cAAc,OAAO,QAAQ,UAAU,EAAE,EAAE,YAAY;AAC7D,QAAM,OAAO,oBAAI,IAAY;AAG7B,QAAM,iBAAiB,YAAY,MAAM,2DAA2D,KAAK,CAAC;AAC1G,MAAI,eAAe,SAAS,GAAG;AAC7B,UAAM,UAAoB,CAAC;AAC3B,eAAW,SAAS,gBAAgB;AAClC,YAAM,WAAW,MAAM,MAAM,sBAAsB;AACnD,UAAI,SAAU,SAAQ,KAAK,SAAS,CAAC,EAAE,KAAK,CAAC;AAAA,IAC/C;AAGA,UAAM,UAAU,QAAQ,MAAM,GAAG,EAAE,EAAE,IAAI,OAAK,gBAAgB,GAAG,SAAS,CAAC;AAC3E,UAAM,UAAU,MAAM,QAAQ,IAAI,OAAO;AACzC,eAAW,QAAQ,SAAS;AAC1B,UAAI,MAAM;AACR,2BAAmB,MAAM,aAAa,IAAI;AAAA,MAC5C;AAAA,IACF;AAAA,EACF;AAGA,qBAAmB,aAAa,aAAa,IAAI;AAEjD,SAAO,MAAM,KAAK,IAAI;AACxB;AAEA,SAAS,mBAAmB,KAAa,aAAqB,MAAyB;AACrF,QAAM,aAAa,IAAI,MAAM,mDAAmD,KAAK,CAAC;AACtF,aAAW,SAAS,YAAY;AAC9B,UAAM,WAAW,MAAM,MAAM,sBAAsB;AACnD,QAAI,CAAC,SAAU;AACf,UAAM,MAAM,SAAS,CAAC,EAAE,KAAK;AAE7B,QAAI;AACF,YAAM,SAAS,IAAI,IAAI,GAAG;AAC1B,YAAM,YAAY,OAAO,SAAS,QAAQ,UAAU,EAAE,EAAE,YAAY;AACpE,UAAI,cAAc,YAAa;AAC/B,UAAI,oBAAoB,KAAK,OAAO,QAAQ,EAAG;AAC/C,WAAK,IAAI,GAAG;AAAA,IACd,QAAQ;AACN;AAAA,IACF;AAAA,EACF;AACF;AAQO,SAAS,qBAAqB,MAAc,QAA0B;AAC3E,QAAM,cAAc,OAAO,QAAQ,UAAU,EAAE,EAAE,YAAY;AAC7D,QAAM,cAAc,KAAK,MAAM,kBAAkB,KAAK,CAAC;AACvD,QAAM,OAAO,oBAAI,IAAY;AAE7B,aAAW,SAAS,aAAa;AAC/B,UAAM,OAAO,MAAM,MAAM,iBAAiB,IAAI,CAAC;AAC/C,QAAI,CAAC,QAAQ,CAAC,KAAK,KAAK,EAAG;AAE3B,QAAI;AAEJ,QAAI,KAAK,WAAW,IAAI,GAAG;AACzB,gBAAU,SAAS,IAAI;AAAA,IACzB,WAAW,KAAK,WAAW,GAAG,GAAG;AAE/B,UAAI,SAAS,OAAO,KAAK,WAAW,IAAI,EAAG;AAC3C,gBAAU,WAAW,MAAM,GAAG,IAAI;AAAA,IACpC,WAAW,KAAK,WAAW,MAAM,GAAG;AAClC,gBAAU;AAAA,IACZ,WAAW,KAAK,WAAW,GAAG,KAAK,KAAK,WAAW,GAAG,KAAK,KAAK,WAAW,SAAS,KAAK,KAAK,WAAW,MAAM,KAAK,KAAK,WAAW,aAAa,GAAG;AAClJ;AAAA,IACF,OAAO;AAEL,gBAAU,WAAW,MAAM,IAAI,IAAI;AAAA,IACrC;AAEA,QAAI;AACF,YAAM,SAAS,IAAI,IAAI,OAAO;AAC9B,YAAM,aAAa,OAAO,SAAS,QAAQ,UAAU,EAAE,EAAE,YAAY;AACrE,UAAI,eAAe,YAAa;AAGhC,aAAO,OAAO;AACd,YAAM,OAAO,OAAO;AAEpB,UAAI,SAAS,OAAO,SAAS,GAAI;AACjC,UAAI,oBAAoB,KAAK,IAAI,EAAG;AACpC,UAAI,mBAAmB,KAAK,IAAI,EAAG;AAGnC,YAAM,aAAa,OAAO,SAAS,KAAK,QAAQ,QAAQ,EAAE,IAAI,OAAO;AACrE,WAAK,IAAI,UAAU;AAAA,IACrB,QAAQ;AACN;AAAA,IACF;AAAA,EACF;AAEA,SAAO,MAAM,KAAK,IAAI;AACxB;AAIA,IAAM,oBAAmD;AAAA,EACvD,CAAC,+DAA+D,MAAM;AAAA,EACtE,CAAC,4CAA4C,OAAO;AAAA,EACpD,CAAC,iCAAiC,SAAS;AAAA,EAC3C,CAAC,yEAAyE,UAAU;AAAA,EACpF,CAAC,0CAA0C,SAAS;AAAA,EACpD,CAAC,yDAAyD,MAAM;AAAA,EAChE,CAAC,wDAAwD,WAAW;AAAA,EACpE,CAAC,sEAAsE,MAAM;AAAA,EAC7E,CAAC,sEAAsE,OAAO;AAAA,EAC9E,CAAC,yCAAyC,KAAK;AACjD;AAKO,SAAS,cAAc,KAA2B;AACvD,MAAI;AACF,UAAM,OAAO,IAAI,IAAI,GAAG,EAAE;AAC1B,eAAW,CAAC,SAAS,QAAQ,KAAK,mBAAmB;AACnD,UAAI,QAAQ,KAAK,IAAI,EAAG,QAAO;AAAA,IACjC;AAAA,EACF,QAAQ;AAAA,EAER;AACA,SAAO;AACT;AASA,eAAsB,cACpB,UACA,SACsB;AACtB,QAAM,YAAY,KAAK,IAAI;AAC3B,QAAM,WAAW,SAAS,YAAY;AACtC,QAAM,YAAY,SAAS,aAAa;AACxC,QAAM,cAAc,SAAS,eAAe;AAC5C,QAAM,gBAAgB,SAAS,iBAAiB;AAEhD,QAAM,QAAuB,CAAC;AAC9B,QAAM,iBAAiB,oBAAI,IAAY;AACvC,QAAM,cAAc,oBAAI,IAAY;AACpC,QAAM,cAAc,oBAAI,IAAY;AACpC,QAAM,UAAU,oBAAI,IAAY;AAGhC,MAAI,cAA2B,EAAE,UAAU,CAAC,GAAG,OAAO,CAAC,EAAE;AACzD,MAAI,iBAAiB,SAAS,WAAW,MAAM;AAC7C,kBAAc,eAAe,SAAS,UAAU,IAAI;AAAA,EACtD;AAEA,QAAM,UAAU,GAAG,SAAS,QAAQ,MAAM,SAAS,MAAM;AAGzD,UAAQ,IAAI,aAAa,OAAO,CAAC;AACjC,UAAQ,IAAI,aAAa,UAAU,GAAG,CAAC;AACvC,MAAI,SAAS,YAAY;AACvB,eAAW,QAAQ,SAAS,YAAY;AACtC,UAAI,KAAK,SAAU,SAAQ,IAAI,aAAa,KAAK,QAAQ,CAAC;AAAA,IAC5D;AAAA,EACF;AAGA,QAAM,QAAkB,CAAC;AACzB,MAAI,SAAS,YAAY,MAAM;AAC7B,UAAM,cAAc,MAAM;AAAA,MACxB,SAAS,WAAW;AAAA,MACpB,SAAS;AAAA,MACT;AAAA,IACF;AACA,eAAW,OAAO,aAAa;AAC7B,YAAM,OAAO,aAAa,GAAG;AAC7B,UAAI,CAAC,QAAQ,IAAI,IAAI,GAAG;AACtB,uBAAe,IAAI,GAAG;AACtB,YAAI,CAAC,MAAM,SAAS,GAAG,EAAG,OAAM,KAAK,GAAG;AAAA,MAC1C;AAAA,IACF;AAAA,EACF;AAGA,MAAI,SAAS,UAAU,MAAM;AAC3B,UAAM,YAAY,qBAAqB,SAAS,SAAS,MAAM,SAAS,MAAM;AAC9E,eAAW,OAAO,WAAW;AAC3B,YAAM,OAAO,aAAa,GAAG;AAC7B,UAAI,CAAC,QAAQ,IAAI,IAAI,KAAK,CAAC,eAAe,IAAI,GAAG,GAAG;AAClD,uBAAe,IAAI,GAAG;AACtB,YAAI,CAAC,MAAM,SAAS,GAAG,EAAG,OAAM,KAAK,GAAG;AAAA,MAC1C;AAAA,IACF;AAAA,EACF;AAGA,SAAO,MAAM,SAAS,KAAK,YAAY,OAAO,UAAU;AAEtD,UAAM,YAAY,KAAK,IAAI,aAAa,WAAW,YAAY,MAAM,MAAM,MAAM;AACjF,UAAM,QAAkB,CAAC;AAEzB,WAAO,MAAM,SAAS,aAAa,MAAM,SAAS,GAAG;AACnD,YAAM,MAAM,MAAM,MAAM;AACxB,YAAM,OAAO,aAAa,GAAG;AAE7B,UAAI,QAAQ,IAAI,IAAI,EAAG;AACvB,cAAQ,IAAI,IAAI;AAGhB,UAAI,eAAe;AACjB,YAAI;AACF,gBAAM,OAAO,IAAI,IAAI,GAAG,EAAE;AAC1B,cAAI,qBAAqB,MAAM,WAAW,GAAG;AAC3C,wBAAY,IAAI,GAAG;AACnB;AAAA,UACF;AAAA,QACF,QAAQ;AAEN;AAAA,QACF;AAAA,MACF;AAEA,YAAM,KAAK,GAAG;AAAA,IAChB;AAEA,QAAI,MAAM,WAAW,EAAG;AAGxB,UAAM,UAAU,MAAM,QAAQ,IAAI,MAAM,IAAI,SAAO,UAAU,KAAK,SAAS,CAAC,CAAC;AAE7E,aAAS,IAAI,GAAG,IAAI,QAAQ,QAAQ,KAAK;AACvC,YAAM,SAAS,QAAQ,CAAC;AACxB,YAAM,MAAM,MAAM,CAAC;AACnB,kBAAY,IAAI,GAAG;AAEnB,UAAI,CAAC,OAAQ;AAEb,aAAO,WAAW,cAAc,GAAG;AACnC,YAAM,KAAK,MAAM;AAGjB,YAAM,WAAW,qBAAqB,OAAO,MAAM,SAAS,MAAM;AAClE,iBAAW,QAAQ,UAAU;AAC3B,cAAM,OAAO,aAAa,IAAI;AAC9B,YAAI,CAAC,QAAQ,IAAI,IAAI,KAAK,CAAC,eAAe,IAAI,IAAI,GAAG;AACnD,yBAAe,IAAI,IAAI;AACvB,gBAAM,KAAK,IAAI;AAAA,QACjB;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAGA,aAAW,OAAO,OAAO;AACvB,QAAI,CAAC,YAAY,IAAI,GAAG,GAAG;AACzB,kBAAY,IAAI,GAAG;AAAA,IACrB;AAAA,EACF;AAEA,SAAO;AAAA,IACL;AAAA,IACA,gBAAgB,MAAM,KAAK,cAAc;AAAA,IACzC,aAAa,MAAM,KAAK,WAAW;AAAA,IACnC,aAAa,MAAM,KAAK,WAAW;AAAA,IACnC,SAAS,KAAK,OAAO,KAAK,IAAI,IAAI,aAAa,GAAG,IAAI;AAAA,EACxD;AACF;AAEA,SAAS,aAAa,KAAqB;AACzC,MAAI;AACF,UAAM,SAAS,IAAI,IAAI,GAAG;AAE1B,YAAQ,OAAO,SAAS,OAAO,SAAS,QAAQ,QAAQ,EAAE,IAAI,OAAO,QAAQ,YAAY;AAAA,EAC3F,QAAQ;AACN,WAAO,IAAI,YAAY;AAAA,EACzB;AACF;","names":[]}
@@ -5,7 +5,7 @@ import {
5
5
  inferCategory,
6
6
  isDisallowedByRobots,
7
7
  parseRobotsTxt
8
- } from "./chunk-3IJISYWT.js";
8
+ } from "./chunk-PKJIKMLV.js";
9
9
  export {
10
10
  crawlFullSite,
11
11
  extractAllUrlsFromSitemap,
@@ -14,4 +14,4 @@ export {
14
14
  isDisallowedByRobots,
15
15
  parseRobotsTxt
16
16
  };
17
- //# sourceMappingURL=full-site-crawler-VFARFR2C.js.map
17
+ //# sourceMappingURL=full-site-crawler-UIOMKOZA.js.map
package/dist/index.cjs CHANGED
@@ -303,7 +303,7 @@ var init_full_site_crawler = __esm({
303
303
  RESOURCE_EXTENSIONS = /\.(js|css|png|jpg|jpeg|gif|svg|ico|pdf|xml|txt|woff|woff2|ttf|eot|mp4|mp3|webp|avif|zip|gz|tar|json)$/i;
304
304
  SKIP_PATH_PATTERNS = /^\/(api|wp-admin|wp-json|static|assets|_next|auth|login|signup|cart|checkout|admin|feed|xmlrpc)\b/i;
305
305
  CATEGORY_PATTERNS = [
306
- [/\/(blog|articles?|posts?|news|insights|guides)\b/i, "blog"],
306
+ [/\/([^/]*-?)?(blog|articles?|posts?|news|insights|guides)\b/i, "blog"],
307
307
  [/\/(about|about-us|company|who-we-are)\b/i, "about"],
308
308
  [/\/(pricing|plans|packages)\b/i, "pricing"],
309
309
  [/\/(services?|features?|solutions?|products?|what-we-do|offerings?)\b/i, "services"],
@@ -552,7 +552,7 @@ async function prefetchSiteData(domain) {
552
552
  sitemapForBlog = subSitemap.text;
553
553
  }
554
554
  }
555
- const blogUrls = extractBlogUrlsFromSitemap(sitemapForBlog, domain, 10);
555
+ const blogUrls = extractBlogUrlsFromSitemap(sitemapForBlog, domain, 50);
556
556
  if (blogUrls.length > 0) {
557
557
  const fetched = await Promise.all(blogUrls.map((url) => fetchText(url)));
558
558
  blogSample = fetched.filter(
@@ -909,15 +909,17 @@ function checkOriginalData(data) {
909
909
  findings.push({ severity: "critical", detail: "Could not fetch homepage" });
910
910
  return { criterion: "original_data", criterion_label: "Original Data & Expert Content", score: 0, status: "not_found", findings, fix_priority: "P2" };
911
911
  }
912
+ const allPages = [data.homepage, ...data.blogSample || []].filter(Boolean);
912
913
  const html = data.homepage.text;
913
- const text = html.replace(/<[^>]*>/g, " ").replace(/\s+/g, " ");
914
+ const allText = allPages.map((p) => p.text.replace(/<[^>]*>/g, " ").replace(/\s+/g, " ")).join(" ");
915
+ const text = data.homepage.text.replace(/<[^>]*>/g, " ").replace(/\s+/g, " ");
914
916
  let score = 0;
915
917
  const statPatterns = /\d+%|\d+\s*(patients|clients|customers|cases|years|professionals|specialists|companies|users|businesses|domains|audits)/i;
916
- if (statPatterns.test(text)) {
918
+ if (statPatterns.test(allText)) {
917
919
  const researchContext = /\b(our\s+(?:study|analysis|research|data|survey|findings|report)|we\s+(?:surveyed|analyzed|studied|measured|tracked)|proprietary|methodology|original\s+research)\b/i;
918
- if (researchContext.test(text)) {
920
+ if (researchContext.test(allText)) {
919
921
  score += 3;
920
- findings.push({ severity: "info", detail: "Proprietary statistics with research context found on homepage" });
922
+ findings.push({ severity: "info", detail: "Proprietary statistics with research context found" });
921
923
  } else {
922
924
  score += 1;
923
925
  findings.push({ severity: "low", detail: 'Statistics found but without research context (e.g., "500+ clients")', fix: 'Add context about your methodology: "Our analysis of X found..." or "We surveyed Y..."' });
@@ -1432,20 +1434,24 @@ function checkFactDensity(data) {
1432
1434
  findings.push({ severity: "critical", detail: "Could not fetch homepage" });
1433
1435
  return { criterion: "fact_density", criterion_label: "Fact & Data Density", score: 0, status: "not_found", findings, fix_priority: "P2" };
1434
1436
  }
1435
- const text = data.homepage.text.replace(/<[^>]*>/g, " ").replace(/\s+/g, " ");
1437
+ const allPages = [data.homepage, ...data.blogSample || []].filter(Boolean);
1438
+ const allText = allPages.map((p) => p.text.replace(/<[^>]*>/g, " ").replace(/\s+/g, " ")).join(" ");
1439
+ const text = allText;
1440
+ const pageCount = allPages.length;
1436
1441
  let score = 0;
1437
1442
  const dataPoints = text.match(/\d+(?:\.\d+)?(?:\s*%|\s*\$|\s*USD|\s*EUR)/g) || [];
1438
1443
  const countPhrases = text.match(/\d+(?:,\d{3})*\+?\s+(?:users?|clients?|customers?|companies|businesses|patients?|members?|employees?|projects?|downloads?)/gi) || [];
1439
1444
  const totalDataPoints = dataPoints.length + countPhrases.length;
1440
- if (totalDataPoints >= 6) {
1445
+ const avgPerPage = pageCount > 0 ? totalDataPoints / pageCount : 0;
1446
+ if (avgPerPage >= 4) {
1441
1447
  score += 5;
1442
- findings.push({ severity: "info", detail: `${totalDataPoints} quantitative data points found on homepage` });
1443
- } else if (totalDataPoints >= 3) {
1448
+ findings.push({ severity: "info", detail: `${totalDataPoints} quantitative data points found across ${pageCount} pages (avg ${avgPerPage.toFixed(1)}/page)` });
1449
+ } else if (avgPerPage >= 2) {
1444
1450
  score += 3;
1445
- findings.push({ severity: "info", detail: `${totalDataPoints} quantitative data points found` });
1451
+ findings.push({ severity: "info", detail: `${totalDataPoints} quantitative data points found across ${pageCount} pages` });
1446
1452
  } else if (totalDataPoints >= 1) {
1447
1453
  score += 1;
1448
- findings.push({ severity: "low", detail: `Only ${totalDataPoints} quantitative data point(s) found`, fix: "Add more specific numbers, percentages, and metrics to strengthen credibility" });
1454
+ findings.push({ severity: "low", detail: `Only ${totalDataPoints} quantitative data point(s) found across ${pageCount} pages`, fix: "Add more specific numbers, percentages, and metrics to strengthen credibility" });
1449
1455
  } else {
1450
1456
  findings.push({ severity: "high", detail: "No quantitative data points found", fix: "Add specific statistics (percentages, counts, comparisons) that AI engines can cite" });
1451
1457
  }
@@ -1551,9 +1557,9 @@ function countRecentSitemapDates(sitemapText) {
1551
1557
  distinctRecentDays: recentDays.size
1552
1558
  };
1553
1559
  }
1554
- var BLOG_PATH_PATTERNS = /\/(?:blog|articles?|insights?|guides?|resources?|news|posts?|learn|help|how-?to|tutorials?|case-stud|whitepapers?)\b/i;
1560
+ var BLOG_PATH_PATTERNS = /\/(?:[^/]*-?)?(?:blog|articles?|insights?|guides?|resources?|news|posts?|learn|help|how-?to|tutorials?|case-stud|whitepapers?)\b/i;
1555
1561
  var EXCLUDE_PATH_PATTERNS = /\/(?:tag|category|author|page|feed|wp-content|wp-admin|wp-json|cart|checkout|login|search|api|static|assets|_next)\b/i;
1556
- function extractBlogUrlsFromSitemap(sitemapText, domain, limit = 5) {
1562
+ function extractBlogUrlsFromSitemap(sitemapText, domain, limit = 50) {
1557
1563
  const urlBlocks = sitemapText.match(/<url>([\s\S]*?)<\/url>/gi) || [];
1558
1564
  const candidates = [];
1559
1565
  const cleanDomain = domain.replace(/^www\./, "").toLowerCase();
@@ -1849,7 +1855,7 @@ function jaccardSimilarity(a, b) {
1849
1855
  const union = a.size + b.size - intersection;
1850
1856
  return union === 0 ? 0 : intersection / union;
1851
1857
  }
1852
- function checkContentCannibalization(data) {
1858
+ function checkContentCannibalization(data, topicCoherenceScore) {
1853
1859
  const findings = [];
1854
1860
  if (!data.homepage) {
1855
1861
  findings.push({ severity: "critical", detail: "No homepage available for cannibalization analysis" });
@@ -1859,7 +1865,7 @@ function checkContentCannibalization(data) {
1859
1865
  { html: data.homepage.text, url: data.homepage.finalUrl || `https://${data.domain}/` }
1860
1866
  ];
1861
1867
  if (data.blogSample) {
1862
- for (const page of data.blogSample.slice(0, 5)) {
1868
+ for (const page of data.blogSample) {
1863
1869
  pages.push({ html: page.text, url: page.finalUrl || "" });
1864
1870
  }
1865
1871
  }
@@ -1869,10 +1875,29 @@ function checkContentCannibalization(data) {
1869
1875
  }
1870
1876
  const pageTitles = pages.map((p) => ({ title: extractPageTitle(p.html), url: p.url }));
1871
1877
  const wordSets = pageTitles.map((p) => titleToWordSet(p.title));
1878
+ const termPageCount = /* @__PURE__ */ new Map();
1879
+ for (const ws of wordSets) {
1880
+ for (const w of ws) {
1881
+ termPageCount.set(w, (termPageCount.get(w) || 0) + 1);
1882
+ }
1883
+ }
1884
+ const commonTermThreshold = Math.max(3, pages.length * 0.4);
1885
+ const siteThemeTerms = /* @__PURE__ */ new Set();
1886
+ for (const [term, count] of termPageCount) {
1887
+ if (count >= commonTermThreshold) siteThemeTerms.add(term);
1888
+ }
1889
+ const filteredSets = wordSets.map((ws) => {
1890
+ const filtered = /* @__PURE__ */ new Set();
1891
+ for (const w of ws) {
1892
+ if (!siteThemeTerms.has(w)) filtered.add(w);
1893
+ }
1894
+ return filtered;
1895
+ });
1872
1896
  const cannibalPairs = [];
1873
1897
  for (let i = 0; i < pages.length; i++) {
1874
1898
  for (let j = i + 1; j < pages.length; j++) {
1875
- const sim = jaccardSimilarity(wordSets[i], wordSets[j]);
1899
+ if (filteredSets[i].size === 0 && filteredSets[j].size === 0) continue;
1900
+ const sim = jaccardSimilarity(filteredSets[i], filteredSets[j]);
1876
1901
  if (sim > 0.6) {
1877
1902
  cannibalPairs.push({
1878
1903
  urlA: pageTitles[i].url.slice(0, 60),
@@ -1882,23 +1907,39 @@ function checkContentCannibalization(data) {
1882
1907
  }
1883
1908
  }
1884
1909
  }
1910
+ const cannibalUrls = /* @__PURE__ */ new Set();
1911
+ for (const pair of cannibalPairs) {
1912
+ cannibalUrls.add(pair.urlA);
1913
+ cannibalUrls.add(pair.urlB);
1914
+ }
1915
+ const cannibalRatio = pages.length > 0 ? cannibalUrls.size / pages.length : 0;
1885
1916
  let score;
1886
1917
  if (cannibalPairs.length === 0) {
1887
1918
  score = 10;
1888
1919
  findings.push({ severity: "info", detail: `${pages.length} pages analyzed - no content cannibalization detected` });
1889
- } else if (cannibalPairs.length === 1) {
1890
- score = 8;
1891
- findings.push({ severity: "low", detail: `1 pair of pages with overlapping topics (${cannibalPairs[0].similarity}% similarity)`, fix: "Differentiate titles and H1 headings to reduce topic overlap" });
1892
- } else if (cannibalPairs.length === 2) {
1920
+ } else if (cannibalRatio <= 0.05) {
1921
+ score = 9;
1922
+ findings.push({ severity: "info", detail: `${cannibalPairs.length} pair(s) of pages with minor topic overlap (${cannibalUrls.size}/${pages.length} pages affected)` });
1923
+ } else if (cannibalRatio <= 0.1) {
1924
+ score = 7;
1925
+ findings.push({ severity: "low", detail: `${cannibalUrls.size} pages (${Math.round(cannibalRatio * 100)}%) have overlapping topics`, fix: "Differentiate titles and H1 headings to reduce topic overlap" });
1926
+ } else if (cannibalRatio <= 0.2) {
1893
1927
  score = 5;
1894
- findings.push({ severity: "medium", detail: `${cannibalPairs.length} pairs of pages with overlapping topics`, fix: "Consolidate overlapping pages or differentiate their titles and content focus" });
1928
+ findings.push({ severity: "medium", detail: `${cannibalUrls.size} pages (${Math.round(cannibalRatio * 100)}%) competing for overlapping topics`, fix: "Consolidate overlapping pages or differentiate their titles and content focus" });
1929
+ } else if (cannibalRatio <= 0.4) {
1930
+ score = 3;
1931
+ findings.push({ severity: "medium", detail: `${cannibalUrls.size} pages (${Math.round(cannibalRatio * 100)}%) have significant content overlap`, fix: "Many pages compete for the same topics - consolidate or clearly differentiate them" });
1895
1932
  } else {
1896
1933
  score = 0;
1897
- findings.push({ severity: "high", detail: `${cannibalPairs.length} pairs of pages competing for the same topics`, fix: "Significant content overlap detected - consolidate or clearly differentiate competing pages" });
1934
+ findings.push({ severity: "high", detail: `${cannibalUrls.size} pages (${Math.round(cannibalRatio * 100)}%) competing for the same topics`, fix: "Severe content cannibalization - consolidate overlapping pages or create clear topic differentiation" });
1898
1935
  }
1899
1936
  for (const pair of cannibalPairs.slice(0, 3)) {
1900
1937
  findings.push({ severity: "low", detail: `Overlap (${pair.similarity}%): ${pair.urlA} vs ${pair.urlB}` });
1901
1938
  }
1939
+ if (topicCoherenceScore !== void 0 && topicCoherenceScore <= 4 && score >= 8) {
1940
+ score = 6;
1941
+ findings.push({ severity: "low", detail: "Low topic overlap but content lacks coherent focus - not a strong signal for AI authority", fix: "Focus content on fewer core topics to build topical authority that AI engines can identify" });
1942
+ }
1902
1943
  return { criterion: "content_cannibalization", criterion_label: "Content Cannibalization", score, status: score >= 7 ? "pass" : score >= 4 ? "partial" : "fail", findings, fix_priority: score >= 7 ? "P3" : "P1" };
1903
1944
  }
1904
1945
  function checkVisibleDateSignal(data) {
@@ -2124,7 +2165,233 @@ function extractRawDataSummary(data) {
2124
2165
  crawl_skipped: data.crawlStats?.skipped ?? 0
2125
2166
  };
2126
2167
  }
2168
+ function getPageTopicText(html) {
2169
+ const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
2170
+ const h1Match = html.match(/<h1[^>]*>([\s\S]*?)<\/h1>/i);
2171
+ return [
2172
+ titleMatch?.[1] || "",
2173
+ h1Match?.[1]?.replace(/<[^>]*>/g, "") || ""
2174
+ ].join(" ").toLowerCase().trim();
2175
+ }
2176
+ function extractBigrams(text) {
2177
+ const words = text.split(/[\s,.!?;:()\[\]{}"'\/&]+/).filter((w) => w.length > 2 && !STOP_WORDS.has(w) && !/^\d+$/.test(w));
2178
+ const bigrams = [];
2179
+ for (let i = 0; i < words.length - 1; i++) {
2180
+ bigrams.push(words[i] + " " + words[i + 1]);
2181
+ }
2182
+ return bigrams;
2183
+ }
2184
+ function checkTopicCoherence(data) {
2185
+ const findings = [];
2186
+ if (!data.homepage) {
2187
+ findings.push({ severity: "critical", detail: "Could not fetch homepage" });
2188
+ return { criterion: "topic_coherence", criterion_label: "Topic Coherence", score: 0, status: "not_found", findings, fix_priority: "P0" };
2189
+ }
2190
+ if (!data.blogSample || data.blogSample.length < 3) {
2191
+ findings.push({ severity: "info", detail: `Only ${data.blogSample?.length || 0} blog pages found - insufficient for topic coherence analysis` });
2192
+ return { criterion: "topic_coherence", criterion_label: "Topic Coherence", score: 5, status: "partial", findings, fix_priority: "P2" };
2193
+ }
2194
+ const blogPages = data.blogSample;
2195
+ const domainBase = data.domain.replace(/^www\./, "").replace(/\.(com|org|net|io|co|ai)$/i, "").toLowerCase();
2196
+ const brandWords = /* @__PURE__ */ new Set();
2197
+ brandWords.add(domainBase);
2198
+ for (const part of domainBase.split(/[-_]/)) {
2199
+ if (part.length > 2) brandWords.add(part);
2200
+ }
2201
+ const rawTermFreq = /* @__PURE__ */ new Map();
2202
+ const pageTitleTexts = [];
2203
+ for (const page of blogPages) {
2204
+ const topicText = getPageTopicText(page.text);
2205
+ pageTitleTexts.push(topicText);
2206
+ const words = topicText.split(/[\s,.!?;:()\[\]{}"'\/&]+/).filter((w) => w.length > 2 && !STOP_WORDS.has(w) && !/^\d+$/.test(w));
2207
+ const uniqueWords = new Set(words);
2208
+ for (const w of uniqueWords) {
2209
+ rawTermFreq.set(w, (rawTermFreq.get(w) || 0) + 1);
2210
+ }
2211
+ }
2212
+ for (const [term, count] of rawTermFreq) {
2213
+ if (count / blogPages.length >= 0.8 && domainBase.includes(term)) {
2214
+ brandWords.add(term);
2215
+ }
2216
+ }
2217
+ const termFreq = /* @__PURE__ */ new Map();
2218
+ for (const page of blogPages) {
2219
+ const topicText = getPageTopicText(page.text);
2220
+ const words = topicText.split(/[\s,.!?;:()\[\]{}"'\/&]+/).filter((w) => w.length > 2 && !STOP_WORDS.has(w) && !/^\d+$/.test(w) && !brandWords.has(w));
2221
+ const uniqueWords = new Set(words);
2222
+ for (const w of uniqueWords) {
2223
+ termFreq.set(w, (termFreq.get(w) || 0) + 1);
2224
+ }
2225
+ }
2226
+ const sortedTerms = [...termFreq.entries()].sort((a, b) => b[1] - a[1]);
2227
+ const topTerm = sortedTerms[0];
2228
+ const bigramFreq = /* @__PURE__ */ new Map();
2229
+ const pageBigrams = [];
2230
+ for (const topicText of pageTitleTexts) {
2231
+ const bigrams = extractBigrams(topicText).filter((bg) => !bg.split(" ").some((w) => brandWords.has(w)));
2232
+ pageBigrams.push(bigrams);
2233
+ const uniqueBigrams = new Set(bigrams);
2234
+ for (const bg of uniqueBigrams) {
2235
+ bigramFreq.set(bg, (bigramFreq.get(bg) || 0) + 1);
2236
+ }
2237
+ }
2238
+ const sortedBigrams = [...bigramFreq.entries()].sort((a, b) => b[1] - a[1]);
2239
+ const topBigram = sortedBigrams[0];
2240
+ const significantBigrams = sortedBigrams.filter(([, count]) => count >= 2);
2241
+ const clusterRoots = [];
2242
+ const assigned = /* @__PURE__ */ new Set();
2243
+ for (const [bg] of significantBigrams) {
2244
+ if (assigned.has(bg)) continue;
2245
+ clusterRoots.push(bg);
2246
+ assigned.add(bg);
2247
+ const [w1, w2] = bg.split(" ");
2248
+ for (const [otherBg] of significantBigrams) {
2249
+ if (assigned.has(otherBg)) continue;
2250
+ if (otherBg.includes(w1) || otherBg.includes(w2)) {
2251
+ assigned.add(otherBg);
2252
+ }
2253
+ }
2254
+ }
2255
+ const topicClusterCount = clusterRoots.length;
2256
+ const dominantTerm = topTerm?.[0] || "";
2257
+ const dominantTermCount = topTerm?.[1] || 0;
2258
+ const focusRatio = blogPages.length > 0 ? dominantTermCount / blogPages.length : 0;
2259
+ const dominantBigram = topBigram?.[0] || "";
2260
+ const dominantBigramCount = topBigram?.[1] || 0;
2261
+ const bigramFocusRatio = blogPages.length > 0 ? dominantBigramCount / blogPages.length : 0;
2262
+ let score = 0;
2263
+ const bestFocusRatio = Math.max(focusRatio, bigramFocusRatio);
2264
+ if (bestFocusRatio >= 0.8) {
2265
+ score += 7;
2266
+ } else if (bestFocusRatio >= 0.6) {
2267
+ score += 6;
2268
+ } else if (bestFocusRatio >= 0.45) {
2269
+ score += 5;
2270
+ } else if (bestFocusRatio >= 0.3) {
2271
+ score += 3;
2272
+ } else if (bestFocusRatio >= 0.15) {
2273
+ score += 2;
2274
+ } else {
2275
+ score += 1;
2276
+ }
2277
+ const clusterPenaltyReduced = focusRatio >= 0.7;
2278
+ if (topicClusterCount <= 3) {
2279
+ score += 3;
2280
+ findings.push({ severity: "info", detail: `${topicClusterCount} topic cluster(s) - tightly focused content` });
2281
+ } else if (topicClusterCount <= 6) {
2282
+ score += clusterPenaltyReduced ? 2 : 1;
2283
+ findings.push({ severity: "info", detail: `${topicClusterCount} topic clusters${clusterPenaltyReduced ? " within a focused niche" : " - moderately focused"}` });
2284
+ } else if (topicClusterCount <= 10) {
2285
+ score += clusterPenaltyReduced ? 1 : 0;
2286
+ if (!clusterPenaltyReduced) {
2287
+ findings.push({ severity: "low", detail: `${topicClusterCount} topic clusters - scattered content`, fix: "Reduce the number of distinct topics. Focus blog content on 2-3 core expertise areas." });
2288
+ } else {
2289
+ findings.push({ severity: "info", detail: `${topicClusterCount} topic clusters but strong core topic focus (${Math.round(focusRatio * 100)}%)` });
2290
+ }
2291
+ } else {
2292
+ score += clusterPenaltyReduced ? 0 : -2;
2293
+ if (!clusterPenaltyReduced) {
2294
+ findings.push({ severity: "medium", detail: `${topicClusterCount} topic clusters - highly scattered content`, fix: "Content covers too many unrelated topics. AI engines cannot identify your expertise. Focus on your core niche." });
2295
+ } else {
2296
+ findings.push({ severity: "low", detail: `${topicClusterCount} topic clusters despite strong core topic focus`, fix: "Consider narrowing subtopics within your niche for even stronger AI visibility." });
2297
+ }
2298
+ }
2299
+ score = Math.max(0, Math.min(10, score));
2300
+ if (dominantTerm) {
2301
+ const focusPct = Math.round(focusRatio * 100);
2302
+ findings.push({ severity: "info", detail: `Dominant topic term: "${dominantTerm}" (${focusPct}% of ${blogPages.length} pages)` });
2303
+ }
2304
+ if (dominantBigram && dominantBigramCount >= 2) {
2305
+ findings.push({ severity: "info", detail: `Dominant topic phrase: "${dominantBigram}" (${dominantBigramCount}/${blogPages.length} pages)` });
2306
+ }
2307
+ const offTopicExamples = [];
2308
+ for (let i = 0; i < pageTitleTexts.length && offTopicExamples.length < 3; i++) {
2309
+ if (dominantTerm && !pageTitleTexts[i].includes(dominantTerm)) {
2310
+ const title = blogPages[i].text.match(/<title[^>]*>([^<]+)<\/title>/i)?.[1]?.trim();
2311
+ if (title && title.length > 3) offTopicExamples.push(title.slice(0, 60));
2312
+ }
2313
+ }
2314
+ if (offTopicExamples.length > 0 && score < 8) {
2315
+ findings.push({ severity: "low", detail: `Off-topic examples: ${offTopicExamples.join("; ")}` });
2316
+ }
2317
+ return { criterion: "topic_coherence", criterion_label: "Topic Coherence", score, status: score >= 7 ? "pass" : score >= 4 ? "partial" : "fail", findings, fix_priority: score >= 7 ? "P3" : "P0" };
2318
+ }
2319
+ function countWords(html) {
2320
+ const text = html.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "").replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "").replace(/<[^>]*>/g, " ").replace(/\s+/g, " ").trim();
2321
+ return text.split(/\s+/).filter((w) => w.length > 0).length;
2322
+ }
2323
+ function countHeadings(html) {
2324
+ const headings = html.match(/<h[2-6][^>]*>/gi) || [];
2325
+ return headings.length;
2326
+ }
2327
+ function checkContentDepth(data, topicCoherenceScore) {
2328
+ const findings = [];
2329
+ if (!data.blogSample || data.blogSample.length < 2) {
2330
+ findings.push({ severity: "info", detail: `Only ${data.blogSample?.length || 0} blog pages found - insufficient for depth analysis` });
2331
+ return { criterion: "content_depth", criterion_label: "Content Depth", score: 3, status: "partial", findings, fix_priority: "P2" };
2332
+ }
2333
+ const blogPages = data.blogSample;
2334
+ const wordCounts = blogPages.map((p) => countWords(p.text));
2335
+ const headingCounts = blogPages.map((p) => countHeadings(p.text));
2336
+ const avgWords = wordCounts.reduce((a, b) => a + b, 0) / wordCounts.length;
2337
+ const avgHeadings = headingCounts.reduce((a, b) => a + b, 0) / headingCounts.length;
2338
+ const deepPages = wordCounts.filter((w) => w >= 1e3).length;
2339
+ const thinPages = wordCounts.filter((w) => w < 300).length;
2340
+ const deepRatio = deepPages / blogPages.length;
2341
+ const thinRatio = thinPages / blogPages.length;
2342
+ let score = 0;
2343
+ if (avgWords >= 2e3) {
2344
+ score += 5;
2345
+ findings.push({ severity: "info", detail: `Average ${Math.round(avgWords)} words per page across ${blogPages.length} pages - excellent depth` });
2346
+ } else if (avgWords >= 1200) {
2347
+ score += 4;
2348
+ findings.push({ severity: "info", detail: `Average ${Math.round(avgWords)} words per page across ${blogPages.length} pages - good depth` });
2349
+ } else if (avgWords >= 800) {
2350
+ score += 3;
2351
+ findings.push({ severity: "info", detail: `Average ${Math.round(avgWords)} words per page - moderate depth` });
2352
+ } else if (avgWords >= 400) {
2353
+ score += 2;
2354
+ findings.push({ severity: "low", detail: `Average ${Math.round(avgWords)} words per page - shallow content`, fix: "Expand articles with more detail, examples, and expert analysis to build AI citation authority" });
2355
+ } else {
2356
+ score += 1;
2357
+ findings.push({ severity: "medium", detail: `Average ${Math.round(avgWords)} words per page - very thin content`, fix: "Content is too thin for AI engines to cite. Aim for 1000+ words per article with structured sections." });
2358
+ }
2359
+ if (avgHeadings >= 8) {
2360
+ score += 3;
2361
+ findings.push({ severity: "info", detail: `Average ${avgHeadings.toFixed(1)} subheadings per page - well-structured` });
2362
+ } else if (avgHeadings >= 5) {
2363
+ score += 2;
2364
+ findings.push({ severity: "info", detail: `Average ${avgHeadings.toFixed(1)} subheadings per page - decent structure` });
2365
+ } else if (avgHeadings >= 2) {
2366
+ score += 1;
2367
+ findings.push({ severity: "low", detail: `Average ${avgHeadings.toFixed(1)} subheadings per page`, fix: "Add more H2/H3 headings to break content into extractable sections" });
2368
+ } else {
2369
+ findings.push({ severity: "medium", detail: `Average ${avgHeadings.toFixed(1)} subheadings per page - minimal structure`, fix: "Add question-format H2/H3 headings so AI engines can extract specific answers" });
2370
+ }
2371
+ if (deepRatio >= 0.5) {
2372
+ score += 2;
2373
+ findings.push({ severity: "info", detail: `${deepPages}/${blogPages.length} pages (${Math.round(deepRatio * 100)}%) have 1000+ words` });
2374
+ } else if (deepRatio >= 0.25) {
2375
+ score += 1;
2376
+ findings.push({ severity: "info", detail: `${deepPages}/${blogPages.length} pages have 1000+ words` });
2377
+ }
2378
+ if (thinRatio >= 0.5) {
2379
+ score = Math.max(0, score - 2);
2380
+ findings.push({ severity: "medium", detail: `${thinPages}/${blogPages.length} pages (${Math.round(thinRatio * 100)}%) have under 300 words - high thin content ratio`, fix: "Remove or expand thin pages. Thin content dilutes site quality for AI engines." });
2381
+ } else if (thinRatio >= 0.25) {
2382
+ score = Math.max(0, score - 1);
2383
+ findings.push({ severity: "low", detail: `${thinPages}/${blogPages.length} pages have under 300 words` });
2384
+ }
2385
+ let finalScore = Math.min(10, score);
2386
+ if (topicCoherenceScore !== void 0 && topicCoherenceScore <= 4 && finalScore >= 8) {
2387
+ finalScore = 7;
2388
+ findings.push({ severity: "low", detail: "Deep content but low topic coherence - depth on scattered topics has reduced AI citation value", fix: "Focus content depth on your core expertise area for maximum AI visibility" });
2389
+ }
2390
+ return { criterion: "content_depth", criterion_label: "Content Depth", score: finalScore, status: finalScore >= 7 ? "pass" : finalScore >= 4 ? "partial" : "fail", findings, fix_priority: finalScore >= 7 ? "P3" : "P1" };
2391
+ }
2127
2392
  function auditSiteFromData(data) {
2393
+ const topicCoherence = checkTopicCoherence(data);
2394
+ const cannibalization = checkContentCannibalization(data, topicCoherence.score);
2128
2395
  return [
2129
2396
  checkLlmsTxt(data),
2130
2397
  checkSchemaMarkup(data),
@@ -2150,47 +2417,55 @@ function auditSiteFromData(data) {
2150
2417
  checkSchemaCoverage(data),
2151
2418
  checkSpeakableSchema(data),
2152
2419
  checkQueryAnswerAlignment(data),
2153
- checkContentCannibalization(data),
2154
- checkVisibleDateSignal(data)
2420
+ cannibalization,
2421
+ checkVisibleDateSignal(data),
2422
+ topicCoherence,
2423
+ checkContentDepth(data, topicCoherence.score)
2155
2424
  ];
2156
2425
  }
2157
2426
 
2158
2427
  // src/scoring.ts
2159
2428
  var WEIGHTS = {
2160
- // Original 10
2161
- llms_txt: 0.1,
2162
- schema_markup: 0.15,
2163
- qa_content_format: 0.15,
2164
- clean_html: 0.1,
2165
- entity_consistency: 0.1,
2166
- robots_txt: 0.05,
2167
- faq_section: 0.1,
2168
- original_data: 0.1,
2169
- internal_linking: 0.1,
2170
- semantic_html: 0.05,
2171
- // New 12
2172
- content_freshness: 0.07,
2173
- sitemap_completeness: 0.05,
2174
- rss_feed: 0.03,
2175
- table_list_extractability: 0.07,
2176
- definition_patterns: 0.04,
2429
+ // ─── Core Content (high weight - these determine real AI citation quality) ──
2430
+ qa_content_format: 0.12,
2431
+ original_data: 0.12,
2432
+ topic_coherence: 0.14,
2433
+ // NEW v2.0: biggest predictor of AI citation quality
2434
+ fact_density: 0.08,
2177
2435
  direct_answer_density: 0.07,
2178
- content_licensing: 0.04,
2436
+ content_depth: 0.06,
2437
+ // NEW v2.0: substantive content vs thin pages
2438
+ // ─── Structure & Discovery (medium weight - technical readiness) ────────────
2439
+ schema_markup: 0.08,
2440
+ llms_txt: 0.08,
2441
+ clean_html: 0.08,
2442
+ entity_consistency: 0.08,
2443
+ faq_section: 0.08,
2444
+ internal_linking: 0.08,
2445
+ // ─── Content Signals (moderate weight) ──────────────────────────────────────
2446
+ content_freshness: 0.06,
2447
+ table_list_extractability: 0.05,
2448
+ query_answer_alignment: 0.06,
2449
+ definition_patterns: 0.04,
2179
2450
  author_schema_depth: 0.04,
2180
- fact_density: 0.05,
2181
- canonical_url: 0.04,
2182
- content_velocity: 0.03,
2183
- schema_coverage: 0.03,
2184
- speakable_schema: 0.03,
2185
- query_answer_alignment: 0.08,
2186
2451
  content_cannibalization: 0.05,
2187
- visible_date_signal: 0.04
2452
+ visible_date_signal: 0.04,
2453
+ semantic_html: 0.04,
2454
+ // ─── Plumbing (low weight - nice to have but not what drives citations) ─────
2455
+ robots_txt: 0.03,
2456
+ sitemap_completeness: 0.03,
2457
+ content_velocity: 0.03,
2458
+ rss_feed: 0.02,
2459
+ content_licensing: 0.03,
2460
+ canonical_url: 0.02,
2461
+ schema_coverage: 0.02,
2462
+ speakable_schema: 0.02
2188
2463
  };
2189
2464
  function calculateOverallScore(criteria) {
2190
2465
  let totalWeight = 0;
2191
2466
  let weightedSum = 0;
2192
2467
  for (const c of criteria) {
2193
- const weight = WEIGHTS[c.criterion] ?? 0.1;
2468
+ const weight = WEIGHTS[c.criterion] ?? 0.05;
2194
2469
  weightedSum += c.score / 10 * weight * 100;
2195
2470
  totalWeight += weight;
2196
2471
  }
@@ -2326,7 +2601,9 @@ var CRITERION_LABELS = {
2326
2601
  "Speakable Schema": "Speakable Schema",
2327
2602
  "Query-Answer Alignment": "Query-Answer Alignment",
2328
2603
  "Content Cannibalization": "Content Cannibalization",
2329
- "Visible Date Signal": "Visible Date Signal"
2604
+ "Visible Date Signal": "Visible Date Signal",
2605
+ "Topic Coherence": "Topic Coherence",
2606
+ "Content Depth": "Content Depth"
2330
2607
  };
2331
2608
  function scoreToStatus(score) {
2332
2609
  if (score === 0) return "MISSING";
@@ -3235,7 +3512,7 @@ function extractTitle(html) {
3235
3512
  function getTextContent2(html) {
3236
3513
  return html.replace(/<script[\s\S]*?<\/script>/gi, "").replace(/<style[\s\S]*?<\/style>/gi, "").replace(/<[^>]*>/g, " ").replace(/\s+/g, " ").trim();
3237
3514
  }
3238
- function countWords(text) {
3515
+ function countWords2(text) {
3239
3516
  if (!text) return 0;
3240
3517
  return text.split(/\s+/).filter((w) => w.length > 0).length;
3241
3518
  }
@@ -3386,7 +3663,7 @@ function checkHasQuestionHeadings(html) {
3386
3663
  function analyzePage(html, url, category) {
3387
3664
  const title = extractTitle(html);
3388
3665
  const textContent = getTextContent2(html);
3389
- const wordCount = countWords(textContent);
3666
+ const wordCount = countWords2(textContent);
3390
3667
  const issues = [];
3391
3668
  const strengths = [];
3392
3669
  const issueChecks = [
@@ -3539,7 +3816,7 @@ function extractTitle2(html) {
3539
3816
  function getTextContent3(html) {
3540
3817
  return html.replace(/<script[\s\S]*?<\/script>/gi, "").replace(/<style[\s\S]*?<\/style>/gi, "").replace(/<[^>]*>/g, " ").replace(/\s+/g, " ").trim();
3541
3818
  }
3542
- function countWords2(text) {
3819
+ function countWords3(text) {
3543
3820
  if (!text) return 0;
3544
3821
  return text.split(/\s+/).filter((w) => w.length > 0).length;
3545
3822
  }
@@ -3687,7 +3964,7 @@ function buildLinkGraph(pages, domain, homepageUrl) {
3687
3964
  if (nodes.has(norm)) continue;
3688
3965
  const title = extractTitle2(page.text);
3689
3966
  const text = getTextContent3(page.text);
3690
- const wordCount = countWords2(text);
3967
+ const wordCount = countWords3(text);
3691
3968
  nodes.set(norm, {
3692
3969
  url: norm,
3693
3970
  title,