aeorank 1.3.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -3,6 +3,9 @@ var __defProp = Object.defineProperty;
3
3
  var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
4
4
  var __getOwnPropNames = Object.getOwnPropertyNames;
5
5
  var __hasOwnProp = Object.prototype.hasOwnProperty;
6
+ var __esm = (fn, res) => function __init() {
7
+ return fn && (res = (0, fn[__getOwnPropNames(fn)[0]])(fn = 0)), res;
8
+ };
6
9
  var __export = (target, all) => {
7
10
  for (var name in all)
8
11
  __defProp(target, name, { get: all[name], enumerable: true });
@@ -17,6 +20,303 @@ var __copyProps = (to, from, except, desc) => {
17
20
  };
18
21
  var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
19
22
 
23
+ // src/full-site-crawler.ts
24
+ var full_site_crawler_exports = {};
25
+ __export(full_site_crawler_exports, {
26
+ crawlFullSite: () => crawlFullSite,
27
+ extractAllUrlsFromSitemap: () => extractAllUrlsFromSitemap,
28
+ extractInternalLinks: () => extractInternalLinks,
29
+ inferCategory: () => inferCategory,
30
+ isDisallowedByRobots: () => isDisallowedByRobots,
31
+ parseRobotsTxt: () => parseRobotsTxt
32
+ });
33
+ function parseRobotsTxt(robotsText) {
34
+ const lines = robotsText.split("\n");
35
+ const rules = { disallow: [], allow: [] };
36
+ let inRelevantSection = false;
37
+ for (const rawLine of lines) {
38
+ const line = rawLine.trim();
39
+ if (!line || line.startsWith("#")) continue;
40
+ const uaMatch = line.match(/^user-agent:\s*(.+)/i);
41
+ if (uaMatch) {
42
+ const agent = uaMatch[1].trim().toLowerCase();
43
+ inRelevantSection = agent === "*" || agent === "aeo-visibility-bot";
44
+ continue;
45
+ }
46
+ if (!inRelevantSection) continue;
47
+ const disallowMatch = line.match(/^disallow:\s*(.*)/i);
48
+ if (disallowMatch) {
49
+ const path = disallowMatch[1].trim();
50
+ if (path) rules.disallow.push(path);
51
+ continue;
52
+ }
53
+ const allowMatch = line.match(/^allow:\s*(.*)/i);
54
+ if (allowMatch) {
55
+ const path = allowMatch[1].trim();
56
+ if (path) rules.allow.push(path);
57
+ }
58
+ }
59
+ return rules;
60
+ }
61
+ function isDisallowedByRobots(urlPath, rules) {
62
+ let longestAllow = 0;
63
+ let longestDisallow = 0;
64
+ for (const pattern of rules.allow) {
65
+ if (urlPath.startsWith(pattern) && pattern.length > longestAllow) {
66
+ longestAllow = pattern.length;
67
+ }
68
+ }
69
+ for (const pattern of rules.disallow) {
70
+ if (urlPath.startsWith(pattern) && pattern.length > longestDisallow) {
71
+ longestDisallow = pattern.length;
72
+ }
73
+ }
74
+ if (longestAllow === 0 && longestDisallow === 0) return false;
75
+ return longestDisallow > longestAllow;
76
+ }
77
+ async function fetchPage2(url, timeoutMs = 1e4) {
78
+ try {
79
+ const res = await fetch(url, {
80
+ signal: AbortSignal.timeout(timeoutMs),
81
+ headers: { "User-Agent": "AEO-Visibility-Bot/1.0" },
82
+ redirect: "follow"
83
+ });
84
+ if (res.status !== 200) return null;
85
+ const text = await res.text();
86
+ if (text.length < 200) return null;
87
+ return { text: text.slice(0, 5e5), status: res.status, finalUrl: res.url };
88
+ } catch {
89
+ return null;
90
+ }
91
+ }
92
+ async function fetchSitemapXml(url, timeoutMs = 1e4) {
93
+ try {
94
+ const res = await fetch(url, {
95
+ signal: AbortSignal.timeout(timeoutMs),
96
+ headers: { "User-Agent": "AEO-Visibility-Bot/1.0" },
97
+ redirect: "follow"
98
+ });
99
+ if (res.status !== 200) return null;
100
+ return await res.text();
101
+ } catch {
102
+ return null;
103
+ }
104
+ }
105
+ async function extractAllUrlsFromSitemap(sitemapText, domain, timeoutMs = 1e4) {
106
+ const cleanDomain = domain.replace(/^www\./, "").toLowerCase();
107
+ const urls = /* @__PURE__ */ new Set();
108
+ const subSitemapLocs = sitemapText.match(/<sitemap>[\s\S]*?<loc>([^<]+)<\/loc>[\s\S]*?<\/sitemap>/gi) || [];
109
+ if (subSitemapLocs.length > 0) {
110
+ const subUrls = [];
111
+ for (const block of subSitemapLocs) {
112
+ const locMatch = block.match(/<loc>([^<]+)<\/loc>/i);
113
+ if (locMatch) subUrls.push(locMatch[1].trim());
114
+ }
115
+ const fetches = subUrls.slice(0, 10).map((u) => fetchSitemapXml(u, timeoutMs));
116
+ const results = await Promise.all(fetches);
117
+ for (const text of results) {
118
+ if (text) {
119
+ extractLocsFromXml(text, cleanDomain, urls);
120
+ }
121
+ }
122
+ }
123
+ extractLocsFromXml(sitemapText, cleanDomain, urls);
124
+ return Array.from(urls);
125
+ }
126
+ function extractLocsFromXml(xml, cleanDomain, urls) {
127
+ const locMatches = xml.match(/<url>[\s\S]*?<loc>([^<]+)<\/loc>[\s\S]*?<\/url>/gi) || [];
128
+ for (const block of locMatches) {
129
+ const locMatch = block.match(/<loc>([^<]+)<\/loc>/i);
130
+ if (!locMatch) continue;
131
+ const url = locMatch[1].trim();
132
+ try {
133
+ const parsed = new URL(url);
134
+ const urlDomain = parsed.hostname.replace(/^www\./, "").toLowerCase();
135
+ if (urlDomain !== cleanDomain) continue;
136
+ if (RESOURCE_EXTENSIONS.test(parsed.pathname)) continue;
137
+ urls.add(url);
138
+ } catch {
139
+ continue;
140
+ }
141
+ }
142
+ }
143
+ function extractInternalLinks(html, domain) {
144
+ const cleanDomain = domain.replace(/^www\./, "").toLowerCase();
145
+ const hrefMatches = html.match(/href="([^"]*)"/gi) || [];
146
+ const urls = /* @__PURE__ */ new Set();
147
+ for (const match of hrefMatches) {
148
+ const href = match.match(/href="([^"]*)"/i)?.[1];
149
+ if (!href || !href.trim()) continue;
150
+ let fullUrl;
151
+ if (href.startsWith("//")) {
152
+ fullUrl = `https:${href}`;
153
+ } else if (href.startsWith("/")) {
154
+ if (href === "/" || href.startsWith("/#")) continue;
155
+ fullUrl = `https://${domain}${href}`;
156
+ } else if (href.startsWith("http")) {
157
+ fullUrl = href;
158
+ } else if (href.startsWith("#") || href.startsWith("?") || href.startsWith("mailto:") || href.startsWith("tel:") || href.startsWith("javascript:")) {
159
+ continue;
160
+ } else {
161
+ fullUrl = `https://${domain}/${href}`;
162
+ }
163
+ try {
164
+ const parsed = new URL(fullUrl);
165
+ const linkDomain = parsed.hostname.replace(/^www\./, "").toLowerCase();
166
+ if (linkDomain !== cleanDomain) continue;
167
+ parsed.hash = "";
168
+ const path = parsed.pathname;
169
+ if (path === "/" || path === "") continue;
170
+ if (RESOURCE_EXTENSIONS.test(path)) continue;
171
+ if (SKIP_PATH_PATTERNS.test(path)) continue;
172
+ const normalized = parsed.origin + path.replace(/\/+$/, "") + parsed.search;
173
+ urls.add(normalized);
174
+ } catch {
175
+ continue;
176
+ }
177
+ }
178
+ return Array.from(urls);
179
+ }
180
+ function inferCategory(url) {
181
+ try {
182
+ const path = new URL(url).pathname;
183
+ for (const [pattern, category] of CATEGORY_PATTERNS) {
184
+ if (pattern.test(path)) return category;
185
+ }
186
+ } catch {
187
+ }
188
+ return "content";
189
+ }
190
+ async function crawlFullSite(siteData, options) {
191
+ const startTime = Date.now();
192
+ const maxPages = options?.maxPages ?? 200;
193
+ const timeoutMs = options?.timeoutMs ?? 1e4;
194
+ const concurrency = options?.concurrency ?? 5;
195
+ const respectRobots = options?.respectRobots ?? true;
196
+ const pages = [];
197
+ const discoveredUrls = /* @__PURE__ */ new Set();
198
+ const fetchedUrls = /* @__PURE__ */ new Set();
199
+ const skippedUrls = /* @__PURE__ */ new Set();
200
+ const visited = /* @__PURE__ */ new Set();
201
+ let robotsRules = { disallow: [], allow: [] };
202
+ if (respectRobots && siteData.robotsTxt?.text) {
203
+ robotsRules = parseRobotsTxt(siteData.robotsTxt.text);
204
+ }
205
+ const baseUrl = `${siteData.protocol}://${siteData.domain}`;
206
+ visited.add(normalizeUrl(baseUrl));
207
+ visited.add(normalizeUrl(baseUrl + "/"));
208
+ if (siteData.blogSample) {
209
+ for (const page of siteData.blogSample) {
210
+ if (page.finalUrl) visited.add(normalizeUrl(page.finalUrl));
211
+ }
212
+ }
213
+ const queue = [];
214
+ if (siteData.sitemapXml?.text) {
215
+ const sitemapUrls = await extractAllUrlsFromSitemap(
216
+ siteData.sitemapXml.text,
217
+ siteData.domain,
218
+ timeoutMs
219
+ );
220
+ for (const url of sitemapUrls) {
221
+ const norm = normalizeUrl(url);
222
+ if (!visited.has(norm)) {
223
+ discoveredUrls.add(url);
224
+ if (!queue.includes(url)) queue.push(url);
225
+ }
226
+ }
227
+ }
228
+ if (siteData.homepage?.text) {
229
+ const homeLinks = extractInternalLinks(siteData.homepage.text, siteData.domain);
230
+ for (const url of homeLinks) {
231
+ const norm = normalizeUrl(url);
232
+ if (!visited.has(norm) && !discoveredUrls.has(url)) {
233
+ discoveredUrls.add(url);
234
+ if (!queue.includes(url)) queue.push(url);
235
+ }
236
+ }
237
+ }
238
+ while (queue.length > 0 && fetchedUrls.size < maxPages) {
239
+ const batchSize = Math.min(concurrency, maxPages - fetchedUrls.size, queue.length);
240
+ const batch = [];
241
+ while (batch.length < batchSize && queue.length > 0) {
242
+ const url = queue.shift();
243
+ const norm = normalizeUrl(url);
244
+ if (visited.has(norm)) continue;
245
+ visited.add(norm);
246
+ if (respectRobots) {
247
+ try {
248
+ const path = new URL(url).pathname;
249
+ if (isDisallowedByRobots(path, robotsRules)) {
250
+ skippedUrls.add(url);
251
+ continue;
252
+ }
253
+ } catch {
254
+ continue;
255
+ }
256
+ }
257
+ batch.push(url);
258
+ }
259
+ if (batch.length === 0) continue;
260
+ const results = await Promise.all(batch.map((url) => fetchPage2(url, timeoutMs)));
261
+ for (let i = 0; i < results.length; i++) {
262
+ const result = results[i];
263
+ const url = batch[i];
264
+ fetchedUrls.add(url);
265
+ if (!result) continue;
266
+ result.category = inferCategory(url);
267
+ pages.push(result);
268
+ const newLinks = extractInternalLinks(result.text, siteData.domain);
269
+ for (const link of newLinks) {
270
+ const norm = normalizeUrl(link);
271
+ if (!visited.has(norm) && !discoveredUrls.has(link)) {
272
+ discoveredUrls.add(link);
273
+ queue.push(link);
274
+ }
275
+ }
276
+ }
277
+ }
278
+ for (const url of queue) {
279
+ if (!fetchedUrls.has(url)) {
280
+ skippedUrls.add(url);
281
+ }
282
+ }
283
+ return {
284
+ pages,
285
+ discoveredUrls: Array.from(discoveredUrls),
286
+ fetchedUrls: Array.from(fetchedUrls),
287
+ skippedUrls: Array.from(skippedUrls),
288
+ elapsed: Math.round((Date.now() - startTime) / 100) / 10
289
+ };
290
+ }
291
+ function normalizeUrl(url) {
292
+ try {
293
+ const parsed = new URL(url);
294
+ return (parsed.origin + parsed.pathname.replace(/\/+$/, "") + parsed.search).toLowerCase();
295
+ } catch {
296
+ return url.toLowerCase();
297
+ }
298
+ }
299
+ var RESOURCE_EXTENSIONS, SKIP_PATH_PATTERNS, CATEGORY_PATTERNS;
300
+ var init_full_site_crawler = __esm({
301
+ "src/full-site-crawler.ts"() {
302
+ "use strict";
303
+ RESOURCE_EXTENSIONS = /\.(js|css|png|jpg|jpeg|gif|svg|ico|pdf|xml|txt|woff|woff2|ttf|eot|mp4|mp3|webp|avif|zip|gz|tar|json)$/i;
304
+ SKIP_PATH_PATTERNS = /^\/(api|wp-admin|wp-json|static|assets|_next|auth|login|signup|cart|checkout|admin|feed|xmlrpc)\b/i;
305
+ CATEGORY_PATTERNS = [
306
+ [/\/(blog|articles?|posts?|news|insights|guides)\b/i, "blog"],
307
+ [/\/(about|about-us|company|who-we-are)\b/i, "about"],
308
+ [/\/(pricing|plans|packages)\b/i, "pricing"],
309
+ [/\/(services?|features?|solutions?|products?|what-we-do|offerings?)\b/i, "services"],
310
+ [/\/(contact|contact-us|get-in-touch)\b/i, "contact"],
311
+ [/\/(team|our-team|authors?|people|leadership|staff)\b/i, "team"],
312
+ [/\/(resources?|resource-center|library|downloads?)\b/i, "resources"],
313
+ [/\/(docs?|documentation|help|help-center|support|knowledge-base)\b/i, "docs"],
314
+ [/\/(case-stud\w*|cases|customers?|success-stor\w*|testimonials?)\b/i, "cases"],
315
+ [/\/(faq|frequently-asked|questions)\b/i, "faq"]
316
+ ];
317
+ }
318
+ });
319
+
20
320
  // src/index.ts
21
321
  var index_exports = {};
22
322
  __export(index_exports, {
@@ -30,8 +330,11 @@ __export(index_exports, {
30
330
  calculateOverallScore: () => calculateOverallScore,
31
331
  classifyRendering: () => classifyRendering,
32
332
  compare: () => compare,
333
+ crawlFullSite: () => crawlFullSite,
33
334
  detectParkedDomain: () => detectParkedDomain,
335
+ extractAllUrlsFromSitemap: () => extractAllUrlsFromSitemap,
34
336
  extractContentPagesFromSitemap: () => extractContentPagesFromSitemap,
337
+ extractInternalLinks: () => extractInternalLinks,
35
338
  extractNavLinks: () => extractNavLinks,
36
339
  extractRawDataSummary: () => extractRawDataSummary,
37
340
  fetchMultiPageData: () => fetchMultiPageData,
@@ -42,6 +345,7 @@ __export(index_exports, {
42
345
  generateOpportunities: () => generateOpportunities,
43
346
  generatePitchNumbers: () => generatePitchNumbers,
44
347
  generateVerdict: () => generateVerdict,
348
+ inferCategory: () => inferCategory,
45
349
  isSpaShell: () => isSpaShell,
46
350
  prefetchSiteData: () => prefetchSiteData,
47
351
  scoreToStatus: () => scoreToStatus
@@ -1803,7 +2107,11 @@ function extractRawDataSummary(data) {
1803
2107
  const d = new Date(m[1]);
1804
2108
  if (isNaN(d.getTime())) return null;
1805
2109
  return Math.floor((Date.now() - d.getTime()) / (1e3 * 60 * 60 * 24));
1806
- })()
2110
+ })(),
2111
+ // Full-crawl stats
2112
+ crawl_discovered: data.crawlStats?.discovered ?? 0,
2113
+ crawl_fetched: data.crawlStats?.fetched ?? 0,
2114
+ crawl_skipped: data.crawlStats?.skipped ?? 0
1807
2115
  };
1808
2116
  }
1809
2117
  function auditSiteFromData(data) {
@@ -2799,7 +3107,20 @@ async function audit(domain, options) {
2799
3107
  }
2800
3108
  }
2801
3109
  }
2802
- if (!options?.noMultiPage) {
3110
+ if (options?.fullCrawl) {
3111
+ const { crawlFullSite: crawlFullSite2 } = await Promise.resolve().then(() => (init_full_site_crawler(), full_site_crawler_exports));
3112
+ const crawlResult = await crawlFullSite2(siteData, {
3113
+ maxPages: options.maxPages ?? 200,
3114
+ concurrency: options.concurrency ?? 5
3115
+ });
3116
+ siteData.blogSample = crawlResult.pages;
3117
+ siteData.crawlStats = {
3118
+ discovered: crawlResult.discoveredUrls.length,
3119
+ fetched: crawlResult.fetchedUrls.length,
3120
+ skipped: crawlResult.skippedUrls.length,
3121
+ elapsed: crawlResult.elapsed
3122
+ };
3123
+ } else if (!options?.noMultiPage) {
2803
3124
  await fetchMultiPageData(siteData);
2804
3125
  }
2805
3126
  const results = auditSiteFromData(siteData);
@@ -2832,6 +3153,9 @@ async function audit(domain, options) {
2832
3153
  };
2833
3154
  }
2834
3155
 
3156
+ // src/index.ts
3157
+ init_full_site_crawler();
3158
+
2835
3159
  // src/html-report.ts
2836
3160
  function scoreColor(score) {
2837
3161
  if (score <= 40) return "#F44336";
@@ -3166,8 +3490,11 @@ async function compare(domainA, domainB, options) {
3166
3490
  calculateOverallScore,
3167
3491
  classifyRendering,
3168
3492
  compare,
3493
+ crawlFullSite,
3169
3494
  detectParkedDomain,
3495
+ extractAllUrlsFromSitemap,
3170
3496
  extractContentPagesFromSitemap,
3497
+ extractInternalLinks,
3171
3498
  extractNavLinks,
3172
3499
  extractRawDataSummary,
3173
3500
  fetchMultiPageData,
@@ -3178,6 +3505,7 @@ async function compare(domainA, domainB, options) {
3178
3505
  generateOpportunities,
3179
3506
  generatePitchNumbers,
3180
3507
  generateVerdict,
3508
+ inferCategory,
3181
3509
  isSpaShell,
3182
3510
  prefetchSiteData,
3183
3511
  scoreToStatus