aeorank 1.3.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +44 -1
- package/dist/chunk-3IJISYWT.js +291 -0
- package/dist/chunk-3IJISYWT.js.map +1 -0
- package/dist/cli.js +38 -6
- package/dist/cli.js.map +1 -1
- package/dist/full-site-crawler-F7J2HRL4.js +292 -0
- package/dist/full-site-crawler-F7J2HRL4.js.map +1 -0
- package/dist/full-site-crawler-VFARFR2C.js +17 -0
- package/dist/full-site-crawler-VFARFR2C.js.map +1 -0
- package/dist/index.cjs +330 -2
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +64 -3
- package/dist/index.d.ts +64 -3
- package/dist/index.js +30 -2
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -24,6 +24,8 @@ npx aeorank example.com --summary # Human-readable scorecard
|
|
|
24
24
|
npx aeorank example.com --html # Standalone HTML report
|
|
25
25
|
npx aeorank example.com --ci --threshold 80 # CI gate
|
|
26
26
|
npx aeorank site-a.com site-b.com # Side-by-side comparison
|
|
27
|
+
npx aeorank example.com --full-crawl # Crawl all discoverable pages
|
|
28
|
+
npx aeorank example.com --full-crawl --max-pages 50 # Limit to 50 pages
|
|
27
29
|
```
|
|
28
30
|
|
|
29
31
|
### Programmatic
|
|
@@ -84,6 +86,9 @@ Options:
|
|
|
84
86
|
--threshold <N> Score threshold for --ci (default: 70)
|
|
85
87
|
--no-headless Skip Puppeteer SPA rendering
|
|
86
88
|
--no-multi-page Skip extra page discovery (faster)
|
|
89
|
+
--full-crawl BFS crawl all discoverable pages
|
|
90
|
+
--max-pages <N> Max pages for --full-crawl (default: 200)
|
|
91
|
+
--concurrency <N> Parallel fetches for --full-crawl (default: 5)
|
|
87
92
|
--version Print version
|
|
88
93
|
--help Show help
|
|
89
94
|
```
|
|
@@ -114,7 +119,7 @@ Or use `npx` directly:
|
|
|
114
119
|
Run a complete audit. Returns `AuditResult` with:
|
|
115
120
|
|
|
116
121
|
- `overallScore` - 0-100 weighted score
|
|
117
|
-
- `scorecard` -
|
|
122
|
+
- `scorecard` - 26 `ScoreCardItem` entries (criterion, score 0-10, status, key findings)
|
|
118
123
|
- `detailedFindings` - Per-criterion findings with severity
|
|
119
124
|
- `opportunities` - Prioritized improvements with effort/impact
|
|
120
125
|
- `pitchNumbers` - Key metrics (schema types, AI crawler access, etc.)
|
|
@@ -130,6 +135,9 @@ Run a complete audit. Returns `AuditResult` with:
|
|
|
130
135
|
| `noHeadless` | `boolean` | `false` | Skip Puppeteer SPA rendering |
|
|
131
136
|
| `noMultiPage` | `boolean` | `false` | Homepage + blog only |
|
|
132
137
|
| `timeout` | `number` | `15000` | Fetch timeout in ms |
|
|
138
|
+
| `fullCrawl` | `boolean` | `false` | BFS crawl all discoverable pages |
|
|
139
|
+
| `maxPages` | `number` | `200` | Max pages for full crawl |
|
|
140
|
+
| `concurrency` | `number` | `5` | Parallel fetches for full crawl |
|
|
133
141
|
|
|
134
142
|
### Advanced API
|
|
135
143
|
|
|
@@ -165,6 +173,41 @@ npm install puppeteer
|
|
|
165
173
|
|
|
166
174
|
Use `--no-headless` to skip SPA rendering (faster but may produce lower scores for SPAs).
|
|
167
175
|
|
|
176
|
+
## Full-Site Crawl
|
|
177
|
+
|
|
178
|
+
By default, AEORank audits the homepage plus ~20 discovered pages. For deeper analysis, enable `--full-crawl` to BFS-crawl every discoverable page:
|
|
179
|
+
|
|
180
|
+
```bash
|
|
181
|
+
npx aeorank example.com --full-crawl # Up to 200 pages
|
|
182
|
+
npx aeorank example.com --full-crawl --max-pages 50 # Limit to 50
|
|
183
|
+
npx aeorank example.com --full-crawl --concurrency 10 # 10 parallel fetches
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
The crawler seeds from sitemap URLs and homepage links, then follows internal links on each fetched page. It respects `robots.txt` Disallow rules, skips resource files, and tags each page with a category (blog, about, pricing, services, docs, faq, etc.).
|
|
187
|
+
|
|
188
|
+
Programmatic usage:
|
|
189
|
+
|
|
190
|
+
```ts
|
|
191
|
+
import { audit } from 'aeorank';
|
|
192
|
+
|
|
193
|
+
const result = await audit('example.com', {
|
|
194
|
+
fullCrawl: true,
|
|
195
|
+
maxPages: 100,
|
|
196
|
+
concurrency: 5,
|
|
197
|
+
});
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
Or use the crawler directly:
|
|
201
|
+
|
|
202
|
+
```ts
|
|
203
|
+
import { crawlFullSite, prefetchSiteData } from 'aeorank';
|
|
204
|
+
|
|
205
|
+
const siteData = await prefetchSiteData('example.com');
|
|
206
|
+
const crawlResult = await crawlFullSite(siteData, { maxPages: 200 });
|
|
207
|
+
console.log(crawlResult.pages.length); // Pages fetched
|
|
208
|
+
console.log(crawlResult.discoveredUrls.length); // Total URLs found
|
|
209
|
+
```
|
|
210
|
+
|
|
168
211
|
## Scoring
|
|
169
212
|
|
|
170
213
|
Each criterion is scored 0-10 by deterministic checks (regex, HTML parsing, HTTP headers). The overall score is a weighted average normalized to 0-100.
|
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
// src/full-site-crawler.ts
|
|
2
|
+
var RESOURCE_EXTENSIONS = /\.(js|css|png|jpg|jpeg|gif|svg|ico|pdf|xml|txt|woff|woff2|ttf|eot|mp4|mp3|webp|avif|zip|gz|tar|json)$/i;
|
|
3
|
+
var SKIP_PATH_PATTERNS = /^\/(api|wp-admin|wp-json|static|assets|_next|auth|login|signup|cart|checkout|admin|feed|xmlrpc)\b/i;
|
|
4
|
+
function parseRobotsTxt(robotsText) {
|
|
5
|
+
const lines = robotsText.split("\n");
|
|
6
|
+
const rules = { disallow: [], allow: [] };
|
|
7
|
+
let inRelevantSection = false;
|
|
8
|
+
for (const rawLine of lines) {
|
|
9
|
+
const line = rawLine.trim();
|
|
10
|
+
if (!line || line.startsWith("#")) continue;
|
|
11
|
+
const uaMatch = line.match(/^user-agent:\s*(.+)/i);
|
|
12
|
+
if (uaMatch) {
|
|
13
|
+
const agent = uaMatch[1].trim().toLowerCase();
|
|
14
|
+
inRelevantSection = agent === "*" || agent === "aeo-visibility-bot";
|
|
15
|
+
continue;
|
|
16
|
+
}
|
|
17
|
+
if (!inRelevantSection) continue;
|
|
18
|
+
const disallowMatch = line.match(/^disallow:\s*(.*)/i);
|
|
19
|
+
if (disallowMatch) {
|
|
20
|
+
const path = disallowMatch[1].trim();
|
|
21
|
+
if (path) rules.disallow.push(path);
|
|
22
|
+
continue;
|
|
23
|
+
}
|
|
24
|
+
const allowMatch = line.match(/^allow:\s*(.*)/i);
|
|
25
|
+
if (allowMatch) {
|
|
26
|
+
const path = allowMatch[1].trim();
|
|
27
|
+
if (path) rules.allow.push(path);
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
return rules;
|
|
31
|
+
}
|
|
32
|
+
function isDisallowedByRobots(urlPath, rules) {
|
|
33
|
+
let longestAllow = 0;
|
|
34
|
+
let longestDisallow = 0;
|
|
35
|
+
for (const pattern of rules.allow) {
|
|
36
|
+
if (urlPath.startsWith(pattern) && pattern.length > longestAllow) {
|
|
37
|
+
longestAllow = pattern.length;
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
for (const pattern of rules.disallow) {
|
|
41
|
+
if (urlPath.startsWith(pattern) && pattern.length > longestDisallow) {
|
|
42
|
+
longestDisallow = pattern.length;
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
if (longestAllow === 0 && longestDisallow === 0) return false;
|
|
46
|
+
return longestDisallow > longestAllow;
|
|
47
|
+
}
|
|
48
|
+
async function fetchPage(url, timeoutMs = 1e4) {
|
|
49
|
+
try {
|
|
50
|
+
const res = await fetch(url, {
|
|
51
|
+
signal: AbortSignal.timeout(timeoutMs),
|
|
52
|
+
headers: { "User-Agent": "AEO-Visibility-Bot/1.0" },
|
|
53
|
+
redirect: "follow"
|
|
54
|
+
});
|
|
55
|
+
if (res.status !== 200) return null;
|
|
56
|
+
const text = await res.text();
|
|
57
|
+
if (text.length < 200) return null;
|
|
58
|
+
return { text: text.slice(0, 5e5), status: res.status, finalUrl: res.url };
|
|
59
|
+
} catch {
|
|
60
|
+
return null;
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
async function fetchSitemapXml(url, timeoutMs = 1e4) {
|
|
64
|
+
try {
|
|
65
|
+
const res = await fetch(url, {
|
|
66
|
+
signal: AbortSignal.timeout(timeoutMs),
|
|
67
|
+
headers: { "User-Agent": "AEO-Visibility-Bot/1.0" },
|
|
68
|
+
redirect: "follow"
|
|
69
|
+
});
|
|
70
|
+
if (res.status !== 200) return null;
|
|
71
|
+
return await res.text();
|
|
72
|
+
} catch {
|
|
73
|
+
return null;
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
async function extractAllUrlsFromSitemap(sitemapText, domain, timeoutMs = 1e4) {
|
|
77
|
+
const cleanDomain = domain.replace(/^www\./, "").toLowerCase();
|
|
78
|
+
const urls = /* @__PURE__ */ new Set();
|
|
79
|
+
const subSitemapLocs = sitemapText.match(/<sitemap>[\s\S]*?<loc>([^<]+)<\/loc>[\s\S]*?<\/sitemap>/gi) || [];
|
|
80
|
+
if (subSitemapLocs.length > 0) {
|
|
81
|
+
const subUrls = [];
|
|
82
|
+
for (const block of subSitemapLocs) {
|
|
83
|
+
const locMatch = block.match(/<loc>([^<]+)<\/loc>/i);
|
|
84
|
+
if (locMatch) subUrls.push(locMatch[1].trim());
|
|
85
|
+
}
|
|
86
|
+
const fetches = subUrls.slice(0, 10).map((u) => fetchSitemapXml(u, timeoutMs));
|
|
87
|
+
const results = await Promise.all(fetches);
|
|
88
|
+
for (const text of results) {
|
|
89
|
+
if (text) {
|
|
90
|
+
extractLocsFromXml(text, cleanDomain, urls);
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
extractLocsFromXml(sitemapText, cleanDomain, urls);
|
|
95
|
+
return Array.from(urls);
|
|
96
|
+
}
|
|
97
|
+
function extractLocsFromXml(xml, cleanDomain, urls) {
|
|
98
|
+
const locMatches = xml.match(/<url>[\s\S]*?<loc>([^<]+)<\/loc>[\s\S]*?<\/url>/gi) || [];
|
|
99
|
+
for (const block of locMatches) {
|
|
100
|
+
const locMatch = block.match(/<loc>([^<]+)<\/loc>/i);
|
|
101
|
+
if (!locMatch) continue;
|
|
102
|
+
const url = locMatch[1].trim();
|
|
103
|
+
try {
|
|
104
|
+
const parsed = new URL(url);
|
|
105
|
+
const urlDomain = parsed.hostname.replace(/^www\./, "").toLowerCase();
|
|
106
|
+
if (urlDomain !== cleanDomain) continue;
|
|
107
|
+
if (RESOURCE_EXTENSIONS.test(parsed.pathname)) continue;
|
|
108
|
+
urls.add(url);
|
|
109
|
+
} catch {
|
|
110
|
+
continue;
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
function extractInternalLinks(html, domain) {
|
|
115
|
+
const cleanDomain = domain.replace(/^www\./, "").toLowerCase();
|
|
116
|
+
const hrefMatches = html.match(/href="([^"]*)"/gi) || [];
|
|
117
|
+
const urls = /* @__PURE__ */ new Set();
|
|
118
|
+
for (const match of hrefMatches) {
|
|
119
|
+
const href = match.match(/href="([^"]*)"/i)?.[1];
|
|
120
|
+
if (!href || !href.trim()) continue;
|
|
121
|
+
let fullUrl;
|
|
122
|
+
if (href.startsWith("//")) {
|
|
123
|
+
fullUrl = `https:${href}`;
|
|
124
|
+
} else if (href.startsWith("/")) {
|
|
125
|
+
if (href === "/" || href.startsWith("/#")) continue;
|
|
126
|
+
fullUrl = `https://${domain}${href}`;
|
|
127
|
+
} else if (href.startsWith("http")) {
|
|
128
|
+
fullUrl = href;
|
|
129
|
+
} else if (href.startsWith("#") || href.startsWith("?") || href.startsWith("mailto:") || href.startsWith("tel:") || href.startsWith("javascript:")) {
|
|
130
|
+
continue;
|
|
131
|
+
} else {
|
|
132
|
+
fullUrl = `https://${domain}/${href}`;
|
|
133
|
+
}
|
|
134
|
+
try {
|
|
135
|
+
const parsed = new URL(fullUrl);
|
|
136
|
+
const linkDomain = parsed.hostname.replace(/^www\./, "").toLowerCase();
|
|
137
|
+
if (linkDomain !== cleanDomain) continue;
|
|
138
|
+
parsed.hash = "";
|
|
139
|
+
const path = parsed.pathname;
|
|
140
|
+
if (path === "/" || path === "") continue;
|
|
141
|
+
if (RESOURCE_EXTENSIONS.test(path)) continue;
|
|
142
|
+
if (SKIP_PATH_PATTERNS.test(path)) continue;
|
|
143
|
+
const normalized = parsed.origin + path.replace(/\/+$/, "") + parsed.search;
|
|
144
|
+
urls.add(normalized);
|
|
145
|
+
} catch {
|
|
146
|
+
continue;
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
return Array.from(urls);
|
|
150
|
+
}
|
|
151
|
+
var CATEGORY_PATTERNS = [
|
|
152
|
+
[/\/(blog|articles?|posts?|news|insights|guides)\b/i, "blog"],
|
|
153
|
+
[/\/(about|about-us|company|who-we-are)\b/i, "about"],
|
|
154
|
+
[/\/(pricing|plans|packages)\b/i, "pricing"],
|
|
155
|
+
[/\/(services?|features?|solutions?|products?|what-we-do|offerings?)\b/i, "services"],
|
|
156
|
+
[/\/(contact|contact-us|get-in-touch)\b/i, "contact"],
|
|
157
|
+
[/\/(team|our-team|authors?|people|leadership|staff)\b/i, "team"],
|
|
158
|
+
[/\/(resources?|resource-center|library|downloads?)\b/i, "resources"],
|
|
159
|
+
[/\/(docs?|documentation|help|help-center|support|knowledge-base)\b/i, "docs"],
|
|
160
|
+
[/\/(case-stud\w*|cases|customers?|success-stor\w*|testimonials?)\b/i, "cases"],
|
|
161
|
+
[/\/(faq|frequently-asked|questions)\b/i, "faq"]
|
|
162
|
+
];
|
|
163
|
+
function inferCategory(url) {
|
|
164
|
+
try {
|
|
165
|
+
const path = new URL(url).pathname;
|
|
166
|
+
for (const [pattern, category] of CATEGORY_PATTERNS) {
|
|
167
|
+
if (pattern.test(path)) return category;
|
|
168
|
+
}
|
|
169
|
+
} catch {
|
|
170
|
+
}
|
|
171
|
+
return "content";
|
|
172
|
+
}
|
|
173
|
+
async function crawlFullSite(siteData, options) {
|
|
174
|
+
const startTime = Date.now();
|
|
175
|
+
const maxPages = options?.maxPages ?? 200;
|
|
176
|
+
const timeoutMs = options?.timeoutMs ?? 1e4;
|
|
177
|
+
const concurrency = options?.concurrency ?? 5;
|
|
178
|
+
const respectRobots = options?.respectRobots ?? true;
|
|
179
|
+
const pages = [];
|
|
180
|
+
const discoveredUrls = /* @__PURE__ */ new Set();
|
|
181
|
+
const fetchedUrls = /* @__PURE__ */ new Set();
|
|
182
|
+
const skippedUrls = /* @__PURE__ */ new Set();
|
|
183
|
+
const visited = /* @__PURE__ */ new Set();
|
|
184
|
+
let robotsRules = { disallow: [], allow: [] };
|
|
185
|
+
if (respectRobots && siteData.robotsTxt?.text) {
|
|
186
|
+
robotsRules = parseRobotsTxt(siteData.robotsTxt.text);
|
|
187
|
+
}
|
|
188
|
+
const baseUrl = `${siteData.protocol}://${siteData.domain}`;
|
|
189
|
+
visited.add(normalizeUrl(baseUrl));
|
|
190
|
+
visited.add(normalizeUrl(baseUrl + "/"));
|
|
191
|
+
if (siteData.blogSample) {
|
|
192
|
+
for (const page of siteData.blogSample) {
|
|
193
|
+
if (page.finalUrl) visited.add(normalizeUrl(page.finalUrl));
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
const queue = [];
|
|
197
|
+
if (siteData.sitemapXml?.text) {
|
|
198
|
+
const sitemapUrls = await extractAllUrlsFromSitemap(
|
|
199
|
+
siteData.sitemapXml.text,
|
|
200
|
+
siteData.domain,
|
|
201
|
+
timeoutMs
|
|
202
|
+
);
|
|
203
|
+
for (const url of sitemapUrls) {
|
|
204
|
+
const norm = normalizeUrl(url);
|
|
205
|
+
if (!visited.has(norm)) {
|
|
206
|
+
discoveredUrls.add(url);
|
|
207
|
+
if (!queue.includes(url)) queue.push(url);
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
if (siteData.homepage?.text) {
|
|
212
|
+
const homeLinks = extractInternalLinks(siteData.homepage.text, siteData.domain);
|
|
213
|
+
for (const url of homeLinks) {
|
|
214
|
+
const norm = normalizeUrl(url);
|
|
215
|
+
if (!visited.has(norm) && !discoveredUrls.has(url)) {
|
|
216
|
+
discoveredUrls.add(url);
|
|
217
|
+
if (!queue.includes(url)) queue.push(url);
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
while (queue.length > 0 && fetchedUrls.size < maxPages) {
|
|
222
|
+
const batchSize = Math.min(concurrency, maxPages - fetchedUrls.size, queue.length);
|
|
223
|
+
const batch = [];
|
|
224
|
+
while (batch.length < batchSize && queue.length > 0) {
|
|
225
|
+
const url = queue.shift();
|
|
226
|
+
const norm = normalizeUrl(url);
|
|
227
|
+
if (visited.has(norm)) continue;
|
|
228
|
+
visited.add(norm);
|
|
229
|
+
if (respectRobots) {
|
|
230
|
+
try {
|
|
231
|
+
const path = new URL(url).pathname;
|
|
232
|
+
if (isDisallowedByRobots(path, robotsRules)) {
|
|
233
|
+
skippedUrls.add(url);
|
|
234
|
+
continue;
|
|
235
|
+
}
|
|
236
|
+
} catch {
|
|
237
|
+
continue;
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
batch.push(url);
|
|
241
|
+
}
|
|
242
|
+
if (batch.length === 0) continue;
|
|
243
|
+
const results = await Promise.all(batch.map((url) => fetchPage(url, timeoutMs)));
|
|
244
|
+
for (let i = 0; i < results.length; i++) {
|
|
245
|
+
const result = results[i];
|
|
246
|
+
const url = batch[i];
|
|
247
|
+
fetchedUrls.add(url);
|
|
248
|
+
if (!result) continue;
|
|
249
|
+
result.category = inferCategory(url);
|
|
250
|
+
pages.push(result);
|
|
251
|
+
const newLinks = extractInternalLinks(result.text, siteData.domain);
|
|
252
|
+
for (const link of newLinks) {
|
|
253
|
+
const norm = normalizeUrl(link);
|
|
254
|
+
if (!visited.has(norm) && !discoveredUrls.has(link)) {
|
|
255
|
+
discoveredUrls.add(link);
|
|
256
|
+
queue.push(link);
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
for (const url of queue) {
|
|
262
|
+
if (!fetchedUrls.has(url)) {
|
|
263
|
+
skippedUrls.add(url);
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
return {
|
|
267
|
+
pages,
|
|
268
|
+
discoveredUrls: Array.from(discoveredUrls),
|
|
269
|
+
fetchedUrls: Array.from(fetchedUrls),
|
|
270
|
+
skippedUrls: Array.from(skippedUrls),
|
|
271
|
+
elapsed: Math.round((Date.now() - startTime) / 100) / 10
|
|
272
|
+
};
|
|
273
|
+
}
|
|
274
|
+
function normalizeUrl(url) {
|
|
275
|
+
try {
|
|
276
|
+
const parsed = new URL(url);
|
|
277
|
+
return (parsed.origin + parsed.pathname.replace(/\/+$/, "") + parsed.search).toLowerCase();
|
|
278
|
+
} catch {
|
|
279
|
+
return url.toLowerCase();
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
export {
|
|
284
|
+
parseRobotsTxt,
|
|
285
|
+
isDisallowedByRobots,
|
|
286
|
+
extractAllUrlsFromSitemap,
|
|
287
|
+
extractInternalLinks,
|
|
288
|
+
inferCategory,
|
|
289
|
+
crawlFullSite
|
|
290
|
+
};
|
|
291
|
+
//# sourceMappingURL=chunk-3IJISYWT.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/full-site-crawler.ts"],"sourcesContent":["/**\n * Full-site crawler for deep AEO audits.\n * BFS crawl that discovers all internal pages up to a configurable limit.\n */\n\nimport type { FetchResult, SiteData, PageCategory } from './site-crawler.js';\n\n// ─── Types ──────────────────────────────────────────────────────────────────\n\nexport interface CrawlOptions {\n /** Maximum pages to fetch (default 200) */\n maxPages?: number;\n /** Per-page fetch timeout in ms (default 10000) */\n timeoutMs?: number;\n /** Parallel fetches (default 5) */\n concurrency?: number;\n /** Honor robots.txt Disallow rules (default true) */\n respectRobots?: boolean;\n /** Include asset files — skipped by default */\n includeAssets?: boolean;\n}\n\nexport interface CrawlResult {\n pages: FetchResult[];\n discoveredUrls: string[];\n fetchedUrls: string[];\n skippedUrls: string[];\n elapsed: number;\n}\n\n// ─── Resource file extensions to skip ────────────────────────────────────────\n\nconst RESOURCE_EXTENSIONS = /\\.(js|css|png|jpg|jpeg|gif|svg|ico|pdf|xml|txt|woff|woff2|ttf|eot|mp4|mp3|webp|avif|zip|gz|tar|json)$/i;\n\nconst SKIP_PATH_PATTERNS = /^\\/(api|wp-admin|wp-json|static|assets|_next|auth|login|signup|cart|checkout|admin|feed|xmlrpc)\\b/i;\n\n// ─── Robots.txt parsing ─────────────────────────────────────────────────────\n\ninterface RobotsRules {\n disallow: string[];\n allow: string[];\n}\n\nexport function parseRobotsTxt(robotsText: string): RobotsRules {\n const lines = robotsText.split('\\n');\n const rules: RobotsRules = { disallow: [], allow: [] };\n\n // Collect rules for User-agent: * and User-agent: AEO-Visibility-Bot\n let inRelevantSection = false;\n\n for (const rawLine of lines) {\n const line = rawLine.trim();\n if (!line || line.startsWith('#')) continue;\n\n const uaMatch = line.match(/^user-agent:\\s*(.+)/i);\n if (uaMatch) {\n const agent = uaMatch[1].trim().toLowerCase();\n inRelevantSection = agent === '*' || agent === 'aeo-visibility-bot';\n continue;\n }\n\n if (!inRelevantSection) continue;\n\n const disallowMatch = line.match(/^disallow:\\s*(.*)/i);\n if (disallowMatch) {\n const path = disallowMatch[1].trim();\n if (path) rules.disallow.push(path);\n continue;\n }\n\n const allowMatch = line.match(/^allow:\\s*(.*)/i);\n if (allowMatch) {\n const path = allowMatch[1].trim();\n if (path) rules.allow.push(path);\n }\n }\n\n return rules;\n}\n\nexport function isDisallowedByRobots(urlPath: string, rules: RobotsRules): boolean {\n // Check allow rules first — more specific (longer) rules take precedence\n let longestAllow = 0;\n let longestDisallow = 0;\n\n for (const pattern of rules.allow) {\n if (urlPath.startsWith(pattern) && pattern.length > longestAllow) {\n longestAllow = pattern.length;\n }\n }\n\n for (const pattern of rules.disallow) {\n if (urlPath.startsWith(pattern) && pattern.length > longestDisallow) {\n longestDisallow = pattern.length;\n }\n }\n\n // More specific (longer) rule wins; if equal length, allow wins\n if (longestAllow === 0 && longestDisallow === 0) return false;\n return longestDisallow > longestAllow;\n}\n\n// ─── Fetch helper (matches multi-page-fetcher.ts fetchPage) ──────────────────\n\nasync function fetchPage(url: string, timeoutMs = 10000): Promise<FetchResult | null> {\n try {\n const res = await fetch(url, {\n signal: AbortSignal.timeout(timeoutMs),\n headers: { 'User-Agent': 'AEO-Visibility-Bot/1.0' },\n redirect: 'follow',\n });\n if (res.status !== 200) return null;\n const text = await res.text();\n if (text.length < 200) return null;\n return { text: text.slice(0, 500_000), status: res.status, finalUrl: res.url };\n } catch {\n return null;\n }\n}\n\nasync function fetchSitemapXml(url: string, timeoutMs = 10000): Promise<string | null> {\n try {\n const res = await fetch(url, {\n signal: AbortSignal.timeout(timeoutMs),\n headers: { 'User-Agent': 'AEO-Visibility-Bot/1.0' },\n redirect: 'follow',\n });\n if (res.status !== 200) return null;\n return await res.text();\n } catch {\n return null;\n }\n}\n\n// ─── Sitemap parsing ────────────────────────────────────────────────────────\n\n/**\n * Extract all page URLs from sitemap XML (handles sitemapindex with sub-sitemaps).\n * Filters to same domain only, skips resource files.\n */\nexport async function extractAllUrlsFromSitemap(\n sitemapText: string,\n domain: string,\n timeoutMs = 10000,\n): Promise<string[]> {\n const cleanDomain = domain.replace(/^www\\./, '').toLowerCase();\n const urls = new Set<string>();\n\n // Check for sitemapindex — fetch sub-sitemaps\n const subSitemapLocs = sitemapText.match(/<sitemap>[\\s\\S]*?<loc>([^<]+)<\\/loc>[\\s\\S]*?<\\/sitemap>/gi) || [];\n if (subSitemapLocs.length > 0) {\n const subUrls: string[] = [];\n for (const block of subSitemapLocs) {\n const locMatch = block.match(/<loc>([^<]+)<\\/loc>/i);\n if (locMatch) subUrls.push(locMatch[1].trim());\n }\n\n // Fetch sub-sitemaps in parallel (limit to 10)\n const fetches = subUrls.slice(0, 10).map(u => fetchSitemapXml(u, timeoutMs));\n const results = await Promise.all(fetches);\n for (const text of results) {\n if (text) {\n extractLocsFromXml(text, cleanDomain, urls);\n }\n }\n }\n\n // Also extract <url><loc> from the main sitemap text (could be a regular sitemap)\n extractLocsFromXml(sitemapText, cleanDomain, urls);\n\n return Array.from(urls);\n}\n\nfunction extractLocsFromXml(xml: string, cleanDomain: string, urls: Set<string>): void {\n const locMatches = xml.match(/<url>[\\s\\S]*?<loc>([^<]+)<\\/loc>[\\s\\S]*?<\\/url>/gi) || [];\n for (const block of locMatches) {\n const locMatch = block.match(/<loc>([^<]+)<\\/loc>/i);\n if (!locMatch) continue;\n const url = locMatch[1].trim();\n\n try {\n const parsed = new URL(url);\n const urlDomain = parsed.hostname.replace(/^www\\./, '').toLowerCase();\n if (urlDomain !== cleanDomain) continue;\n if (RESOURCE_EXTENSIONS.test(parsed.pathname)) continue;\n urls.add(url);\n } catch {\n continue;\n }\n }\n}\n\n// ─── Internal link extraction ───────────────────────────────────────────────\n\n/**\n * Extract ALL internal links from HTML (not just nav).\n * Returns deduplicated full URLs for the same domain.\n */\nexport function extractInternalLinks(html: string, domain: string): string[] {\n const cleanDomain = domain.replace(/^www\\./, '').toLowerCase();\n const hrefMatches = html.match(/href=\"([^\"]*)\"/gi) || [];\n const urls = new Set<string>();\n\n for (const match of hrefMatches) {\n const href = match.match(/href=\"([^\"]*)\"/i)?.[1];\n if (!href || !href.trim()) continue;\n\n let fullUrl: string;\n\n if (href.startsWith('//')) {\n fullUrl = `https:${href}`;\n } else if (href.startsWith('/')) {\n // Skip fragment-only, query-only, and anchor links\n if (href === '/' || href.startsWith('/#')) continue;\n fullUrl = `https://${domain}${href}`;\n } else if (href.startsWith('http')) {\n fullUrl = href;\n } else if (href.startsWith('#') || href.startsWith('?') || href.startsWith('mailto:') || href.startsWith('tel:') || href.startsWith('javascript:')) {\n continue;\n } else {\n // Relative path\n fullUrl = `https://${domain}/${href}`;\n }\n\n try {\n const parsed = new URL(fullUrl);\n const linkDomain = parsed.hostname.replace(/^www\\./, '').toLowerCase();\n if (linkDomain !== cleanDomain) continue;\n\n // Strip hash and normalize\n parsed.hash = '';\n const path = parsed.pathname;\n\n if (path === '/' || path === '') continue;\n if (RESOURCE_EXTENSIONS.test(path)) continue;\n if (SKIP_PATH_PATTERNS.test(path)) continue;\n\n // Normalize: strip trailing slash\n const normalized = parsed.origin + path.replace(/\\/+$/, '') + parsed.search;\n urls.add(normalized);\n } catch {\n continue;\n }\n }\n\n return Array.from(urls);\n}\n\n// ─── Category inference ─────────────────────────────────────────────────────\n\nconst CATEGORY_PATTERNS: Array<[RegExp, PageCategory]> = [\n [/\\/(blog|articles?|posts?|news|insights|guides)\\b/i, 'blog'],\n [/\\/(about|about-us|company|who-we-are)\\b/i, 'about'],\n [/\\/(pricing|plans|packages)\\b/i, 'pricing'],\n [/\\/(services?|features?|solutions?|products?|what-we-do|offerings?)\\b/i, 'services'],\n [/\\/(contact|contact-us|get-in-touch)\\b/i, 'contact'],\n [/\\/(team|our-team|authors?|people|leadership|staff)\\b/i, 'team'],\n [/\\/(resources?|resource-center|library|downloads?)\\b/i, 'resources'],\n [/\\/(docs?|documentation|help|help-center|support|knowledge-base)\\b/i, 'docs'],\n [/\\/(case-stud\\w*|cases|customers?|success-stor\\w*|testimonials?)\\b/i, 'cases'],\n [/\\/(faq|frequently-asked|questions)\\b/i, 'faq'],\n];\n\n/**\n * Infer PageCategory from URL path patterns.\n */\nexport function inferCategory(url: string): PageCategory {\n try {\n const path = new URL(url).pathname;\n for (const [pattern, category] of CATEGORY_PATTERNS) {\n if (pattern.test(path)) return category;\n }\n } catch {\n // Fall through to default\n }\n return 'content';\n}\n\n// ─── Main crawler ───────────────────────────────────────────────────────────\n\n/**\n * BFS crawl of a site, discovering all internal pages up to maxPages.\n * Seeds from sitemap URLs + homepage internal links.\n * Skips URLs already in siteData.blogSample and homepage.\n */\nexport async function crawlFullSite(\n siteData: SiteData,\n options?: CrawlOptions,\n): Promise<CrawlResult> {\n const startTime = Date.now();\n const maxPages = options?.maxPages ?? 200;\n const timeoutMs = options?.timeoutMs ?? 10000;\n const concurrency = options?.concurrency ?? 5;\n const respectRobots = options?.respectRobots ?? true;\n\n const pages: FetchResult[] = [];\n const discoveredUrls = new Set<string>();\n const fetchedUrls = new Set<string>();\n const skippedUrls = new Set<string>();\n const visited = new Set<string>();\n\n // Parse robots.txt rules\n let robotsRules: RobotsRules = { disallow: [], allow: [] };\n if (respectRobots && siteData.robotsTxt?.text) {\n robotsRules = parseRobotsTxt(siteData.robotsTxt.text);\n }\n\n const baseUrl = `${siteData.protocol}://${siteData.domain}`;\n\n // Mark already-fetched URLs as visited\n visited.add(normalizeUrl(baseUrl));\n visited.add(normalizeUrl(baseUrl + '/'));\n if (siteData.blogSample) {\n for (const page of siteData.blogSample) {\n if (page.finalUrl) visited.add(normalizeUrl(page.finalUrl));\n }\n }\n\n // Seed the queue from sitemap\n const queue: string[] = [];\n if (siteData.sitemapXml?.text) {\n const sitemapUrls = await extractAllUrlsFromSitemap(\n siteData.sitemapXml.text,\n siteData.domain,\n timeoutMs,\n );\n for (const url of sitemapUrls) {\n const norm = normalizeUrl(url);\n if (!visited.has(norm)) {\n discoveredUrls.add(url);\n if (!queue.includes(url)) queue.push(url);\n }\n }\n }\n\n // Seed from homepage internal links\n if (siteData.homepage?.text) {\n const homeLinks = extractInternalLinks(siteData.homepage.text, siteData.domain);\n for (const url of homeLinks) {\n const norm = normalizeUrl(url);\n if (!visited.has(norm) && !discoveredUrls.has(url)) {\n discoveredUrls.add(url);\n if (!queue.includes(url)) queue.push(url);\n }\n }\n }\n\n // BFS loop\n while (queue.length > 0 && fetchedUrls.size < maxPages) {\n // Take a batch\n const batchSize = Math.min(concurrency, maxPages - fetchedUrls.size, queue.length);\n const batch: string[] = [];\n\n while (batch.length < batchSize && queue.length > 0) {\n const url = queue.shift()!;\n const norm = normalizeUrl(url);\n\n if (visited.has(norm)) continue;\n visited.add(norm);\n\n // Check robots.txt\n if (respectRobots) {\n try {\n const path = new URL(url).pathname;\n if (isDisallowedByRobots(path, robotsRules)) {\n skippedUrls.add(url);\n continue;\n }\n } catch {\n // Skip malformed URLs\n continue;\n }\n }\n\n batch.push(url);\n }\n\n if (batch.length === 0) continue;\n\n // Fetch batch in parallel\n const results = await Promise.all(batch.map(url => fetchPage(url, timeoutMs)));\n\n for (let i = 0; i < results.length; i++) {\n const result = results[i];\n const url = batch[i];\n fetchedUrls.add(url);\n\n if (!result) continue;\n\n result.category = inferCategory(url);\n pages.push(result);\n\n // Extract new internal links from fetched page\n const newLinks = extractInternalLinks(result.text, siteData.domain);\n for (const link of newLinks) {\n const norm = normalizeUrl(link);\n if (!visited.has(norm) && !discoveredUrls.has(link)) {\n discoveredUrls.add(link);\n queue.push(link);\n }\n }\n }\n }\n\n // Any remaining queued URLs count as discovered but skipped\n for (const url of queue) {\n if (!fetchedUrls.has(url)) {\n skippedUrls.add(url);\n }\n }\n\n return {\n pages,\n discoveredUrls: Array.from(discoveredUrls),\n fetchedUrls: Array.from(fetchedUrls),\n skippedUrls: Array.from(skippedUrls),\n elapsed: Math.round((Date.now() - startTime) / 100) / 10,\n };\n}\n\nfunction normalizeUrl(url: string): string {\n try {\n const parsed = new URL(url);\n // Normalize: lowercase host, strip trailing slash, strip hash\n return (parsed.origin + parsed.pathname.replace(/\\/+$/, '') + parsed.search).toLowerCase();\n } catch {\n return url.toLowerCase();\n }\n}\n"],"mappings":";AAgCA,IAAM,sBAAsB;AAE5B,IAAM,qBAAqB;AASpB,SAAS,eAAe,YAAiC;AAC9D,QAAM,QAAQ,WAAW,MAAM,IAAI;AACnC,QAAM,QAAqB,EAAE,UAAU,CAAC,GAAG,OAAO,CAAC,EAAE;AAGrD,MAAI,oBAAoB;AAExB,aAAW,WAAW,OAAO;AAC3B,UAAM,OAAO,QAAQ,KAAK;AAC1B,QAAI,CAAC,QAAQ,KAAK,WAAW,GAAG,EAAG;AAEnC,UAAM,UAAU,KAAK,MAAM,sBAAsB;AACjD,QAAI,SAAS;AACX,YAAM,QAAQ,QAAQ,CAAC,EAAE,KAAK,EAAE,YAAY;AAC5C,0BAAoB,UAAU,OAAO,UAAU;AAC/C;AAAA,IACF;AAEA,QAAI,CAAC,kBAAmB;AAExB,UAAM,gBAAgB,KAAK,MAAM,oBAAoB;AACrD,QAAI,eAAe;AACjB,YAAM,OAAO,cAAc,CAAC,EAAE,KAAK;AACnC,UAAI,KAAM,OAAM,SAAS,KAAK,IAAI;AAClC;AAAA,IACF;AAEA,UAAM,aAAa,KAAK,MAAM,iBAAiB;AAC/C,QAAI,YAAY;AACd,YAAM,OAAO,WAAW,CAAC,EAAE,KAAK;AAChC,UAAI,KAAM,OAAM,MAAM,KAAK,IAAI;AAAA,IACjC;AAAA,EACF;AAEA,SAAO;AACT;AAEO,SAAS,qBAAqB,SAAiB,OAA6B;AAEjF,MAAI,eAAe;AACnB,MAAI,kBAAkB;AAEtB,aAAW,WAAW,MAAM,OAAO;AACjC,QAAI,QAAQ,WAAW,OAAO,KAAK,QAAQ,SAAS,cAAc;AAChE,qBAAe,QAAQ;AAAA,IACzB;AAAA,EACF;AAEA,aAAW,WAAW,MAAM,UAAU;AACpC,QAAI,QAAQ,WAAW,OAAO,KAAK,QAAQ,SAAS,iBAAiB;AACnE,wBAAkB,QAAQ;AAAA,IAC5B;AAAA,EACF;AAGA,MAAI,iBAAiB,KAAK,oBAAoB,EAAG,QAAO;AACxD,SAAO,kBAAkB;AAC3B;AAIA,eAAe,UAAU,KAAa,YAAY,KAAoC;AACpF,MAAI;AACF,UAAM,MAAM,MAAM,MAAM,KAAK;AAAA,MAC3B,QAAQ,YAAY,QAAQ,SAAS;AAAA,MACrC,SAAS,EAAE,cAAc,yBAAyB;AAAA,MAClD,UAAU;AAAA,IACZ,CAAC;AACD,QAAI,IAAI,WAAW,IAAK,QAAO;AAC/B,UAAM,OAAO,MAAM,IAAI,KAAK;AAC5B,QAAI,KAAK,SAAS,IAAK,QAAO;AAC9B,WAAO,EAAE,MAAM,KAAK,MAAM,GAAG,GAAO,GAAG,QAAQ,IAAI,QAAQ,UAAU,IAAI,IAAI;AAAA,EAC/E,QAAQ;AACN,WAAO;AAAA,EACT;AACF;AAEA,eAAe,gBAAgB,KAAa,YAAY,KAA+B;AACrF,MAAI;AACF,UAAM,MAAM,MAAM,MAAM,KAAK;AAAA,MAC3B,QAAQ,YAAY,QAAQ,SAAS;AAAA,MACrC,SAAS,EAAE,cAAc,yBAAyB;AAAA,MAClD,UAAU;AAAA,IACZ,CAAC;AACD,QAAI,IAAI,WAAW,IAAK,QAAO;AAC/B,WAAO,MAAM,IAAI,KAAK;AAAA,EACxB,QAAQ;AACN,WAAO;AAAA,EACT;AACF;AAQA,eAAsB,0BACpB,aACA,QACA,YAAY,KACO;AACnB,QAAM,cAAc,OAAO,QAAQ,UAAU,EAAE,EAAE,YAAY;AAC7D,QAAM,OAAO,oBAAI,IAAY;AAG7B,QAAM,iBAAiB,YAAY,MAAM,2DAA2D,KAAK,CAAC;AAC1G,MAAI,eAAe,SAAS,GAAG;AAC7B,UAAM,UAAoB,CAAC;AAC3B,eAAW,SAAS,gBAAgB;AAClC,YAAM,WAAW,MAAM,MAAM,sBAAsB;AACnD,UAAI,SAAU,SAAQ,KAAK,SAAS,CAAC,EAAE,KAAK,CAAC;AAAA,IAC/C;AAGA,UAAM,UAAU,QAAQ,MAAM,GAAG,EAAE,EAAE,IAAI,OAAK,gBAAgB,GAAG,SAAS,CAAC;AAC3E,UAAM,UAAU,MAAM,QAAQ,IAAI,OAAO;AACzC,eAAW,QAAQ,SAAS;AAC1B,UAAI,MAAM;AACR,2BAAmB,MAAM,aAAa,IAAI;AAAA,MAC5C;AAAA,IACF;AAAA,EACF;AAGA,qBAAmB,aAAa,aAAa,IAAI;AAEjD,SAAO,MAAM,KAAK,IAAI;AACxB;AAEA,SAAS,mBAAmB,KAAa,aAAqB,MAAyB;AACrF,QAAM,aAAa,IAAI,MAAM,mDAAmD,KAAK,CAAC;AACtF,aAAW,SAAS,YAAY;AAC9B,UAAM,WAAW,MAAM,MAAM,sBAAsB;AACnD,QAAI,CAAC,SAAU;AACf,UAAM,MAAM,SAAS,CAAC,EAAE,KAAK;AAE7B,QAAI;AACF,YAAM,SAAS,IAAI,IAAI,GAAG;AAC1B,YAAM,YAAY,OAAO,SAAS,QAAQ,UAAU,EAAE,EAAE,YAAY;AACpE,UAAI,cAAc,YAAa;AAC/B,UAAI,oBAAoB,KAAK,OAAO,QAAQ,EAAG;AAC/C,WAAK,IAAI,GAAG;AAAA,IACd,QAAQ;AACN;AAAA,IACF;AAAA,EACF;AACF;AAQO,SAAS,qBAAqB,MAAc,QAA0B;AAC3E,QAAM,cAAc,OAAO,QAAQ,UAAU,EAAE,EAAE,YAAY;AAC7D,QAAM,cAAc,KAAK,MAAM,kBAAkB,KAAK,CAAC;AACvD,QAAM,OAAO,oBAAI,IAAY;AAE7B,aAAW,SAAS,aAAa;AAC/B,UAAM,OAAO,MAAM,MAAM,iBAAiB,IAAI,CAAC;AAC/C,QAAI,CAAC,QAAQ,CAAC,KAAK,KAAK,EAAG;AAE3B,QAAI;AAEJ,QAAI,KAAK,WAAW,IAAI,GAAG;AACzB,gBAAU,SAAS,IAAI;AAAA,IACzB,WAAW,KAAK,WAAW,GAAG,GAAG;AAE/B,UAAI,SAAS,OAAO,KAAK,WAAW,IAAI,EAAG;AAC3C,gBAAU,WAAW,MAAM,GAAG,IAAI;AAAA,IACpC,WAAW,KAAK,WAAW,MAAM,GAAG;AAClC,gBAAU;AAAA,IACZ,WAAW,KAAK,WAAW,GAAG,KAAK,KAAK,WAAW,GAAG,KAAK,KAAK,WAAW,SAAS,KAAK,KAAK,WAAW,MAAM,KAAK,KAAK,WAAW,aAAa,GAAG;AAClJ;AAAA,IACF,OAAO;AAEL,gBAAU,WAAW,MAAM,IAAI,IAAI;AAAA,IACrC;AAEA,QAAI;AACF,YAAM,SAAS,IAAI,IAAI,OAAO;AAC9B,YAAM,aAAa,OAAO,SAAS,QAAQ,UAAU,EAAE,EAAE,YAAY;AACrE,UAAI,eAAe,YAAa;AAGhC,aAAO,OAAO;AACd,YAAM,OAAO,OAAO;AAEpB,UAAI,SAAS,OAAO,SAAS,GAAI;AACjC,UAAI,oBAAoB,KAAK,IAAI,EAAG;AACpC,UAAI,mBAAmB,KAAK,IAAI,EAAG;AAGnC,YAAM,aAAa,OAAO,SAAS,KAAK,QAAQ,QAAQ,EAAE,IAAI,OAAO;AACrE,WAAK,IAAI,UAAU;AAAA,IACrB,QAAQ;AACN;AAAA,IACF;AAAA,EACF;AAEA,SAAO,MAAM,KAAK,IAAI;AACxB;AAIA,IAAM,oBAAmD;AAAA,EACvD,CAAC,qDAAqD,MAAM;AAAA,EAC5D,CAAC,4CAA4C,OAAO;AAAA,EACpD,CAAC,iCAAiC,SAAS;AAAA,EAC3C,CAAC,yEAAyE,UAAU;AAAA,EACpF,CAAC,0CAA0C,SAAS;AAAA,EACpD,CAAC,yDAAyD,MAAM;AAAA,EAChE,CAAC,wDAAwD,WAAW;AAAA,EACpE,CAAC,sEAAsE,MAAM;AAAA,EAC7E,CAAC,sEAAsE,OAAO;AAAA,EAC9E,CAAC,yCAAyC,KAAK;AACjD;AAKO,SAAS,cAAc,KAA2B;AACvD,MAAI;AACF,UAAM,OAAO,IAAI,IAAI,GAAG,EAAE;AAC1B,eAAW,CAAC,SAAS,QAAQ,KAAK,mBAAmB;AACnD,UAAI,QAAQ,KAAK,IAAI,EAAG,QAAO;AAAA,IACjC;AAAA,EACF,QAAQ;AAAA,EAER;AACA,SAAO;AACT;AASA,eAAsB,cACpB,UACA,SACsB;AACtB,QAAM,YAAY,KAAK,IAAI;AAC3B,QAAM,WAAW,SAAS,YAAY;AACtC,QAAM,YAAY,SAAS,aAAa;AACxC,QAAM,cAAc,SAAS,eAAe;AAC5C,QAAM,gBAAgB,SAAS,iBAAiB;AAEhD,QAAM,QAAuB,CAAC;AAC9B,QAAM,iBAAiB,oBAAI,IAAY;AACvC,QAAM,cAAc,oBAAI,IAAY;AACpC,QAAM,cAAc,oBAAI,IAAY;AACpC,QAAM,UAAU,oBAAI,IAAY;AAGhC,MAAI,cAA2B,EAAE,UAAU,CAAC,GAAG,OAAO,CAAC,EAAE;AACzD,MAAI,iBAAiB,SAAS,WAAW,MAAM;AAC7C,kBAAc,eAAe,SAAS,UAAU,IAAI;AAAA,EACtD;AAEA,QAAM,UAAU,GAAG,SAAS,QAAQ,MAAM,SAAS,MAAM;AAGzD,UAAQ,IAAI,aAAa,OAAO,CAAC;AACjC,UAAQ,IAAI,aAAa,UAAU,GAAG,CAAC;AACvC,MAAI,SAAS,YAAY;AACvB,eAAW,QAAQ,SAAS,YAAY;AACtC,UAAI,KAAK,SAAU,SAAQ,IAAI,aAAa,KAAK,QAAQ,CAAC;AAAA,IAC5D;AAAA,EACF;AAGA,QAAM,QAAkB,CAAC;AACzB,MAAI,SAAS,YAAY,MAAM;AAC7B,UAAM,cAAc,MAAM;AAAA,MACxB,SAAS,WAAW;AAAA,MACpB,SAAS;AAAA,MACT;AAAA,IACF;AACA,eAAW,OAAO,aAAa;AAC7B,YAAM,OAAO,aAAa,GAAG;AAC7B,UAAI,CAAC,QAAQ,IAAI,IAAI,GAAG;AACtB,uBAAe,IAAI,GAAG;AACtB,YAAI,CAAC,MAAM,SAAS,GAAG,EAAG,OAAM,KAAK,GAAG;AAAA,MAC1C;AAAA,IACF;AAAA,EACF;AAGA,MAAI,SAAS,UAAU,MAAM;AAC3B,UAAM,YAAY,qBAAqB,SAAS,SAAS,MAAM,SAAS,MAAM;AAC9E,eAAW,OAAO,WAAW;AAC3B,YAAM,OAAO,aAAa,GAAG;AAC7B,UAAI,CAAC,QAAQ,IAAI,IAAI,KAAK,CAAC,eAAe,IAAI,GAAG,GAAG;AAClD,uBAAe,IAAI,GAAG;AACtB,YAAI,CAAC,MAAM,SAAS,GAAG,EAAG,OAAM,KAAK,GAAG;AAAA,MAC1C;AAAA,IACF;AAAA,EACF;AAGA,SAAO,MAAM,SAAS,KAAK,YAAY,OAAO,UAAU;AAEtD,UAAM,YAAY,KAAK,IAAI,aAAa,WAAW,YAAY,MAAM,MAAM,MAAM;AACjF,UAAM,QAAkB,CAAC;AAEzB,WAAO,MAAM,SAAS,aAAa,MAAM,SAAS,GAAG;AACnD,YAAM,MAAM,MAAM,MAAM;AACxB,YAAM,OAAO,aAAa,GAAG;AAE7B,UAAI,QAAQ,IAAI,IAAI,EAAG;AACvB,cAAQ,IAAI,IAAI;AAGhB,UAAI,eAAe;AACjB,YAAI;AACF,gBAAM,OAAO,IAAI,IAAI,GAAG,EAAE;AAC1B,cAAI,qBAAqB,MAAM,WAAW,GAAG;AAC3C,wBAAY,IAAI,GAAG;AACnB;AAAA,UACF;AAAA,QACF,QAAQ;AAEN;AAAA,QACF;AAAA,MACF;AAEA,YAAM,KAAK,GAAG;AAAA,IAChB;AAEA,QAAI,MAAM,WAAW,EAAG;AAGxB,UAAM,UAAU,MAAM,QAAQ,IAAI,MAAM,IAAI,SAAO,UAAU,KAAK,SAAS,CAAC,CAAC;AAE7E,aAAS,IAAI,GAAG,IAAI,QAAQ,QAAQ,KAAK;AACvC,YAAM,SAAS,QAAQ,CAAC;AACxB,YAAM,MAAM,MAAM,CAAC;AACnB,kBAAY,IAAI,GAAG;AAEnB,UAAI,CAAC,OAAQ;AAEb,aAAO,WAAW,cAAc,GAAG;AACnC,YAAM,KAAK,MAAM;AAGjB,YAAM,WAAW,qBAAqB,OAAO,MAAM,SAAS,MAAM;AAClE,iBAAW,QAAQ,UAAU;AAC3B,cAAM,OAAO,aAAa,IAAI;AAC9B,YAAI,CAAC,QAAQ,IAAI,IAAI,KAAK,CAAC,eAAe,IAAI,IAAI,GAAG;AACnD,yBAAe,IAAI,IAAI;AACvB,gBAAM,KAAK,IAAI;AAAA,QACjB;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAGA,aAAW,OAAO,OAAO;AACvB,QAAI,CAAC,YAAY,IAAI,GAAG,GAAG;AACzB,kBAAY,IAAI,GAAG;AAAA,IACrB;AAAA,EACF;AAEA,SAAO;AAAA,IACL;AAAA,IACA,gBAAgB,MAAM,KAAK,cAAc;AAAA,IACzC,aAAa,MAAM,KAAK,WAAW;AAAA,IACnC,aAAa,MAAM,KAAK,WAAW;AAAA,IACnC,SAAS,KAAK,OAAO,KAAK,IAAI,IAAI,aAAa,GAAG,IAAI;AAAA,EACxD;AACF;AAEA,SAAS,aAAa,KAAqB;AACzC,MAAI;AACF,UAAM,SAAS,IAAI,IAAI,GAAG;AAE1B,YAAQ,OAAO,SAAS,OAAO,SAAS,QAAQ,QAAQ,EAAE,IAAI,OAAO,QAAQ,YAAY;AAAA,EAC3F,QAAQ;AACN,WAAO,IAAI,YAAY;AAAA,EACzB;AACF;","names":[]}
|
package/dist/cli.js
CHANGED
|
@@ -1758,7 +1758,11 @@ function extractRawDataSummary(data) {
|
|
|
1758
1758
|
const d = new Date(m[1]);
|
|
1759
1759
|
if (isNaN(d.getTime())) return null;
|
|
1760
1760
|
return Math.floor((Date.now() - d.getTime()) / (1e3 * 60 * 60 * 24));
|
|
1761
|
-
})()
|
|
1761
|
+
})(),
|
|
1762
|
+
// Full-crawl stats
|
|
1763
|
+
crawl_discovered: data.crawlStats?.discovered ?? 0,
|
|
1764
|
+
crawl_fetched: data.crawlStats?.fetched ?? 0,
|
|
1765
|
+
crawl_skipped: data.crawlStats?.skipped ?? 0
|
|
1762
1766
|
};
|
|
1763
1767
|
}
|
|
1764
1768
|
function auditSiteFromData(data) {
|
|
@@ -2737,7 +2741,20 @@ async function audit(domain, options) {
|
|
|
2737
2741
|
}
|
|
2738
2742
|
}
|
|
2739
2743
|
}
|
|
2740
|
-
if (
|
|
2744
|
+
if (options?.fullCrawl) {
|
|
2745
|
+
const { crawlFullSite } = await import("./full-site-crawler-F7J2HRL4.js");
|
|
2746
|
+
const crawlResult = await crawlFullSite(siteData, {
|
|
2747
|
+
maxPages: options.maxPages ?? 200,
|
|
2748
|
+
concurrency: options.concurrency ?? 5
|
|
2749
|
+
});
|
|
2750
|
+
siteData.blogSample = crawlResult.pages;
|
|
2751
|
+
siteData.crawlStats = {
|
|
2752
|
+
discovered: crawlResult.discoveredUrls.length,
|
|
2753
|
+
fetched: crawlResult.fetchedUrls.length,
|
|
2754
|
+
skipped: crawlResult.skippedUrls.length,
|
|
2755
|
+
elapsed: crawlResult.elapsed
|
|
2756
|
+
};
|
|
2757
|
+
} else if (!options?.noMultiPage) {
|
|
2741
2758
|
await fetchMultiPageData(siteData);
|
|
2742
2759
|
}
|
|
2743
2760
|
const results = auditSiteFromData(siteData);
|
|
@@ -3094,7 +3111,7 @@ function generateComparisonHtmlReport(result) {
|
|
|
3094
3111
|
}
|
|
3095
3112
|
|
|
3096
3113
|
// src/cli.ts
|
|
3097
|
-
var VERSION = "1.
|
|
3114
|
+
var VERSION = "1.4.0";
|
|
3098
3115
|
function printHelp() {
|
|
3099
3116
|
console.log(`
|
|
3100
3117
|
aeorank - AI Engine Optimization audit
|
|
@@ -3111,6 +3128,9 @@ function printHelp() {
|
|
|
3111
3128
|
--threshold <N> Score threshold for --ci (default: 70)
|
|
3112
3129
|
--no-headless Skip Puppeteer SPA rendering
|
|
3113
3130
|
--no-multi-page Skip extra page discovery (faster)
|
|
3131
|
+
--full-crawl BFS crawl all discoverable pages
|
|
3132
|
+
--max-pages <N> Max pages for --full-crawl (default: 200)
|
|
3133
|
+
--concurrency <N> Parallel fetches for --full-crawl (default: 5)
|
|
3114
3134
|
--version Print version
|
|
3115
3135
|
--help Show this help
|
|
3116
3136
|
|
|
@@ -3125,16 +3145,17 @@ function printHelp() {
|
|
|
3125
3145
|
`);
|
|
3126
3146
|
}
|
|
3127
3147
|
function parseArgs(argv) {
|
|
3128
|
-
const defaults = { domain: "", domainB: null, json: false, summary: false, html: false, ci: false, threshold: 70, noHeadless: false, noMultiPage: false, version: false, help: false };
|
|
3148
|
+
const defaults = { domain: "", domainB: null, json: false, summary: false, html: false, ci: false, threshold: 70, noHeadless: false, noMultiPage: false, fullCrawl: false, maxPages: 200, concurrency: 5, version: false, help: false };
|
|
3129
3149
|
if (argv.includes("--version") || argv.includes("-v")) {
|
|
3130
3150
|
return { ...defaults, version: true };
|
|
3131
3151
|
}
|
|
3132
3152
|
if (argv.includes("--help") || argv.includes("-h")) {
|
|
3133
3153
|
return { ...defaults, help: true };
|
|
3134
3154
|
}
|
|
3155
|
+
const valuedFlags = ["--threshold", "--max-pages", "--concurrency"];
|
|
3135
3156
|
const nonFlags = [];
|
|
3136
3157
|
for (let i = 0; i < argv.length; i++) {
|
|
3137
|
-
if (argv[i]
|
|
3158
|
+
if (valuedFlags.includes(argv[i])) {
|
|
3138
3159
|
i++;
|
|
3139
3160
|
continue;
|
|
3140
3161
|
}
|
|
@@ -3150,6 +3171,8 @@ function parseArgs(argv) {
|
|
|
3150
3171
|
return argv[idx + 1];
|
|
3151
3172
|
}
|
|
3152
3173
|
const threshold = parseInt(getArg("threshold") || "70", 10);
|
|
3174
|
+
const maxPages = parseInt(getArg("max-pages") || "200", 10);
|
|
3175
|
+
const concurrency = parseInt(getArg("concurrency") || "5", 10);
|
|
3153
3176
|
return {
|
|
3154
3177
|
domain,
|
|
3155
3178
|
domainB,
|
|
@@ -3160,6 +3183,9 @@ function parseArgs(argv) {
|
|
|
3160
3183
|
threshold: isNaN(threshold) ? 70 : threshold,
|
|
3161
3184
|
noHeadless: argv.includes("--no-headless"),
|
|
3162
3185
|
noMultiPage: argv.includes("--no-multi-page"),
|
|
3186
|
+
fullCrawl: argv.includes("--full-crawl"),
|
|
3187
|
+
maxPages: isNaN(maxPages) ? 200 : maxPages,
|
|
3188
|
+
concurrency: isNaN(concurrency) ? 5 : concurrency,
|
|
3163
3189
|
version: false,
|
|
3164
3190
|
help: false
|
|
3165
3191
|
};
|
|
@@ -3241,7 +3267,13 @@ async function main() {
|
|
|
3241
3267
|
}
|
|
3242
3268
|
const log = (msg) => process.stderr.write(`[aeorank] ${msg}
|
|
3243
3269
|
`);
|
|
3244
|
-
const auditOptions = {
|
|
3270
|
+
const auditOptions = {
|
|
3271
|
+
noHeadless: args.noHeadless,
|
|
3272
|
+
noMultiPage: args.noMultiPage,
|
|
3273
|
+
fullCrawl: args.fullCrawl,
|
|
3274
|
+
maxPages: args.maxPages,
|
|
3275
|
+
concurrency: args.concurrency
|
|
3276
|
+
};
|
|
3245
3277
|
try {
|
|
3246
3278
|
if (args.domainB) {
|
|
3247
3279
|
log(`Comparing ${args.domain} vs ${args.domainB}...`);
|