seo-intel 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +41 -0
- package/LICENSE +75 -0
- package/README.md +243 -0
- package/Start SEO Intel.bat +9 -0
- package/Start SEO Intel.command +8 -0
- package/cli.js +3727 -0
- package/config/example.json +29 -0
- package/config/setup-wizard.js +522 -0
- package/crawler/index.js +566 -0
- package/crawler/robots.js +103 -0
- package/crawler/sanitize.js +124 -0
- package/crawler/schema-parser.js +168 -0
- package/crawler/sitemap.js +103 -0
- package/crawler/stealth.js +393 -0
- package/crawler/subdomain-discovery.js +341 -0
- package/db/db.js +213 -0
- package/db/schema.sql +120 -0
- package/exports/competitive.js +186 -0
- package/exports/heuristics.js +67 -0
- package/exports/queries.js +197 -0
- package/exports/suggestive.js +230 -0
- package/exports/technical.js +180 -0
- package/exports/templates.js +77 -0
- package/lib/gate.js +204 -0
- package/lib/license.js +369 -0
- package/lib/oauth.js +432 -0
- package/lib/updater.js +324 -0
- package/package.json +68 -0
- package/reports/generate-html.js +6194 -0
- package/reports/generate-site-graph.js +949 -0
- package/reports/gsc-loader.js +190 -0
- package/scheduler.js +142 -0
- package/seo-audit.js +619 -0
- package/seo-intel.png +0 -0
- package/server.js +602 -0
- package/setup/ROADMAP.md +109 -0
- package/setup/checks.js +483 -0
- package/setup/config-builder.js +227 -0
- package/setup/engine.js +65 -0
- package/setup/installers.js +197 -0
- package/setup/models.js +328 -0
- package/setup/openclaw-bridge.js +329 -0
- package/setup/validator.js +395 -0
- package/setup/web-routes.js +688 -0
- package/setup/wizard.html +2920 -0
- package/start-seo-intel.sh +8 -0
package/crawler/index.js
ADDED
|
@@ -0,0 +1,566 @@
|
|
|
1
|
+
import { createHash } from 'crypto';
|
|
2
|
+
import { chromium } from 'playwright';
|
|
3
|
+
import { sanitize, extractSelective, extractAsMarkdown } from './sanitize.js';
|
|
4
|
+
import { checkRobots, getCrawlDelay } from './robots.js';
|
|
5
|
+
import { fetchSitemap } from './sitemap.js';
|
|
6
|
+
import { parseJsonLd } from './schema-parser.js';
|
|
7
|
+
import { loadSessionState, saveSessionState, discardSession } from './stealth.js';
|
|
8
|
+
|
|
9
|
+
const CRAWL_DELAY = parseInt(process.env.CRAWL_DELAY_MS || '1500');
|
|
10
|
+
const MAX_PAGES = parseInt(process.env.CRAWL_MAX_PAGES || '50');
|
|
11
|
+
const MAX_DEPTH = parseInt(process.env.CRAWL_MAX_DEPTH || '3');
|
|
12
|
+
const TIMEOUT = parseInt(process.env.CRAWL_TIMEOUT_MS || '12000');
|
|
13
|
+
const PAGE_BUDGET = parseInt(process.env.PAGE_BUDGET_MS || '25000'); // hard per-page wall-clock limit
|
|
14
|
+
|
|
15
|
+
// ── Content quality gate ────────────────────────────────────────────────
|
|
16
|
+
const SHELL_PATTERNS = /id=["'](root|app|__next|__nuxt)["']|<noscript[^>]*>.*enable javascript/i;
|
|
17
|
+
const CAPTCHA_PATTERNS = /cf-browser-verification|checking your browser|just a moment|verify you are human|challenge-platform/i;
|
|
18
|
+
|
|
19
|
+
function assessQuality({ wordCount, bodyText, title }) {
|
|
20
|
+
if (CAPTCHA_PATTERNS.test(bodyText)) return { ok: false, reason: 'blocked' };
|
|
21
|
+
if (wordCount < 30 && title && SHELL_PATTERNS.test(bodyText)) return { ok: false, reason: 'js-shell' };
|
|
22
|
+
if (wordCount < 10) return { ok: false, reason: 'empty' };
|
|
23
|
+
return { ok: true, reason: null };
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
// ── SECTION TIERS — smart crawl priorities ──────────────────────────────
|
|
27
|
+
// Not all pages are equal. Section-aware crawling gets 90% of SEO insight
|
|
28
|
+
// at ~15% of full-crawl cost.
|
|
29
|
+
const SECTION_TIERS = {
|
|
30
|
+
skip: {
|
|
31
|
+
// These sections have no SEO value — skip entirely
|
|
32
|
+
patterns: ['/changelog', '/legal', '/tos', '/terms', '/privacy', '/cookie',
|
|
33
|
+
'/cdn-cgi', '/wp-admin', '/wp-json', '/wp-content', '/wp-includes',
|
|
34
|
+
'/_next', '/__', '/admin', '/console', '/account', '/auth',
|
|
35
|
+
'/login', '/signup', '/register', '/onboarding', '/settings'],
|
|
36
|
+
depth: 0,
|
|
37
|
+
budget: 0,
|
|
38
|
+
},
|
|
39
|
+
high: {
|
|
40
|
+
// Conversion-critical — always crawl, moderate depth
|
|
41
|
+
patterns: ['/', '/pricing', '/plans', '/features', '/product', '/solutions',
|
|
42
|
+
'/services', '/about', '/contact', '/demo'],
|
|
43
|
+
depth: 2,
|
|
44
|
+
budget: Infinity, // always included
|
|
45
|
+
},
|
|
46
|
+
core: {
|
|
47
|
+
// Core product content — full depth
|
|
48
|
+
patterns: ['/api', '/rpc', '/platform', '/tools', '/integrations',
|
|
49
|
+
'/resources', '/use-cases', '/customers', '/case-studies'],
|
|
50
|
+
depth: 3,
|
|
51
|
+
budget: 30,
|
|
52
|
+
},
|
|
53
|
+
docs: {
|
|
54
|
+
// Documentation — index + 1 level (skip deep API refs)
|
|
55
|
+
patterns: ['/docs', '/documentation', '/reference', '/guides', '/tutorials',
|
|
56
|
+
'/learn', '/help', '/support', '/knowledge-base', '/kb'],
|
|
57
|
+
depth: 2,
|
|
58
|
+
budget: 15,
|
|
59
|
+
},
|
|
60
|
+
blog: {
|
|
61
|
+
// Blog/news — latest posts only, not full archive
|
|
62
|
+
patterns: ['/blog', '/news', '/articles', '/posts', '/journal',
|
|
63
|
+
'/updates', '/insights', '/content'],
|
|
64
|
+
depth: 1,
|
|
65
|
+
budget: 10,
|
|
66
|
+
},
|
|
67
|
+
default: {
|
|
68
|
+
// Everything else — standard depth
|
|
69
|
+
depth: 3,
|
|
70
|
+
budget: 20,
|
|
71
|
+
},
|
|
72
|
+
};
|
|
73
|
+
|
|
74
|
+
/**
|
|
75
|
+
* Classify a URL into a section tier.
|
|
76
|
+
* Returns { tier, section, depth, budget }
|
|
77
|
+
*/
|
|
78
|
+
function classifyUrl(urlStr) {
|
|
79
|
+
try {
|
|
80
|
+
const pathname = new URL(urlStr).pathname.toLowerCase();
|
|
81
|
+
|
|
82
|
+
// Exact homepage match
|
|
83
|
+
if (pathname === '/' || pathname === '') {
|
|
84
|
+
return { tier: 'high', section: '/', depth: SECTION_TIERS.high.depth, budget: SECTION_TIERS.high.budget };
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
// Check skip tier first (highest priority — never crawl these)
|
|
88
|
+
for (const pattern of SECTION_TIERS.skip.patterns) {
|
|
89
|
+
if (pathname.startsWith(pattern)) {
|
|
90
|
+
return { tier: 'skip', section: pattern, depth: 0, budget: 0 };
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
// Check named tiers in priority order
|
|
95
|
+
for (const tierName of ['high', 'core', 'docs', 'blog']) {
|
|
96
|
+
const tier = SECTION_TIERS[tierName];
|
|
97
|
+
for (const pattern of tier.patterns) {
|
|
98
|
+
if (pattern === '/') continue; // homepage already handled
|
|
99
|
+
if (pathname === pattern || pathname.startsWith(pattern + '/') || pathname.startsWith(pattern + '?')) {
|
|
100
|
+
return { tier: tierName, section: pattern, depth: tier.depth, budget: tier.budget };
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
// Default tier
|
|
106
|
+
const firstSegment = '/' + (pathname.split('/').filter(Boolean)[0] || '');
|
|
107
|
+
return { tier: 'default', section: firstSegment, depth: SECTION_TIERS.default.depth, budget: SECTION_TIERS.default.budget };
|
|
108
|
+
} catch {
|
|
109
|
+
return { tier: 'default', section: '/', depth: SECTION_TIERS.default.depth, budget: SECTION_TIERS.default.budget };
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
/**
|
|
114
|
+
* Apply section-aware sorting + budgeting to sitemap URLs.
|
|
115
|
+
* Prioritizes high-value sections, limits blog/docs, skips junk.
|
|
116
|
+
*/
|
|
117
|
+
function applySectionBudgets(sitemapUrls, maxPages) {
|
|
118
|
+
// Classify all URLs
|
|
119
|
+
const classified = sitemapUrls.map(entry => ({
|
|
120
|
+
...entry,
|
|
121
|
+
...classifyUrl(entry.url),
|
|
122
|
+
}));
|
|
123
|
+
|
|
124
|
+
// Remove skipped sections
|
|
125
|
+
const allowed = classified.filter(u => u.tier !== 'skip');
|
|
126
|
+
|
|
127
|
+
// Group by section
|
|
128
|
+
const sectionMap = new Map();
|
|
129
|
+
for (const u of allowed) {
|
|
130
|
+
const key = u.section;
|
|
131
|
+
if (!sectionMap.has(key)) sectionMap.set(key, []);
|
|
132
|
+
sectionMap.get(key).push(u);
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
// Sort sections by tier priority
|
|
136
|
+
const tierOrder = { high: 0, core: 1, docs: 2, blog: 3, default: 4 };
|
|
137
|
+
const sortedSections = [...sectionMap.entries()].sort((a, b) => {
|
|
138
|
+
const tierA = tierOrder[a[1][0]?.tier] ?? 4;
|
|
139
|
+
const tierB = tierOrder[b[1][0]?.tier] ?? 4;
|
|
140
|
+
return tierA - tierB;
|
|
141
|
+
});
|
|
142
|
+
|
|
143
|
+
// Apply per-section budgets
|
|
144
|
+
const result = [];
|
|
145
|
+
for (const [section, urls] of sortedSections) {
|
|
146
|
+
const tier = urls[0]?.tier || 'default';
|
|
147
|
+
const budget = SECTION_TIERS[tier]?.budget ?? SECTION_TIERS.default.budget;
|
|
148
|
+
|
|
149
|
+
// For blog: sort by lastmod descending to get newest posts first
|
|
150
|
+
if (tier === 'blog') {
|
|
151
|
+
urls.sort((a, b) => {
|
|
152
|
+
if (!a.lastmod && !b.lastmod) return 0;
|
|
153
|
+
if (!a.lastmod) return 1;
|
|
154
|
+
if (!b.lastmod) return -1;
|
|
155
|
+
return b.lastmod.localeCompare(a.lastmod);
|
|
156
|
+
});
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
const limited = Number.isFinite(budget) ? urls.slice(0, budget) : urls;
|
|
160
|
+
result.push(...limited);
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
return result;
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
/** Race a promise against a timeout */
|
|
167
|
+
function withTimeout(promise, ms, label = 'operation') {
|
|
168
|
+
return Promise.race([
|
|
169
|
+
promise,
|
|
170
|
+
new Promise((_, reject) => setTimeout(() => reject(new Error(`Timeout: ${label} after ${ms}ms`)), ms)),
|
|
171
|
+
]);
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
/** SHA-256 hash for incremental crawling */
|
|
175
|
+
function contentHash(text) {
|
|
176
|
+
return createHash('sha256').update(text || '').digest('hex').slice(0, 16);
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
export async function* crawlDomain(startUrl, opts = {}) {
|
|
180
|
+
const base = new URL(startUrl);
|
|
181
|
+
const visited = new Set();
|
|
182
|
+
const queue = [{ url: startUrl, depth: 0 }];
|
|
183
|
+
let count = 0;
|
|
184
|
+
|
|
185
|
+
// ── Docs domains: some hosted docs platforms block unknown bots.
|
|
186
|
+
// When hostname contains "docs.", spoof Googlebot UA to reduce WAF friction.
|
|
187
|
+
const isDocsHostname = base.hostname.toLowerCase().includes('docs.');
|
|
188
|
+
const GOOGLEBOT_UA = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)';
|
|
189
|
+
const defaultUA = 'Mozilla/5.0 (compatible; SEOIntelBot/1.0; +https://froggo.pro/seo-intel/bot)';
|
|
190
|
+
const effectiveUA = isDocsHostname ? GOOGLEBOT_UA : defaultUA;
|
|
191
|
+
|
|
192
|
+
async function tryLoadLlmsTxt() {
|
|
193
|
+
const llmsUrl = `https://${base.hostname}/llms.txt`;
|
|
194
|
+
try {
|
|
195
|
+
const controller = new AbortController();
|
|
196
|
+
const t = setTimeout(() => controller.abort(), Math.min(TIMEOUT, 8000));
|
|
197
|
+
const res = await fetch(llmsUrl, {
|
|
198
|
+
method: 'GET',
|
|
199
|
+
redirect: 'follow',
|
|
200
|
+
signal: controller.signal,
|
|
201
|
+
headers: {
|
|
202
|
+
'user-agent': effectiveUA,
|
|
203
|
+
'accept': 'text/plain,text/markdown;q=0.9,*/*;q=0.1',
|
|
204
|
+
},
|
|
205
|
+
}).finally(() => clearTimeout(t));
|
|
206
|
+
|
|
207
|
+
if (!res?.ok) return;
|
|
208
|
+
const text = await res.text();
|
|
209
|
+
if (!text || text.length < 5) return;
|
|
210
|
+
|
|
211
|
+
// Extract markdown links: - [Title](url): description
|
|
212
|
+
const urls = [];
|
|
213
|
+
const linkRe = /\[[^\]]*\]\(([^)\s]+)\)/g;
|
|
214
|
+
let m;
|
|
215
|
+
while ((m = linkRe.exec(text))) {
|
|
216
|
+
const u = m[1];
|
|
217
|
+
if (!u) continue;
|
|
218
|
+
// allow absolute http(s) only
|
|
219
|
+
if (!/^https?:\/\//i.test(u)) continue;
|
|
220
|
+
urls.push(u);
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
// De-dupe and enqueue
|
|
224
|
+
const unique = [...new Set(urls)];
|
|
225
|
+
let added = 0;
|
|
226
|
+
for (const u of unique) {
|
|
227
|
+
if (!queue.some(q => q.url === u)) {
|
|
228
|
+
queue.push({ url: u, depth: 1 });
|
|
229
|
+
added++;
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
if (unique.length > 0) {
|
|
233
|
+
console.log(`[llms.txt] ${base.hostname} — discovered ${unique.length} URLs (${added} added to queue)`);
|
|
234
|
+
}
|
|
235
|
+
} catch {
|
|
236
|
+
// silent: llms.txt is optional
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
// ── llms.txt: if present, use it to seed crawl queue first ──
|
|
241
|
+
await tryLoadLlmsTxt();
|
|
242
|
+
|
|
243
|
+
const maxPages = Number.isFinite(opts.maxPages) ? opts.maxPages : MAX_PAGES;
|
|
244
|
+
const maxDepth = Number.isFinite(opts.maxDepth) ? opts.maxDepth : MAX_DEPTH;
|
|
245
|
+
|
|
246
|
+
// ── Section budget tracking ──
|
|
247
|
+
const sectionCounts = new Map(); // section → pages crawled
|
|
248
|
+
const tiered = opts.tiered !== false; // tiered crawling on by default
|
|
249
|
+
|
|
250
|
+
// ── Sitemap-first: seed queue from sitemap.xml (section-aware) ──
|
|
251
|
+
try {
|
|
252
|
+
const sitemapUrls = await fetchSitemap(startUrl);
|
|
253
|
+
if (sitemapUrls.length > 0) {
|
|
254
|
+
// Apply section budgets if tiered crawling is enabled
|
|
255
|
+
const budgeted = tiered ? applySectionBudgets(sitemapUrls, maxPages) : sitemapUrls;
|
|
256
|
+
|
|
257
|
+
const skipped = sitemapUrls.length - budgeted.length;
|
|
258
|
+
console.log(`[sitemap] Found ${sitemapUrls.length} URLs — ${budgeted.length} after section budgets` +
|
|
259
|
+
(skipped > 0 ? ` (${skipped} skipped)` : ''));
|
|
260
|
+
|
|
261
|
+
if (tiered && budgeted.length > 0) {
|
|
262
|
+
// Show section breakdown
|
|
263
|
+
const sections = new Map();
|
|
264
|
+
for (const u of budgeted) {
|
|
265
|
+
const { tier, section } = classifyUrl(u.url);
|
|
266
|
+
const key = `${section} [${tier}]`;
|
|
267
|
+
sections.set(key, (sections.get(key) || 0) + 1);
|
|
268
|
+
}
|
|
269
|
+
for (const [sec, cnt] of [...sections.entries()].slice(0, 8)) {
|
|
270
|
+
console.log(` ${sec}: ${cnt} URLs`);
|
|
271
|
+
}
|
|
272
|
+
if (sections.size > 8) console.log(` ... and ${sections.size - 8} more sections`);
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
// Don't enqueue 10k URLs if the crawl budget is tiny.
|
|
276
|
+
const seedLimit = Number.isFinite(opts.sitemapSeedLimit)
|
|
277
|
+
? opts.sitemapSeedLimit
|
|
278
|
+
: Math.max(maxPages * 2, 50);
|
|
279
|
+
|
|
280
|
+
for (const entry of budgeted.slice(0, seedLimit)) {
|
|
281
|
+
if (!queue.some(q => q.url === entry.url) && entry.url !== startUrl) {
|
|
282
|
+
queue.push({ url: entry.url, depth: 1 }); // treat sitemap URLs as depth 1
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
} catch (err) {
|
|
287
|
+
console.log(`[sitemap] Could not fetch sitemap: ${err.message}`);
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
// ── Backoff tracking per domain ──
|
|
291
|
+
let consecutiveErrors = 0;
|
|
292
|
+
let currentDelay = CRAWL_DELAY;
|
|
293
|
+
let blocked = false;
|
|
294
|
+
const MAX_CONSECUTIVE_ERRORS = 5;
|
|
295
|
+
|
|
296
|
+
// ── Advanced mode: full browser rendering with enhanced compatibility ──
|
|
297
|
+
let browser, context;
|
|
298
|
+
if (opts.stealth) {
|
|
299
|
+
const { getStealthConfig, STEALTH_INIT_SCRIPT, applyStealthRoutes } = await import('./stealth.js');
|
|
300
|
+
const stealthCfg = getStealthConfig();
|
|
301
|
+
browser = await chromium.launch({ headless: true, ...stealthCfg.launchArgs });
|
|
302
|
+
// Try to load a saved session for this domain (returning visitor = less WAF friction)
|
|
303
|
+
const sessionPath = loadSessionState(base.hostname);
|
|
304
|
+
const contextOpts = { ...stealthCfg.contextOpts, userAgent: effectiveUA };
|
|
305
|
+
if (sessionPath) contextOpts.storageState = sessionPath;
|
|
306
|
+
context = await browser.newContext(contextOpts);
|
|
307
|
+
await context.addInitScript(STEALTH_INIT_SCRIPT);
|
|
308
|
+
await applyStealthRoutes(context);
|
|
309
|
+
console.log(`[stealth] 🥷 Advanced mode — full browser rendering, persistent sessions`);
|
|
310
|
+
} else {
|
|
311
|
+
browser = await chromium.launch({ headless: true });
|
|
312
|
+
context = await browser.newContext({
|
|
313
|
+
userAgent: effectiveUA,
|
|
314
|
+
ignoreHTTPSErrors: true,
|
|
315
|
+
});
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
try {
|
|
319
|
+
while (queue.length > 0 && count < maxPages && !blocked) {
|
|
320
|
+
const { url, depth } = queue.shift();
|
|
321
|
+
if (visited.has(url)) continue;
|
|
322
|
+
visited.add(url);
|
|
323
|
+
|
|
324
|
+
// ── Section tier check — skip junk sections, respect depth limits ──
|
|
325
|
+
if (tiered) {
|
|
326
|
+
const { tier, section, depth: sectionMaxDepth, budget: sectionBudget } = classifyUrl(url);
|
|
327
|
+
|
|
328
|
+
// Skip banned sections entirely
|
|
329
|
+
if (tier === 'skip') continue;
|
|
330
|
+
|
|
331
|
+
// Check per-section depth limit (section-relative depth, not global)
|
|
332
|
+
if (depth > sectionMaxDepth + 1) continue; // +1 because sitemap URLs start at depth 1
|
|
333
|
+
|
|
334
|
+
// Check per-section budget
|
|
335
|
+
if (Number.isFinite(sectionBudget)) {
|
|
336
|
+
const currentCount = sectionCounts.get(section) || 0;
|
|
337
|
+
if (currentCount >= sectionBudget) continue;
|
|
338
|
+
}
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
// In stealth mode, skip robots.txt — user explicitly opted into bypass
|
|
342
|
+
let crawlDelayMs = 0;
|
|
343
|
+
if (!opts.stealth) {
|
|
344
|
+
const robotsResult = await checkRobots(url).catch(() => ({ allowed: true, crawlDelayMs: 0 }));
|
|
345
|
+
if (!robotsResult.allowed) {
|
|
346
|
+
console.log(`[robots] Skipping disallowed: ${url}`);
|
|
347
|
+
continue;
|
|
348
|
+
}
|
|
349
|
+
crawlDelayMs = robotsResult.crawlDelayMs || 0;
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
const page = await context.newPage();
|
|
353
|
+
|
|
354
|
+
try {
|
|
355
|
+
// Hard per-page deadline wrapping everything
|
|
356
|
+
const pageResult = await withTimeout(processPage(page, url, base, depth, queue, maxDepth), PAGE_BUDGET, url);
|
|
357
|
+
|
|
358
|
+
if (pageResult) {
|
|
359
|
+
// ── Backoff: check for rate limit / WAF responses ──
|
|
360
|
+
if (pageResult.status === 429 || pageResult.status === 503) {
|
|
361
|
+
consecutiveErrors++;
|
|
362
|
+
currentDelay = Math.min(currentDelay * 2, 30000); // exponential backoff, max 30s
|
|
363
|
+
console.log(`[backoff] ${pageResult.status} on ${url} — delay now ${currentDelay}ms (${consecutiveErrors}/${MAX_CONSECUTIVE_ERRORS})`);
|
|
364
|
+
if (consecutiveErrors >= MAX_CONSECUTIVE_ERRORS) {
|
|
365
|
+
blocked = true;
|
|
366
|
+
console.log(`[blocked] ${base.hostname} — too many ${pageResult.status} errors, stopping crawl`);
|
|
367
|
+
// Yield a blocked marker
|
|
368
|
+
yield { ...pageResult, _blocked: true, _blockReason: `${MAX_CONSECUTIVE_ERRORS}x ${pageResult.status}` };
|
|
369
|
+
}
|
|
370
|
+
continue; // don't count rate-limited pages
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
if (pageResult.status === 403) {
|
|
374
|
+
consecutiveErrors++;
|
|
375
|
+
// If stealth session caused 3+ consecutive 403s, discard it
|
|
376
|
+
if (opts.stealth && consecutiveErrors >= 3) discardSession(base.hostname);
|
|
377
|
+
console.log(`[blocked] 403 on ${url} (${consecutiveErrors}/${MAX_CONSECUTIVE_ERRORS})`);
|
|
378
|
+
if (consecutiveErrors >= MAX_CONSECUTIVE_ERRORS) {
|
|
379
|
+
blocked = true;
|
|
380
|
+
console.log(`[blocked] ${base.hostname} — likely WAF/firewall, stopping crawl`);
|
|
381
|
+
yield { ...pageResult, _blocked: true, _blockReason: `${MAX_CONSECUTIVE_ERRORS}x 403 Forbidden` };
|
|
382
|
+
}
|
|
383
|
+
continue;
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
// Success — reset backoff
|
|
387
|
+
consecutiveErrors = 0;
|
|
388
|
+
currentDelay = CRAWL_DELAY;
|
|
389
|
+
|
|
390
|
+
// Track section budget
|
|
391
|
+
if (tiered) {
|
|
392
|
+
const { section } = classifyUrl(url);
|
|
393
|
+
sectionCounts.set(section, (sectionCounts.get(section) || 0) + 1);
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
count++;
|
|
397
|
+
yield pageResult;
|
|
398
|
+
}
|
|
399
|
+
} catch (err) {
|
|
400
|
+
console.error(`[crawler] Error on ${url}: ${err.message}`);
|
|
401
|
+
consecutiveErrors++;
|
|
402
|
+
if (consecutiveErrors >= MAX_CONSECUTIVE_ERRORS) {
|
|
403
|
+
blocked = true;
|
|
404
|
+
console.log(`[blocked] ${base.hostname} — ${MAX_CONSECUTIVE_ERRORS} consecutive failures, stopping`);
|
|
405
|
+
}
|
|
406
|
+
} finally {
|
|
407
|
+
await page.close().catch(() => {});
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
// Stealth: jittered human-like delays (2-5s), Standard: configured crawl delay
|
|
411
|
+
const delay = opts.stealth
|
|
412
|
+
? 2000 + Math.random() * 3000
|
|
413
|
+
: Math.max(crawlDelayMs, currentDelay);
|
|
414
|
+
await new Promise(r => setTimeout(r, delay));
|
|
415
|
+
}
|
|
416
|
+
} finally {
|
|
417
|
+
// Persist stealth session cookies for next run (returning visitor)
|
|
418
|
+
if (opts.stealth && !blocked) await saveSessionState(context, base.hostname);
|
|
419
|
+
await browser.close().catch(() => {});
|
|
420
|
+
}
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
async function processPage(page, url, base, depth, queue, maxDepth) {
|
|
424
|
+
let status = 0;
|
|
425
|
+
const t0 = Date.now();
|
|
426
|
+
|
|
427
|
+
// Try domcontentloaded first, fall back to load
|
|
428
|
+
let res;
|
|
429
|
+
for (const waitUntil of ['domcontentloaded', 'load']) {
|
|
430
|
+
try {
|
|
431
|
+
res = await page.goto(url, { waitUntil, timeout: TIMEOUT });
|
|
432
|
+
break;
|
|
433
|
+
} catch (err) {
|
|
434
|
+
if (waitUntil === 'load') throw err;
|
|
435
|
+
console.log(`[crawler] ${waitUntil} failed for ${url}, retrying with load...`);
|
|
436
|
+
}
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
status = res?.status() || 0;
|
|
440
|
+
const loadMs = Date.now() - t0;
|
|
441
|
+
|
|
442
|
+
// ── Return status for backoff logic (don't silently drop 4xx) ──
|
|
443
|
+
if (status === 429 || status === 503 || status === 403) {
|
|
444
|
+
return { url, depth, status, loadMs, wordCount: 0, isIndexable: false, title: '', metaDesc: '', headings: [], links: [], bodyText: '', schemaTypes: [], vitals: {}, publishedDate: null, modifiedDate: null, contentHash: null };
|
|
445
|
+
}
|
|
446
|
+
if (status >= 400) return null;
|
|
447
|
+
|
|
448
|
+
const title = await page.title().catch(() => '');
|
|
449
|
+
const metaDesc = await page.$eval('meta[name="description"]', el => el.content).catch(() => '');
|
|
450
|
+
|
|
451
|
+
const headings = await page.$$eval('h1,h2,h3,h4,h5,h6', els =>
|
|
452
|
+
els.map(el => ({ level: parseInt(el.tagName[1]), text: el.innerText?.trim().slice(0, 200) })).filter(h => h.text)
|
|
453
|
+
).catch(() => []);
|
|
454
|
+
|
|
455
|
+
const links = await page.$$eval('a[href]', (els, baseHref) =>
|
|
456
|
+
els.map(el => {
|
|
457
|
+
try { return { url: new URL(el.href, baseHref).href, anchor: el.innerText?.trim().slice(0, 100) || '' }; }
|
|
458
|
+
catch { return null; }
|
|
459
|
+
}).filter(Boolean), base.href
|
|
460
|
+
).catch(() => []);
|
|
461
|
+
|
|
462
|
+
const getRootDomain = h => h.split(".").slice(-2).join(".");
|
|
463
|
+
// BUG-006: When strictHost is set (--domain flag), only exact hostname match is internal.
|
|
464
|
+
// Otherwise, same root domain = internal (so blog.x and docs.x are internal to x).
|
|
465
|
+
const isInternal = (h) => opts.strictHost
|
|
466
|
+
? h === base.hostname
|
|
467
|
+
: (h === base.hostname || getRootDomain(h) === getRootDomain(base.hostname));
|
|
468
|
+
const internalLinks = links.filter(l => { try { return isInternal(new URL(l.url).hostname); } catch { return false; } }).map(l => ({ ...l, isInternal: true }));
|
|
469
|
+
const externalLinks = links.filter(l => { try { return !isInternal(new URL(l.url).hostname); } catch { return false; } }).map(l => ({ ...l, isInternal: false }));
|
|
470
|
+
|
|
471
|
+
// Markdown-first extraction — preserves headings, lists, emphasis. Falls back to selector-based.
|
|
472
|
+
const bodyText = await extractAsMarkdown(page).catch(() => '')
|
|
473
|
+
|| await extractSelective(page, ['h1','h2','h3','p','li','span.hero','div.tagline']).catch(() => '');
|
|
474
|
+
|
|
475
|
+
const schemaTypes = await page.$$eval('script[type="application/ld+json"]', els => {
|
|
476
|
+
const types = [];
|
|
477
|
+
for (const el of els) { try { const d = JSON.parse(el.textContent); types.push(d['@type']); } catch {} }
|
|
478
|
+
return types.filter(Boolean);
|
|
479
|
+
}).catch(() => []);
|
|
480
|
+
|
|
481
|
+
// LCP with a hard 1.5s cap (was hanging before)
|
|
482
|
+
const vitals = await Promise.race([
|
|
483
|
+
page.evaluate(() => new Promise(resolve => {
|
|
484
|
+
let lcp = null;
|
|
485
|
+
try {
|
|
486
|
+
new PerformanceObserver(list => { lcp = list.getEntries().at(-1)?.startTime || null; })
|
|
487
|
+
.observe({ type: 'largest-contentful-paint', buffered: true });
|
|
488
|
+
} catch {}
|
|
489
|
+
setTimeout(() => resolve({ lcp }), 1000);
|
|
490
|
+
})),
|
|
491
|
+
new Promise(resolve => setTimeout(() => resolve({}), 1500)),
|
|
492
|
+
]).catch(() => ({}));
|
|
493
|
+
|
|
494
|
+
const wordCount = await page.$eval('body', el => el.innerText.split(/\s+/).filter(Boolean).length).catch(() => 0);
|
|
495
|
+
|
|
496
|
+
const robotsMeta = await page.$eval('meta[name="robots"]', el => el.content).catch(() => '');
|
|
497
|
+
const isIndexable = !robotsMeta.toLowerCase().includes('noindex');
|
|
498
|
+
|
|
499
|
+
const publishedDate = await page.evaluate(() => {
|
|
500
|
+
for (const sel of ['meta[property="article:published_time"]','meta[name="date"]','meta[itemprop="datePublished"]']) {
|
|
501
|
+
const el = document.querySelector(sel);
|
|
502
|
+
if (el?.content) return el.content;
|
|
503
|
+
}
|
|
504
|
+
for (const el of document.querySelectorAll('script[type="application/ld+json"]')) {
|
|
505
|
+
try { const d = JSON.parse(el.textContent); if (d.datePublished) return d.datePublished; } catch {}
|
|
506
|
+
}
|
|
507
|
+
return null;
|
|
508
|
+
}).catch(() => null);
|
|
509
|
+
|
|
510
|
+
const modifiedDate = await page.evaluate(() => {
|
|
511
|
+
for (const sel of ['meta[property="article:modified_time"]','meta[name="last-modified"]','meta[itemprop="dateModified"]']) {
|
|
512
|
+
const el = document.querySelector(sel);
|
|
513
|
+
if (el?.content) return el.content;
|
|
514
|
+
}
|
|
515
|
+
for (const el of document.querySelectorAll('script[type="application/ld+json"]')) {
|
|
516
|
+
try { const d = JSON.parse(el.textContent); if (d.dateModified) return d.dateModified; } catch {}
|
|
517
|
+
}
|
|
518
|
+
return null;
|
|
519
|
+
}).catch(() => null);
|
|
520
|
+
|
|
521
|
+
// Queue new URLs (section-aware: skip junk links early)
|
|
522
|
+
if (depth < maxDepth) {
|
|
523
|
+
for (const link of internalLinks) {
|
|
524
|
+
try {
|
|
525
|
+
const u = new URL(link.url);
|
|
526
|
+
if (/\.(pdf|png|jpg|jpeg|gif|svg|css|js|woff|ico)$/i.test(u.pathname)) continue;
|
|
527
|
+
// Pre-filter: don't even enqueue URLs from skip sections
|
|
528
|
+
const { tier } = classifyUrl(link.url);
|
|
529
|
+
if (tier === 'skip') continue;
|
|
530
|
+
if (!queue.some(q => q.url === link.url)) {
|
|
531
|
+
queue.push({ url: link.url, depth: depth + 1 });
|
|
532
|
+
}
|
|
533
|
+
} catch(e) {
|
|
534
|
+
}
|
|
535
|
+
}
|
|
536
|
+
}
|
|
537
|
+
|
|
538
|
+
// ── Deep JSON-LD parsing — extract structured schema data from raw HTML ──
|
|
539
|
+
const rawHtml = await page.content().catch(() => '');
|
|
540
|
+
const parsedSchemas = parseJsonLd(rawHtml);
|
|
541
|
+
|
|
542
|
+
// ── Content hash for incremental crawling ──
|
|
543
|
+
const hash = contentHash(bodyText);
|
|
544
|
+
|
|
545
|
+
// ── Quality gate — detect shells, blocked pages, empty content ──
|
|
546
|
+
const quality = assessQuality({ wordCount, bodyText, title, status });
|
|
547
|
+
|
|
548
|
+
return {
|
|
549
|
+
url, depth, status, loadMs, wordCount, isIndexable,
|
|
550
|
+
title, metaDesc, headings,
|
|
551
|
+
links: [...internalLinks, ...externalLinks],
|
|
552
|
+
bodyText: sanitize(bodyText, 2000),
|
|
553
|
+
schemaTypes, parsedSchemas, vitals, publishedDate, modifiedDate,
|
|
554
|
+
contentHash: hash,
|
|
555
|
+
quality: quality.ok, qualityReason: quality.reason,
|
|
556
|
+
};
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
export async function crawlAll(startUrl) {
|
|
560
|
+
const pages = [];
|
|
561
|
+
for await (const page of crawlDomain(startUrl)) pages.push(page);
|
|
562
|
+
return pages;
|
|
563
|
+
}
|
|
564
|
+
|
|
565
|
+
// Export for use by other modules (content velocity, weekly brief, etc.)
|
|
566
|
+
export { classifyUrl, SECTION_TIERS };
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* robots.txt fetcher + parser
|
|
3
|
+
* Checks if we're allowed to crawl a URL and what delay to respect.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import fetch from 'node-fetch';
|
|
7
|
+
|
|
8
|
+
const cache = new Map(); // domain → { rules, crawlDelay, fetchedAt }
|
|
9
|
+
const CACHE_TTL = 24 * 60 * 60 * 1000; // 24h
|
|
10
|
+
const OUR_AGENT = 'SEOIntelBot';
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* Fetch and parse robots.txt for a domain.
|
|
14
|
+
*/
|
|
15
|
+
async function fetchRobots(domain) {
|
|
16
|
+
const cached = cache.get(domain);
|
|
17
|
+
if (cached && Date.now() - cached.fetchedAt < CACHE_TTL) return cached;
|
|
18
|
+
|
|
19
|
+
const url = `https://${domain}/robots.txt`;
|
|
20
|
+
let text = '';
|
|
21
|
+
try {
|
|
22
|
+
const res = await fetch(url, { timeout: 8000, headers: { 'User-Agent': OUR_AGENT } });
|
|
23
|
+
if (res.ok) text = await res.text();
|
|
24
|
+
} catch {
|
|
25
|
+
// No robots.txt = everything allowed
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
const parsed = parseRobots(text);
|
|
29
|
+
cache.set(domain, { ...parsed, fetchedAt: Date.now() });
|
|
30
|
+
return parsed;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
function parseRobots(text) {
|
|
34
|
+
const lines = text.split('\n').map(l => l.trim()).filter(l => l && !l.startsWith('#'));
|
|
35
|
+
|
|
36
|
+
let crawlDelay = null;
|
|
37
|
+
const disallowed = [];
|
|
38
|
+
const allowed = [];
|
|
39
|
+
let inOurBlock = false;
|
|
40
|
+
let inAllBlock = false;
|
|
41
|
+
|
|
42
|
+
for (const line of lines) {
|
|
43
|
+
const [key, ...rest] = line.split(':');
|
|
44
|
+
const val = rest.join(':').trim();
|
|
45
|
+
const k = key.toLowerCase().trim();
|
|
46
|
+
|
|
47
|
+
if (k === 'user-agent') {
|
|
48
|
+
inOurBlock = val === OUR_AGENT || val === '*';
|
|
49
|
+
inAllBlock = val === '*';
|
|
50
|
+
}
|
|
51
|
+
if ((inOurBlock || inAllBlock) && k === 'disallow' && val) {
|
|
52
|
+
disallowed.push(val);
|
|
53
|
+
}
|
|
54
|
+
if ((inOurBlock || inAllBlock) && k === 'allow' && val) {
|
|
55
|
+
allowed.push(val);
|
|
56
|
+
}
|
|
57
|
+
if ((inOurBlock || inAllBlock) && k === 'crawl-delay' && val) {
|
|
58
|
+
const d = parseFloat(val);
|
|
59
|
+
if (!isNaN(d)) crawlDelay = Math.max(d, 1); // minimum 1s
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
return { disallowed, allowed, crawlDelay };
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* Check if we're allowed to crawl a URL.
|
|
68
|
+
* Returns { allowed: bool, crawlDelayMs: number }
|
|
69
|
+
*/
|
|
70
|
+
export async function checkRobots(url) {
|
|
71
|
+
try {
|
|
72
|
+
const { hostname } = new URL(url);
|
|
73
|
+
const { disallowed, allowed, crawlDelay } = await fetchRobots(hostname);
|
|
74
|
+
|
|
75
|
+
const path = new URL(url).pathname;
|
|
76
|
+
|
|
77
|
+
// Check disallow rules
|
|
78
|
+
for (const rule of disallowed) {
|
|
79
|
+
if (path.startsWith(rule)) {
|
|
80
|
+
// Check if there's a more specific allow
|
|
81
|
+
const overridden = allowed.some(a => a.length > rule.length && path.startsWith(a));
|
|
82
|
+
if (!overridden) return { allowed: false, crawlDelayMs: 0 };
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// crawlDelay from robots.txt takes priority, min 1.5s always
|
|
87
|
+
const crawlDelayMs = crawlDelay
|
|
88
|
+
? Math.max(crawlDelay * 1000, 1500)
|
|
89
|
+
: parseInt(process.env.CRAWL_DELAY_MS || '1500');
|
|
90
|
+
|
|
91
|
+
return { allowed: true, crawlDelayMs };
|
|
92
|
+
} catch {
|
|
93
|
+
return { allowed: true, crawlDelayMs: 1500 };
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/**
|
|
98
|
+
* Get recommended crawl delay for a domain (ms).
|
|
99
|
+
*/
|
|
100
|
+
export async function getCrawlDelay(domain) {
|
|
101
|
+
const { crawlDelay } = await fetchRobots(domain).catch(() => ({ crawlDelay: null }));
|
|
102
|
+
return crawlDelay ? Math.max(crawlDelay * 1000, 1500) : 1500;
|
|
103
|
+
}
|