seo-intel 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. package/.env.example +41 -0
  2. package/LICENSE +75 -0
  3. package/README.md +243 -0
  4. package/Start SEO Intel.bat +9 -0
  5. package/Start SEO Intel.command +8 -0
  6. package/cli.js +3727 -0
  7. package/config/example.json +29 -0
  8. package/config/setup-wizard.js +522 -0
  9. package/crawler/index.js +566 -0
  10. package/crawler/robots.js +103 -0
  11. package/crawler/sanitize.js +124 -0
  12. package/crawler/schema-parser.js +168 -0
  13. package/crawler/sitemap.js +103 -0
  14. package/crawler/stealth.js +393 -0
  15. package/crawler/subdomain-discovery.js +341 -0
  16. package/db/db.js +213 -0
  17. package/db/schema.sql +120 -0
  18. package/exports/competitive.js +186 -0
  19. package/exports/heuristics.js +67 -0
  20. package/exports/queries.js +197 -0
  21. package/exports/suggestive.js +230 -0
  22. package/exports/technical.js +180 -0
  23. package/exports/templates.js +77 -0
  24. package/lib/gate.js +204 -0
  25. package/lib/license.js +369 -0
  26. package/lib/oauth.js +432 -0
  27. package/lib/updater.js +324 -0
  28. package/package.json +68 -0
  29. package/reports/generate-html.js +6194 -0
  30. package/reports/generate-site-graph.js +949 -0
  31. package/reports/gsc-loader.js +190 -0
  32. package/scheduler.js +142 -0
  33. package/seo-audit.js +619 -0
  34. package/seo-intel.png +0 -0
  35. package/server.js +602 -0
  36. package/setup/ROADMAP.md +109 -0
  37. package/setup/checks.js +483 -0
  38. package/setup/config-builder.js +227 -0
  39. package/setup/engine.js +65 -0
  40. package/setup/installers.js +197 -0
  41. package/setup/models.js +328 -0
  42. package/setup/openclaw-bridge.js +329 -0
  43. package/setup/validator.js +395 -0
  44. package/setup/web-routes.js +688 -0
  45. package/setup/wizard.html +2920 -0
  46. package/start-seo-intel.sh +8 -0
@@ -0,0 +1,566 @@
1
+ import { createHash } from 'crypto';
2
+ import { chromium } from 'playwright';
3
+ import { sanitize, extractSelective, extractAsMarkdown } from './sanitize.js';
4
+ import { checkRobots, getCrawlDelay } from './robots.js';
5
+ import { fetchSitemap } from './sitemap.js';
6
+ import { parseJsonLd } from './schema-parser.js';
7
+ import { loadSessionState, saveSessionState, discardSession } from './stealth.js';
8
+
9
+ const CRAWL_DELAY = parseInt(process.env.CRAWL_DELAY_MS || '1500');
10
+ const MAX_PAGES = parseInt(process.env.CRAWL_MAX_PAGES || '50');
11
+ const MAX_DEPTH = parseInt(process.env.CRAWL_MAX_DEPTH || '3');
12
+ const TIMEOUT = parseInt(process.env.CRAWL_TIMEOUT_MS || '12000');
13
+ const PAGE_BUDGET = parseInt(process.env.PAGE_BUDGET_MS || '25000'); // hard per-page wall-clock limit
14
+
15
+ // ── Content quality gate ────────────────────────────────────────────────
16
+ const SHELL_PATTERNS = /id=["'](root|app|__next|__nuxt)["']|<noscript[^>]*>.*enable javascript/i;
17
+ const CAPTCHA_PATTERNS = /cf-browser-verification|checking your browser|just a moment|verify you are human|challenge-platform/i;
18
+
19
+ function assessQuality({ wordCount, bodyText, title }) {
20
+ if (CAPTCHA_PATTERNS.test(bodyText)) return { ok: false, reason: 'blocked' };
21
+ if (wordCount < 30 && title && SHELL_PATTERNS.test(bodyText)) return { ok: false, reason: 'js-shell' };
22
+ if (wordCount < 10) return { ok: false, reason: 'empty' };
23
+ return { ok: true, reason: null };
24
+ }
25
+
26
+ // ── SECTION TIERS — smart crawl priorities ──────────────────────────────
27
+ // Not all pages are equal. Section-aware crawling gets 90% of SEO insight
28
+ // at ~15% of full-crawl cost.
29
+ const SECTION_TIERS = {
30
+ skip: {
31
+ // These sections have no SEO value — skip entirely
32
+ patterns: ['/changelog', '/legal', '/tos', '/terms', '/privacy', '/cookie',
33
+ '/cdn-cgi', '/wp-admin', '/wp-json', '/wp-content', '/wp-includes',
34
+ '/_next', '/__', '/admin', '/console', '/account', '/auth',
35
+ '/login', '/signup', '/register', '/onboarding', '/settings'],
36
+ depth: 0,
37
+ budget: 0,
38
+ },
39
+ high: {
40
+ // Conversion-critical — always crawl, moderate depth
41
+ patterns: ['/', '/pricing', '/plans', '/features', '/product', '/solutions',
42
+ '/services', '/about', '/contact', '/demo'],
43
+ depth: 2,
44
+ budget: Infinity, // always included
45
+ },
46
+ core: {
47
+ // Core product content — full depth
48
+ patterns: ['/api', '/rpc', '/platform', '/tools', '/integrations',
49
+ '/resources', '/use-cases', '/customers', '/case-studies'],
50
+ depth: 3,
51
+ budget: 30,
52
+ },
53
+ docs: {
54
+ // Documentation — index + 1 level (skip deep API refs)
55
+ patterns: ['/docs', '/documentation', '/reference', '/guides', '/tutorials',
56
+ '/learn', '/help', '/support', '/knowledge-base', '/kb'],
57
+ depth: 2,
58
+ budget: 15,
59
+ },
60
+ blog: {
61
+ // Blog/news — latest posts only, not full archive
62
+ patterns: ['/blog', '/news', '/articles', '/posts', '/journal',
63
+ '/updates', '/insights', '/content'],
64
+ depth: 1,
65
+ budget: 10,
66
+ },
67
+ default: {
68
+ // Everything else — standard depth
69
+ depth: 3,
70
+ budget: 20,
71
+ },
72
+ };
73
+
74
+ /**
75
+ * Classify a URL into a section tier.
76
+ * Returns { tier, section, depth, budget }
77
+ */
78
+ function classifyUrl(urlStr) {
79
+ try {
80
+ const pathname = new URL(urlStr).pathname.toLowerCase();
81
+
82
+ // Exact homepage match
83
+ if (pathname === '/' || pathname === '') {
84
+ return { tier: 'high', section: '/', depth: SECTION_TIERS.high.depth, budget: SECTION_TIERS.high.budget };
85
+ }
86
+
87
+ // Check skip tier first (highest priority — never crawl these)
88
+ for (const pattern of SECTION_TIERS.skip.patterns) {
89
+ if (pathname.startsWith(pattern)) {
90
+ return { tier: 'skip', section: pattern, depth: 0, budget: 0 };
91
+ }
92
+ }
93
+
94
+ // Check named tiers in priority order
95
+ for (const tierName of ['high', 'core', 'docs', 'blog']) {
96
+ const tier = SECTION_TIERS[tierName];
97
+ for (const pattern of tier.patterns) {
98
+ if (pattern === '/') continue; // homepage already handled
99
+ if (pathname === pattern || pathname.startsWith(pattern + '/') || pathname.startsWith(pattern + '?')) {
100
+ return { tier: tierName, section: pattern, depth: tier.depth, budget: tier.budget };
101
+ }
102
+ }
103
+ }
104
+
105
+ // Default tier
106
+ const firstSegment = '/' + (pathname.split('/').filter(Boolean)[0] || '');
107
+ return { tier: 'default', section: firstSegment, depth: SECTION_TIERS.default.depth, budget: SECTION_TIERS.default.budget };
108
+ } catch {
109
+ return { tier: 'default', section: '/', depth: SECTION_TIERS.default.depth, budget: SECTION_TIERS.default.budget };
110
+ }
111
+ }
112
+
113
+ /**
114
+ * Apply section-aware sorting + budgeting to sitemap URLs.
115
+ * Prioritizes high-value sections, limits blog/docs, skips junk.
116
+ */
117
+ function applySectionBudgets(sitemapUrls, maxPages) {
118
+ // Classify all URLs
119
+ const classified = sitemapUrls.map(entry => ({
120
+ ...entry,
121
+ ...classifyUrl(entry.url),
122
+ }));
123
+
124
+ // Remove skipped sections
125
+ const allowed = classified.filter(u => u.tier !== 'skip');
126
+
127
+ // Group by section
128
+ const sectionMap = new Map();
129
+ for (const u of allowed) {
130
+ const key = u.section;
131
+ if (!sectionMap.has(key)) sectionMap.set(key, []);
132
+ sectionMap.get(key).push(u);
133
+ }
134
+
135
+ // Sort sections by tier priority
136
+ const tierOrder = { high: 0, core: 1, docs: 2, blog: 3, default: 4 };
137
+ const sortedSections = [...sectionMap.entries()].sort((a, b) => {
138
+ const tierA = tierOrder[a[1][0]?.tier] ?? 4;
139
+ const tierB = tierOrder[b[1][0]?.tier] ?? 4;
140
+ return tierA - tierB;
141
+ });
142
+
143
+ // Apply per-section budgets
144
+ const result = [];
145
+ for (const [section, urls] of sortedSections) {
146
+ const tier = urls[0]?.tier || 'default';
147
+ const budget = SECTION_TIERS[tier]?.budget ?? SECTION_TIERS.default.budget;
148
+
149
+ // For blog: sort by lastmod descending to get newest posts first
150
+ if (tier === 'blog') {
151
+ urls.sort((a, b) => {
152
+ if (!a.lastmod && !b.lastmod) return 0;
153
+ if (!a.lastmod) return 1;
154
+ if (!b.lastmod) return -1;
155
+ return b.lastmod.localeCompare(a.lastmod);
156
+ });
157
+ }
158
+
159
+ const limited = Number.isFinite(budget) ? urls.slice(0, budget) : urls;
160
+ result.push(...limited);
161
+ }
162
+
163
+ return result;
164
+ }
165
+
166
+ /** Race a promise against a timeout */
167
+ function withTimeout(promise, ms, label = 'operation') {
168
+ return Promise.race([
169
+ promise,
170
+ new Promise((_, reject) => setTimeout(() => reject(new Error(`Timeout: ${label} after ${ms}ms`)), ms)),
171
+ ]);
172
+ }
173
+
174
+ /** SHA-256 hash for incremental crawling */
175
+ function contentHash(text) {
176
+ return createHash('sha256').update(text || '').digest('hex').slice(0, 16);
177
+ }
178
+
179
+ export async function* crawlDomain(startUrl, opts = {}) {
180
+ const base = new URL(startUrl);
181
+ const visited = new Set();
182
+ const queue = [{ url: startUrl, depth: 0 }];
183
+ let count = 0;
184
+
185
+ // ── Docs domains: some hosted docs platforms block unknown bots.
186
+ // When hostname contains "docs.", spoof Googlebot UA to reduce WAF friction.
187
+ const isDocsHostname = base.hostname.toLowerCase().includes('docs.');
188
+ const GOOGLEBOT_UA = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)';
189
+ const defaultUA = 'Mozilla/5.0 (compatible; SEOIntelBot/1.0; +https://froggo.pro/seo-intel/bot)';
190
+ const effectiveUA = isDocsHostname ? GOOGLEBOT_UA : defaultUA;
191
+
192
+ async function tryLoadLlmsTxt() {
193
+ const llmsUrl = `https://${base.hostname}/llms.txt`;
194
+ try {
195
+ const controller = new AbortController();
196
+ const t = setTimeout(() => controller.abort(), Math.min(TIMEOUT, 8000));
197
+ const res = await fetch(llmsUrl, {
198
+ method: 'GET',
199
+ redirect: 'follow',
200
+ signal: controller.signal,
201
+ headers: {
202
+ 'user-agent': effectiveUA,
203
+ 'accept': 'text/plain,text/markdown;q=0.9,*/*;q=0.1',
204
+ },
205
+ }).finally(() => clearTimeout(t));
206
+
207
+ if (!res?.ok) return;
208
+ const text = await res.text();
209
+ if (!text || text.length < 5) return;
210
+
211
+ // Extract markdown links: - [Title](url): description
212
+ const urls = [];
213
+ const linkRe = /\[[^\]]*\]\(([^)\s]+)\)/g;
214
+ let m;
215
+ while ((m = linkRe.exec(text))) {
216
+ const u = m[1];
217
+ if (!u) continue;
218
+ // allow absolute http(s) only
219
+ if (!/^https?:\/\//i.test(u)) continue;
220
+ urls.push(u);
221
+ }
222
+
223
+ // De-dupe and enqueue
224
+ const unique = [...new Set(urls)];
225
+ let added = 0;
226
+ for (const u of unique) {
227
+ if (!queue.some(q => q.url === u)) {
228
+ queue.push({ url: u, depth: 1 });
229
+ added++;
230
+ }
231
+ }
232
+ if (unique.length > 0) {
233
+ console.log(`[llms.txt] ${base.hostname} — discovered ${unique.length} URLs (${added} added to queue)`);
234
+ }
235
+ } catch {
236
+ // silent: llms.txt is optional
237
+ }
238
+ }
239
+
240
+ // ── llms.txt: if present, use it to seed crawl queue first ──
241
+ await tryLoadLlmsTxt();
242
+
243
+ const maxPages = Number.isFinite(opts.maxPages) ? opts.maxPages : MAX_PAGES;
244
+ const maxDepth = Number.isFinite(opts.maxDepth) ? opts.maxDepth : MAX_DEPTH;
245
+
246
+ // ── Section budget tracking ──
247
+ const sectionCounts = new Map(); // section → pages crawled
248
+ const tiered = opts.tiered !== false; // tiered crawling on by default
249
+
250
+ // ── Sitemap-first: seed queue from sitemap.xml (section-aware) ──
251
+ try {
252
+ const sitemapUrls = await fetchSitemap(startUrl);
253
+ if (sitemapUrls.length > 0) {
254
+ // Apply section budgets if tiered crawling is enabled
255
+ const budgeted = tiered ? applySectionBudgets(sitemapUrls, maxPages) : sitemapUrls;
256
+
257
+ const skipped = sitemapUrls.length - budgeted.length;
258
+ console.log(`[sitemap] Found ${sitemapUrls.length} URLs — ${budgeted.length} after section budgets` +
259
+ (skipped > 0 ? ` (${skipped} skipped)` : ''));
260
+
261
+ if (tiered && budgeted.length > 0) {
262
+ // Show section breakdown
263
+ const sections = new Map();
264
+ for (const u of budgeted) {
265
+ const { tier, section } = classifyUrl(u.url);
266
+ const key = `${section} [${tier}]`;
267
+ sections.set(key, (sections.get(key) || 0) + 1);
268
+ }
269
+ for (const [sec, cnt] of [...sections.entries()].slice(0, 8)) {
270
+ console.log(` ${sec}: ${cnt} URLs`);
271
+ }
272
+ if (sections.size > 8) console.log(` ... and ${sections.size - 8} more sections`);
273
+ }
274
+
275
+ // Don't enqueue 10k URLs if the crawl budget is tiny.
276
+ const seedLimit = Number.isFinite(opts.sitemapSeedLimit)
277
+ ? opts.sitemapSeedLimit
278
+ : Math.max(maxPages * 2, 50);
279
+
280
+ for (const entry of budgeted.slice(0, seedLimit)) {
281
+ if (!queue.some(q => q.url === entry.url) && entry.url !== startUrl) {
282
+ queue.push({ url: entry.url, depth: 1 }); // treat sitemap URLs as depth 1
283
+ }
284
+ }
285
+ }
286
+ } catch (err) {
287
+ console.log(`[sitemap] Could not fetch sitemap: ${err.message}`);
288
+ }
289
+
290
+ // ── Backoff tracking per domain ──
291
+ let consecutiveErrors = 0;
292
+ let currentDelay = CRAWL_DELAY;
293
+ let blocked = false;
294
+ const MAX_CONSECUTIVE_ERRORS = 5;
295
+
296
+ // ── Advanced mode: full browser rendering with enhanced compatibility ──
297
+ let browser, context;
298
+ if (opts.stealth) {
299
+ const { getStealthConfig, STEALTH_INIT_SCRIPT, applyStealthRoutes } = await import('./stealth.js');
300
+ const stealthCfg = getStealthConfig();
301
+ browser = await chromium.launch({ headless: true, ...stealthCfg.launchArgs });
302
+ // Try to load a saved session for this domain (returning visitor = less WAF friction)
303
+ const sessionPath = loadSessionState(base.hostname);
304
+ const contextOpts = { ...stealthCfg.contextOpts, userAgent: effectiveUA };
305
+ if (sessionPath) contextOpts.storageState = sessionPath;
306
+ context = await browser.newContext(contextOpts);
307
+ await context.addInitScript(STEALTH_INIT_SCRIPT);
308
+ await applyStealthRoutes(context);
309
+ console.log(`[stealth] 🥷 Advanced mode — full browser rendering, persistent sessions`);
310
+ } else {
311
+ browser = await chromium.launch({ headless: true });
312
+ context = await browser.newContext({
313
+ userAgent: effectiveUA,
314
+ ignoreHTTPSErrors: true,
315
+ });
316
+ }
317
+
318
+ try {
319
+ while (queue.length > 0 && count < maxPages && !blocked) {
320
+ const { url, depth } = queue.shift();
321
+ if (visited.has(url)) continue;
322
+ visited.add(url);
323
+
324
+ // ── Section tier check — skip junk sections, respect depth limits ──
325
+ if (tiered) {
326
+ const { tier, section, depth: sectionMaxDepth, budget: sectionBudget } = classifyUrl(url);
327
+
328
+ // Skip banned sections entirely
329
+ if (tier === 'skip') continue;
330
+
331
+ // Check per-section depth limit (section-relative depth, not global)
332
+ if (depth > sectionMaxDepth + 1) continue; // +1 because sitemap URLs start at depth 1
333
+
334
+ // Check per-section budget
335
+ if (Number.isFinite(sectionBudget)) {
336
+ const currentCount = sectionCounts.get(section) || 0;
337
+ if (currentCount >= sectionBudget) continue;
338
+ }
339
+ }
340
+
341
+ // In stealth mode, skip robots.txt — user explicitly opted into bypass
342
+ let crawlDelayMs = 0;
343
+ if (!opts.stealth) {
344
+ const robotsResult = await checkRobots(url).catch(() => ({ allowed: true, crawlDelayMs: 0 }));
345
+ if (!robotsResult.allowed) {
346
+ console.log(`[robots] Skipping disallowed: ${url}`);
347
+ continue;
348
+ }
349
+ crawlDelayMs = robotsResult.crawlDelayMs || 0;
350
+ }
351
+
352
+ const page = await context.newPage();
353
+
354
+ try {
355
+ // Hard per-page deadline wrapping everything
356
+ const pageResult = await withTimeout(processPage(page, url, base, depth, queue, maxDepth), PAGE_BUDGET, url);
357
+
358
+ if (pageResult) {
359
+ // ── Backoff: check for rate limit / WAF responses ──
360
+ if (pageResult.status === 429 || pageResult.status === 503) {
361
+ consecutiveErrors++;
362
+ currentDelay = Math.min(currentDelay * 2, 30000); // exponential backoff, max 30s
363
+ console.log(`[backoff] ${pageResult.status} on ${url} — delay now ${currentDelay}ms (${consecutiveErrors}/${MAX_CONSECUTIVE_ERRORS})`);
364
+ if (consecutiveErrors >= MAX_CONSECUTIVE_ERRORS) {
365
+ blocked = true;
366
+ console.log(`[blocked] ${base.hostname} — too many ${pageResult.status} errors, stopping crawl`);
367
+ // Yield a blocked marker
368
+ yield { ...pageResult, _blocked: true, _blockReason: `${MAX_CONSECUTIVE_ERRORS}x ${pageResult.status}` };
369
+ }
370
+ continue; // don't count rate-limited pages
371
+ }
372
+
373
+ if (pageResult.status === 403) {
374
+ consecutiveErrors++;
375
+ // If stealth session caused 3+ consecutive 403s, discard it
376
+ if (opts.stealth && consecutiveErrors >= 3) discardSession(base.hostname);
377
+ console.log(`[blocked] 403 on ${url} (${consecutiveErrors}/${MAX_CONSECUTIVE_ERRORS})`);
378
+ if (consecutiveErrors >= MAX_CONSECUTIVE_ERRORS) {
379
+ blocked = true;
380
+ console.log(`[blocked] ${base.hostname} — likely WAF/firewall, stopping crawl`);
381
+ yield { ...pageResult, _blocked: true, _blockReason: `${MAX_CONSECUTIVE_ERRORS}x 403 Forbidden` };
382
+ }
383
+ continue;
384
+ }
385
+
386
+ // Success — reset backoff
387
+ consecutiveErrors = 0;
388
+ currentDelay = CRAWL_DELAY;
389
+
390
+ // Track section budget
391
+ if (tiered) {
392
+ const { section } = classifyUrl(url);
393
+ sectionCounts.set(section, (sectionCounts.get(section) || 0) + 1);
394
+ }
395
+
396
+ count++;
397
+ yield pageResult;
398
+ }
399
+ } catch (err) {
400
+ console.error(`[crawler] Error on ${url}: ${err.message}`);
401
+ consecutiveErrors++;
402
+ if (consecutiveErrors >= MAX_CONSECUTIVE_ERRORS) {
403
+ blocked = true;
404
+ console.log(`[blocked] ${base.hostname} — ${MAX_CONSECUTIVE_ERRORS} consecutive failures, stopping`);
405
+ }
406
+ } finally {
407
+ await page.close().catch(() => {});
408
+ }
409
+
410
+ // Stealth: jittered human-like delays (2-5s), Standard: configured crawl delay
411
+ const delay = opts.stealth
412
+ ? 2000 + Math.random() * 3000
413
+ : Math.max(crawlDelayMs, currentDelay);
414
+ await new Promise(r => setTimeout(r, delay));
415
+ }
416
+ } finally {
417
+ // Persist stealth session cookies for next run (returning visitor)
418
+ if (opts.stealth && !blocked) await saveSessionState(context, base.hostname);
419
+ await browser.close().catch(() => {});
420
+ }
421
+ }
422
+
423
+ async function processPage(page, url, base, depth, queue, maxDepth) {
424
+ let status = 0;
425
+ const t0 = Date.now();
426
+
427
+ // Try domcontentloaded first, fall back to load
428
+ let res;
429
+ for (const waitUntil of ['domcontentloaded', 'load']) {
430
+ try {
431
+ res = await page.goto(url, { waitUntil, timeout: TIMEOUT });
432
+ break;
433
+ } catch (err) {
434
+ if (waitUntil === 'load') throw err;
435
+ console.log(`[crawler] ${waitUntil} failed for ${url}, retrying with load...`);
436
+ }
437
+ }
438
+
439
+ status = res?.status() || 0;
440
+ const loadMs = Date.now() - t0;
441
+
442
+ // ── Return status for backoff logic (don't silently drop 4xx) ──
443
+ if (status === 429 || status === 503 || status === 403) {
444
+ return { url, depth, status, loadMs, wordCount: 0, isIndexable: false, title: '', metaDesc: '', headings: [], links: [], bodyText: '', schemaTypes: [], vitals: {}, publishedDate: null, modifiedDate: null, contentHash: null };
445
+ }
446
+ if (status >= 400) return null;
447
+
448
+ const title = await page.title().catch(() => '');
449
+ const metaDesc = await page.$eval('meta[name="description"]', el => el.content).catch(() => '');
450
+
451
+ const headings = await page.$$eval('h1,h2,h3,h4,h5,h6', els =>
452
+ els.map(el => ({ level: parseInt(el.tagName[1]), text: el.innerText?.trim().slice(0, 200) })).filter(h => h.text)
453
+ ).catch(() => []);
454
+
455
+ const links = await page.$$eval('a[href]', (els, baseHref) =>
456
+ els.map(el => {
457
+ try { return { url: new URL(el.href, baseHref).href, anchor: el.innerText?.trim().slice(0, 100) || '' }; }
458
+ catch { return null; }
459
+ }).filter(Boolean), base.href
460
+ ).catch(() => []);
461
+
462
+ const getRootDomain = h => h.split(".").slice(-2).join(".");
463
+ // BUG-006: When strictHost is set (--domain flag), only exact hostname match is internal.
464
+ // Otherwise, same root domain = internal (so blog.x and docs.x are internal to x).
465
+ const isInternal = (h) => opts.strictHost
466
+ ? h === base.hostname
467
+ : (h === base.hostname || getRootDomain(h) === getRootDomain(base.hostname));
468
+ const internalLinks = links.filter(l => { try { return isInternal(new URL(l.url).hostname); } catch { return false; } }).map(l => ({ ...l, isInternal: true }));
469
+ const externalLinks = links.filter(l => { try { return !isInternal(new URL(l.url).hostname); } catch { return false; } }).map(l => ({ ...l, isInternal: false }));
470
+
471
+ // Markdown-first extraction — preserves headings, lists, emphasis. Falls back to selector-based.
472
+ const bodyText = await extractAsMarkdown(page).catch(() => '')
473
+ || await extractSelective(page, ['h1','h2','h3','p','li','span.hero','div.tagline']).catch(() => '');
474
+
475
+ const schemaTypes = await page.$$eval('script[type="application/ld+json"]', els => {
476
+ const types = [];
477
+ for (const el of els) { try { const d = JSON.parse(el.textContent); types.push(d['@type']); } catch {} }
478
+ return types.filter(Boolean);
479
+ }).catch(() => []);
480
+
481
+ // LCP with a hard 1.5s cap (was hanging before)
482
+ const vitals = await Promise.race([
483
+ page.evaluate(() => new Promise(resolve => {
484
+ let lcp = null;
485
+ try {
486
+ new PerformanceObserver(list => { lcp = list.getEntries().at(-1)?.startTime || null; })
487
+ .observe({ type: 'largest-contentful-paint', buffered: true });
488
+ } catch {}
489
+ setTimeout(() => resolve({ lcp }), 1000);
490
+ })),
491
+ new Promise(resolve => setTimeout(() => resolve({}), 1500)),
492
+ ]).catch(() => ({}));
493
+
494
+ const wordCount = await page.$eval('body', el => el.innerText.split(/\s+/).filter(Boolean).length).catch(() => 0);
495
+
496
+ const robotsMeta = await page.$eval('meta[name="robots"]', el => el.content).catch(() => '');
497
+ const isIndexable = !robotsMeta.toLowerCase().includes('noindex');
498
+
499
+ const publishedDate = await page.evaluate(() => {
500
+ for (const sel of ['meta[property="article:published_time"]','meta[name="date"]','meta[itemprop="datePublished"]']) {
501
+ const el = document.querySelector(sel);
502
+ if (el?.content) return el.content;
503
+ }
504
+ for (const el of document.querySelectorAll('script[type="application/ld+json"]')) {
505
+ try { const d = JSON.parse(el.textContent); if (d.datePublished) return d.datePublished; } catch {}
506
+ }
507
+ return null;
508
+ }).catch(() => null);
509
+
510
+ const modifiedDate = await page.evaluate(() => {
511
+ for (const sel of ['meta[property="article:modified_time"]','meta[name="last-modified"]','meta[itemprop="dateModified"]']) {
512
+ const el = document.querySelector(sel);
513
+ if (el?.content) return el.content;
514
+ }
515
+ for (const el of document.querySelectorAll('script[type="application/ld+json"]')) {
516
+ try { const d = JSON.parse(el.textContent); if (d.dateModified) return d.dateModified; } catch {}
517
+ }
518
+ return null;
519
+ }).catch(() => null);
520
+
521
+ // Queue new URLs (section-aware: skip junk links early)
522
+ if (depth < maxDepth) {
523
+ for (const link of internalLinks) {
524
+ try {
525
+ const u = new URL(link.url);
526
+ if (/\.(pdf|png|jpg|jpeg|gif|svg|css|js|woff|ico)$/i.test(u.pathname)) continue;
527
+ // Pre-filter: don't even enqueue URLs from skip sections
528
+ const { tier } = classifyUrl(link.url);
529
+ if (tier === 'skip') continue;
530
+ if (!queue.some(q => q.url === link.url)) {
531
+ queue.push({ url: link.url, depth: depth + 1 });
532
+ }
533
+ } catch(e) {
534
+ }
535
+ }
536
+ }
537
+
538
+ // ── Deep JSON-LD parsing — extract structured schema data from raw HTML ──
539
+ const rawHtml = await page.content().catch(() => '');
540
+ const parsedSchemas = parseJsonLd(rawHtml);
541
+
542
+ // ── Content hash for incremental crawling ──
543
+ const hash = contentHash(bodyText);
544
+
545
+ // ── Quality gate — detect shells, blocked pages, empty content ──
546
+ const quality = assessQuality({ wordCount, bodyText, title, status });
547
+
548
+ return {
549
+ url, depth, status, loadMs, wordCount, isIndexable,
550
+ title, metaDesc, headings,
551
+ links: [...internalLinks, ...externalLinks],
552
+ bodyText: sanitize(bodyText, 2000),
553
+ schemaTypes, parsedSchemas, vitals, publishedDate, modifiedDate,
554
+ contentHash: hash,
555
+ quality: quality.ok, qualityReason: quality.reason,
556
+ };
557
+ }
558
+
559
+ export async function crawlAll(startUrl) {
560
+ const pages = [];
561
+ for await (const page of crawlDomain(startUrl)) pages.push(page);
562
+ return pages;
563
+ }
564
+
565
+ // Export for use by other modules (content velocity, weekly brief, etc.)
566
+ export { classifyUrl, SECTION_TIERS };
@@ -0,0 +1,103 @@
1
+ /**
2
+ * robots.txt fetcher + parser
3
+ * Checks if we're allowed to crawl a URL and what delay to respect.
4
+ */
5
+
6
+ import fetch from 'node-fetch';
7
+
8
+ const cache = new Map(); // domain → { rules, crawlDelay, fetchedAt }
9
+ const CACHE_TTL = 24 * 60 * 60 * 1000; // 24h
10
+ const OUR_AGENT = 'SEOIntelBot';
11
+
12
+ /**
13
+ * Fetch and parse robots.txt for a domain.
14
+ */
15
+ async function fetchRobots(domain) {
16
+ const cached = cache.get(domain);
17
+ if (cached && Date.now() - cached.fetchedAt < CACHE_TTL) return cached;
18
+
19
+ const url = `https://${domain}/robots.txt`;
20
+ let text = '';
21
+ try {
22
+ const res = await fetch(url, { timeout: 8000, headers: { 'User-Agent': OUR_AGENT } });
23
+ if (res.ok) text = await res.text();
24
+ } catch {
25
+ // No robots.txt = everything allowed
26
+ }
27
+
28
+ const parsed = parseRobots(text);
29
+ cache.set(domain, { ...parsed, fetchedAt: Date.now() });
30
+ return parsed;
31
+ }
32
+
33
+ function parseRobots(text) {
34
+ const lines = text.split('\n').map(l => l.trim()).filter(l => l && !l.startsWith('#'));
35
+
36
+ let crawlDelay = null;
37
+ const disallowed = [];
38
+ const allowed = [];
39
+ let inOurBlock = false;
40
+ let inAllBlock = false;
41
+
42
+ for (const line of lines) {
43
+ const [key, ...rest] = line.split(':');
44
+ const val = rest.join(':').trim();
45
+ const k = key.toLowerCase().trim();
46
+
47
+ if (k === 'user-agent') {
48
+ inOurBlock = val === OUR_AGENT || val === '*';
49
+ inAllBlock = val === '*';
50
+ }
51
+ if ((inOurBlock || inAllBlock) && k === 'disallow' && val) {
52
+ disallowed.push(val);
53
+ }
54
+ if ((inOurBlock || inAllBlock) && k === 'allow' && val) {
55
+ allowed.push(val);
56
+ }
57
+ if ((inOurBlock || inAllBlock) && k === 'crawl-delay' && val) {
58
+ const d = parseFloat(val);
59
+ if (!isNaN(d)) crawlDelay = Math.max(d, 1); // minimum 1s
60
+ }
61
+ }
62
+
63
+ return { disallowed, allowed, crawlDelay };
64
+ }
65
+
66
+ /**
67
+ * Check if we're allowed to crawl a URL.
68
+ * Returns { allowed: bool, crawlDelayMs: number }
69
+ */
70
+ export async function checkRobots(url) {
71
+ try {
72
+ const { hostname } = new URL(url);
73
+ const { disallowed, allowed, crawlDelay } = await fetchRobots(hostname);
74
+
75
+ const path = new URL(url).pathname;
76
+
77
+ // Check disallow rules
78
+ for (const rule of disallowed) {
79
+ if (path.startsWith(rule)) {
80
+ // Check if there's a more specific allow
81
+ const overridden = allowed.some(a => a.length > rule.length && path.startsWith(a));
82
+ if (!overridden) return { allowed: false, crawlDelayMs: 0 };
83
+ }
84
+ }
85
+
86
+ // crawlDelay from robots.txt takes priority, min 1.5s always
87
+ const crawlDelayMs = crawlDelay
88
+ ? Math.max(crawlDelay * 1000, 1500)
89
+ : parseInt(process.env.CRAWL_DELAY_MS || '1500');
90
+
91
+ return { allowed: true, crawlDelayMs };
92
+ } catch {
93
+ return { allowed: true, crawlDelayMs: 1500 };
94
+ }
95
+ }
96
+
97
+ /**
98
+ * Get recommended crawl delay for a domain (ms).
99
+ */
100
+ export async function getCrawlDelay(domain) {
101
+ const { crawlDelay } = await fetchRobots(domain).catch(() => ({ crawlDelay: null }));
102
+ return crawlDelay ? Math.max(crawlDelay * 1000, 1500) : 1500;
103
+ }