seo-intel 1.5.45 → 1.5.50

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/mcp/server.js CHANGED
@@ -32,8 +32,17 @@ import { readProgress } from '../lib/progress.js';
32
32
  import { getProblems, getProblemCounts, markProblemStatus, getActiveStatusMap, PROBLEM_CATEGORIES, PROBLEM_STATUSES } from '../lib/problems.js';
33
33
 
34
34
  import { runAeoAnalysis, persistAeoScores, upsertCitabilityInsights } from '../analyses/aeo/index.js';
35
+ import { fetchAiAccessForDomains } from '../analyses/aeo/ai-access.js';
36
+ import { runTechnicalAudit } from '../analysis/technical-audit.js';
37
+ // NOTE: model-suggestion helpers (setup/models.js, setup/checks.js) are loaded
38
+ // lazily inside the suggest_models handler, NOT imported at top level — to keep
39
+ // the setup subtree (and anything it transitively pulls) off the MCP boot path.
35
40
  import { prescore, extractDraftTopic } from '../analyses/blog-draft/prescorer.js';
36
- import { lightCrawl } from '../crawler/light.js';
41
+ // NOTE: lightCrawl (crawler/light.js) is loaded lazily inside the crawl_site
42
+ // handler, NOT imported at top level. Its chain pulls turndown
43
+ // (light.js → html-extract.js → sanitize.js → turndown), and a slow/hanging
44
+ // turndown import would otherwise block the entire MCP stdio boot — no tools,
45
+ // no banner, no handshake. Keep the crawler subtree off the boot path.
37
46
  import { runContentLoop } from '../analyses/loop/orchestrator.js';
38
47
  import { gatherBlogDraftContext, buildBlogDraftPrompt } from '../analyses/blog-draft/index.js';
39
48
 
@@ -383,6 +392,9 @@ server.registerTool(
383
392
  },
384
393
  async ({ url, max_pages, include_citability, same_origin }) => {
385
394
  try {
395
+ // Lazy-load the crawler subtree (pulls turndown) only when crawl_site is
396
+ // actually invoked — keeps it off the MCP boot path. See note at top.
397
+ const { lightCrawl } = await import('../crawler/light.js');
386
398
  const r = await lightCrawl(url, {
387
399
  maxPages: max_pages ?? 10,
388
400
  includeCitability: include_citability ?? false,
@@ -502,21 +514,31 @@ server.registerTool(
502
514
  server.registerTool(
503
515
  'run_citability_audit',
504
516
  {
505
- description: 'Run AEO citability scoring across all crawled pages (6 signals: entity authority, structured claims, answer density, Q&A proximity, freshness, schema coverage). Persists scores to citability_scores and upserts citability_gap insights into the ledger. Pure function — fast, no LLM calls. Free tier — analysis of your own site is free.',
517
+ description: 'Run AEO citability scoring across all crawled pages (7 signals: entity authority, structured claims, answer density, Q&A proximity, freshness, schema coverage, and AI-crawler access). Also checks robots.txt per target domain — if it blocks answer-engine crawlers (ClaudeBot / GPTBot / PerplexityBot / Google-Extended), affected pages are gated low because AI assistants literally cannot read them. Persists scores to citability_scores and upserts citability_gap insights into the ledger. Free tier — analysis of your own site is free.',
506
518
  inputSchema: {
507
519
  project: z.string(),
508
520
  include_competitors: z.boolean().optional().describe('Score competitor pages too (default true)'),
521
+ check_ai_access: z.boolean().optional().describe('Fetch robots.txt per target domain to score AI-crawler access (default true). The only network call this tool makes; set false to keep it fully offline.'),
509
522
  },
510
523
  },
511
- async ({ project, include_competitors = true }) => {
524
+ async ({ project, include_competitors = true, check_ai_access = true }) => {
512
525
  if (!loadProjectConfig(project)) {
513
526
  return { content: [{ type: 'text', text: `Project "${project}" not found. Use list_projects to discover.` }], isError: true };
514
527
  }
515
528
  try {
516
529
  const db = getDb();
517
- const results = runAeoAnalysis(db, project, { includeCompetitors: include_competitors, log: () => {} });
530
+ let aiAccessByDomain = null;
531
+ if (check_ai_access) {
532
+ const targetDomains = db
533
+ .prepare("SELECT DISTINCT domain FROM domains WHERE project = ? AND role IN ('target','owned')")
534
+ .all(project).map(r => r.domain);
535
+ if (targetDomains.length) {
536
+ try { aiAccessByDomain = await fetchAiAccessForDomains(targetDomains); } catch { /* best-effort */ }
537
+ }
538
+ }
539
+ const results = runAeoAnalysis(db, project, { includeCompetitors: include_competitors, aiAccessByDomain, log: () => {} });
518
540
  persistAeoScores(db, results);
519
- upsertCitabilityInsights(db, project, results.target);
541
+ upsertCitabilityInsights(db, project, results.target, results.summary.aiAccess);
520
542
  const competitorPageCount = [...results.competitors.values()].reduce((a, list) => a + list.length, 0);
521
543
  const avgTargetScore = results.target.length
522
544
  ? Math.round(results.target.reduce((s, p) => s + p.score, 0) / results.target.length)
@@ -532,6 +554,8 @@ server.registerTool(
532
554
  target_pages_scored: results.target.length,
533
555
  competitor_pages_scored: competitorPageCount,
534
556
  avg_target_score: avgTargetScore,
557
+ ai_access: results.summary.aiAccess,
558
+ ai_access_gated_pages: results.summary.gatedPages,
535
559
  low_score_target_pages: lowScorePages,
536
560
  hint: 'Scores persisted to DB. Call get_intel(project, for=audit) to see the full citability matrix + insights ledger.',
537
561
  };
@@ -545,6 +569,226 @@ server.registerTool(
545
569
  }
546
570
  );
547
571
 
572
+ // ── Tool: tech_audit (FREE) ───────────────────────────────────────────────
573
+ server.registerTool(
574
+ 'tech_audit',
575
+ {
576
+ description: [
577
+ 'Run the technical SEO audit on already-crawled data for a project — titles, meta descriptions, noindex/robots conflicts, redirect chains, canonical issues, and sitemap-vs-crawl diff. Returns severity-sorted findings (error / warn / info) with the affected URL and a description each.',
578
+ '',
579
+ 'Reads from the local DB (no re-crawl). Optionally runs live HEAD checks against sitemap URLs (network) to catch broken/redirected entries. Free tier — covers your own target/owned domains.',
580
+ ].join('\n'),
581
+ inputSchema: {
582
+ project: z.string().describe('Project slug. Use list_projects to discover.'),
583
+ domain: z.string().optional().describe('Audit a single domain. Omit to audit all target/owned domains in the project.'),
584
+ sitemap_head: z.boolean().optional().describe('Also run live HEAD checks against sitemap URLs (network-heavy). Default false.'),
585
+ limit: z.number().int().positive().max(200).optional().describe('Max findings to return per domain (default 60).'),
586
+ },
587
+ },
588
+ async ({ project, domain, sitemap_head, limit = 60 }) => {
589
+ if (!loadProjectConfig(project)) {
590
+ return { content: [{ type: 'text', text: `Project "${project}" not found. Use list_projects to discover.` }], isError: true };
591
+ }
592
+ try {
593
+ const db = getDb();
594
+ const domainRows = domain
595
+ ? [{ domain }]
596
+ : db.prepare("SELECT domain FROM domains WHERE project = ? AND role IN ('target','owned')").all(project);
597
+ if (!domainRows.length) {
598
+ return { content: [{ type: 'text', text: `No target/owned domains found for project "${project}".` }], isError: true };
599
+ }
600
+ const order = { error: 0, warn: 1, info: 2 };
601
+ const domains = [];
602
+ for (const { domain: d } of domainRows) {
603
+ const res = await runTechnicalAudit(db, { project, domain: d, runSitemapHead: !!sitemap_head });
604
+ if (res.error) { domains.push({ domain: d, error: res.error }); continue; }
605
+ const findings = [...(res.findings || [])]
606
+ .sort((a, b) => (order[a.severity] ?? 3) - (order[b.severity] ?? 3))
607
+ .slice(0, limit)
608
+ .map(f => ({ severity: f.severity, type: f.type, url: f.url || null, details: f.details }));
609
+ domains.push({ domain: d, stats: res.stats, findings, findings_truncated: (res.findings || []).length > limit });
610
+ }
611
+ const out = {
612
+ ok: true,
613
+ project,
614
+ domains,
615
+ hint: 'Findings read from the local crawl DB. Re-run `run_crawl` then this tool to verify fixes cleared. For AI-citability gaps, use run_citability_audit; for the prioritized fix queue, use list_problems.',
616
+ };
617
+ return { content: [{ type: 'text', text: JSON.stringify(out, null, 2) }], structuredContent: out };
618
+ } catch (err) {
619
+ return { content: [{ type: 'text', text: `seo-intel error: ${err.message}` }], isError: true };
620
+ }
621
+ }
622
+ );
623
+
624
+ // ── Tool: suggest_models (FREE) ───────────────────────────────────────────
625
+ server.registerTool(
626
+ 'suggest_models',
627
+ {
628
+ description: [
629
+ 'Suggest LOCAL extraction models for the user\'s machine — the small models seo-intel runs once per crawled page to pull structured SEO data. Detects GPU/VRAM and which models are already in Ollama, then recommends from the curated set (Gemma 4 E2B / E4B / 12B, Qwen 3.5 4B / 9B).',
630
+ '',
631
+ 'IMPORTANT: extraction should be done with a LOCAL model. The response always includes a cloud disclaimer — surface it to the user. Cloud extraction sends every page off-machine, costs money at scale, and rate-limits; a 4–8B local model handles this task well, offline. Free tier.',
632
+ ].join('\n'),
633
+ inputSchema: {
634
+ vram_gb: z.number().positive().optional().describe('Override detected VRAM (GB). Omit to auto-detect the host GPU/unified memory.'),
635
+ },
636
+ },
637
+ async ({ vram_gb }) => {
638
+ try {
639
+ const { suggestExtractionModels, CLOUD_EXTRACTION_DISCLAIMER } = await import('../setup/models.js');
640
+ let vramMB = 0, gpuName = null;
641
+ if (vram_gb) { vramMB = Math.round(vram_gb * 1024); gpuName = 'user-specified'; }
642
+ else { try { const { detectVRAM } = await import('../setup/checks.js'); const v = detectVRAM(); vramMB = v.vramMB || 0; gpuName = v.gpuName || null; } catch { /* unknown */ } }
643
+
644
+ let installed = [];
645
+ try {
646
+ const c = new AbortController();
647
+ const t = setTimeout(() => c.abort(), 1500);
648
+ const r = await fetch('http://localhost:11434/api/tags', { signal: c.signal });
649
+ clearTimeout(t);
650
+ if (r.ok) { const d = await r.json(); installed = (d.models || []).map(m => m.name); }
651
+ } catch { /* Ollama not reachable */ }
652
+
653
+ const { suggestions, recommendedId } = suggestExtractionModels(vramMB, installed);
654
+ const out = {
655
+ hardware: { gpu: gpuName, vram_gb: vramMB ? +(vramMB / 1024).toFixed(1) : null },
656
+ recommended: recommendedId,
657
+ install_hint: recommendedId ? `ollama pull ${recommendedId}` : null,
658
+ suggestions,
659
+ cloud_disclaimer: CLOUD_EXTRACTION_DISCLAIMER,
660
+ note: 'Extraction should be done with a LOCAL model — show cloud_disclaimer to the user before suggesting any cloud option.',
661
+ };
662
+ return { content: [{ type: 'text', text: JSON.stringify(out, null, 2) }], structuredContent: out };
663
+ } catch (err) {
664
+ return { content: [{ type: 'text', text: `seo-intel error: ${err.message}` }], isError: true };
665
+ }
666
+ }
667
+ );
668
+
669
+ // ── Tool: setup_project (FREE — project creation from chat) ───────────────
670
+ // Closes the "setting up" gap: before this, projects could only be created via
671
+ // the CLI/web wizard. An agent can now take a user from zero → configured →
672
+ // crawled → audited entirely in chat.
673
+ server.registerTool(
674
+ 'setup_project',
675
+ {
676
+ description: [
677
+ 'Create (or update) a SEO Intel project from chat — no CLI wizard needed. Writes the project config that run_crawl / run_citability_audit / tech_audit / get_intel operate on.',
678
+ '',
679
+ 'Minimum: project_name + target_url. Add competitors to unlock the Solo competitive surface later. Industry/audience/goal feed the analysis prompts — better context, better insights. Use suggest_models first to pick a local extraction model for the user\'s hardware.',
680
+ '',
681
+ 'Refuses to overwrite an existing project unless overwrite=true. Free tier.',
682
+ ].join('\n'),
683
+ inputSchema: {
684
+ project_name: z.string().describe('Human name — slugified for the project id (e.g. "Carbium Docs" → carbium-docs).'),
685
+ target_url: z.string().describe('The site to optimize (scheme optional).'),
686
+ site_name: z.string().optional().describe('Brand/site display name (defaults to project_name).'),
687
+ industry: z.string().optional().describe('What the site/business does — feeds analysis context.'),
688
+ audience: z.string().optional().describe('Who the site serves — feeds analysis context.'),
689
+ goal: z.string().optional().describe('What success looks like — feeds analysis context.'),
690
+ competitors: z.array(z.string()).optional().describe('Competitor URLs/domains to track (Solo features use these).'),
691
+ owned: z.array(z.string()).optional().describe('Other owned domains/subdomains to include.'),
692
+ pages_per_domain: z.number().int().positive().optional().describe('Max pages per domain per crawl (default 50).'),
693
+ extraction_model: z.string().optional().describe('Local extraction model tag (e.g. gemma4:e4b). Get a recommendation from suggest_models.'),
694
+ overwrite: z.boolean().optional().describe('Allow overwriting an existing project config (default false).'),
695
+ },
696
+ },
697
+ async ({ project_name, target_url, site_name, industry, audience, goal, competitors = [], owned = [], pages_per_domain, extraction_model, overwrite = false }) => {
698
+ try {
699
+ const { buildProjectConfig, writeProjectConfig, validateConfig, slugify } = await import('../setup/config-builder.js');
700
+ const slug = slugify(project_name);
701
+ const existing = join(CONFIG_DIR, `${slug}.json`);
702
+ if (existsSync(existing) && !overwrite) {
703
+ return { content: [{ type: 'text', text: `Project "${slug}" already exists. Pass overwrite=true to replace it, or use list_projects to see what's configured.` }], isError: true };
704
+ }
705
+
706
+ const config = buildProjectConfig({
707
+ projectName: project_name,
708
+ targetUrl: target_url,
709
+ siteName: site_name || project_name,
710
+ industry: industry || '',
711
+ audience: audience || '',
712
+ goal: goal || '',
713
+ competitors: competitors.map(u => ({ url: u })),
714
+ owned: owned.map(u => ({ url: u })),
715
+ pagesPerDomain: pages_per_domain || 50,
716
+ extractionModel: extraction_model,
717
+ });
718
+
719
+ const validation = validateConfig(config);
720
+ if (!validation.valid) {
721
+ return { content: [{ type: 'text', text: `Config validation failed: ${validation.errors.join('; ')}` }], isError: true };
722
+ }
723
+
724
+ const written = writeProjectConfig(config, ROOT);
725
+ const out = {
726
+ ok: true,
727
+ project: config.project,
728
+ config_path: written.path,
729
+ overwritten: written.overwritten,
730
+ target: config.target?.domain,
731
+ competitors: (config.competitors || []).map(c => c.domain),
732
+ owned: (config.owned || []).map(o => o.domain),
733
+ extraction_model: config.crawl?.extractionModel || '(default)',
734
+ hint: `Project ready. Next: run_crawl("${config.project}") to crawl, then run_citability_audit + tech_audit + list_problems. For a local extraction model, see suggest_models.`,
735
+ };
736
+ return { content: [{ type: 'text', text: JSON.stringify(out, null, 2) }], structuredContent: out };
737
+ } catch (err) {
738
+ return { content: [{ type: 'text', text: `seo-intel error: ${err.message}` }], isError: true };
739
+ }
740
+ }
741
+ );
742
+
743
+ // ── Tool: scan_site (PAID — one-shot full audit, no config) ───────────────
744
+ // Mirrors `seo-intel scan <domain>`: crawl → extract → analyze → export. It is
745
+ // heavyweight (browser crawl + extraction + cloud analysis), so it runs as a
746
+ // detached subprocess like run_crawl and returns the report path to poll.
747
+ server.registerTool(
748
+ 'scan_site',
749
+ {
750
+ description: [
751
+ 'One-shot full SEO audit of any domain with no project setup — crawl → extract → analyze → export. Spawns a detached background job (like run_crawl) and returns immediately with the report path; poll get_crawl_status for progress.',
752
+ '',
753
+ 'Heavyweight: full browser crawl, local extraction, and cloud analysis. For a fast, ephemeral, offline read of a single URL use crawl_site instead. Paid tier (Solo).',
754
+ ].join('\n'),
755
+ inputSchema: {
756
+ domain: z.string().describe('Domain or URL to audit (e.g. "docs.carbium.sh").'),
757
+ pages: z.number().int().positive().max(500).optional().describe('Max pages to crawl (default 100).'),
758
+ stealth: z.boolean().optional().describe('Enable stealth browser mode for JS-heavy / anti-bot sites.'),
759
+ no_ai: z.boolean().optional().describe('Skip the AI-enriched export (deterministic markdown only).'),
760
+ model: z.enum(['gemini', 'claude', 'gpt']).optional().describe('Model for analysis + AI export (default gemini).'),
761
+ },
762
+ },
763
+ async ({ domain, pages, stealth, no_ai, model }) => {
764
+ if (!isPro()) return paidGate('scan_site');
765
+ const progress = readProgress();
766
+ if (progress?.status === 'running') {
767
+ return { content: [{ type: 'text', text: `A seo-intel job is already running (command="${progress.command}", pid=${progress.pid}). Wait or call get_crawl_status.` }], isError: true };
768
+ }
769
+ const bare = domain.replace(/^https?:\/\//, '').replace(/\/.*$/, '').replace(/^www\./, '');
770
+ const args = ['cli.js', 'scan', bare];
771
+ if (pages) args.push('--pages', String(pages));
772
+ if (stealth) args.push('--stealth');
773
+ if (no_ai) args.push('--no-ai');
774
+ if (model) args.push('--model', model);
775
+
776
+ const child = spawn(process.execPath, args, { cwd: ROOT, detached: true, stdio: 'ignore' });
777
+ child.unref();
778
+
779
+ const reportPath = join(ROOT, 'reports', `scan-${bare.replace(/[^a-z0-9]/gi, '-').toLowerCase()}-${new Date().toISOString().slice(0, 10)}.md`);
780
+ const result = {
781
+ started: true,
782
+ pid: child.pid,
783
+ domain: bare,
784
+ command: `node ${args.join(' ')}`,
785
+ report_path: reportPath,
786
+ hint: 'Scan is running detached (crawl → extract → analyze → export). Poll get_crawl_status; when status="completed" read the markdown at report_path. The ephemeral project is "_scan-<domain>" — tech_audit/run_citability_audit can be run against it once the crawl lands.',
787
+ };
788
+ return { content: [{ type: 'text', text: JSON.stringify(result, null, 2) }], structuredContent: result };
789
+ }
790
+ );
791
+
548
792
  // ── Tool: get_competitor_positioning (PAID) ───────────────────────────────
549
793
  server.registerTool(
550
794
  'get_competitor_positioning',
@@ -989,7 +1233,7 @@ async function main() {
989
1233
  const transport = new StdioServerTransport();
990
1234
  await server.connect(transport);
991
1235
  // stderr is fine; the host typically surfaces this in its MCP logs panel.
992
- console.error(`[seo-intel-mcp] v${VERSION} ready on stdio. 17 tools — free: crawl_site (ad-hoc, any URL, no config), run_content_loop (gap→draft→close), list_projects, list_problems, mark_problem_status, get_intel(raw/audit/blog), get_pages, list_keywords, get_headings, run_crawl, get_crawl_status, ingest_insight, run_citability_audit, prescore_draft, draft_blog_prompt, export_intel (own-site tables); Solo (competitor synthesis): get_competitor_positioning, get_intel(competitor), export_intel (analyses table).`);
1236
+ console.error(`[seo-intel-mcp] v${VERSION} ready on stdio. 21 tools — free: setup_project (zero→configured from chat), crawl_site (ad-hoc, any URL, no config), run_content_loop (gap→draft→close), list_projects, list_problems, mark_problem_status, get_intel(raw/audit/blog), get_pages, list_keywords, get_headings, run_crawl, get_crawl_status, ingest_insight, run_citability_audit (now with AI-crawler access), tech_audit, suggest_models (local-first), prescore_draft, draft_blog_prompt, export_intel (own-site tables); Solo: scan_site (one-shot full audit), get_competitor_positioning, get_intel(competitor), export_intel (analyses table).`);
993
1237
  }
994
1238
 
995
1239
  main().catch(err => {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "seo-intel",
3
- "version": "1.5.45",
3
+ "version": "1.5.50",
4
4
  "description": "Local Ahrefs-style SEO competitor intelligence. Crawl → SQLite → cloud analysis.",
5
5
  "type": "module",
6
6
  "license": "SEE LICENSE IN LICENSE",
package/seo-intel.png CHANGED
Binary file
package/server.js CHANGED
@@ -140,11 +140,55 @@ function getProjects() {
140
140
  .filter(Boolean);
141
141
  }
142
142
 
143
+ // ── Security: loopback-only gate (anti DNS-rebinding + cross-origin/CSRF) ──
144
+ //
145
+ // This server binds 127.0.0.1, but any web page you visit can still fire
146
+ // requests at localhost. Two checks close that whole class:
147
+ // • Host — a DNS-rebinding request arrives carrying the ATTACKER's domain as
148
+ // Host (not localhost), so requiring a loopback Host defeats it.
149
+ // • Origin — a cross-origin page sends its own Origin; requiring a loopback
150
+ // Origin (when present) blocks cross-origin reads and CSRF.
151
+ // Same-origin dashboard use is unaffected: same-origin GET/SSE either sends no
152
+ // Origin or sends our own loopback Origin.
153
+ const LOCAL_HOSTS = new Set(['localhost', '127.0.0.1', '::1']);
154
+
155
+ function normHost(h) {
156
+ if (!h) return '';
157
+ h = String(h).trim().toLowerCase();
158
+ if (h.startsWith('[')) { const i = h.indexOf(']'); return i > 0 ? h.slice(1, i) : h.slice(1); } // [::1]:port → ::1
159
+ return h.split(':')[0]; // host:port → host
160
+ }
161
+
162
+ function isLocalRequest(req) {
163
+ const host = normHost(req.headers.host);
164
+ if (!host || !LOCAL_HOSTS.has(host)) return false; // defeats DNS rebinding
165
+ const origin = req.headers.origin;
166
+ if (origin && origin !== 'null') { // defeats cross-origin / CSRF
167
+ try { if (!LOCAL_HOSTS.has(normHost(new URL(origin).host))) return false; }
168
+ catch { return false; }
169
+ }
170
+ return true;
171
+ }
172
+
143
173
  // ── Request handler ──
144
174
  async function handleRequest(req, res) {
145
175
  const url = new URL(req.url, `http://localhost:${PORT}`);
146
176
  const path = url.pathname;
147
177
 
178
+ // Security headers on every response (clickjacking + MIME-sniffing). The
179
+ // frame-ancestors directive only governs who may iframe us — it does NOT
180
+ // restrict the dashboard's own CDN resources, so it is safe to set globally.
181
+ res.setHeader('X-Frame-Options', 'DENY');
182
+ res.setHeader('X-Content-Type-Options', 'nosniff');
183
+ res.setHeader('Content-Security-Policy', "frame-ancestors 'none'");
184
+
185
+ // Loopback-only gate — reject anything not local before any routing happens.
186
+ if (!isLocalRequest(req)) {
187
+ res.writeHead(403, { 'Content-Type': 'text/plain' });
188
+ res.end('Forbidden: SEO Intel only accepts requests from localhost.');
189
+ return;
190
+ }
191
+
148
192
  // ─── Setup wizard routes ───
149
193
  if (path.startsWith('/setup') || path.startsWith('/api/setup/')) {
150
194
  try {
@@ -1162,12 +1206,13 @@ ${md}`;
1162
1206
  args.push('--save');
1163
1207
  }
1164
1208
 
1165
- // SSE headers
1209
+ // SSE headers — no CORS: the dashboard is same-origin, and the loopback
1210
+ // gate already blocks cross-origin callers. (Removed Access-Control-Allow-Origin:*
1211
+ // which previously let any website read this command-execution stream.)
1166
1212
  res.writeHead(200, {
1167
1213
  'Content-Type': 'text/event-stream',
1168
1214
  'Cache-Control': 'no-cache',
1169
1215
  'Connection': 'keep-alive',
1170
- 'Access-Control-Allow-Origin': '*',
1171
1216
  });
1172
1217
 
1173
1218
  const send = (type, data) => {
package/setup/engine.js CHANGED
@@ -35,6 +35,9 @@ export {
35
35
  recommendExtractionModel,
36
36
  recommendAnalysisModel,
37
37
  getModelRecommendations,
38
+ suggestExtractionModels,
39
+ CLOUD_EXTRACTION_DISCLAIMER,
40
+ CLOUD_EXTRACTION_DISCLAIMER_SHORT,
38
41
  } from './models.js';
39
42
 
40
43
  // Auto-installers
package/setup/models.js CHANGED
@@ -42,6 +42,18 @@ export const EXTRACTION_MODELS = [
42
42
  description: 'Default recommendation. MoE (8B total, 4.5B active) — excellent extraction quality at edge-model speed. Best quality/speed ratio.',
43
43
  recommended: true,
44
44
  },
45
+ {
46
+ id: 'gemma4:12b',
47
+ name: 'Gemma 4 12B',
48
+ family: 'gemma4',
49
+ tier: 'quality',
50
+ vram: '~10 GB',
51
+ minVramMB: 8500,
52
+ speed: '~3s/page',
53
+ quality: 'excellent',
54
+ description: 'Dense 12B — a clear quality step up from E4B for tricky pages, still fast. Needs RTX 3080+/M-series 16GB+.',
55
+ recommended: false,
56
+ },
45
57
  {
46
58
  id: 'gemma4:26b',
47
59
  name: 'Gemma 4 26B',
@@ -261,8 +273,8 @@ export const ANALYSIS_MODELS = [
261
273
  description: 'Google\'s latest frontier model. Massive 2M context handles the largest competitive datasets. Best value for cloud analysis.',
262
274
  },
263
275
  {
264
- id: 'claude-opus-4.6',
265
- name: 'Claude Opus 4.6',
276
+ id: 'claude-opus-4-8',
277
+ name: 'Claude Opus 4.8',
266
278
  family: 'claude',
267
279
  type: 'cloud',
268
280
  provider: 'anthropic',
@@ -430,6 +442,82 @@ export function recommendAnalysisModel(availableModels = [], vramMB = 0) {
430
442
  };
431
443
  }
432
444
 
445
+ // ── Cloud-extraction disclaimer ─────────────────────────────────────────────
446
+ //
447
+ // Extraction runs once per crawled page. At scale (thousands of pages) a cloud
448
+ // provider means every page's content leaves the machine, costs real money, and
449
+ // hits rate limits — for a task a small local model handles well. Surface this
450
+ // EVERYWHERE a cloud model is offered/selected for extraction. Non-negotiable.
451
+
452
+ export const CLOUD_EXTRACTION_DISCLAIMER =
453
+ 'Extraction should be done with a LOCAL model. Cloud is a fallback, not the default: ' +
454
+ 'it sends every page\'s content to a third-party API, costs money at scale (a 10k-page ' +
455
+ 'site is real spend), and hits rate limits — all for a task a 4–8B local model does well, ' +
456
+ 'offline, with your data never leaving the machine. Use cloud only if you have no local ' +
457
+ 'GPU/Ollama and accept those tradeoffs.';
458
+
459
+ // Short one-liner for tight UIs (status bars, JSON `notice` fields).
460
+ export const CLOUD_EXTRACTION_DISCLAIMER_SHORT =
461
+ '⚠ Use a LOCAL model for extraction — cloud sends page content off-machine, costs money at scale, and rate-limits. Local is private, free, and offline.';
462
+
463
+ // ── Curated extraction suggestions ──────────────────────────────────────────
464
+ //
465
+ // The headline "what should I run locally" set — the families we actually
466
+ // recommend, smallest → largest. Drawn from EXTRACTION_MODELS so VRAM/speed/
467
+ // quality stay in one place. Used by `seo-intel models` and the suggest_models
468
+ // MCP tool so an agent in chat can recommend a model without the full wizard.
469
+
470
+ const SUGGESTED_EXTRACTION_IDS = [
471
+ 'gemma4:e2b', // laptop / low VRAM
472
+ 'qwen3.5:4b', // budget alt
473
+ 'gemma4:e4b', // default — best quality/speed
474
+ 'qwen3.5:9b', // ~8B-class alt
475
+ 'gemma4:12b', // step-up quality, still fast
476
+ ];
477
+
478
+ /**
479
+ * Curated local extraction-model suggestions, annotated for the detected
480
+ * hardware. Always returns the local set; the cloud disclaimer is attached so
481
+ * callers can surface it alongside.
482
+ *
483
+ * @param {number} [vramMB] - detected VRAM in MB (0/unknown = show all, no fit filter)
484
+ * @param {string[]} [installed] - model tags currently in Ollama
485
+ * @returns {{ suggestions: object[], recommendedId: string|null, vramMB: number, disclaimer: string }}
486
+ */
487
+ export function suggestExtractionModels(vramMB = 0, installed = []) {
488
+ const isInstalled = (id) => {
489
+ const [fam, size] = id.split(':');
490
+ return installed.some(m => m.startsWith(fam) && (!size || m.includes(size)));
491
+ };
492
+
493
+ const suggestions = SUGGESTED_EXTRACTION_IDS
494
+ .map(id => EXTRACTION_MODELS.find(m => m.id === id))
495
+ .filter(Boolean)
496
+ .map(m => ({
497
+ id: m.id,
498
+ name: m.name,
499
+ tier: m.tier,
500
+ vram: m.vram,
501
+ speed: m.speed,
502
+ quality: m.quality,
503
+ fitsVram: !vramMB || vramMB >= m.minVramMB,
504
+ installed: isInstalled(m.id),
505
+ note: m.description,
506
+ }));
507
+
508
+ // Recommend the largest suggested model that fits VRAM (or the default if VRAM unknown).
509
+ let recommendedId = 'gemma4:e4b';
510
+ if (vramMB) {
511
+ const fitting = suggestions.filter(s => s.fitsVram);
512
+ recommendedId = fitting.length ? fitting[fitting.length - 1].id : (suggestions[0]?.id ?? null);
513
+ }
514
+ // Prefer an already-installed fitting model if there is one.
515
+ const installedFitting = suggestions.find(s => s.installed && s.fitsVram);
516
+ if (installedFitting) recommendedId = installedFitting.id;
517
+
518
+ return { suggestions, recommendedId, vramMB, disclaimer: CLOUD_EXTRACTION_DISCLAIMER };
519
+ }
520
+
433
521
  /**
434
522
  * Get all model recommendations for display.
435
523
  *