npm - seo-intel - Versions diffs - 1.5.26 → 1.5.28 - Mend

seo-intel 1.5.26 → 1.5.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,31 @@
 # Changelog
+## 1.5.28 (2026-05-17)
+### MCP — agents can now trigger crawls and watch progress
+The MCP server gains its first **active** tools — agents move from read-only to actually doing work on the user's machine.
+- **`run_crawl(project, stealth?, max_pages?)`** — spawn a crawl as a detached subprocess. Returns immediately with `{ started, pid, command, hint }`. Free tier — crawl page limits still apply (Solo unlocks unlimited). Refuses to start if any seo-intel job is already running (conflict guard mirrors the existing HTTP `/api/crawl` behaviour).
+- **`get_crawl_status()`** — read the most recent job's progress: status (`running` / `completed` / `crashed` / `stopped` / `idle`), command, project, pid, timestamps. PID liveness is verified — a "running" job whose process died gets re-tagged as `crashed`.
+A natural session now looks like: agent calls `run_crawl(carbium)` → polls `get_crawl_status()` every minute → once `completed`, calls `get_intel(carbium, for=raw)` and `get_pages(carbium)` to see new data. Free tier, end to end.
+### Internal — shared progress reader
+`server.js` and `mcp/server.js` now both read job state from `lib/progress.js` (the canonical implementation, with PID liveness detection). Eliminates a duplicate `readProgress()` and ensures any future progress-file schema changes propagate automatically.
+## 1.5.27 (2026-05-16)
+### MCP — three new free-tier read tools
+The MCP server (`seo-intel-mcp`) now exposes individual records, not just summaries. AI agents can drill from inventory into actual pages, keywords, and heading structures without leaving the agent chat.
+- **`get_pages(project, role?, limit?, offset?)`** — paginated page list with url, title, word count, status, click depth, and domain role. Filterable by role (target / owned / competitor). Returns total count for pagination math.
+- **`list_keywords(project, domain?, limit?)`** — top extracted keywords grouped by domain + location (title / h1 / h2 / meta / body). Use to surface what each site is targeting before running gap analysis.
+- **`get_headings(project, url, limit?)`** — heading structure (H1–H6) for a specific page. Returns ordered `{ level, text }` list. Useful for content-architecture comparisons between target and competitor pages.
+All three are **free tier** — no license required. Pairs naturally with the existing `list_projects` and `get_intel(raw)` to give AI agents a complete free-tier read surface: list projects → inspect inventory → drill into pages → read headings → analyze with the agent's own flagship LLM.
+Errors are returned as proper MCP `isError: true` responses with helpful guidance (e.g. `get_headings` on an unknown URL points the agent at `get_pages`).
 ## 1.5.26 (2026-05-16)
 ### New — MCP server (`seo-intel-mcp`)

package/lib/progress.js ADDED Viewed

@@ -0,0 +1,37 @@
+/**
+ * lib/progress.js — Single source of truth for the seo-intel job progress file.
+ *
+ * The CLI's crawl/extract/analyze/aeo/... commands all write their state to
+ * `.extraction-progress.json` in the project root. Server.js, mcp/server.js,
+ * and any future consumer can read job status from here without spawning a
+ * subprocess.
+ */
+import { readFileSync, existsSync } from 'fs';
+import { dirname, join } from 'path';
+import { fileURLToPath } from 'url';
+const __dirname = dirname(fileURLToPath(import.meta.url));
+export const PROGRESS_FILE = join(__dirname, '..', '.extraction-progress.json');
+/**
+ * Read the current job progress, with PID liveness detection so a "running"
+ * job whose process died gets re-tagged as "crashed".
+ *
+ * @returns {object|null}
+ */
+export function readProgress() {
+  try {
+    if (!existsSync(PROGRESS_FILE)) return null;
+    const data = JSON.parse(readFileSync(PROGRESS_FILE, 'utf8'));
+    if (data.status === 'running' && data.pid) {
+      try { process.kill(data.pid, 0); } catch (e) {
+        if (e.code === 'ESRCH') {
+          data.status = 'crashed';
+          data.crashed_at = data.updated_at;
+        }
+      }
+    }
+    return data;
+  } catch { return null; }
+}

package/mcp/server.js CHANGED Viewed

@@ -21,12 +21,14 @@ import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
 import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
 import * as z from 'zod/v4';
 import { readFileSync, readdirSync, existsSync } from 'fs';
+import { spawn } from 'child_process';
 import { dirname, join } from 'path';
 import { fileURLToPath } from 'url';
 import { getDb } from '../db/db.js';
 import { getIntel, INTEL_SLICES, FREE_SLICES } from '../lib/intel.js';
 import { isPro } from '../lib/license.js';
+import { readProgress } from '../lib/progress.js';
 const __dirname = dirname(fileURLToPath(import.meta.url));
 const ROOT = join(__dirname, '..');
@@ -116,11 +118,200 @@ server.registerTool(
   }
 );
+// ── Tool: get_pages (free) ────────────────────────────────────────────────
+server.registerTool(
+  'get_pages',
+  {
+    description: 'Paginated list of crawled pages for a project, with url, title, word count, status, and domain role. Use this to drill into individual pages after seeing the inventory summary from get_intel. Free tier.',
+    inputSchema: {
+      project: z.string().describe('Project slug'),
+      role: z.enum(['target', 'owned', 'competitor']).optional().describe('Filter by domain role'),
+      limit: z.number().int().positive().max(500).optional().describe('Max pages to return (default 50, max 500)'),
+      offset: z.number().int().nonnegative().optional().describe('Offset for pagination (default 0)'),
+    },
+  },
+  async ({ project, role, limit = 50, offset = 0 }) => {
+    try {
+      const db = getDb();
+      const whereParams = role ? [project, role] : [project];
+      const where = role ? 'd.project = ? AND d.role = ?' : 'd.project = ?';
+      const rows = db.prepare(
+        `SELECT p.url, p.title, p.word_count, p.status_code, p.click_depth,
+                d.domain, d.role
+         FROM pages p JOIN domains d ON d.id = p.domain_id
+         WHERE ${where}
+         ORDER BY d.role, d.domain, p.url
+         LIMIT ? OFFSET ?`
+      ).all(...whereParams, limit, offset);
+      const total = db.prepare(
+        `SELECT COUNT(*) AS n FROM pages p JOIN domains d ON d.id = p.domain_id WHERE ${where}`
+      ).get(...whereParams)?.n || 0;
+      const out = { project, role: role || 'any', total, returned: rows.length, offset, pages: rows };
+      return {
+        content: [{ type: 'text', text: JSON.stringify(out, null, 2) }],
+        structuredContent: out,
+      };
+    } catch (err) {
+      return { content: [{ type: 'text', text: `seo-intel error: ${err.message}` }], isError: true };
+    }
+  }
+);
+// ── Tool: list_keywords (free) ────────────────────────────────────────────
+server.registerTool(
+  'list_keywords',
+  {
+    description: 'Top extracted keywords for a project, grouped by domain. Each keyword has frequency, location (title/h1/h2/meta/body), and source domain. Use this to surface what each site is targeting before running gap analysis. Free tier.',
+    inputSchema: {
+      project: z.string().describe('Project slug'),
+      domain: z.string().optional().describe('Optional: filter to a single domain'),
+      limit: z.number().int().positive().max(1000).optional().describe('Max keywords to return (default 100, max 1000)'),
+    },
+  },
+  async ({ project, domain, limit = 100 }) => {
+    try {
+      const db = getDb();
+      const params = [project];
+      let where = 'd.project = ?';
+      if (domain) { where += ' AND d.domain = ?'; params.push(domain); }
+      params.push(limit);
+      const rows = db.prepare(
+        `SELECT k.keyword, k.location, d.domain, d.role, COUNT(*) AS freq
+         FROM keywords k
+           JOIN pages p ON p.id = k.page_id
+           JOIN domains d ON d.id = p.domain_id
+         WHERE ${where}
+         GROUP BY k.keyword, k.location, d.domain
+         ORDER BY freq DESC
+         LIMIT ?`
+      ).all(...params);
+      const out = { project, domain: domain || 'all', returned: rows.length, keywords: rows };
+      return {
+        content: [{ type: 'text', text: JSON.stringify(out, null, 2) }],
+        structuredContent: out,
+      };
+    } catch (err) {
+      return { content: [{ type: 'text', text: `seo-intel error: ${err.message}` }], isError: true };
+    }
+  }
+);
+// ── Tool: get_headings (free) ─────────────────────────────────────────────
+server.registerTool(
+  'get_headings',
+  {
+    description: 'Heading structure (H1–H6) for a specific page. Returns ordered list of { level, text }. Useful for content architecture comparisons between target and competitor pages. Free tier.',
+    inputSchema: {
+      project: z.string().describe('Project slug'),
+      url: z.string().describe('Exact page URL (as crawled). Get URLs from get_pages.'),
+      limit: z.number().int().positive().max(200).optional().describe('Max headings (default 50)'),
+    },
+  },
+  async ({ project, url, limit = 50 }) => {
+    try {
+      const db = getDb();
+      const page = db.prepare(
+        `SELECT p.id, p.title, p.word_count, d.domain, d.role
+         FROM pages p JOIN domains d ON d.id = p.domain_id
+         WHERE d.project = ? AND p.url = ?`
+      ).get(project, url);
+      if (!page) {
+        return {
+          content: [{ type: 'text', text: `No crawled page found for url="${url}" in project "${project}". Use get_pages to discover URLs.` }],
+          isError: true,
+        };
+      }
+      const headings = db.prepare(
+        `SELECT level, text FROM headings WHERE page_id = ? ORDER BY id LIMIT ?`
+      ).all(page.id, limit);
+      const out = { project, url, page_title: page.title, domain: page.domain, role: page.role, word_count: page.word_count, headings };
+      return {
+        content: [{ type: 'text', text: JSON.stringify(out, null, 2) }],
+        structuredContent: out,
+      };
+    } catch (err) {
+      return { content: [{ type: 'text', text: `seo-intel error: ${err.message}` }], isError: true };
+    }
+  }
+);
+// ── Tool: run_crawl (free) ────────────────────────────────────────────────
+server.registerTool(
+  'run_crawl',
+  {
+    description: [
+      'Trigger a background crawl for an existing project. Spawns the crawl as a detached subprocess and returns immediately — the crawl will keep running even if this MCP server exits. Use get_crawl_status to monitor progress, or call get_intel/get_pages once the crawl completes to see results.',
+      '',
+      'Conflict guard: refuses to start if any seo-intel job is already running. Free tier — crawl page limits still apply (configurable via setup / Solo license unlocks unlimited).',
+    ].join('\n'),
+    inputSchema: {
+      project: z.string().describe('Existing project slug. Use list_projects to discover.'),
+      stealth: z.boolean().optional().describe('Enable stealth browser mode for JS-heavy or anti-bot sites'),
+      max_pages: z.number().int().positive().optional().describe('Override max pages per domain'),
+    },
+  },
+  async ({ project, stealth, max_pages }) => {
+    const configPath = join(CONFIG_DIR, `${project}.json`);
+    if (!existsSync(configPath)) {
+      const available = listConfigProjects().map(p => p.project).join(', ') || '(none configured)';
+      return {
+        content: [{ type: 'text', text: `Project "${project}" not found. Available: ${available}. Use list_projects to discover, or run \`seo-intel setup\` to add a new project.` }],
+        isError: true,
+      };
+    }
+    const progress = readProgress();
+    if (progress?.status === 'running') {
+      return {
+        content: [{ type: 'text', text: `A seo-intel job is already running (command="${progress.command}", project="${progress.project}", pid=${progress.pid}). Call get_crawl_status to monitor, or wait for it to finish before starting another.` }],
+        isError: true,
+      };
+    }
+    const args = ['cli.js', 'crawl', project];
+    if (stealth) args.push('--stealth');
+    if (max_pages) args.push('--max-pages', String(max_pages));
+    const child = spawn(process.execPath, args, {
+      cwd: ROOT,
+      detached: true,
+      stdio: 'ignore',
+    });
+    child.unref();
+    const result = {
+      started: true,
+      pid: child.pid,
+      project,
+      command: `node ${args.join(' ')}`,
+      hint: 'Crawl is running detached. Call get_crawl_status to check progress (updates every few seconds), or call get_intel(project, for=raw) in a minute or two to see new data.',
+    };
+    return {
+      content: [{ type: 'text', text: JSON.stringify(result, null, 2) }],
+      structuredContent: result,
+    };
+  }
+);
+// ── Tool: get_crawl_status (free) ─────────────────────────────────────────
+server.registerTool(
+  'get_crawl_status',
+  {
+    description: 'Read the current state of the most recent seo-intel job (crawl/extract/analyze/etc). Returns status: running | completed | crashed | stopped | idle, plus project/command/pid/timestamps when available. Use this after run_crawl to monitor progress. Free tier.',
+  },
+  async () => {
+    const progress = readProgress() || { status: 'idle', note: 'No seo-intel job has been recorded since startup. Use run_crawl to start one.' };
+    return {
+      content: [{ type: 'text', text: JSON.stringify(progress, null, 2) }],
+      structuredContent: progress,
+    };
+  }
+);
 async function main() {
   const transport = new StdioServerTransport();
   await server.connect(transport);
   // stderr is fine; the host typically surfaces this in its MCP logs panel.
-  console.error(`[seo-intel-mcp] v${VERSION} ready on stdio. Tools: list_projects, get_intel.`);
+  console.error(`[seo-intel-mcp] v${VERSION} ready on stdio. Tools: list_projects, get_intel, get_pages, list_keywords, get_headings, run_crawl, get_crawl_status.`);
 }
 main().catch(err => {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "seo-intel",
-  "version": "1.5.26",
+  "version": "1.5.28",
   "description": "Local Ahrefs-style SEO competitor intelligence. Crawl → SQLite → cloud analysis.",
   "type": "module",
   "license": "SEE LICENSE IN LICENSE",

package/server.js CHANGED Viewed

@@ -4,10 +4,10 @@ import { spawn } from 'child_process';
 import { dirname, join, extname } from 'path';
 import { fileURLToPath } from 'url';
 import { checkForUpdates, getUpdateInfo } from './lib/updater.js';
+import { readProgress, PROGRESS_FILE } from './lib/progress.js';
 const __dirname = dirname(fileURLToPath(import.meta.url));
 const PORT = parseInt(process.env.PORT || '3000', 10);
-const PROGRESS_FILE = join(__dirname, '.extraction-progress.json');
 const REPORTS_DIR = join(__dirname, 'reports');
@@ -100,23 +100,6 @@ const MIME = {
   '.zip': 'application/zip',
 };
-// ── Read progress with PID liveness check (mirrors cli.js) ──
-function readProgress() {
-  try {
-    if (!existsSync(PROGRESS_FILE)) return null;
-    const data = JSON.parse(readFileSync(PROGRESS_FILE, 'utf8'));
-    if (data.status === 'running' && data.pid) {
-      try { process.kill(data.pid, 0); } catch (e) {
-        if (e.code === 'ESRCH') {
-          data.status = 'crashed';
-          data.crashed_at = data.updated_at;
-        }
-      }
-    }
-    return data;
-  } catch { return null; }
-}
 // ── Parse JSON body from request ──
 function readBody(req) {
   return new Promise((resolve, reject) => {