npm - seo-intel - Versions diffs - 1.5.39 → 1.5.45 - Mend

seo-intel 1.5.39 → 1.5.45

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/CHANGELOG.md +76 -0
package/analyses/blog-draft/prescorer.js +17 -0
package/analyses/loop/orchestrator.js +179 -0
package/cli.js +197 -6
package/crawler/html-extract.js +127 -0
package/crawler/light.js +169 -0
package/db/db.js +66 -0
package/lib/cron.js +108 -0
package/lib/gate.js +33 -1
package/lib/intel.js +9 -3
package/mcp/server.js +172 -17
package/package.json +1 -1
package/reports/generate-html.js +42 -404
package/setup/web-routes.js +39 -0
package/setup/wizard.html +73 -0

package/mcp/server.js CHANGED Viewed

@@ -25,14 +25,16 @@ import { spawn } from 'child_process';
 import { dirname, join } from 'path';
 import { fileURLToPath } from 'url';
-import { getDb, insertAgentInsight, AGENT_INSIGHT_TYPES, getActiveInsights, getCompetitorSummary } from '../db/db.js';
+import { getDb, insertAgentInsight, AGENT_INSIGHT_TYPES, getActiveInsights, getCompetitorSummary, recordDraftCreated, markGapsInProgress } from '../db/db.js';
 import { getIntel, INTEL_SLICES, FREE_SLICES } from '../lib/intel.js';
 import { isPro } from '../lib/license.js';
 import { readProgress } from '../lib/progress.js';
 import { getProblems, getProblemCounts, markProblemStatus, getActiveStatusMap, PROBLEM_CATEGORIES, PROBLEM_STATUSES } from '../lib/problems.js';
 import { runAeoAnalysis, persistAeoScores, upsertCitabilityInsights } from '../analyses/aeo/index.js';
-import { prescore } from '../analyses/blog-draft/prescorer.js';
+import { prescore, extractDraftTopic } from '../analyses/blog-draft/prescorer.js';
+import { lightCrawl } from '../crawler/light.js';
+import { runContentLoop } from '../analyses/loop/orchestrator.js';
 import { gatherBlogDraftContext, buildBlogDraftPrompt } from '../analyses/blog-draft/index.js';
 // ── Helpers ────────────────────────────────────────────────────────────────
@@ -357,6 +359,88 @@ server.registerTool(
   }
 );
+// ── Tool: crawl_site (free — zero-config, zero-signup, local, lightweight) ──
+// "Crawl for all Claude users": point it at a URL and it BFS-crawls same-origin
+// pages with plain fetch (no browser, no project config, nothing persisted,
+// nothing leaves the machine). For deep/JS-rendered/persistent crawls, the user
+// installs seo-intel and runs `seo-intel crawl`.
+server.registerTool(
+  'crawl_site',
+  {
+    description: [
+      'Crawl a website ad-hoc and return structured SEO/AEO data — no project setup, no account, no API key, nothing saved. Point it at any URL.',
+      '',
+      'Lightweight by design: plain HTTP fetch (no browser/JS rendering), same-origin BFS, honours robots.txt + crawl-delay, small page budget (default 10, hard cap 50). Returns title, meta, headings, links, JSON-LD schema types, word count, indexability — optionally a per-page AI-citability (AEO) score.',
+      '',
+      'Limits: JS-rendered/SPA pages under-report content (use the full `seo-intel crawl` with Playwright for those). Results are ephemeral — for persistent history, the Intelligence Ledger, and competitor analysis, install seo-intel (still local, own-site free). Free tier.',
+    ].join('\n'),
+    inputSchema: {
+      url: z.string().describe('Start URL (scheme optional — "example.com" works). The crawl follows same-origin links from here.'),
+      max_pages: z.number().int().positive().optional().describe('Pages to fetch (default 10, hard cap 50).'),
+      include_citability: z.boolean().optional().describe('Run the AEO citability scorer per page (default false). Note: light mode does no entity extraction, so entity-authority is under-counted — run `seo-intel aeo` for the full score.'),
+      same_origin: z.boolean().optional().describe('Only follow links on the start site (default true). www/non-www and http/https are treated as the same site.'),
+    },
+  },
+  async ({ url, max_pages, include_citability, same_origin }) => {
+    try {
+      const r = await lightCrawl(url, {
+        maxPages: max_pages ?? 10,
+        includeCitability: include_citability ?? false,
+        sameOrigin: same_origin ?? true,
+      });
+      // Compact, token-aware shape: drop body_text + the full per-page link lists
+      // (return counts + a deduped discovered-URL list instead).
+      const pages = r.pages.map(p => ({
+        url: p.url,
+        status_code: p.status_code,
+        title: p.title,
+        meta_desc: p.meta_desc,
+        canonical: p.canonical || null,
+        is_indexable: p.is_indexable,
+        word_count: p.word_count,
+        headings: p.headings.slice(0, 40),
+        schema_types: p.schema_types,
+        published_date: p.published_date,
+        modified_date: p.modified_date,
+        internal_links: p.links.filter(l => l.internal).length,
+        external_links: p.links.filter(l => !l.internal).length,
+        ...(p.citability ? { citability: p.citability } : {}),
+      }));
+      // Deduped internal URLs discovered but not crawled (structure peek).
+      const crawled = new Set(r.pages.map(p => p.url));
+      const discovered = [];
+      const seen = new Set();
+      for (const p of r.pages) {
+        for (const l of p.links) {
+          if (l.internal && !crawled.has(l.href) && !seen.has(l.href)) {
+            seen.add(l.href); discovered.push(l.href);
+            if (discovered.length >= 50) break;
+          }
+        }
+        if (discovered.length >= 50) break;
+      }
+      const out = {
+        start: r.start,
+        origin: r.origin,
+        stats: r.stats,
+        pages,
+        discovered_internal_urls: discovered,
+        skipped: r.skipped,
+        notice: 'Ephemeral + local — nothing was saved and nothing left this machine. Light mode does not render JavaScript, so SPA/JS-built pages under-report content; use `seo-intel crawl` (Playwright) for those. For persistent history, the Intelligence Ledger, AI-citability over time, and competitor analysis, install seo-intel — own-site stays free.',
+      };
+      return {
+        content: [{ type: 'text', text: JSON.stringify(out, null, 2) }],
+        structuredContent: out,
+      };
+    } catch (err) {
+      return { content: [{ type: 'text', text: `seo-intel crawl_site error: ${err.message}` }], isError: true };
+    }
+  }
+);
 // ── Tool: ingest_insight (free — write-back closes the loop) ──────────────
 server.registerTool(
   'ingest_insight',
@@ -414,18 +498,17 @@ server.registerTool(
   }
 );
-// ── Tool: run_citability_audit (PAID) ─────────────────────────────────────
+// ── Tool: run_citability_audit (FREE) ─────────────────────────────────────
 server.registerTool(
   'run_citability_audit',
   {
-    description: 'Run AEO citability scoring across all crawled pages (6 signals: entity authority, structured claims, answer density, Q&A proximity, freshness, schema coverage). Persists scores to citability_scores and upserts citability_gap insights into the ledger. Pure function — fast, no LLM calls. Paid tier.',
+    description: 'Run AEO citability scoring across all crawled pages (6 signals: entity authority, structured claims, answer density, Q&A proximity, freshness, schema coverage). Persists scores to citability_scores and upserts citability_gap insights into the ledger. Pure function — fast, no LLM calls. Free tier — analysis of your own site is free.',
     inputSchema: {
       project: z.string(),
       include_competitors: z.boolean().optional().describe('Score competitor pages too (default true)'),
     },
   },
   async ({ project, include_competitors = true }) => {
-    if (!isPro()) return paidGate('run_citability_audit');
     if (!loadProjectConfig(project)) {
       return { content: [{ type: 'text', text: `Project "${project}" not found. Use list_projects to discover.` }], isError: true };
     }
@@ -497,17 +580,18 @@ server.registerTool(
   }
 );
-// ── Tool: prescore_draft (PAID) ───────────────────────────────────────────
+// ── Tool: prescore_draft (FREE) ───────────────────────────────────────────
 server.registerTool(
   'prescore_draft',
   {
-    description: 'Run the AEO scorer on a markdown draft before publishing. Returns the same 6-signal breakdown the dashboard uses (entity authority, structured claims, answer density, Q&A proximity, freshness, schema coverage) plus the overall 0-100 score and tier (excellent / good / fair / poor). Use this as a pre-publish gate when drafting via draft_blog_prompt — score < 60 means revise. Paid tier.',
+    description: 'Run the AEO scorer on a markdown draft before publishing. Returns the same 6-signal breakdown the dashboard uses (entity authority, structured claims, answer density, Q&A proximity, freshness, schema coverage) plus the overall 0-100 score and tier (excellent / good / fair / poor). Use this as a pre-publish gate when drafting via draft_blog_prompt — score < 60 means revise. Free tier. Pass `project` (and optionally `topic`) to close the loop: the draft is recorded in the Ledger and matching gaps are marked in_progress so they stop resurfacing.',
     inputSchema: {
       draft_md: z.string().describe('Full markdown of the draft, including YAML frontmatter if present. The scorer extracts headings, word count, schema_type from frontmatter, etc.'),
+      project: z.string().optional().describe('If set, the scored draft is written back to this project\'s Intelligence Ledger (records a draft_created insight + marks matching gaps in_progress). Omit for a pure, stateless score.'),
+      topic: z.string().optional().describe('The topic/keyword this draft targets. Used to match gaps for the in_progress write-back. If omitted, recovered from the draft\'s frontmatter title or first H1.'),
     },
   },
-  async ({ draft_md }) => {
-    if (!isPro()) return paidGate('prescore_draft');
+  async ({ draft_md, project, topic }) => {
     try {
       const score = prescore(draft_md);
       const out = {
@@ -520,6 +604,33 @@ server.registerTool(
           ? 'Draft scores well. Safe to publish.'
           : 'Below 60 — consider strengthening: add FAQ schema for Q&A proximity, increase entity authority via named experts/citations, shorten paragraphs for answer density, add structured claims (numbers/dates).',
       };
+      // F1 (v1.5.42): loop write-back — only when a project is supplied, and
+      // best-effort so a Ledger hiccup never fails the score.
+      if (project && loadProjectConfig(project)) {
+        try {
+          const db = getDb();
+          const effectiveTopic = topic || extractDraftTopic(draft_md);
+          recordDraftCreated(db, project, {
+            topic: effectiveTopic,
+            score: score.score,
+            tier: score.tier,
+            wordCount: score.wordCount,
+          });
+          const marked = markGapsInProgress(db, project, effectiveTopic);
+          out.ledger = {
+            recorded: true,
+            topic: effectiveTopic || '(auto)',
+            gaps_marked_in_progress: marked,
+            note: marked > 0
+              ? `${marked} matching gap(s) marked in_progress — they stop resurfacing until a re-audit re-scores the published page.`
+              : 'Draft recorded; no active gaps matched the topic.',
+          };
+        } catch (e) {
+          out.ledger = { recorded: false, error: e.message };
+        }
+      }
       return {
         content: [{ type: 'text', text: JSON.stringify(out, null, 2) }],
         structuredContent: out,
@@ -530,11 +641,11 @@ server.registerTool(
   }
 );
-// ── Tool: draft_blog_prompt (PAID) ────────────────────────────────────────
+// ── Tool: draft_blog_prompt (FREE) ────────────────────────────────────────
 server.registerTool(
   'draft_blog_prompt',
   {
-    description: 'Generate an AEO-aware blog draft prompt seeded with full project context — keyword gaps, citability gaps, top entities, brand voice notes, competitor heading patterns. The agent\'s own LLM writes the draft using this prompt. Pair with prescore_draft for a write→score→revise loop. Paid tier.',
+    description: 'Generate an AEO-aware blog draft prompt seeded with full project context — keyword gaps, citability gaps, top entities, brand voice notes, competitor heading patterns. The agent\'s own LLM writes the draft using this prompt. Pair with prescore_draft for a write→score→revise loop. Free tier.',
     inputSchema: {
       project: z.string(),
       topic: z.string().optional().describe('Specific topic to draft about. If omitted, the prompt asks the LLM to pick the highest-leverage topic from the gap data.'),
@@ -543,7 +654,6 @@ server.registerTool(
     },
   },
   async ({ project, topic, lang = 'en', content_type = 'blog' }) => {
-    if (!isPro()) return paidGate('draft_blog_prompt');
     const config = loadProjectConfig(project);
     if (!config) {
       return { content: [{ type: 'text', text: `Project "${project}" not found. Use list_projects to discover.` }], isError: true };
@@ -571,9 +681,54 @@ server.registerTool(
   }
 );
+// ── Tool: run_content_loop (free — the one-call content loop) ─────────────
+// Walks gap → draft → prescore → queue. In MCP the agent's own LLM is the
+// writer, so this runs in HAND-BACK mode: it ranks the gaps, picks the highest-
+// leverage one(s), and returns a seeded prompt per gap. The agent writes the
+// draft, then calls prescore_draft(project, topic) to score + close the loop.
+server.registerTool(
+  'run_content_loop',
+  {
+    description: [
+      'Run the content loop for a project in one call: ranks the open gaps in the Intelligence Ledger by leverage (priority × source × AI-intent), picks the highest, and returns an AEO-aware draft prompt seeded with full context.',
+      '',
+      'Hand-back by design — your own LLM writes the draft from the returned prompt, then you call prescore_draft(project, topic) to AEO-score it and close the loop (records the draft, marks the gap in_progress). Use dry_run to just see which gap it would target. Free tier.',
+    ].join('\n'),
+    inputSchema: {
+      project: z.string(),
+      topic: z.string().optional().describe('Focus a specific topic instead of auto-picking the top gap.'),
+      count: z.number().int().positive().optional().describe('Return prompts for the top N gaps (default 1).'),
+      lang: z.enum(['en', 'fi']).optional(),
+      content_type: z.enum(['blog', 'article', 'guide', 'docs', 'social']).optional(),
+      dry_run: z.boolean().optional().describe('Only rank + select the gap(s); do not build prompts.'),
+    },
+  },
+  async ({ project, topic, count, lang = 'en', content_type = 'blog', dry_run }) => {
+    const config = loadProjectConfig(project);
+    if (!config) {
+      return { content: [{ type: 'text', text: `Project "${project}" not found. Use list_projects to discover.` }], isError: true };
+    }
+    try {
+      const db = getDb();
+      const result = await runContentLoop(db, project, {
+        config, topic: topic || null, count: count || 1, lang, contentType: content_type,
+        dryRun: !!dry_run, generate: null, // hand-back: the agent writes
+      });
+      return {
+        content: [{ type: 'text', text: JSON.stringify(result, null, 2) }],
+        structuredContent: result,
+      };
+    } catch (err) {
+      return { content: [{ type: 'text', text: `seo-intel run_content_loop error: ${err.message}` }], isError: true };
+    }
+  }
+);
 // ── Tool: export_intel (firehose; free tables + paid tables) ──────────────
-const FREE_EXPORT_TABLES = ['pages', 'keywords', 'headings', 'links', 'technical', 'sitemap_urls'];
-const PAID_EXPORT_TABLES = ['extractions', 'analyses', 'page_schemas', 'citability_scores', 'insights'];
+// v1.5.41: own-site derived data (extractions, schemas, citability, the
+// ledger) is free — only the competitor gap analysis (`analyses`) is paid.
+const FREE_EXPORT_TABLES = ['pages', 'keywords', 'headings', 'links', 'technical', 'sitemap_urls', 'extractions', 'page_schemas', 'citability_scores', 'insights'];
+const PAID_EXPORT_TABLES = ['analyses'];
 const ALL_EXPORT_TABLES = [...FREE_EXPORT_TABLES, ...PAID_EXPORT_TABLES];
 const EXPORT_TABLE_QUERIES = {
@@ -597,7 +752,7 @@ const MAX_MAX_ROWS_PER_TABLE = 50000;
 function buildExportNotice({ tokens, bytes, free, paidRequested, paidExcluded, anyTruncated, maxRowsPerTable }) {
   const tooBig = tokens > 50000;
   const upgradeBlurb = free
-    ? `\n\n📦 Tables NOT in this response (require SEO Intel Solo, €19.99/mo — vs Ahrefs ~$129/mo): ${PAID_EXPORT_TABLES.join(', ')}.\n   These are the AI-derived layers: per-page entity/intent/schema extraction, full analysis history, structured @type inventory, citability scores, and the Intelligence Ledger.\n   For pre-parsed digests instead of raw rows, the Solo tools return ready-to-use analysis: run_citability_audit, get_competitor_positioning, prescore_draft, draft_blog_prompt.`
+    ? `\n\n📦 Table NOT in this response (requires SEO Intel Solo, €19.99/mo — vs Ahrefs ~$129/mo): ${PAID_EXPORT_TABLES.join(', ')}.\n   That's the competitor gap-analysis history (keyword_gaps, content_gaps, positioning, quick_wins). Everything about YOUR OWN site — extractions, schemas, citability scores, and the Intelligence Ledger — is free.\n   Free pre-parsed digests: get_intel(for=audit|blog), run_citability_audit, prescore_draft, draft_blog_prompt. Solo adds competitor synthesis: get_competitor_positioning + get_intel(for=competitor).`
     : `\n\nYou have Solo. Paid tables in this export: ${(paidRequested || []).join(', ') || '(none requested)'}.`;
   const sizeLine = tooBig
@@ -630,7 +785,7 @@ server.registerTool(
   'export_intel',
   {
     description: [
-      'Bulk export of raw structured intelligence — pages, keywords, headings, links, technical, sitemap URLs (free), plus extractions, analyses, schemas, citability scores, and insights (Solo). Mirrors `seo-intel export --full <project>` as a single MCP call.',
+      'Bulk export of raw structured intelligence — pages, keywords, headings, links, technical, sitemap URLs, extractions, schemas, citability scores, and the Intelligence Ledger (all free), plus the competitor gap-analysis history (Solo). Mirrors `seo-intel export --full <project>` as a single MCP call.',
       '',
       '⚠️ FIREHOSE WARNING: this is raw rows, not summaries. For carbium-sized projects it can be 5–10 MB / 200k+ tokens. The response includes a `notice` field telling the agent how to handle it (pipe to file, use other tools, or upgrade). Agents SHOULD NOT paste the response wholesale into their context — read the `notice` first, then either query selectively or save to a file.',
       '',
@@ -834,7 +989,7 @@ async function main() {
   const transport = new StdioServerTransport();
   await server.connect(transport);
   // stderr is fine; the host typically surfaces this in its MCP logs panel.
-  console.error(`[seo-intel-mcp] v${VERSION} ready on stdio. 15 tools — free: list_projects (with nag), list_problems, mark_problem_status, get_intel(raw), get_pages, list_keywords, get_headings, run_crawl, get_crawl_status, ingest_insight, export_intel (free-tier subset); paid: get_intel(audit/blog/competitor), run_citability_audit, get_competitor_positioning, prescore_draft, draft_blog_prompt, export_intel (paid tables), and list_problems unlocks paid categories.`);
+  console.error(`[seo-intel-mcp] v${VERSION} ready on stdio. 17 tools — free: crawl_site (ad-hoc, any URL, no config), run_content_loop (gap→draft→close), list_projects, list_problems, mark_problem_status, get_intel(raw/audit/blog), get_pages, list_keywords, get_headings, run_crawl, get_crawl_status, ingest_insight, run_citability_audit, prescore_draft, draft_blog_prompt, export_intel (own-site tables); Solo (competitor synthesis): get_competitor_positioning, get_intel(competitor), export_intel (analyses table).`);
 }
 main().catch(err => {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "seo-intel",
-  "version": "1.5.39",
+  "version": "1.5.45",
   "description": "Local Ahrefs-style SEO competitor intelligence. Crawl → SQLite → cloud analysis.",
   "type": "module",
   "license": "SEE LICENSE IN LICENSE",