seo-intel 1.5.40 → 1.5.45
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +64 -0
- package/analyses/blog-draft/prescorer.js +17 -0
- package/analyses/loop/orchestrator.js +179 -0
- package/cli.js +162 -6
- package/crawler/html-extract.js +127 -0
- package/crawler/light.js +169 -0
- package/db/db.js +66 -0
- package/lib/gate.js +33 -1
- package/lib/intel.js +9 -3
- package/mcp/server.js +172 -17
- package/package.json +1 -1
- package/reports/generate-html.js +42 -404
package/crawler/light.js
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Light crawler — fetch-based, zero-browser, zero-config, zero-signup.
|
|
3
|
+
*
|
|
4
|
+
* The "crawl for all Claude users" path: point it at a URL and it BFS-crawls
|
|
5
|
+
* same-origin pages with plain HTTP fetch (no Playwright, no browser download),
|
|
6
|
+
* returns structured SEO/AEO data entirely in memory. Nothing is persisted,
|
|
7
|
+
* nothing leaves the machine, no account required.
|
|
8
|
+
*
|
|
9
|
+
* Deliberately NOT a "massive crawl environment":
|
|
10
|
+
* - small page budget (default 10, hard cap 50)
|
|
11
|
+
* - same-origin only by default
|
|
12
|
+
* - honours robots.txt + crawl-delay (no tricks)
|
|
13
|
+
* - no JS rendering (use the full Playwright crawler for JS-heavy sites)
|
|
14
|
+
*
|
|
15
|
+
* For deep, persistent, JS-rendered crawls of a configured project, use the
|
|
16
|
+
* heavyweight crawler (`crawler/index.js` via `seo-intel crawl`).
|
|
17
|
+
*/
|
|
18
|
+
|
|
19
|
+
import fetch from 'node-fetch';
|
|
20
|
+
import { checkRobots } from './robots.js';
|
|
21
|
+
import { extractPageData } from './html-extract.js';
|
|
22
|
+
import { scorePage } from '../analyses/aeo/scorer.js';
|
|
23
|
+
|
|
24
|
+
const HARD_CAP = 50;
|
|
25
|
+
const DEFAULT_UA = 'SEOIntelBot (+https://ukkometa.fi/seo-intel; light-crawl)';
|
|
26
|
+
|
|
27
|
+
function sleep(ms) { return new Promise(r => setTimeout(r, ms)); }
|
|
28
|
+
|
|
29
|
+
function normalizeStart(url) {
|
|
30
|
+
let u = url.trim();
|
|
31
|
+
if (!/^https?:\/\//i.test(u)) u = 'https://' + u;
|
|
32
|
+
return new URL(u).toString();
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// Same-site key: hostname minus a leading "www." (and protocol-agnostic), so
|
|
36
|
+
// http↔https and www↔non-www redirects don't break same-origin link following.
|
|
37
|
+
function siteKey(u) {
|
|
38
|
+
try { return new URL(u).hostname.replace(/^www\./i, '').toLowerCase(); } catch { return null; }
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* @param {string} startUrl
|
|
43
|
+
* @param {object} [opts]
|
|
44
|
+
* @param {number} [opts.maxPages=10] pages to fetch (clamped to HARD_CAP)
|
|
45
|
+
* @param {boolean} [opts.sameOrigin=true] only follow links on the start origin
|
|
46
|
+
* @param {boolean} [opts.includeCitability=false] run the AEO scorer per page
|
|
47
|
+
* @param {boolean} [opts.respectRobots=true] honour robots.txt + crawl-delay
|
|
48
|
+
* @param {number} [opts.timeoutMs=10000] per-request timeout
|
|
49
|
+
* @param {number} [opts.maxDelayMs=3000] cap on politeness delay between requests
|
|
50
|
+
* @param {(msg:string)=>void} [opts.onProgress]
|
|
51
|
+
* @returns {Promise<object>} { start, origin, pages, skipped, stats }
|
|
52
|
+
*/
|
|
53
|
+
export async function lightCrawl(startUrl, opts = {}) {
|
|
54
|
+
const {
|
|
55
|
+
maxPages = 10,
|
|
56
|
+
sameOrigin = true,
|
|
57
|
+
includeCitability = false,
|
|
58
|
+
respectRobots = true,
|
|
59
|
+
timeoutMs = 10000,
|
|
60
|
+
maxDelayMs = 3000,
|
|
61
|
+
onProgress,
|
|
62
|
+
} = opts;
|
|
63
|
+
|
|
64
|
+
const budget = Math.max(1, Math.min(maxPages, HARD_CAP));
|
|
65
|
+
let start;
|
|
66
|
+
try { start = normalizeStart(startUrl); } catch { throw new Error(`Invalid URL: ${startUrl}`); }
|
|
67
|
+
const origin = new URL(start).origin;
|
|
68
|
+
|
|
69
|
+
const siteRoot = siteKey(start);
|
|
70
|
+
const queue = [start];
|
|
71
|
+
const queued = new Set([start]);
|
|
72
|
+
const visited = new Set(); // FINAL (post-redirect) URLs actually processed
|
|
73
|
+
const pages = [];
|
|
74
|
+
const skipped = [];
|
|
75
|
+
const t0 = Date.now();
|
|
76
|
+
|
|
77
|
+
while (queue.length && pages.length < budget) {
|
|
78
|
+
const url = queue.shift();
|
|
79
|
+
|
|
80
|
+
if (respectRobots) {
|
|
81
|
+
let robot;
|
|
82
|
+
try { robot = await checkRobots(url); } catch { robot = { allowed: true, crawlDelayMs: 0 }; }
|
|
83
|
+
if (!robot.allowed) { skipped.push({ url, reason: 'robots_disallow' }); continue; }
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
let res, finalUrl = url, status = 0, html = '';
|
|
87
|
+
try {
|
|
88
|
+
res = await fetch(url, { timeout: timeoutMs, redirect: 'follow', headers: { 'User-Agent': DEFAULT_UA, Accept: 'text/html,application/xhtml+xml' } });
|
|
89
|
+
status = res.status;
|
|
90
|
+
finalUrl = res.url || url;
|
|
91
|
+
const ct = (res.headers.get('content-type') || '').toLowerCase();
|
|
92
|
+
if (res.ok && ct.includes('html')) {
|
|
93
|
+
html = await res.text();
|
|
94
|
+
} else {
|
|
95
|
+
skipped.push({ url, reason: res.ok ? `non_html (${ct || 'unknown'})` : `http_${status}`, status });
|
|
96
|
+
continue;
|
|
97
|
+
}
|
|
98
|
+
} catch (e) {
|
|
99
|
+
skipped.push({ url, reason: `fetch_error: ${e.message}` });
|
|
100
|
+
continue;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// Dedupe on the FINAL url — a redirect may collapse onto a page we already
|
|
104
|
+
// crawled (e.g. non-www start → www, then the page's own www self-link).
|
|
105
|
+
if (visited.has(finalUrl)) continue;
|
|
106
|
+
visited.add(finalUrl);
|
|
107
|
+
queued.add(finalUrl);
|
|
108
|
+
|
|
109
|
+
const data = extractPageData(html, finalUrl);
|
|
110
|
+
data.status_code = status;
|
|
111
|
+
|
|
112
|
+
if (includeCitability) {
|
|
113
|
+
try {
|
|
114
|
+
const cite = scorePage(
|
|
115
|
+
{ url: data.url, title: data.title, body_text: data.body_text, word_count: data.word_count, published_date: data.published_date, modified_date: data.modified_date },
|
|
116
|
+
data.headings, [], data.schema_types, [], null
|
|
117
|
+
);
|
|
118
|
+
data.citability = { score: cite.score, tier: cite.tier, breakdown: cite.breakdown, ai_intents: cite.aiIntents };
|
|
119
|
+
} catch (e) {
|
|
120
|
+
data.citability = { error: e.message };
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
pages.push(data);
|
|
125
|
+
if (onProgress) onProgress(`[${pages.length}/${budget}] ${finalUrl} (${status}, ${data.word_count}w)`);
|
|
126
|
+
|
|
127
|
+
// Enqueue internal links for BFS
|
|
128
|
+
if (pages.length < budget) {
|
|
129
|
+
for (const link of data.links) {
|
|
130
|
+
if (!link.href || !/^https?:/i.test(link.href)) continue;
|
|
131
|
+
if (queued.has(link.href)) continue;
|
|
132
|
+
if (sameOrigin && siteKey(link.href) !== siteRoot) continue;
|
|
133
|
+
// skip obvious non-page assets
|
|
134
|
+
if (/\.(png|jpe?g|gif|svg|webp|ico|css|js|pdf|zip|mp4|woff2?|ttf)(\?|$)/i.test(link.href)) continue;
|
|
135
|
+
queued.add(link.href);
|
|
136
|
+
queue.push(link.href);
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
// Politeness delay between requests (honour robots crawl-delay, capped)
|
|
141
|
+
if (queue.length && pages.length < budget && respectRobots) {
|
|
142
|
+
let delay = 0;
|
|
143
|
+
try { delay = (await checkRobots(url)).crawlDelayMs || 0; } catch { delay = 0; }
|
|
144
|
+
if (delay) await sleep(Math.min(delay, maxDelayMs));
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
const indexable = pages.filter(p => p.is_indexable).length;
|
|
149
|
+
const withSchema = pages.filter(p => p.schema_types.length).length;
|
|
150
|
+
const missingTitle = pages.filter(p => !p.title).length;
|
|
151
|
+
const missingMeta = pages.filter(p => !p.meta_desc).length;
|
|
152
|
+
|
|
153
|
+
return {
|
|
154
|
+
start,
|
|
155
|
+
origin,
|
|
156
|
+
pages,
|
|
157
|
+
skipped,
|
|
158
|
+
stats: {
|
|
159
|
+
crawled: pages.length,
|
|
160
|
+
skipped: skipped.length,
|
|
161
|
+
queued_unvisited: Math.max(0, queue.length),
|
|
162
|
+
indexable,
|
|
163
|
+
with_schema: withSchema,
|
|
164
|
+
missing_title: missingTitle,
|
|
165
|
+
missing_meta_desc: missingMeta,
|
|
166
|
+
elapsed_ms: Date.now() - t0,
|
|
167
|
+
},
|
|
168
|
+
};
|
|
169
|
+
}
|
package/db/db.js
CHANGED
|
@@ -328,6 +328,72 @@ export function updateInsightStatus(db, id, status) {
|
|
|
328
328
|
db.prepare('UPDATE insights SET status = ? WHERE id = ?').run(status, id);
|
|
329
329
|
}
|
|
330
330
|
|
|
331
|
+
// ── Agentic loop write-back (F1, v1.5.42) ───────────────────────────────────
|
|
332
|
+
//
|
|
333
|
+
// Closes the loop's memory gap: when a draft is actually produced, the Ledger
|
|
334
|
+
// should remember it. Two moves:
|
|
335
|
+
// 1. recordDraftCreated — persist a `draft_created` insight (idempotent per
|
|
336
|
+
// topic/type/lang) so "I drafted X" is durable and visible.
|
|
337
|
+
// 2. markGapsInProgress — flip matching ACTIVE gap insights to 'in_progress'
|
|
338
|
+
// so the same gap stops resurfacing in the next blog-draft pass.
|
|
339
|
+
// Both are best-effort and must never break draft generation.
|
|
340
|
+
|
|
341
|
+
/**
|
|
342
|
+
* Record that a draft was created targeting this project's Ledger.
|
|
343
|
+
* Idempotent: re-drafting the same (topic, content_type, lang) refreshes it.
|
|
344
|
+
* @returns {string} the fingerprint used
|
|
345
|
+
*/
|
|
346
|
+
export function recordDraftCreated(db, project, { topic, score = null, tier = null, wordCount = null, lang = 'en', contentType = 'blog', savedPath = null } = {}) {
|
|
347
|
+
const ts = Date.now();
|
|
348
|
+
const normTopic = (topic || 'auto').toLowerCase().trim().slice(0, 120);
|
|
349
|
+
const fp = `draft:${contentType}:${lang}:${normTopic}`.replace(/[^a-z0-9:_-]+/g, '-');
|
|
350
|
+
const data = JSON.stringify({
|
|
351
|
+
topic: topic || '(auto)', score, tier, word_count: wordCount,
|
|
352
|
+
lang, content_type: contentType, saved_path: savedPath, created_at: ts,
|
|
353
|
+
});
|
|
354
|
+
db.prepare(`
|
|
355
|
+
INSERT INTO insights (project, type, status, fingerprint, first_seen, last_seen, source_analysis_id, data)
|
|
356
|
+
VALUES (?, 'draft_created', 'active', ?, ?, ?, NULL, ?)
|
|
357
|
+
ON CONFLICT(project, type, fingerprint) DO UPDATE SET
|
|
358
|
+
last_seen = excluded.last_seen,
|
|
359
|
+
data = excluded.data
|
|
360
|
+
`).run(project, fp, ts, ts, data);
|
|
361
|
+
return fp;
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
/**
|
|
365
|
+
* Flip ACTIVE gap insights matching `topic` to 'in_progress' so the loop stops
|
|
366
|
+
* re-suggesting work that's already been drafted. Precise substring match on
|
|
367
|
+
* each gap's key term (never a loose word-split that would over-match).
|
|
368
|
+
* Only touches drafting-relevant gap types — never positioning/site_watch/etc.
|
|
369
|
+
* @returns {number} count of insights marked
|
|
370
|
+
*/
|
|
371
|
+
export function markGapsInProgress(db, project, topic) {
|
|
372
|
+
if (!topic || !topic.trim()) return 0;
|
|
373
|
+
const needle = topic.toLowerCase().trim();
|
|
374
|
+
const GAP_TYPES = ['keyword_gap', 'long_tail', 'content_gap', 'citability_gap', 'keyword_inventor'];
|
|
375
|
+
const placeholders = GAP_TYPES.map(() => '?').join(',');
|
|
376
|
+
const rows = db.prepare(
|
|
377
|
+
`SELECT id, data FROM insights WHERE project = ? AND status = 'active' AND type IN (${placeholders})`
|
|
378
|
+
).all(project, ...GAP_TYPES);
|
|
379
|
+
|
|
380
|
+
const upd = db.prepare(`UPDATE insights SET status = 'in_progress', last_seen = ? WHERE id = ?`);
|
|
381
|
+
const ts = Date.now();
|
|
382
|
+
let marked = 0;
|
|
383
|
+
for (const r of rows) {
|
|
384
|
+
let keyTerm = '', fullText = '';
|
|
385
|
+
try {
|
|
386
|
+
const d = JSON.parse(r.data);
|
|
387
|
+
keyTerm = (d.keyword || d.phrase || d.topic || d.suggested_title || d.title || d.url || '').toLowerCase().trim();
|
|
388
|
+
fullText = [d.keyword, d.phrase, d.topic, d.suggested_title, d.title, d.url]
|
|
389
|
+
.filter(Boolean).join(' ').toLowerCase();
|
|
390
|
+
} catch { continue; }
|
|
391
|
+
const hit = (keyTerm && (needle.includes(keyTerm) || keyTerm.includes(needle))) || (fullText && fullText.includes(needle));
|
|
392
|
+
if (hit) { upd.run(ts, r.id); marked++; }
|
|
393
|
+
}
|
|
394
|
+
return marked;
|
|
395
|
+
}
|
|
396
|
+
|
|
331
397
|
export function upsertDomain(db, { domain, project, role }) {
|
|
332
398
|
const now = Date.now();
|
|
333
399
|
return db.prepare(`
|
package/lib/gate.js
CHANGED
|
@@ -65,6 +65,36 @@ const FEATURE_NAMES = {
|
|
|
65
65
|
'intel-competitor': 'Intel Competitor Digest (AI-agent-ready)',
|
|
66
66
|
};
|
|
67
67
|
|
|
68
|
+
// ── Free features (v1.5.41 monetization line) ───────────────────────────────
|
|
69
|
+
//
|
|
70
|
+
// Analysis of YOUR OWN site is free: a capable agent commoditizes one-shot
|
|
71
|
+
// analysis anyway, so we give it away to make the free tier a genuine
|
|
72
|
+
// "complete SEO brain" — local, private, zero flagship tokens on grunt work.
|
|
73
|
+
//
|
|
74
|
+
// The paywall sits on what an agent structurally CANNOT do for itself:
|
|
75
|
+
// • Competitors — analyze, shallow, decay, headings-audit, entities,
|
|
76
|
+
// friction, competitive, gap-intel, intel-competitor
|
|
77
|
+
// • Automation — run (scheduler)
|
|
78
|
+
// • History — brief ("what changed"), velocity (publish-rate over time)
|
|
79
|
+
//
|
|
80
|
+
// Anything listed here passes requirePro() regardless of license tier.
|
|
81
|
+
const FREE_FEATURES = new Set([
|
|
82
|
+
'extract', // local Ollama labor — the grunt work that powers the free brain
|
|
83
|
+
'aeo', // AI citability scoring (pure function, own site)
|
|
84
|
+
'keywords', // keyword intelligence (own site)
|
|
85
|
+
'templates', // programmatic template detection (own site)
|
|
86
|
+
'orphans', // orphan entity detection (own site)
|
|
87
|
+
'js-delta', // JS rendering delta (own site, technical)
|
|
88
|
+
'blog-draft', // AEO blog draft from the Ledger (own site, content)
|
|
89
|
+
'html', // HTML dashboard
|
|
90
|
+
'html-all', // HTML dashboard (all projects)
|
|
91
|
+
'gsc-insights', // Search Console intelligence (own site)
|
|
92
|
+
'intel-audit', // agent-ready audit digest (own site)
|
|
93
|
+
'intel-blog', // agent-ready blog digest (own site)
|
|
94
|
+
'scan', // one-shot single-domain audit — the first-touch "try it" command
|
|
95
|
+
'loop', // content-loop orchestrator (own-site gaps → draft → queue)
|
|
96
|
+
]);
|
|
97
|
+
|
|
68
98
|
// ── CLI Gate — blocks command and shows upgrade message ──────────────────────
|
|
69
99
|
|
|
70
100
|
/**
|
|
@@ -75,6 +105,7 @@ const FEATURE_NAMES = {
|
|
|
75
105
|
* @returns {boolean}
|
|
76
106
|
*/
|
|
77
107
|
export function requirePro(feature) {
|
|
108
|
+
if (FREE_FEATURES.has(feature)) return true;
|
|
78
109
|
if (isPro()) return true;
|
|
79
110
|
|
|
80
111
|
const displayName = FEATURE_NAMES[feature] || feature;
|
|
@@ -178,7 +209,8 @@ export function printLicenseStatus() {
|
|
|
178
209
|
if (license.stale) console.log(`\x1b[33m ⚠ License cache stale — will re-validate on next network access${RESET}`);
|
|
179
210
|
} else {
|
|
180
211
|
console.log(`${DIM} SEO Intel Free${RESET}`);
|
|
181
|
-
console.log(`${DIM} Unlimited crawl ·
|
|
212
|
+
console.log(`${DIM} Unlimited crawl · Full site analysis · Dashboard · Local & private${RESET}`);
|
|
213
|
+
console.log(`${DIM} Solo adds: competitors · scheduled crawls · history & trends${RESET}`);
|
|
182
214
|
if (license.invalidKey) {
|
|
183
215
|
console.log(`\x1b[33m ⚠ ${license.reason}${RESET}`);
|
|
184
216
|
}
|
package/lib/intel.js
CHANGED
|
@@ -8,10 +8,14 @@
|
|
|
8
8
|
*
|
|
9
9
|
* Slices:
|
|
10
10
|
* raw (free) — page/keyword/heading inventory, no analysis
|
|
11
|
-
* audit (
|
|
12
|
-
* blog (
|
|
11
|
+
* audit (free) — citability + technical + active insights (your own site)
|
|
12
|
+
* blog (free) — gaps + tone hints for drafting (your own site)
|
|
13
13
|
* competitor (paid) — competitor summary + schema landscape
|
|
14
14
|
*
|
|
15
|
+
* Monetization line (v1.5.41): analysis of YOUR OWN site is free — a smart
|
|
16
|
+
* agent commoditizes one-shot analysis anyway. The paywall sits on the things
|
|
17
|
+
* an agent structurally can't do for itself: competitors, automation, history.
|
|
18
|
+
*
|
|
15
19
|
* Output is a stable structured object — agents should be able to chain calls
|
|
16
20
|
* without prompt gymnastics. Keep the schema additive across versions.
|
|
17
21
|
*/
|
|
@@ -26,7 +30,9 @@ const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
|
26
30
|
const VERSION = JSON.parse(readFileSync(join(__dirname, '..', 'package.json'), 'utf8')).version;
|
|
27
31
|
|
|
28
32
|
export const INTEL_SLICES = ['raw', 'audit', 'blog', 'competitor'];
|
|
29
|
-
|
|
33
|
+
// Own-site slices are free; only the competitor slice (data the agent can't
|
|
34
|
+
// gather on its own) requires Solo.
|
|
35
|
+
export const FREE_SLICES = ['raw', 'audit', 'blog'];
|
|
30
36
|
|
|
31
37
|
/**
|
|
32
38
|
* @param {import('node:sqlite').DatabaseSync} db
|
package/mcp/server.js
CHANGED
|
@@ -25,14 +25,16 @@ import { spawn } from 'child_process';
|
|
|
25
25
|
import { dirname, join } from 'path';
|
|
26
26
|
import { fileURLToPath } from 'url';
|
|
27
27
|
|
|
28
|
-
import { getDb, insertAgentInsight, AGENT_INSIGHT_TYPES, getActiveInsights, getCompetitorSummary } from '../db/db.js';
|
|
28
|
+
import { getDb, insertAgentInsight, AGENT_INSIGHT_TYPES, getActiveInsights, getCompetitorSummary, recordDraftCreated, markGapsInProgress } from '../db/db.js';
|
|
29
29
|
import { getIntel, INTEL_SLICES, FREE_SLICES } from '../lib/intel.js';
|
|
30
30
|
import { isPro } from '../lib/license.js';
|
|
31
31
|
import { readProgress } from '../lib/progress.js';
|
|
32
32
|
import { getProblems, getProblemCounts, markProblemStatus, getActiveStatusMap, PROBLEM_CATEGORIES, PROBLEM_STATUSES } from '../lib/problems.js';
|
|
33
33
|
|
|
34
34
|
import { runAeoAnalysis, persistAeoScores, upsertCitabilityInsights } from '../analyses/aeo/index.js';
|
|
35
|
-
import { prescore } from '../analyses/blog-draft/prescorer.js';
|
|
35
|
+
import { prescore, extractDraftTopic } from '../analyses/blog-draft/prescorer.js';
|
|
36
|
+
import { lightCrawl } from '../crawler/light.js';
|
|
37
|
+
import { runContentLoop } from '../analyses/loop/orchestrator.js';
|
|
36
38
|
import { gatherBlogDraftContext, buildBlogDraftPrompt } from '../analyses/blog-draft/index.js';
|
|
37
39
|
|
|
38
40
|
// ── Helpers ────────────────────────────────────────────────────────────────
|
|
@@ -357,6 +359,88 @@ server.registerTool(
|
|
|
357
359
|
}
|
|
358
360
|
);
|
|
359
361
|
|
|
362
|
+
// ── Tool: crawl_site (free — zero-config, zero-signup, local, lightweight) ──
|
|
363
|
+
// "Crawl for all Claude users": point it at a URL and it BFS-crawls same-origin
|
|
364
|
+
// pages with plain fetch (no browser, no project config, nothing persisted,
|
|
365
|
+
// nothing leaves the machine). For deep/JS-rendered/persistent crawls, the user
|
|
366
|
+
// installs seo-intel and runs `seo-intel crawl`.
|
|
367
|
+
server.registerTool(
|
|
368
|
+
'crawl_site',
|
|
369
|
+
{
|
|
370
|
+
description: [
|
|
371
|
+
'Crawl a website ad-hoc and return structured SEO/AEO data — no project setup, no account, no API key, nothing saved. Point it at any URL.',
|
|
372
|
+
'',
|
|
373
|
+
'Lightweight by design: plain HTTP fetch (no browser/JS rendering), same-origin BFS, honours robots.txt + crawl-delay, small page budget (default 10, hard cap 50). Returns title, meta, headings, links, JSON-LD schema types, word count, indexability — optionally a per-page AI-citability (AEO) score.',
|
|
374
|
+
'',
|
|
375
|
+
'Limits: JS-rendered/SPA pages under-report content (use the full `seo-intel crawl` with Playwright for those). Results are ephemeral — for persistent history, the Intelligence Ledger, and competitor analysis, install seo-intel (still local, own-site free). Free tier.',
|
|
376
|
+
].join('\n'),
|
|
377
|
+
inputSchema: {
|
|
378
|
+
url: z.string().describe('Start URL (scheme optional — "example.com" works). The crawl follows same-origin links from here.'),
|
|
379
|
+
max_pages: z.number().int().positive().optional().describe('Pages to fetch (default 10, hard cap 50).'),
|
|
380
|
+
include_citability: z.boolean().optional().describe('Run the AEO citability scorer per page (default false). Note: light mode does no entity extraction, so entity-authority is under-counted — run `seo-intel aeo` for the full score.'),
|
|
381
|
+
same_origin: z.boolean().optional().describe('Only follow links on the start site (default true). www/non-www and http/https are treated as the same site.'),
|
|
382
|
+
},
|
|
383
|
+
},
|
|
384
|
+
async ({ url, max_pages, include_citability, same_origin }) => {
|
|
385
|
+
try {
|
|
386
|
+
const r = await lightCrawl(url, {
|
|
387
|
+
maxPages: max_pages ?? 10,
|
|
388
|
+
includeCitability: include_citability ?? false,
|
|
389
|
+
sameOrigin: same_origin ?? true,
|
|
390
|
+
});
|
|
391
|
+
|
|
392
|
+
// Compact, token-aware shape: drop body_text + the full per-page link lists
|
|
393
|
+
// (return counts + a deduped discovered-URL list instead).
|
|
394
|
+
const pages = r.pages.map(p => ({
|
|
395
|
+
url: p.url,
|
|
396
|
+
status_code: p.status_code,
|
|
397
|
+
title: p.title,
|
|
398
|
+
meta_desc: p.meta_desc,
|
|
399
|
+
canonical: p.canonical || null,
|
|
400
|
+
is_indexable: p.is_indexable,
|
|
401
|
+
word_count: p.word_count,
|
|
402
|
+
headings: p.headings.slice(0, 40),
|
|
403
|
+
schema_types: p.schema_types,
|
|
404
|
+
published_date: p.published_date,
|
|
405
|
+
modified_date: p.modified_date,
|
|
406
|
+
internal_links: p.links.filter(l => l.internal).length,
|
|
407
|
+
external_links: p.links.filter(l => !l.internal).length,
|
|
408
|
+
...(p.citability ? { citability: p.citability } : {}),
|
|
409
|
+
}));
|
|
410
|
+
|
|
411
|
+
// Deduped internal URLs discovered but not crawled (structure peek).
|
|
412
|
+
const crawled = new Set(r.pages.map(p => p.url));
|
|
413
|
+
const discovered = [];
|
|
414
|
+
const seen = new Set();
|
|
415
|
+
for (const p of r.pages) {
|
|
416
|
+
for (const l of p.links) {
|
|
417
|
+
if (l.internal && !crawled.has(l.href) && !seen.has(l.href)) {
|
|
418
|
+
seen.add(l.href); discovered.push(l.href);
|
|
419
|
+
if (discovered.length >= 50) break;
|
|
420
|
+
}
|
|
421
|
+
}
|
|
422
|
+
if (discovered.length >= 50) break;
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
const out = {
|
|
426
|
+
start: r.start,
|
|
427
|
+
origin: r.origin,
|
|
428
|
+
stats: r.stats,
|
|
429
|
+
pages,
|
|
430
|
+
discovered_internal_urls: discovered,
|
|
431
|
+
skipped: r.skipped,
|
|
432
|
+
notice: 'Ephemeral + local — nothing was saved and nothing left this machine. Light mode does not render JavaScript, so SPA/JS-built pages under-report content; use `seo-intel crawl` (Playwright) for those. For persistent history, the Intelligence Ledger, AI-citability over time, and competitor analysis, install seo-intel — own-site stays free.',
|
|
433
|
+
};
|
|
434
|
+
return {
|
|
435
|
+
content: [{ type: 'text', text: JSON.stringify(out, null, 2) }],
|
|
436
|
+
structuredContent: out,
|
|
437
|
+
};
|
|
438
|
+
} catch (err) {
|
|
439
|
+
return { content: [{ type: 'text', text: `seo-intel crawl_site error: ${err.message}` }], isError: true };
|
|
440
|
+
}
|
|
441
|
+
}
|
|
442
|
+
);
|
|
443
|
+
|
|
360
444
|
// ── Tool: ingest_insight (free — write-back closes the loop) ──────────────
|
|
361
445
|
server.registerTool(
|
|
362
446
|
'ingest_insight',
|
|
@@ -414,18 +498,17 @@ server.registerTool(
|
|
|
414
498
|
}
|
|
415
499
|
);
|
|
416
500
|
|
|
417
|
-
// ── Tool: run_citability_audit (
|
|
501
|
+
// ── Tool: run_citability_audit (FREE) ─────────────────────────────────────
|
|
418
502
|
server.registerTool(
|
|
419
503
|
'run_citability_audit',
|
|
420
504
|
{
|
|
421
|
-
description: 'Run AEO citability scoring across all crawled pages (6 signals: entity authority, structured claims, answer density, Q&A proximity, freshness, schema coverage). Persists scores to citability_scores and upserts citability_gap insights into the ledger. Pure function — fast, no LLM calls.
|
|
505
|
+
description: 'Run AEO citability scoring across all crawled pages (6 signals: entity authority, structured claims, answer density, Q&A proximity, freshness, schema coverage). Persists scores to citability_scores and upserts citability_gap insights into the ledger. Pure function — fast, no LLM calls. Free tier — analysis of your own site is free.',
|
|
422
506
|
inputSchema: {
|
|
423
507
|
project: z.string(),
|
|
424
508
|
include_competitors: z.boolean().optional().describe('Score competitor pages too (default true)'),
|
|
425
509
|
},
|
|
426
510
|
},
|
|
427
511
|
async ({ project, include_competitors = true }) => {
|
|
428
|
-
if (!isPro()) return paidGate('run_citability_audit');
|
|
429
512
|
if (!loadProjectConfig(project)) {
|
|
430
513
|
return { content: [{ type: 'text', text: `Project "${project}" not found. Use list_projects to discover.` }], isError: true };
|
|
431
514
|
}
|
|
@@ -497,17 +580,18 @@ server.registerTool(
|
|
|
497
580
|
}
|
|
498
581
|
);
|
|
499
582
|
|
|
500
|
-
// ── Tool: prescore_draft (
|
|
583
|
+
// ── Tool: prescore_draft (FREE) ───────────────────────────────────────────
|
|
501
584
|
server.registerTool(
|
|
502
585
|
'prescore_draft',
|
|
503
586
|
{
|
|
504
|
-
description: 'Run the AEO scorer on a markdown draft before publishing. Returns the same 6-signal breakdown the dashboard uses (entity authority, structured claims, answer density, Q&A proximity, freshness, schema coverage) plus the overall 0-100 score and tier (excellent / good / fair / poor). Use this as a pre-publish gate when drafting via draft_blog_prompt — score < 60 means revise.
|
|
587
|
+
description: 'Run the AEO scorer on a markdown draft before publishing. Returns the same 6-signal breakdown the dashboard uses (entity authority, structured claims, answer density, Q&A proximity, freshness, schema coverage) plus the overall 0-100 score and tier (excellent / good / fair / poor). Use this as a pre-publish gate when drafting via draft_blog_prompt — score < 60 means revise. Free tier. Pass `project` (and optionally `topic`) to close the loop: the draft is recorded in the Ledger and matching gaps are marked in_progress so they stop resurfacing.',
|
|
505
588
|
inputSchema: {
|
|
506
589
|
draft_md: z.string().describe('Full markdown of the draft, including YAML frontmatter if present. The scorer extracts headings, word count, schema_type from frontmatter, etc.'),
|
|
590
|
+
project: z.string().optional().describe('If set, the scored draft is written back to this project\'s Intelligence Ledger (records a draft_created insight + marks matching gaps in_progress). Omit for a pure, stateless score.'),
|
|
591
|
+
topic: z.string().optional().describe('The topic/keyword this draft targets. Used to match gaps for the in_progress write-back. If omitted, recovered from the draft\'s frontmatter title or first H1.'),
|
|
507
592
|
},
|
|
508
593
|
},
|
|
509
|
-
async ({ draft_md }) => {
|
|
510
|
-
if (!isPro()) return paidGate('prescore_draft');
|
|
594
|
+
async ({ draft_md, project, topic }) => {
|
|
511
595
|
try {
|
|
512
596
|
const score = prescore(draft_md);
|
|
513
597
|
const out = {
|
|
@@ -520,6 +604,33 @@ server.registerTool(
|
|
|
520
604
|
? 'Draft scores well. Safe to publish.'
|
|
521
605
|
: 'Below 60 — consider strengthening: add FAQ schema for Q&A proximity, increase entity authority via named experts/citations, shorten paragraphs for answer density, add structured claims (numbers/dates).',
|
|
522
606
|
};
|
|
607
|
+
|
|
608
|
+
// F1 (v1.5.42): loop write-back — only when a project is supplied, and
|
|
609
|
+
// best-effort so a Ledger hiccup never fails the score.
|
|
610
|
+
if (project && loadProjectConfig(project)) {
|
|
611
|
+
try {
|
|
612
|
+
const db = getDb();
|
|
613
|
+
const effectiveTopic = topic || extractDraftTopic(draft_md);
|
|
614
|
+
recordDraftCreated(db, project, {
|
|
615
|
+
topic: effectiveTopic,
|
|
616
|
+
score: score.score,
|
|
617
|
+
tier: score.tier,
|
|
618
|
+
wordCount: score.wordCount,
|
|
619
|
+
});
|
|
620
|
+
const marked = markGapsInProgress(db, project, effectiveTopic);
|
|
621
|
+
out.ledger = {
|
|
622
|
+
recorded: true,
|
|
623
|
+
topic: effectiveTopic || '(auto)',
|
|
624
|
+
gaps_marked_in_progress: marked,
|
|
625
|
+
note: marked > 0
|
|
626
|
+
? `${marked} matching gap(s) marked in_progress — they stop resurfacing until a re-audit re-scores the published page.`
|
|
627
|
+
: 'Draft recorded; no active gaps matched the topic.',
|
|
628
|
+
};
|
|
629
|
+
} catch (e) {
|
|
630
|
+
out.ledger = { recorded: false, error: e.message };
|
|
631
|
+
}
|
|
632
|
+
}
|
|
633
|
+
|
|
523
634
|
return {
|
|
524
635
|
content: [{ type: 'text', text: JSON.stringify(out, null, 2) }],
|
|
525
636
|
structuredContent: out,
|
|
@@ -530,11 +641,11 @@ server.registerTool(
|
|
|
530
641
|
}
|
|
531
642
|
);
|
|
532
643
|
|
|
533
|
-
// ── Tool: draft_blog_prompt (
|
|
644
|
+
// ── Tool: draft_blog_prompt (FREE) ────────────────────────────────────────
|
|
534
645
|
server.registerTool(
|
|
535
646
|
'draft_blog_prompt',
|
|
536
647
|
{
|
|
537
|
-
description: 'Generate an AEO-aware blog draft prompt seeded with full project context — keyword gaps, citability gaps, top entities, brand voice notes, competitor heading patterns. The agent\'s own LLM writes the draft using this prompt. Pair with prescore_draft for a write→score→revise loop.
|
|
648
|
+
description: 'Generate an AEO-aware blog draft prompt seeded with full project context — keyword gaps, citability gaps, top entities, brand voice notes, competitor heading patterns. The agent\'s own LLM writes the draft using this prompt. Pair with prescore_draft for a write→score→revise loop. Free tier.',
|
|
538
649
|
inputSchema: {
|
|
539
650
|
project: z.string(),
|
|
540
651
|
topic: z.string().optional().describe('Specific topic to draft about. If omitted, the prompt asks the LLM to pick the highest-leverage topic from the gap data.'),
|
|
@@ -543,7 +654,6 @@ server.registerTool(
|
|
|
543
654
|
},
|
|
544
655
|
},
|
|
545
656
|
async ({ project, topic, lang = 'en', content_type = 'blog' }) => {
|
|
546
|
-
if (!isPro()) return paidGate('draft_blog_prompt');
|
|
547
657
|
const config = loadProjectConfig(project);
|
|
548
658
|
if (!config) {
|
|
549
659
|
return { content: [{ type: 'text', text: `Project "${project}" not found. Use list_projects to discover.` }], isError: true };
|
|
@@ -571,9 +681,54 @@ server.registerTool(
|
|
|
571
681
|
}
|
|
572
682
|
);
|
|
573
683
|
|
|
684
|
+
// ── Tool: run_content_loop (free — the one-call content loop) ─────────────
|
|
685
|
+
// Walks gap → draft → prescore → queue. In MCP the agent's own LLM is the
|
|
686
|
+
// writer, so this runs in HAND-BACK mode: it ranks the gaps, picks the highest-
|
|
687
|
+
// leverage one(s), and returns a seeded prompt per gap. The agent writes the
|
|
688
|
+
// draft, then calls prescore_draft(project, topic) to score + close the loop.
|
|
689
|
+
server.registerTool(
|
|
690
|
+
'run_content_loop',
|
|
691
|
+
{
|
|
692
|
+
description: [
|
|
693
|
+
'Run the content loop for a project in one call: ranks the open gaps in the Intelligence Ledger by leverage (priority × source × AI-intent), picks the highest, and returns an AEO-aware draft prompt seeded with full context.',
|
|
694
|
+
'',
|
|
695
|
+
'Hand-back by design — your own LLM writes the draft from the returned prompt, then you call prescore_draft(project, topic) to AEO-score it and close the loop (records the draft, marks the gap in_progress). Use dry_run to just see which gap it would target. Free tier.',
|
|
696
|
+
].join('\n'),
|
|
697
|
+
inputSchema: {
|
|
698
|
+
project: z.string(),
|
|
699
|
+
topic: z.string().optional().describe('Focus a specific topic instead of auto-picking the top gap.'),
|
|
700
|
+
count: z.number().int().positive().optional().describe('Return prompts for the top N gaps (default 1).'),
|
|
701
|
+
lang: z.enum(['en', 'fi']).optional(),
|
|
702
|
+
content_type: z.enum(['blog', 'article', 'guide', 'docs', 'social']).optional(),
|
|
703
|
+
dry_run: z.boolean().optional().describe('Only rank + select the gap(s); do not build prompts.'),
|
|
704
|
+
},
|
|
705
|
+
},
|
|
706
|
+
async ({ project, topic, count, lang = 'en', content_type = 'blog', dry_run }) => {
|
|
707
|
+
const config = loadProjectConfig(project);
|
|
708
|
+
if (!config) {
|
|
709
|
+
return { content: [{ type: 'text', text: `Project "${project}" not found. Use list_projects to discover.` }], isError: true };
|
|
710
|
+
}
|
|
711
|
+
try {
|
|
712
|
+
const db = getDb();
|
|
713
|
+
const result = await runContentLoop(db, project, {
|
|
714
|
+
config, topic: topic || null, count: count || 1, lang, contentType: content_type,
|
|
715
|
+
dryRun: !!dry_run, generate: null, // hand-back: the agent writes
|
|
716
|
+
});
|
|
717
|
+
return {
|
|
718
|
+
content: [{ type: 'text', text: JSON.stringify(result, null, 2) }],
|
|
719
|
+
structuredContent: result,
|
|
720
|
+
};
|
|
721
|
+
} catch (err) {
|
|
722
|
+
return { content: [{ type: 'text', text: `seo-intel run_content_loop error: ${err.message}` }], isError: true };
|
|
723
|
+
}
|
|
724
|
+
}
|
|
725
|
+
);
|
|
726
|
+
|
|
574
727
|
// ── Tool: export_intel (firehose; free tables + paid tables) ──────────────
|
|
575
|
-
|
|
576
|
-
|
|
728
|
+
// v1.5.41: own-site derived data (extractions, schemas, citability, the
|
|
729
|
+
// ledger) is free — only the competitor gap analysis (`analyses`) is paid.
|
|
730
|
+
const FREE_EXPORT_TABLES = ['pages', 'keywords', 'headings', 'links', 'technical', 'sitemap_urls', 'extractions', 'page_schemas', 'citability_scores', 'insights'];
|
|
731
|
+
const PAID_EXPORT_TABLES = ['analyses'];
|
|
577
732
|
const ALL_EXPORT_TABLES = [...FREE_EXPORT_TABLES, ...PAID_EXPORT_TABLES];
|
|
578
733
|
|
|
579
734
|
const EXPORT_TABLE_QUERIES = {
|
|
@@ -597,7 +752,7 @@ const MAX_MAX_ROWS_PER_TABLE = 50000;
|
|
|
597
752
|
function buildExportNotice({ tokens, bytes, free, paidRequested, paidExcluded, anyTruncated, maxRowsPerTable }) {
|
|
598
753
|
const tooBig = tokens > 50000;
|
|
599
754
|
const upgradeBlurb = free
|
|
600
|
-
? `\n\n📦
|
|
755
|
+
? `\n\n📦 Table NOT in this response (requires SEO Intel Solo, €19.99/mo — vs Ahrefs ~$129/mo): ${PAID_EXPORT_TABLES.join(', ')}.\n That's the competitor gap-analysis history (keyword_gaps, content_gaps, positioning, quick_wins). Everything about YOUR OWN site — extractions, schemas, citability scores, and the Intelligence Ledger — is free.\n Free pre-parsed digests: get_intel(for=audit|blog), run_citability_audit, prescore_draft, draft_blog_prompt. Solo adds competitor synthesis: get_competitor_positioning + get_intel(for=competitor).`
|
|
601
756
|
: `\n\nYou have Solo. Paid tables in this export: ${(paidRequested || []).join(', ') || '(none requested)'}.`;
|
|
602
757
|
|
|
603
758
|
const sizeLine = tooBig
|
|
@@ -630,7 +785,7 @@ server.registerTool(
|
|
|
630
785
|
'export_intel',
|
|
631
786
|
{
|
|
632
787
|
description: [
|
|
633
|
-
'Bulk export of raw structured intelligence — pages, keywords, headings, links, technical, sitemap URLs
|
|
788
|
+
'Bulk export of raw structured intelligence — pages, keywords, headings, links, technical, sitemap URLs, extractions, schemas, citability scores, and the Intelligence Ledger (all free), plus the competitor gap-analysis history (Solo). Mirrors `seo-intel export --full <project>` as a single MCP call.',
|
|
634
789
|
'',
|
|
635
790
|
'⚠️ FIREHOSE WARNING: this is raw rows, not summaries. For carbium-sized projects it can be 5–10 MB / 200k+ tokens. The response includes a `notice` field telling the agent how to handle it (pipe to file, use other tools, or upgrade). Agents SHOULD NOT paste the response wholesale into their context — read the `notice` first, then either query selectively or save to a file.',
|
|
636
791
|
'',
|
|
@@ -834,7 +989,7 @@ async function main() {
|
|
|
834
989
|
const transport = new StdioServerTransport();
|
|
835
990
|
await server.connect(transport);
|
|
836
991
|
// stderr is fine; the host typically surfaces this in its MCP logs panel.
|
|
837
|
-
console.error(`[seo-intel-mcp] v${VERSION} ready on stdio.
|
|
992
|
+
console.error(`[seo-intel-mcp] v${VERSION} ready on stdio. 17 tools — free: crawl_site (ad-hoc, any URL, no config), run_content_loop (gap→draft→close), list_projects, list_problems, mark_problem_status, get_intel(raw/audit/blog), get_pages, list_keywords, get_headings, run_crawl, get_crawl_status, ingest_insight, run_citability_audit, prescore_draft, draft_blog_prompt, export_intel (own-site tables); Solo (competitor synthesis): get_competitor_positioning, get_intel(competitor), export_intel (analyses table).`);
|
|
838
993
|
}
|
|
839
994
|
|
|
840
995
|
main().catch(err => {
|