seo-intel 1.5.39 → 1.5.45
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +76 -0
- package/analyses/blog-draft/prescorer.js +17 -0
- package/analyses/loop/orchestrator.js +179 -0
- package/cli.js +197 -6
- package/crawler/html-extract.js +127 -0
- package/crawler/light.js +169 -0
- package/db/db.js +66 -0
- package/lib/cron.js +108 -0
- package/lib/gate.js +33 -1
- package/lib/intel.js +9 -3
- package/mcp/server.js +172 -17
- package/package.json +1 -1
- package/reports/generate-html.js +42 -404
- package/setup/web-routes.js +39 -0
- package/setup/wizard.html +73 -0
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Lightweight HTML extractor — pure string/regex parsing. No DOM, no browser.
|
|
3
|
+
*
|
|
4
|
+
* Powers the fetch-based light crawler (crawler/light.js) so ANY Claude user can
|
|
5
|
+
* crawl + analyze a site with zero browser environment installed. Consistent
|
|
6
|
+
* with schema-parser.js's regex approach ("no DOM parser needed").
|
|
7
|
+
*
|
|
8
|
+
* Trade-off: not as bulletproof as a full DOM parse on adversarial markup, but
|
|
9
|
+
* more than good enough for SEO/AEO metadata (title, meta, headings, links,
|
|
10
|
+
* JSON-LD, dates). The full Playwright crawler stays the heavyweight option.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import { stripHtml } from './sanitize.js';
|
|
14
|
+
import { parseJsonLd } from './schema-parser.js';
|
|
15
|
+
|
|
16
|
+
function decodeEntities(s) {
|
|
17
|
+
return (s || '')
|
|
18
|
+
.replace(/&/g, '&').replace(/</g, '<').replace(/>/g, '>')
|
|
19
|
+
.replace(/"/g, '"').replace(/�?39;/g, "'").replace(/'/g, "'")
|
|
20
|
+
.replace(/ /g, ' ')
|
|
21
|
+
.replace(/&#(\d+);/g, (_, n) => { try { return String.fromCodePoint(+n); } catch { return ' '; } })
|
|
22
|
+
.replace(/&#x([0-9a-f]+);/gi, (_, h) => { try { return String.fromCodePoint(parseInt(h, 16)); } catch { return ' '; } })
|
|
23
|
+
.trim();
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
const collapse = (s) => decodeEntities(stripHtml(s || '').replace(/\s+/g, ' '));
|
|
27
|
+
|
|
28
|
+
export function extractTitle(html) {
|
|
29
|
+
const m = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
|
|
30
|
+
return m ? decodeEntities(m[1].replace(/\s+/g, ' ')) : '';
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
// Find a <meta> tag by attribute (name|property) = value, then read its content.
|
|
34
|
+
function metaContent(html, attr, value) {
|
|
35
|
+
const re = new RegExp(`<meta\\b[^>]*\\b${attr}\\s*=\\s*["']${value}["'][^>]*>`, 'i');
|
|
36
|
+
const tag = html.match(re);
|
|
37
|
+
if (!tag) return '';
|
|
38
|
+
const c = tag[0].match(/\bcontent\s*=\s*["']([\s\S]*?)["']/i);
|
|
39
|
+
return c ? decodeEntities(c[1]) : '';
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
export function extractMetaDescription(html) {
|
|
43
|
+
return metaContent(html, 'name', 'description') || metaContent(html, 'property', 'og:description');
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
export function extractMetaRobots(html) {
|
|
47
|
+
return metaContent(html, 'name', 'robots').toLowerCase();
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
export function extractCanonical(html, baseUrl) {
|
|
51
|
+
const tag = html.match(/<link\b[^>]*\brel\s*=\s*["']canonical["'][^>]*>/i);
|
|
52
|
+
if (!tag) return '';
|
|
53
|
+
const h = tag[0].match(/\bhref\s*=\s*["']([^"']+)["']/i);
|
|
54
|
+
if (!h) return '';
|
|
55
|
+
try { return new URL(h[1], baseUrl).toString(); } catch { return h[1]; }
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
export function extractHeadings(html) {
|
|
59
|
+
const out = [];
|
|
60
|
+
const re = /<h([1-6])\b[^>]*>([\s\S]*?)<\/h\1>/gi;
|
|
61
|
+
let m;
|
|
62
|
+
while ((m = re.exec(html)) !== null) {
|
|
63
|
+
const text = collapse(m[2]);
|
|
64
|
+
if (text) out.push({ level: Number(m[1]), text: text.slice(0, 300) });
|
|
65
|
+
if (out.length >= 300) break;
|
|
66
|
+
}
|
|
67
|
+
return out;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
export function extractLinks(html, baseUrl) {
|
|
71
|
+
const out = [];
|
|
72
|
+
const seen = new Set();
|
|
73
|
+
let base; try { base = new URL(baseUrl); } catch { base = null; }
|
|
74
|
+
const re = /<a\b[^>]*\bhref\s*=\s*["']([^"']+)["'][^>]*>([\s\S]*?)<\/a>/gi;
|
|
75
|
+
let m;
|
|
76
|
+
while ((m = re.exec(html)) !== null) {
|
|
77
|
+
let href = m[1].trim();
|
|
78
|
+
if (!href) continue;
|
|
79
|
+
if (/^(#|mailto:|tel:|javascript:|data:)/i.test(href)) continue;
|
|
80
|
+
let abs;
|
|
81
|
+
try { abs = base ? new URL(href, base).toString() : href; } catch { continue; }
|
|
82
|
+
abs = abs.split('#')[0];
|
|
83
|
+
if (seen.has(abs)) continue;
|
|
84
|
+
seen.add(abs);
|
|
85
|
+
let internal = false;
|
|
86
|
+
try { internal = !!base && new URL(abs).hostname === base.hostname; } catch { /* keep false */ }
|
|
87
|
+
out.push({ href: abs, text: collapse(m[2]).slice(0, 120), internal });
|
|
88
|
+
if (out.length >= 1000) break;
|
|
89
|
+
}
|
|
90
|
+
return out;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
/**
|
|
94
|
+
* Parse one fetched HTML document into the structured shape the rest of
|
|
95
|
+
* SEO Intel speaks (mirrors the Playwright crawler's per-page object).
|
|
96
|
+
* @param {string} html
|
|
97
|
+
* @param {string} url - the (final) URL this HTML was fetched from
|
|
98
|
+
*/
|
|
99
|
+
export function extractPageData(html, url) {
|
|
100
|
+
const schemas = parseJsonLd(html) || [];
|
|
101
|
+
const schemaTypes = [...new Set(schemas.map(s => s.type).filter(Boolean))];
|
|
102
|
+
let published = null, modified = null;
|
|
103
|
+
for (const s of schemas) {
|
|
104
|
+
if (!published && s.datePublished) published = s.datePublished;
|
|
105
|
+
if (!modified && s.dateModified) modified = s.dateModified;
|
|
106
|
+
}
|
|
107
|
+
const bodyText = stripHtml(html);
|
|
108
|
+
const wordCount = bodyText ? bodyText.split(/\s+/).filter(Boolean).length : 0;
|
|
109
|
+
const robots = extractMetaRobots(html);
|
|
110
|
+
|
|
111
|
+
return {
|
|
112
|
+
url,
|
|
113
|
+
title: extractTitle(html),
|
|
114
|
+
meta_desc: extractMetaDescription(html),
|
|
115
|
+
canonical: extractCanonical(html, url),
|
|
116
|
+
robots,
|
|
117
|
+
is_indexable: !/\bnoindex\b/.test(robots),
|
|
118
|
+
headings: extractHeadings(html),
|
|
119
|
+
links: extractLinks(html, url),
|
|
120
|
+
schema_types: schemaTypes,
|
|
121
|
+
schemas,
|
|
122
|
+
word_count: wordCount,
|
|
123
|
+
body_text: bodyText.slice(0, 20000),
|
|
124
|
+
published_date: published,
|
|
125
|
+
modified_date: modified,
|
|
126
|
+
};
|
|
127
|
+
}
|
package/crawler/light.js
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Light crawler — fetch-based, zero-browser, zero-config, zero-signup.
|
|
3
|
+
*
|
|
4
|
+
* The "crawl for all Claude users" path: point it at a URL and it BFS-crawls
|
|
5
|
+
* same-origin pages with plain HTTP fetch (no Playwright, no browser download),
|
|
6
|
+
* returns structured SEO/AEO data entirely in memory. Nothing is persisted,
|
|
7
|
+
* nothing leaves the machine, no account required.
|
|
8
|
+
*
|
|
9
|
+
* Deliberately NOT a "massive crawl environment":
|
|
10
|
+
* - small page budget (default 10, hard cap 50)
|
|
11
|
+
* - same-origin only by default
|
|
12
|
+
* - honours robots.txt + crawl-delay (no tricks)
|
|
13
|
+
* - no JS rendering (use the full Playwright crawler for JS-heavy sites)
|
|
14
|
+
*
|
|
15
|
+
* For deep, persistent, JS-rendered crawls of a configured project, use the
|
|
16
|
+
* heavyweight crawler (`crawler/index.js` via `seo-intel crawl`).
|
|
17
|
+
*/
|
|
18
|
+
|
|
19
|
+
import fetch from 'node-fetch';
|
|
20
|
+
import { checkRobots } from './robots.js';
|
|
21
|
+
import { extractPageData } from './html-extract.js';
|
|
22
|
+
import { scorePage } from '../analyses/aeo/scorer.js';
|
|
23
|
+
|
|
24
|
+
const HARD_CAP = 50;
|
|
25
|
+
const DEFAULT_UA = 'SEOIntelBot (+https://ukkometa.fi/seo-intel; light-crawl)';
|
|
26
|
+
|
|
27
|
+
function sleep(ms) { return new Promise(r => setTimeout(r, ms)); }
|
|
28
|
+
|
|
29
|
+
function normalizeStart(url) {
|
|
30
|
+
let u = url.trim();
|
|
31
|
+
if (!/^https?:\/\//i.test(u)) u = 'https://' + u;
|
|
32
|
+
return new URL(u).toString();
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// Same-site key: hostname minus a leading "www." (and protocol-agnostic), so
|
|
36
|
+
// http↔https and www↔non-www redirects don't break same-origin link following.
|
|
37
|
+
function siteKey(u) {
|
|
38
|
+
try { return new URL(u).hostname.replace(/^www\./i, '').toLowerCase(); } catch { return null; }
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* @param {string} startUrl
|
|
43
|
+
* @param {object} [opts]
|
|
44
|
+
* @param {number} [opts.maxPages=10] pages to fetch (clamped to HARD_CAP)
|
|
45
|
+
* @param {boolean} [opts.sameOrigin=true] only follow links on the start origin
|
|
46
|
+
* @param {boolean} [opts.includeCitability=false] run the AEO scorer per page
|
|
47
|
+
* @param {boolean} [opts.respectRobots=true] honour robots.txt + crawl-delay
|
|
48
|
+
* @param {number} [opts.timeoutMs=10000] per-request timeout
|
|
49
|
+
* @param {number} [opts.maxDelayMs=3000] cap on politeness delay between requests
|
|
50
|
+
* @param {(msg:string)=>void} [opts.onProgress]
|
|
51
|
+
* @returns {Promise<object>} { start, origin, pages, skipped, stats }
|
|
52
|
+
*/
|
|
53
|
+
export async function lightCrawl(startUrl, opts = {}) {
|
|
54
|
+
const {
|
|
55
|
+
maxPages = 10,
|
|
56
|
+
sameOrigin = true,
|
|
57
|
+
includeCitability = false,
|
|
58
|
+
respectRobots = true,
|
|
59
|
+
timeoutMs = 10000,
|
|
60
|
+
maxDelayMs = 3000,
|
|
61
|
+
onProgress,
|
|
62
|
+
} = opts;
|
|
63
|
+
|
|
64
|
+
const budget = Math.max(1, Math.min(maxPages, HARD_CAP));
|
|
65
|
+
let start;
|
|
66
|
+
try { start = normalizeStart(startUrl); } catch { throw new Error(`Invalid URL: ${startUrl}`); }
|
|
67
|
+
const origin = new URL(start).origin;
|
|
68
|
+
|
|
69
|
+
const siteRoot = siteKey(start);
|
|
70
|
+
const queue = [start];
|
|
71
|
+
const queued = new Set([start]);
|
|
72
|
+
const visited = new Set(); // FINAL (post-redirect) URLs actually processed
|
|
73
|
+
const pages = [];
|
|
74
|
+
const skipped = [];
|
|
75
|
+
const t0 = Date.now();
|
|
76
|
+
|
|
77
|
+
while (queue.length && pages.length < budget) {
|
|
78
|
+
const url = queue.shift();
|
|
79
|
+
|
|
80
|
+
if (respectRobots) {
|
|
81
|
+
let robot;
|
|
82
|
+
try { robot = await checkRobots(url); } catch { robot = { allowed: true, crawlDelayMs: 0 }; }
|
|
83
|
+
if (!robot.allowed) { skipped.push({ url, reason: 'robots_disallow' }); continue; }
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
let res, finalUrl = url, status = 0, html = '';
|
|
87
|
+
try {
|
|
88
|
+
res = await fetch(url, { timeout: timeoutMs, redirect: 'follow', headers: { 'User-Agent': DEFAULT_UA, Accept: 'text/html,application/xhtml+xml' } });
|
|
89
|
+
status = res.status;
|
|
90
|
+
finalUrl = res.url || url;
|
|
91
|
+
const ct = (res.headers.get('content-type') || '').toLowerCase();
|
|
92
|
+
if (res.ok && ct.includes('html')) {
|
|
93
|
+
html = await res.text();
|
|
94
|
+
} else {
|
|
95
|
+
skipped.push({ url, reason: res.ok ? `non_html (${ct || 'unknown'})` : `http_${status}`, status });
|
|
96
|
+
continue;
|
|
97
|
+
}
|
|
98
|
+
} catch (e) {
|
|
99
|
+
skipped.push({ url, reason: `fetch_error: ${e.message}` });
|
|
100
|
+
continue;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// Dedupe on the FINAL url — a redirect may collapse onto a page we already
|
|
104
|
+
// crawled (e.g. non-www start → www, then the page's own www self-link).
|
|
105
|
+
if (visited.has(finalUrl)) continue;
|
|
106
|
+
visited.add(finalUrl);
|
|
107
|
+
queued.add(finalUrl);
|
|
108
|
+
|
|
109
|
+
const data = extractPageData(html, finalUrl);
|
|
110
|
+
data.status_code = status;
|
|
111
|
+
|
|
112
|
+
if (includeCitability) {
|
|
113
|
+
try {
|
|
114
|
+
const cite = scorePage(
|
|
115
|
+
{ url: data.url, title: data.title, body_text: data.body_text, word_count: data.word_count, published_date: data.published_date, modified_date: data.modified_date },
|
|
116
|
+
data.headings, [], data.schema_types, [], null
|
|
117
|
+
);
|
|
118
|
+
data.citability = { score: cite.score, tier: cite.tier, breakdown: cite.breakdown, ai_intents: cite.aiIntents };
|
|
119
|
+
} catch (e) {
|
|
120
|
+
data.citability = { error: e.message };
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
pages.push(data);
|
|
125
|
+
if (onProgress) onProgress(`[${pages.length}/${budget}] ${finalUrl} (${status}, ${data.word_count}w)`);
|
|
126
|
+
|
|
127
|
+
// Enqueue internal links for BFS
|
|
128
|
+
if (pages.length < budget) {
|
|
129
|
+
for (const link of data.links) {
|
|
130
|
+
if (!link.href || !/^https?:/i.test(link.href)) continue;
|
|
131
|
+
if (queued.has(link.href)) continue;
|
|
132
|
+
if (sameOrigin && siteKey(link.href) !== siteRoot) continue;
|
|
133
|
+
// skip obvious non-page assets
|
|
134
|
+
if (/\.(png|jpe?g|gif|svg|webp|ico|css|js|pdf|zip|mp4|woff2?|ttf)(\?|$)/i.test(link.href)) continue;
|
|
135
|
+
queued.add(link.href);
|
|
136
|
+
queue.push(link.href);
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
// Politeness delay between requests (honour robots crawl-delay, capped)
|
|
141
|
+
if (queue.length && pages.length < budget && respectRobots) {
|
|
142
|
+
let delay = 0;
|
|
143
|
+
try { delay = (await checkRobots(url)).crawlDelayMs || 0; } catch { delay = 0; }
|
|
144
|
+
if (delay) await sleep(Math.min(delay, maxDelayMs));
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
const indexable = pages.filter(p => p.is_indexable).length;
|
|
149
|
+
const withSchema = pages.filter(p => p.schema_types.length).length;
|
|
150
|
+
const missingTitle = pages.filter(p => !p.title).length;
|
|
151
|
+
const missingMeta = pages.filter(p => !p.meta_desc).length;
|
|
152
|
+
|
|
153
|
+
return {
|
|
154
|
+
start,
|
|
155
|
+
origin,
|
|
156
|
+
pages,
|
|
157
|
+
skipped,
|
|
158
|
+
stats: {
|
|
159
|
+
crawled: pages.length,
|
|
160
|
+
skipped: skipped.length,
|
|
161
|
+
queued_unvisited: Math.max(0, queue.length),
|
|
162
|
+
indexable,
|
|
163
|
+
with_schema: withSchema,
|
|
164
|
+
missing_title: missingTitle,
|
|
165
|
+
missing_meta_desc: missingMeta,
|
|
166
|
+
elapsed_ms: Date.now() - t0,
|
|
167
|
+
},
|
|
168
|
+
};
|
|
169
|
+
}
|
package/db/db.js
CHANGED
|
@@ -328,6 +328,72 @@ export function updateInsightStatus(db, id, status) {
|
|
|
328
328
|
db.prepare('UPDATE insights SET status = ? WHERE id = ?').run(status, id);
|
|
329
329
|
}
|
|
330
330
|
|
|
331
|
+
// ── Agentic loop write-back (F1, v1.5.42) ───────────────────────────────────
|
|
332
|
+
//
|
|
333
|
+
// Closes the loop's memory gap: when a draft is actually produced, the Ledger
|
|
334
|
+
// should remember it. Two moves:
|
|
335
|
+
// 1. recordDraftCreated — persist a `draft_created` insight (idempotent per
|
|
336
|
+
// topic/type/lang) so "I drafted X" is durable and visible.
|
|
337
|
+
// 2. markGapsInProgress — flip matching ACTIVE gap insights to 'in_progress'
|
|
338
|
+
// so the same gap stops resurfacing in the next blog-draft pass.
|
|
339
|
+
// Both are best-effort and must never break draft generation.
|
|
340
|
+
|
|
341
|
+
/**
|
|
342
|
+
* Record that a draft was created targeting this project's Ledger.
|
|
343
|
+
* Idempotent: re-drafting the same (topic, content_type, lang) refreshes it.
|
|
344
|
+
* @returns {string} the fingerprint used
|
|
345
|
+
*/
|
|
346
|
+
export function recordDraftCreated(db, project, { topic, score = null, tier = null, wordCount = null, lang = 'en', contentType = 'blog', savedPath = null } = {}) {
|
|
347
|
+
const ts = Date.now();
|
|
348
|
+
const normTopic = (topic || 'auto').toLowerCase().trim().slice(0, 120);
|
|
349
|
+
const fp = `draft:${contentType}:${lang}:${normTopic}`.replace(/[^a-z0-9:_-]+/g, '-');
|
|
350
|
+
const data = JSON.stringify({
|
|
351
|
+
topic: topic || '(auto)', score, tier, word_count: wordCount,
|
|
352
|
+
lang, content_type: contentType, saved_path: savedPath, created_at: ts,
|
|
353
|
+
});
|
|
354
|
+
db.prepare(`
|
|
355
|
+
INSERT INTO insights (project, type, status, fingerprint, first_seen, last_seen, source_analysis_id, data)
|
|
356
|
+
VALUES (?, 'draft_created', 'active', ?, ?, ?, NULL, ?)
|
|
357
|
+
ON CONFLICT(project, type, fingerprint) DO UPDATE SET
|
|
358
|
+
last_seen = excluded.last_seen,
|
|
359
|
+
data = excluded.data
|
|
360
|
+
`).run(project, fp, ts, ts, data);
|
|
361
|
+
return fp;
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
/**
|
|
365
|
+
* Flip ACTIVE gap insights matching `topic` to 'in_progress' so the loop stops
|
|
366
|
+
* re-suggesting work that's already been drafted. Precise substring match on
|
|
367
|
+
* each gap's key term (never a loose word-split that would over-match).
|
|
368
|
+
* Only touches drafting-relevant gap types — never positioning/site_watch/etc.
|
|
369
|
+
* @returns {number} count of insights marked
|
|
370
|
+
*/
|
|
371
|
+
export function markGapsInProgress(db, project, topic) {
|
|
372
|
+
if (!topic || !topic.trim()) return 0;
|
|
373
|
+
const needle = topic.toLowerCase().trim();
|
|
374
|
+
const GAP_TYPES = ['keyword_gap', 'long_tail', 'content_gap', 'citability_gap', 'keyword_inventor'];
|
|
375
|
+
const placeholders = GAP_TYPES.map(() => '?').join(',');
|
|
376
|
+
const rows = db.prepare(
|
|
377
|
+
`SELECT id, data FROM insights WHERE project = ? AND status = 'active' AND type IN (${placeholders})`
|
|
378
|
+
).all(project, ...GAP_TYPES);
|
|
379
|
+
|
|
380
|
+
const upd = db.prepare(`UPDATE insights SET status = 'in_progress', last_seen = ? WHERE id = ?`);
|
|
381
|
+
const ts = Date.now();
|
|
382
|
+
let marked = 0;
|
|
383
|
+
for (const r of rows) {
|
|
384
|
+
let keyTerm = '', fullText = '';
|
|
385
|
+
try {
|
|
386
|
+
const d = JSON.parse(r.data);
|
|
387
|
+
keyTerm = (d.keyword || d.phrase || d.topic || d.suggested_title || d.title || d.url || '').toLowerCase().trim();
|
|
388
|
+
fullText = [d.keyword, d.phrase, d.topic, d.suggested_title, d.title, d.url]
|
|
389
|
+
.filter(Boolean).join(' ').toLowerCase();
|
|
390
|
+
} catch { continue; }
|
|
391
|
+
const hit = (keyTerm && (needle.includes(keyTerm) || keyTerm.includes(needle))) || (fullText && fullText.includes(needle));
|
|
392
|
+
if (hit) { upd.run(ts, r.id); marked++; }
|
|
393
|
+
}
|
|
394
|
+
return marked;
|
|
395
|
+
}
|
|
396
|
+
|
|
331
397
|
export function upsertDomain(db, { domain, project, role }) {
|
|
332
398
|
const now = Date.now();
|
|
333
399
|
return db.prepare(`
|
package/lib/cron.js
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* lib/cron.js — Install / remove the daily `seo-intel notify` cron entry.
|
|
3
|
+
*
|
|
4
|
+
* The "user forgets to check SEO" defense from v1.5.34's delivery brainstorm.
|
|
5
|
+
* Adds a single managed crontab line tagged with a marker comment so we can
|
|
6
|
+
* find and replace/remove our own entry without touching the user's other
|
|
7
|
+
* cron jobs.
|
|
8
|
+
*
|
|
9
|
+
* macOS + Linux: uses crontab(1). On macOS the first install will prompt the
|
|
10
|
+
* user to approve calendar/automation access via the system permission dialog
|
|
11
|
+
* — that's normal, nothing we can do about it.
|
|
12
|
+
*
|
|
13
|
+
* Windows: returns ok:false with a hint pointing at Task Scheduler. Out of
|
|
14
|
+
* scope for v1.5.40.
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
import { spawnSync } from 'child_process';
|
|
18
|
+
import { fileURLToPath } from 'url';
|
|
19
|
+
import { dirname, join } from 'path';
|
|
20
|
+
|
|
21
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
22
|
+
const ROOT = join(__dirname, '..');
|
|
23
|
+
const NODE_BIN = process.execPath;
|
|
24
|
+
const MARKER = '# managed-by-seo-intel';
|
|
25
|
+
|
|
26
|
+
export const DEFAULT_SCHEDULE = '0 9 * * *'; // 9am every day
|
|
27
|
+
|
|
28
|
+
function readCrontab() {
|
|
29
|
+
const r = spawnSync('crontab', ['-l'], { encoding: 'utf8' });
|
|
30
|
+
if (r.status === 0) return r.stdout || '';
|
|
31
|
+
// status !== 0 typically means "no crontab for user yet" — return empty
|
|
32
|
+
return '';
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
function writeCrontab(content) {
|
|
36
|
+
const text = (content || '').replace(/\n*$/, '\n'); // ensure single trailing newline
|
|
37
|
+
const r = spawnSync('crontab', ['-'], { input: text, encoding: 'utf8' });
|
|
38
|
+
if (r.status !== 0) {
|
|
39
|
+
throw new Error(`crontab write failed: ${r.stderr || 'unknown error'}`);
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
function isWindows() { return process.platform === 'win32'; }
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* @returns {{ installed: boolean, line: string|null, schedule: string|null, platform: string }}
|
|
47
|
+
*/
|
|
48
|
+
export function getNotifyCronStatus() {
|
|
49
|
+
if (isWindows()) return { installed: false, line: null, schedule: null, platform: 'win32' };
|
|
50
|
+
const lines = readCrontab().split('\n').filter(l => l.includes(MARKER));
|
|
51
|
+
if (!lines.length) return { installed: false, line: null, schedule: null, platform: process.platform };
|
|
52
|
+
const line = lines[0];
|
|
53
|
+
// Schedule is the first 5 space-separated fields
|
|
54
|
+
const parts = line.trim().split(/\s+/);
|
|
55
|
+
const schedule = parts.slice(0, 5).join(' ');
|
|
56
|
+
return { installed: true, line, schedule, platform: process.platform };
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Install (or replace) the managed cron line.
|
|
61
|
+
*
|
|
62
|
+
* @param {object} [opts]
|
|
63
|
+
* @param {string} [opts.schedule] Cron schedule, default DEFAULT_SCHEDULE (9am daily)
|
|
64
|
+
* @param {boolean} [opts.openOnFire] Append `--open` flag so the dashboard opens when fired
|
|
65
|
+
* @returns {{ ok: boolean, line?: string, schedule?: string, error?: string, hint?: string }}
|
|
66
|
+
*/
|
|
67
|
+
export function installNotifyCron({ schedule = DEFAULT_SCHEDULE, openOnFire = false } = {}) {
|
|
68
|
+
if (isWindows()) {
|
|
69
|
+
return {
|
|
70
|
+
ok: false,
|
|
71
|
+
error: 'Windows not supported — use Task Scheduler manually',
|
|
72
|
+
hint: `Create a daily task running: ${NODE_BIN} "${join(ROOT, 'cli.js')}" notify`,
|
|
73
|
+
};
|
|
74
|
+
}
|
|
75
|
+
// Sanity-check schedule (5 fields, no shell metachars)
|
|
76
|
+
if (!/^[-*\/0-9, ]+$/.test(schedule) || schedule.trim().split(/\s+/).length !== 5) {
|
|
77
|
+
return { ok: false, error: `Invalid cron schedule "${schedule}". Expected 5 fields, e.g. "0 9 * * *".` };
|
|
78
|
+
}
|
|
79
|
+
const cmd = `cd ${ROOT} && ${NODE_BIN} cli.js notify${openOnFire ? ' --open' : ''}`;
|
|
80
|
+
const newLine = `${schedule} ${cmd} ${MARKER}`;
|
|
81
|
+
const current = readCrontab();
|
|
82
|
+
const kept = current.split('\n').filter(l => l && !l.includes(MARKER));
|
|
83
|
+
kept.push(newLine);
|
|
84
|
+
try {
|
|
85
|
+
writeCrontab(kept.join('\n'));
|
|
86
|
+
return { ok: true, line: newLine, schedule };
|
|
87
|
+
} catch (e) {
|
|
88
|
+
return { ok: false, error: e.message };
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/**
|
|
93
|
+
* Remove the managed cron line (if any). Idempotent.
|
|
94
|
+
* @returns {{ ok: boolean, removed: boolean, error?: string }}
|
|
95
|
+
*/
|
|
96
|
+
export function removeNotifyCron() {
|
|
97
|
+
if (isWindows()) return { ok: true, removed: false }; // nothing to remove
|
|
98
|
+
const current = readCrontab();
|
|
99
|
+
const before = current.split('\n').filter(Boolean).length;
|
|
100
|
+
const kept = current.split('\n').filter(l => l && !l.includes(MARKER));
|
|
101
|
+
if (kept.length === before) return { ok: true, removed: false };
|
|
102
|
+
try {
|
|
103
|
+
writeCrontab(kept.join('\n'));
|
|
104
|
+
return { ok: true, removed: true };
|
|
105
|
+
} catch (e) {
|
|
106
|
+
return { ok: false, removed: false, error: e.message };
|
|
107
|
+
}
|
|
108
|
+
}
|
package/lib/gate.js
CHANGED
|
@@ -65,6 +65,36 @@ const FEATURE_NAMES = {
|
|
|
65
65
|
'intel-competitor': 'Intel Competitor Digest (AI-agent-ready)',
|
|
66
66
|
};
|
|
67
67
|
|
|
68
|
+
// ── Free features (v1.5.41 monetization line) ───────────────────────────────
|
|
69
|
+
//
|
|
70
|
+
// Analysis of YOUR OWN site is free: a capable agent commoditizes one-shot
|
|
71
|
+
// analysis anyway, so we give it away to make the free tier a genuine
|
|
72
|
+
// "complete SEO brain" — local, private, zero flagship tokens on grunt work.
|
|
73
|
+
//
|
|
74
|
+
// The paywall sits on what an agent structurally CANNOT do for itself:
|
|
75
|
+
// • Competitors — analyze, shallow, decay, headings-audit, entities,
|
|
76
|
+
// friction, competitive, gap-intel, intel-competitor
|
|
77
|
+
// • Automation — run (scheduler)
|
|
78
|
+
// • History — brief ("what changed"), velocity (publish-rate over time)
|
|
79
|
+
//
|
|
80
|
+
// Anything listed here passes requirePro() regardless of license tier.
|
|
81
|
+
const FREE_FEATURES = new Set([
|
|
82
|
+
'extract', // local Ollama labor — the grunt work that powers the free brain
|
|
83
|
+
'aeo', // AI citability scoring (pure function, own site)
|
|
84
|
+
'keywords', // keyword intelligence (own site)
|
|
85
|
+
'templates', // programmatic template detection (own site)
|
|
86
|
+
'orphans', // orphan entity detection (own site)
|
|
87
|
+
'js-delta', // JS rendering delta (own site, technical)
|
|
88
|
+
'blog-draft', // AEO blog draft from the Ledger (own site, content)
|
|
89
|
+
'html', // HTML dashboard
|
|
90
|
+
'html-all', // HTML dashboard (all projects)
|
|
91
|
+
'gsc-insights', // Search Console intelligence (own site)
|
|
92
|
+
'intel-audit', // agent-ready audit digest (own site)
|
|
93
|
+
'intel-blog', // agent-ready blog digest (own site)
|
|
94
|
+
'scan', // one-shot single-domain audit — the first-touch "try it" command
|
|
95
|
+
'loop', // content-loop orchestrator (own-site gaps → draft → queue)
|
|
96
|
+
]);
|
|
97
|
+
|
|
68
98
|
// ── CLI Gate — blocks command and shows upgrade message ──────────────────────
|
|
69
99
|
|
|
70
100
|
/**
|
|
@@ -75,6 +105,7 @@ const FEATURE_NAMES = {
|
|
|
75
105
|
* @returns {boolean}
|
|
76
106
|
*/
|
|
77
107
|
export function requirePro(feature) {
|
|
108
|
+
if (FREE_FEATURES.has(feature)) return true;
|
|
78
109
|
if (isPro()) return true;
|
|
79
110
|
|
|
80
111
|
const displayName = FEATURE_NAMES[feature] || feature;
|
|
@@ -178,7 +209,8 @@ export function printLicenseStatus() {
|
|
|
178
209
|
if (license.stale) console.log(`\x1b[33m ⚠ License cache stale — will re-validate on next network access${RESET}`);
|
|
179
210
|
} else {
|
|
180
211
|
console.log(`${DIM} SEO Intel Free${RESET}`);
|
|
181
|
-
console.log(`${DIM} Unlimited crawl ·
|
|
212
|
+
console.log(`${DIM} Unlimited crawl · Full site analysis · Dashboard · Local & private${RESET}`);
|
|
213
|
+
console.log(`${DIM} Solo adds: competitors · scheduled crawls · history & trends${RESET}`);
|
|
182
214
|
if (license.invalidKey) {
|
|
183
215
|
console.log(`\x1b[33m ⚠ ${license.reason}${RESET}`);
|
|
184
216
|
}
|
package/lib/intel.js
CHANGED
|
@@ -8,10 +8,14 @@
|
|
|
8
8
|
*
|
|
9
9
|
* Slices:
|
|
10
10
|
* raw (free) — page/keyword/heading inventory, no analysis
|
|
11
|
-
* audit (
|
|
12
|
-
* blog (
|
|
11
|
+
* audit (free) — citability + technical + active insights (your own site)
|
|
12
|
+
* blog (free) — gaps + tone hints for drafting (your own site)
|
|
13
13
|
* competitor (paid) — competitor summary + schema landscape
|
|
14
14
|
*
|
|
15
|
+
* Monetization line (v1.5.41): analysis of YOUR OWN site is free — a smart
|
|
16
|
+
* agent commoditizes one-shot analysis anyway. The paywall sits on the things
|
|
17
|
+
* an agent structurally can't do for itself: competitors, automation, history.
|
|
18
|
+
*
|
|
15
19
|
* Output is a stable structured object — agents should be able to chain calls
|
|
16
20
|
* without prompt gymnastics. Keep the schema additive across versions.
|
|
17
21
|
*/
|
|
@@ -26,7 +30,9 @@ const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
|
26
30
|
const VERSION = JSON.parse(readFileSync(join(__dirname, '..', 'package.json'), 'utf8')).version;
|
|
27
31
|
|
|
28
32
|
export const INTEL_SLICES = ['raw', 'audit', 'blog', 'competitor'];
|
|
29
|
-
|
|
33
|
+
// Own-site slices are free; only the competitor slice (data the agent can't
|
|
34
|
+
// gather on its own) requires Solo.
|
|
35
|
+
export const FREE_SLICES = ['raw', 'audit', 'blog'];
|
|
30
36
|
|
|
31
37
|
/**
|
|
32
38
|
* @param {import('node:sqlite').DatabaseSync} db
|