seo-intel 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +41 -0
- package/LICENSE +75 -0
- package/README.md +243 -0
- package/Start SEO Intel.bat +9 -0
- package/Start SEO Intel.command +8 -0
- package/cli.js +3727 -0
- package/config/example.json +29 -0
- package/config/setup-wizard.js +522 -0
- package/crawler/index.js +566 -0
- package/crawler/robots.js +103 -0
- package/crawler/sanitize.js +124 -0
- package/crawler/schema-parser.js +168 -0
- package/crawler/sitemap.js +103 -0
- package/crawler/stealth.js +393 -0
- package/crawler/subdomain-discovery.js +341 -0
- package/db/db.js +213 -0
- package/db/schema.sql +120 -0
- package/exports/competitive.js +186 -0
- package/exports/heuristics.js +67 -0
- package/exports/queries.js +197 -0
- package/exports/suggestive.js +230 -0
- package/exports/technical.js +180 -0
- package/exports/templates.js +77 -0
- package/lib/gate.js +204 -0
- package/lib/license.js +369 -0
- package/lib/oauth.js +432 -0
- package/lib/updater.js +324 -0
- package/package.json +68 -0
- package/reports/generate-html.js +6194 -0
- package/reports/generate-site-graph.js +949 -0
- package/reports/gsc-loader.js +190 -0
- package/scheduler.js +142 -0
- package/seo-audit.js +619 -0
- package/seo-intel.png +0 -0
- package/server.js +602 -0
- package/setup/ROADMAP.md +109 -0
- package/setup/checks.js +483 -0
- package/setup/config-builder.js +227 -0
- package/setup/engine.js +65 -0
- package/setup/installers.js +197 -0
- package/setup/models.js +328 -0
- package/setup/openclaw-bridge.js +329 -0
- package/setup/validator.js +395 -0
- package/setup/web-routes.js +688 -0
- package/setup/wizard.html +2920 -0
- package/start-seo-intel.sh +8 -0
|
@@ -0,0 +1,341 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* SEO Intel — Subdomain Discovery
|
|
3
|
+
*
|
|
4
|
+
* Finds subdomains for a root domain using multiple passive + active techniques.
|
|
5
|
+
* No bruteforce — uses public data sources + crawl data + DNS checks.
|
|
6
|
+
*
|
|
7
|
+
* Methods (in order of speed/reliability):
|
|
8
|
+
* 1. Certificate Transparency logs (crt.sh) — free, fast, comprehensive
|
|
9
|
+
* 2. Crawl data mining — check links already in our DB for subdomains
|
|
10
|
+
* 3. Common subdomain probe — check well-known subdomains (docs, api, app, etc.)
|
|
11
|
+
* 4. DNS verification — confirm discovered subdomains actually resolve
|
|
12
|
+
*
|
|
13
|
+
* Usage:
|
|
14
|
+
* import { discoverSubdomains } from './subdomain-discovery.js';
|
|
15
|
+
* const results = await discoverSubdomains('example.com', { db });
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
import { resolve as dnsResolve } from 'dns';
|
|
19
|
+
import { promisify } from 'util';
|
|
20
|
+
import { fetchSitemap } from './sitemap.js';
|
|
21
|
+
|
|
22
|
+
const resolveDns = promisify(dnsResolve);
|
|
23
|
+
|
|
24
|
+
// Common subdomains to probe (prioritized by SEO relevance)
|
|
25
|
+
const COMMON_SUBDOMAINS = [
|
|
26
|
+
'www', 'docs', 'blog', 'app', 'api', 'dl', 'cdn',
|
|
27
|
+
'rpc', 'status', 'dashboard', 'portal', 'help', 'support',
|
|
28
|
+
'dev', 'staging', 'beta', 'shop', 'store', 'mail',
|
|
29
|
+
'admin', 'auth', 'accounts', 'community', 'forum',
|
|
30
|
+
'learn', 'academy', 'wiki', 'kb', 'changelog',
|
|
31
|
+
];
|
|
32
|
+
|
|
33
|
+
// ── Certificate Transparency (crt.sh) ─────────────────────────────────────
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Query crt.sh for all subdomains seen in SSL certificates.
|
|
37
|
+
* This is the most comprehensive passive method — catches subdomains
|
|
38
|
+
* that were ever issued a cert, even if they're no longer active.
|
|
39
|
+
*/
|
|
40
|
+
async function queryCrtSh(rootDomain) {
|
|
41
|
+
const controller = new AbortController();
|
|
42
|
+
const timeout = setTimeout(() => controller.abort(), 15000);
|
|
43
|
+
|
|
44
|
+
try {
|
|
45
|
+
const url = `https://crt.sh/?q=%25.${encodeURIComponent(rootDomain)}&output=json`;
|
|
46
|
+
const res = await fetch(url, {
|
|
47
|
+
signal: controller.signal,
|
|
48
|
+
headers: { 'Accept': 'application/json' },
|
|
49
|
+
});
|
|
50
|
+
|
|
51
|
+
if (!res.ok) return [];
|
|
52
|
+
|
|
53
|
+
const data = await res.json();
|
|
54
|
+
const subdomains = new Set();
|
|
55
|
+
|
|
56
|
+
for (const entry of data) {
|
|
57
|
+
const name = (entry.name_value || '').toLowerCase();
|
|
58
|
+
// crt.sh returns wildcard and multi-line entries
|
|
59
|
+
for (const line of name.split('\n')) {
|
|
60
|
+
const cleaned = line.trim().replace(/^\*\./, '');
|
|
61
|
+
if (cleaned.endsWith('.' + rootDomain) || cleaned === rootDomain) {
|
|
62
|
+
subdomains.add(cleaned);
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
return [...subdomains];
|
|
68
|
+
} catch {
|
|
69
|
+
return [];
|
|
70
|
+
} finally {
|
|
71
|
+
clearTimeout(timeout);
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
// ── Crawl Data Mining ──────────────────────────────────────────────────────
|
|
76
|
+
|
|
77
|
+
/**
|
|
78
|
+
* Scan existing crawl data for links pointing to subdomains.
|
|
79
|
+
* Free — uses data we already have.
|
|
80
|
+
*/
|
|
81
|
+
function mineFromCrawlData(rootDomain, db) {
|
|
82
|
+
if (!db) return [];
|
|
83
|
+
|
|
84
|
+
try {
|
|
85
|
+
// Check all URLs we've seen in links table
|
|
86
|
+
const rows = db.prepare(`
|
|
87
|
+
SELECT DISTINCT target_url FROM links
|
|
88
|
+
WHERE target_url LIKE '%${rootDomain}%'
|
|
89
|
+
`).all();
|
|
90
|
+
|
|
91
|
+
const subdomains = new Set();
|
|
92
|
+
for (const row of rows) {
|
|
93
|
+
try {
|
|
94
|
+
const u = new URL(row.target_url);
|
|
95
|
+
if (u.hostname.endsWith('.' + rootDomain) || u.hostname === rootDomain) {
|
|
96
|
+
subdomains.add(u.hostname);
|
|
97
|
+
}
|
|
98
|
+
} catch { /* skip invalid URLs */ }
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
// Also check page URLs
|
|
102
|
+
const pages = db.prepare(`
|
|
103
|
+
SELECT DISTINCT url FROM pages
|
|
104
|
+
WHERE url LIKE '%${rootDomain}%'
|
|
105
|
+
`).all();
|
|
106
|
+
|
|
107
|
+
for (const row of pages) {
|
|
108
|
+
try {
|
|
109
|
+
const u = new URL(row.url);
|
|
110
|
+
if (u.hostname.endsWith('.' + rootDomain) || u.hostname === rootDomain) {
|
|
111
|
+
subdomains.add(u.hostname);
|
|
112
|
+
}
|
|
113
|
+
} catch { /* skip */ }
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
return [...subdomains];
|
|
117
|
+
} catch {
|
|
118
|
+
return [];
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
// ── Common Subdomain Probe ─────────────────────────────────────────────────
|
|
123
|
+
|
|
124
|
+
/**
|
|
125
|
+
* Probe well-known subdomains via DNS lookup.
|
|
126
|
+
* Fast — just DNS queries, no HTTP requests.
|
|
127
|
+
*/
|
|
128
|
+
async function probeCommonSubdomains(rootDomain) {
|
|
129
|
+
const found = [];
|
|
130
|
+
|
|
131
|
+
const checks = COMMON_SUBDOMAINS.map(async (sub) => {
|
|
132
|
+
const hostname = `${sub}.${rootDomain}`;
|
|
133
|
+
try {
|
|
134
|
+
await resolveDns(hostname);
|
|
135
|
+
found.push(hostname);
|
|
136
|
+
} catch {
|
|
137
|
+
// NXDOMAIN — doesn't exist
|
|
138
|
+
}
|
|
139
|
+
});
|
|
140
|
+
|
|
141
|
+
await Promise.all(checks);
|
|
142
|
+
return found;
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
// ── DNS Verification ───────────────────────────────────────────────────────
|
|
146
|
+
|
|
147
|
+
/**
|
|
148
|
+
* Verify a list of hostnames actually resolve via DNS.
|
|
149
|
+
* Filters out expired/dead subdomains from crt.sh results.
|
|
150
|
+
*/
|
|
151
|
+
async function verifyDns(hostnames) {
|
|
152
|
+
const verified = [];
|
|
153
|
+
|
|
154
|
+
const checks = hostnames.map(async (hostname) => {
|
|
155
|
+
try {
|
|
156
|
+
const addrs = await resolveDns(hostname);
|
|
157
|
+
if (addrs && addrs.length > 0) {
|
|
158
|
+
verified.push({ hostname, ip: addrs[0] });
|
|
159
|
+
}
|
|
160
|
+
} catch {
|
|
161
|
+
// Dead subdomain — skip
|
|
162
|
+
}
|
|
163
|
+
});
|
|
164
|
+
|
|
165
|
+
await Promise.all(checks);
|
|
166
|
+
return verified;
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
// ── HTTP Liveness Check ────────────────────────────────────────────────────
|
|
170
|
+
|
|
171
|
+
/**
|
|
172
|
+
* Quick HTTP check to see if a subdomain serves content.
|
|
173
|
+
* Returns status code and basic page info.
|
|
174
|
+
*/
|
|
175
|
+
async function checkHttp(hostname) {
|
|
176
|
+
const controller = new AbortController();
|
|
177
|
+
const timeout = setTimeout(() => controller.abort(), 8000);
|
|
178
|
+
|
|
179
|
+
try {
|
|
180
|
+
const res = await fetch(`https://${hostname}`, {
|
|
181
|
+
signal: controller.signal,
|
|
182
|
+
redirect: 'follow',
|
|
183
|
+
headers: {
|
|
184
|
+
'User-Agent': 'Mozilla/5.0 (compatible; SEOIntelBot/1.0; +https://froggo.pro/seo-intel/bot)',
|
|
185
|
+
},
|
|
186
|
+
});
|
|
187
|
+
|
|
188
|
+
const finalUrl = res.url;
|
|
189
|
+
const status = res.status;
|
|
190
|
+
const contentType = res.headers.get('content-type') || '';
|
|
191
|
+
const isHtml = contentType.includes('text/html');
|
|
192
|
+
|
|
193
|
+
// Read just enough to check if it's a real page
|
|
194
|
+
let title = null;
|
|
195
|
+
if (isHtml) {
|
|
196
|
+
const text = await res.text();
|
|
197
|
+
const titleMatch = text.match(/<title[^>]*>([^<]+)</i);
|
|
198
|
+
title = titleMatch ? titleMatch[1].trim() : null;
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
return {
|
|
202
|
+
hostname,
|
|
203
|
+
status,
|
|
204
|
+
finalUrl,
|
|
205
|
+
isHtml,
|
|
206
|
+
title,
|
|
207
|
+
redirected: new URL(finalUrl).hostname !== hostname,
|
|
208
|
+
redirectTarget: new URL(finalUrl).hostname !== hostname ? new URL(finalUrl).hostname : null,
|
|
209
|
+
};
|
|
210
|
+
} catch (err) {
|
|
211
|
+
return {
|
|
212
|
+
hostname,
|
|
213
|
+
status: 0,
|
|
214
|
+
error: err.code || err.message || 'unknown',
|
|
215
|
+
isHtml: false,
|
|
216
|
+
title: null,
|
|
217
|
+
};
|
|
218
|
+
} finally {
|
|
219
|
+
clearTimeout(timeout);
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
// ── Main Discovery Function ────────────────────────────────────────────────
|
|
224
|
+
|
|
225
|
+
/**
|
|
226
|
+
* Discover all subdomains for a root domain.
|
|
227
|
+
*
|
|
228
|
+
* @param {string} rootDomain - e.g. "example.com"
|
|
229
|
+
* @param {object} opts
|
|
230
|
+
* @param {object} [opts.db] - SQLite database (for crawl data mining)
|
|
231
|
+
* @param {boolean} [opts.httpCheck=true] - also check HTTP liveness
|
|
232
|
+
* @param {function} [opts.onProgress] - callback({ phase, found, total })
|
|
233
|
+
* @returns {Promise<SubdomainResult>}
|
|
234
|
+
*/
|
|
235
|
+
export async function discoverSubdomains(rootDomain, opts = {}) {
|
|
236
|
+
const { db, httpCheck = true, onProgress } = opts;
|
|
237
|
+
|
|
238
|
+
const allFound = new Set();
|
|
239
|
+
const sources = {};
|
|
240
|
+
|
|
241
|
+
// Phase 1: Certificate Transparency
|
|
242
|
+
if (onProgress) onProgress({ phase: 'crt.sh', message: 'Checking certificate transparency logs...' });
|
|
243
|
+
const crtResults = await queryCrtSh(rootDomain);
|
|
244
|
+
for (const d of crtResults) allFound.add(d);
|
|
245
|
+
sources['crt.sh'] = crtResults.length;
|
|
246
|
+
|
|
247
|
+
// Phase 2: Crawl data mining
|
|
248
|
+
if (db) {
|
|
249
|
+
if (onProgress) onProgress({ phase: 'crawl-data', message: 'Mining existing crawl data...' });
|
|
250
|
+
const crawlResults = mineFromCrawlData(rootDomain, db);
|
|
251
|
+
for (const d of crawlResults) allFound.add(d);
|
|
252
|
+
sources['crawl-data'] = crawlResults.length;
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
// Phase 3: Common subdomain probe
|
|
256
|
+
if (onProgress) onProgress({ phase: 'dns-probe', message: 'Probing common subdomains...' });
|
|
257
|
+
const probeResults = await probeCommonSubdomains(rootDomain);
|
|
258
|
+
for (const d of probeResults) allFound.add(d);
|
|
259
|
+
sources['dns-probe'] = probeResults.length;
|
|
260
|
+
|
|
261
|
+
// Phase 4: DNS verification (filter dead ones from crt.sh)
|
|
262
|
+
if (onProgress) onProgress({ phase: 'dns-verify', message: `Verifying ${allFound.size} subdomains via DNS...` });
|
|
263
|
+
const verified = await verifyDns([...allFound]);
|
|
264
|
+
const liveHostnames = new Set(verified.map(v => v.hostname));
|
|
265
|
+
|
|
266
|
+
// Phase 5: HTTP liveness check (optional)
|
|
267
|
+
let httpResults = [];
|
|
268
|
+
if (httpCheck) {
|
|
269
|
+
if (onProgress) onProgress({ phase: 'http-check', message: `Checking HTTP on ${liveHostnames.size} live subdomains...` });
|
|
270
|
+
|
|
271
|
+
// Check in batches of 5 to not overwhelm
|
|
272
|
+
const liveList = [...liveHostnames];
|
|
273
|
+
for (let i = 0; i < liveList.length; i += 5) {
|
|
274
|
+
const batch = liveList.slice(i, i + 5);
|
|
275
|
+
const results = await Promise.all(batch.map(h => checkHttp(h)));
|
|
276
|
+
httpResults.push(...results);
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
// Phase 6: Sitemap check — get page counts for SEO-relevant subdomains
|
|
281
|
+
const sitemapResults = new Map();
|
|
282
|
+
const seoLive = httpResults.filter(r => r.isHtml && r.status === 200 && !r.redirected);
|
|
283
|
+
|
|
284
|
+
if (seoLive.length > 0) {
|
|
285
|
+
if (onProgress) onProgress({ phase: 'sitemaps', message: `Checking sitemaps on ${seoLive.length} live subdomains...` });
|
|
286
|
+
|
|
287
|
+
// Check sitemaps in batches of 3
|
|
288
|
+
for (let i = 0; i < seoLive.length; i += 3) {
|
|
289
|
+
const batch = seoLive.slice(i, i + 3);
|
|
290
|
+
const results = await Promise.all(batch.map(async (r) => {
|
|
291
|
+
try {
|
|
292
|
+
const urls = await fetchSitemap(`https://${r.hostname}`);
|
|
293
|
+
return { hostname: r.hostname, urls };
|
|
294
|
+
} catch {
|
|
295
|
+
return { hostname: r.hostname, urls: [] };
|
|
296
|
+
}
|
|
297
|
+
}));
|
|
298
|
+
for (const r of results) sitemapResults.set(r.hostname, r.urls);
|
|
299
|
+
}
|
|
300
|
+
sources['sitemaps'] = [...sitemapResults.values()].reduce((sum, urls) => sum + urls.length, 0);
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
// Build final result
|
|
304
|
+
const subdomains = [...liveHostnames].sort().map(hostname => {
|
|
305
|
+
const http = httpResults.find(r => r.hostname === hostname) || {};
|
|
306
|
+
const dns = verified.find(v => v.hostname === hostname) || {};
|
|
307
|
+
const isRoot = hostname === rootDomain;
|
|
308
|
+
const sub = isRoot ? '(root)' : hostname.replace('.' + rootDomain, '');
|
|
309
|
+
const sitemap = sitemapResults.get(hostname) || [];
|
|
310
|
+
|
|
311
|
+
return {
|
|
312
|
+
hostname,
|
|
313
|
+
subdomain: sub,
|
|
314
|
+
isRoot,
|
|
315
|
+
ip: dns.ip || null,
|
|
316
|
+
httpStatus: http.status || null,
|
|
317
|
+
title: http.title || null,
|
|
318
|
+
isHtml: http.isHtml || false,
|
|
319
|
+
redirected: http.redirected || false,
|
|
320
|
+
redirectTarget: http.redirectTarget || null,
|
|
321
|
+
error: http.error || null,
|
|
322
|
+
sitemapUrls: sitemap.length,
|
|
323
|
+
sitemapSample: sitemap.slice(0, 5).map(u => u.url || u),
|
|
324
|
+
// SEO relevance score
|
|
325
|
+
seoRelevant: http.isHtml && http.status === 200 && !http.redirected,
|
|
326
|
+
};
|
|
327
|
+
});
|
|
328
|
+
|
|
329
|
+
// Total sitemap URLs across all subdomains
|
|
330
|
+
const totalSitemapUrls = subdomains.reduce((sum, s) => sum + s.sitemapUrls, 0);
|
|
331
|
+
|
|
332
|
+
return {
|
|
333
|
+
rootDomain,
|
|
334
|
+
discovered: subdomains.length,
|
|
335
|
+
live: subdomains.filter(s => s.httpStatus === 200).length,
|
|
336
|
+
seoRelevant: subdomains.filter(s => s.seoRelevant).length,
|
|
337
|
+
totalSitemapUrls,
|
|
338
|
+
sources,
|
|
339
|
+
subdomains,
|
|
340
|
+
};
|
|
341
|
+
}
|
package/db/db.js
ADDED
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
import { DatabaseSync } from 'node:sqlite';
|
|
2
|
+
import { readFileSync } from 'fs';
|
|
3
|
+
import { fileURLToPath } from 'url';
|
|
4
|
+
import { dirname, join } from 'path';
|
|
5
|
+
|
|
6
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
7
|
+
|
|
8
|
+
let _db = null;
|
|
9
|
+
|
|
10
|
+
export function getDb(dbPath = './seo-intel.db') {
|
|
11
|
+
if (_db) return _db;
|
|
12
|
+
_db = new DatabaseSync(dbPath);
|
|
13
|
+
_db.exec('PRAGMA journal_mode = WAL');
|
|
14
|
+
_db.exec('PRAGMA busy_timeout = 10000');
|
|
15
|
+
_db.exec('PRAGMA foreign_keys = ON');
|
|
16
|
+
|
|
17
|
+
// Apply schema
|
|
18
|
+
const schema = readFileSync(join(__dirname, 'schema.sql'), 'utf8');
|
|
19
|
+
_db.exec(schema);
|
|
20
|
+
|
|
21
|
+
// Migrations for existing databases
|
|
22
|
+
try { _db.exec('ALTER TABLE pages ADD COLUMN content_hash TEXT'); } catch { /* already exists */ }
|
|
23
|
+
try { _db.exec('ALTER TABLE pages ADD COLUMN first_seen_at INTEGER'); } catch { /* already exists */ }
|
|
24
|
+
|
|
25
|
+
// Backfill first_seen_at from crawled_at for existing rows
|
|
26
|
+
_db.exec('UPDATE pages SET first_seen_at = crawled_at WHERE first_seen_at IS NULL');
|
|
27
|
+
|
|
28
|
+
// page_schemas table is created by schema.sql — no migration needed (new table)
|
|
29
|
+
|
|
30
|
+
return _db;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export function upsertDomain(db, { domain, project, role }) {
|
|
34
|
+
const now = Date.now();
|
|
35
|
+
return db.prepare(`
|
|
36
|
+
INSERT INTO domains (domain, project, role, first_seen, last_crawled)
|
|
37
|
+
VALUES (?, ?, ?, ?, ?)
|
|
38
|
+
ON CONFLICT(domain) DO UPDATE SET
|
|
39
|
+
project = excluded.project,
|
|
40
|
+
role = excluded.role,
|
|
41
|
+
last_crawled = excluded.last_crawled
|
|
42
|
+
`).run(domain, project, role, now, now);
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
export function upsertPage(db, { domainId, url, statusCode, wordCount, loadMs, isIndexable, clickDepth = 0, publishedDate = null, modifiedDate = null, contentHash = null }) {
|
|
46
|
+
const now = Date.now();
|
|
47
|
+
db.prepare(`
|
|
48
|
+
INSERT INTO pages (domain_id, url, crawled_at, first_seen_at, status_code, word_count, load_ms, is_indexable, click_depth, published_date, modified_date, content_hash)
|
|
49
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
50
|
+
ON CONFLICT(url) DO UPDATE SET
|
|
51
|
+
crawled_at = excluded.crawled_at,
|
|
52
|
+
status_code = excluded.status_code,
|
|
53
|
+
word_count = excluded.word_count,
|
|
54
|
+
load_ms = excluded.load_ms,
|
|
55
|
+
click_depth = excluded.click_depth,
|
|
56
|
+
published_date = excluded.published_date,
|
|
57
|
+
modified_date = excluded.modified_date,
|
|
58
|
+
content_hash = excluded.content_hash
|
|
59
|
+
`).run(domainId, url, now, now, statusCode, wordCount, loadMs, isIndexable ? 1 : 0, clickDepth, publishedDate, modifiedDate, contentHash);
|
|
60
|
+
// first_seen_at is NOT in the ON CONFLICT UPDATE — it stays from original INSERT
|
|
61
|
+
return db.prepare('SELECT id FROM pages WHERE url = ?').get(url);
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
export function getPageHash(db, url) {
|
|
65
|
+
return db.prepare('SELECT content_hash FROM pages WHERE url = ?').get(url)?.content_hash || null;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
export function insertExtraction(db, { pageId, data }) {
|
|
69
|
+
if (!pageId) {
|
|
70
|
+
console.warn('[db] insertExtraction skipped: pageId is missing');
|
|
71
|
+
return null;
|
|
72
|
+
}
|
|
73
|
+
return db.prepare(`
|
|
74
|
+
INSERT OR REPLACE INTO extractions
|
|
75
|
+
(page_id, title, meta_desc, h1, product_type, pricing_tier, cta_primary,
|
|
76
|
+
tech_stack, schema_types, search_intent, primary_entities, extracted_at)
|
|
77
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
78
|
+
`).run(
|
|
79
|
+
pageId, data.title, data.meta_desc, data.h1,
|
|
80
|
+
data.product_type, data.pricing_tier, data.cta_primary,
|
|
81
|
+
JSON.stringify(data.tech_stack || []),
|
|
82
|
+
JSON.stringify(data.schema_types || []),
|
|
83
|
+
data.search_intent || 'Informational',
|
|
84
|
+
JSON.stringify(data.primary_entities || []),
|
|
85
|
+
Date.now()
|
|
86
|
+
);
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
export function insertKeywords(db, pageId, keywords) {
|
|
90
|
+
const stmt = db.prepare(`INSERT INTO keywords (page_id, keyword, location) VALUES (?, ?, ?)`);
|
|
91
|
+
db.exec('BEGIN');
|
|
92
|
+
try {
|
|
93
|
+
for (const kw of keywords) stmt.run(pageId, kw.keyword.toLowerCase(), kw.location);
|
|
94
|
+
db.exec('COMMIT');
|
|
95
|
+
} catch (e) { db.exec('ROLLBACK'); throw e; }
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
export function insertHeadings(db, pageId, headings) {
|
|
99
|
+
const stmt = db.prepare(`INSERT INTO headings (page_id, level, text) VALUES (?, ?, ?)`);
|
|
100
|
+
db.exec('BEGIN');
|
|
101
|
+
try {
|
|
102
|
+
for (const h of headings) stmt.run(pageId, h.level, h.text);
|
|
103
|
+
db.exec('COMMIT');
|
|
104
|
+
} catch (e) { db.exec('ROLLBACK'); throw e; }
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
export function insertLinks(db, sourceId, links) {
|
|
108
|
+
const stmt = db.prepare(`INSERT INTO links (source_id, target_url, anchor_text, is_internal) VALUES (?, ?, ?, ?)`);
|
|
109
|
+
db.exec('BEGIN');
|
|
110
|
+
try {
|
|
111
|
+
for (const l of links) stmt.run(sourceId, l.url, l.anchor, l.isInternal ? 1 : 0);
|
|
112
|
+
db.exec('COMMIT');
|
|
113
|
+
} catch (e) { db.exec('ROLLBACK'); throw e; }
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
export function insertPageSchemas(db, pageId, schemas) {
|
|
117
|
+
// Clear old schemas for this page (re-crawl overwrites)
|
|
118
|
+
db.prepare('DELETE FROM page_schemas WHERE page_id = ?').run(pageId);
|
|
119
|
+
if (!schemas || schemas.length === 0) return;
|
|
120
|
+
|
|
121
|
+
const stmt = db.prepare(`
|
|
122
|
+
INSERT INTO page_schemas
|
|
123
|
+
(page_id, schema_type, name, description, rating, rating_count,
|
|
124
|
+
price, currency, author, date_published, date_modified, image_url,
|
|
125
|
+
raw_json, extracted_at)
|
|
126
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
127
|
+
`);
|
|
128
|
+
db.exec('BEGIN');
|
|
129
|
+
try {
|
|
130
|
+
for (const s of schemas) {
|
|
131
|
+
stmt.run(
|
|
132
|
+
pageId,
|
|
133
|
+
s.type,
|
|
134
|
+
s.name || null,
|
|
135
|
+
s.description?.slice(0, 500) || null,
|
|
136
|
+
s.rating ?? null,
|
|
137
|
+
s.ratingCount ?? null,
|
|
138
|
+
s.price || null,
|
|
139
|
+
s.currency || null,
|
|
140
|
+
s.author || null,
|
|
141
|
+
s.datePublished || null,
|
|
142
|
+
s.dateModified || null,
|
|
143
|
+
s.imageUrl || null,
|
|
144
|
+
JSON.stringify(s.raw),
|
|
145
|
+
Date.now()
|
|
146
|
+
);
|
|
147
|
+
}
|
|
148
|
+
db.exec('COMMIT');
|
|
149
|
+
} catch (e) { db.exec('ROLLBACK'); throw e; }
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
export function getSchemasByProject(db, project) {
|
|
153
|
+
return db.prepare(`
|
|
154
|
+
SELECT
|
|
155
|
+
d.domain, d.role, p.url,
|
|
156
|
+
ps.schema_type, ps.name, ps.description,
|
|
157
|
+
ps.rating, ps.rating_count,
|
|
158
|
+
ps.price, ps.currency,
|
|
159
|
+
ps.author, ps.date_published, ps.date_modified,
|
|
160
|
+
ps.image_url, ps.raw_json
|
|
161
|
+
FROM page_schemas ps
|
|
162
|
+
JOIN pages p ON p.id = ps.page_id
|
|
163
|
+
JOIN domains d ON d.id = p.domain_id
|
|
164
|
+
WHERE d.project = ?
|
|
165
|
+
ORDER BY d.domain, ps.schema_type
|
|
166
|
+
`).all(project);
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
export function getCompetitorSummary(db, project) {
|
|
170
|
+
return db.prepare(`
|
|
171
|
+
SELECT
|
|
172
|
+
d.domain,
|
|
173
|
+
d.role,
|
|
174
|
+
COUNT(DISTINCT p.id) as page_count,
|
|
175
|
+
AVG(p.word_count) as avg_word_count,
|
|
176
|
+
GROUP_CONCAT(DISTINCT e.product_type) as product_types,
|
|
177
|
+
GROUP_CONCAT(DISTINCT e.pricing_tier) as pricing_tiers,
|
|
178
|
+
GROUP_CONCAT(DISTINCT e.cta_primary) as ctas
|
|
179
|
+
FROM domains d
|
|
180
|
+
JOIN pages p ON p.domain_id = d.id
|
|
181
|
+
LEFT JOIN extractions e ON e.page_id = p.id
|
|
182
|
+
WHERE d.project = ?
|
|
183
|
+
GROUP BY d.domain, d.role
|
|
184
|
+
`).all(project);
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
export function getKeywordMatrix(db, project) {
|
|
188
|
+
return db.prepare(`
|
|
189
|
+
SELECT
|
|
190
|
+
k.keyword,
|
|
191
|
+
d.domain,
|
|
192
|
+
d.role,
|
|
193
|
+
k.location,
|
|
194
|
+
COUNT(*) as freq
|
|
195
|
+
FROM keywords k
|
|
196
|
+
JOIN pages p ON p.id = k.page_id
|
|
197
|
+
JOIN domains d ON d.id = p.domain_id
|
|
198
|
+
WHERE d.project = ?
|
|
199
|
+
GROUP BY k.keyword, d.domain
|
|
200
|
+
ORDER BY freq DESC
|
|
201
|
+
`).all(project);
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
export function getHeadingStructure(db, project) {
|
|
205
|
+
return db.prepare(`
|
|
206
|
+
SELECT d.domain, d.role, h.level, h.text
|
|
207
|
+
FROM headings h
|
|
208
|
+
JOIN pages p ON p.id = h.page_id
|
|
209
|
+
JOIN domains d ON d.id = p.domain_id
|
|
210
|
+
WHERE d.project = ?
|
|
211
|
+
ORDER BY d.domain, h.level
|
|
212
|
+
`).all(project);
|
|
213
|
+
}
|
package/db/schema.sql
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
-- SEO Intel Database Schema
|
|
2
|
+
|
|
3
|
+
CREATE TABLE IF NOT EXISTS domains (
|
|
4
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
5
|
+
domain TEXT UNIQUE NOT NULL,
|
|
6
|
+
project TEXT NOT NULL, -- e.g. 'mysite'
|
|
7
|
+
role TEXT NOT NULL, -- 'target' | 'competitor'
|
|
8
|
+
first_seen INTEGER NOT NULL,
|
|
9
|
+
last_crawled INTEGER
|
|
10
|
+
);
|
|
11
|
+
|
|
12
|
+
CREATE TABLE IF NOT EXISTS pages (
|
|
13
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
14
|
+
domain_id INTEGER NOT NULL REFERENCES domains(id),
|
|
15
|
+
url TEXT UNIQUE NOT NULL,
|
|
16
|
+
crawled_at INTEGER NOT NULL,
|
|
17
|
+
status_code INTEGER,
|
|
18
|
+
word_count INTEGER,
|
|
19
|
+
load_ms INTEGER,
|
|
20
|
+
is_indexable INTEGER DEFAULT 1,
|
|
21
|
+
click_depth INTEGER DEFAULT 0, -- BFS depth from homepage (0 = homepage)
|
|
22
|
+
first_seen_at INTEGER, -- epoch ms when this URL was first discovered
|
|
23
|
+
published_date TEXT, -- ISO string or null
|
|
24
|
+
modified_date TEXT, -- ISO string or null
|
|
25
|
+
content_hash TEXT, -- SHA-256 of body text for incremental crawling
|
|
26
|
+
FOREIGN KEY (domain_id) REFERENCES domains(id)
|
|
27
|
+
);
|
|
28
|
+
|
|
29
|
+
CREATE TABLE IF NOT EXISTS extractions (
|
|
30
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
31
|
+
page_id INTEGER UNIQUE NOT NULL REFERENCES pages(id),
|
|
32
|
+
title TEXT,
|
|
33
|
+
meta_desc TEXT,
|
|
34
|
+
h1 TEXT,
|
|
35
|
+
product_type TEXT,
|
|
36
|
+
pricing_tier TEXT, -- 'free' | 'freemium' | 'paid' | 'enterprise' | 'none'
|
|
37
|
+
cta_primary TEXT,
|
|
38
|
+
tech_stack TEXT, -- JSON array
|
|
39
|
+
schema_types TEXT, -- JSON array (Article, Product, FAQ, etc.)
|
|
40
|
+
search_intent TEXT, -- 'Informational' | 'Navigational' | 'Commercial' | 'Transactional'
|
|
41
|
+
primary_entities TEXT, -- JSON array of 3-7 core concept strings
|
|
42
|
+
extracted_at INTEGER NOT NULL
|
|
43
|
+
);
|
|
44
|
+
|
|
45
|
+
CREATE TABLE IF NOT EXISTS headings (
|
|
46
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
47
|
+
page_id INTEGER NOT NULL REFERENCES pages(id),
|
|
48
|
+
level INTEGER NOT NULL, -- 1-6
|
|
49
|
+
text TEXT NOT NULL
|
|
50
|
+
);
|
|
51
|
+
|
|
52
|
+
CREATE TABLE IF NOT EXISTS keywords (
|
|
53
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
54
|
+
page_id INTEGER NOT NULL REFERENCES pages(id),
|
|
55
|
+
keyword TEXT NOT NULL,
|
|
56
|
+
location TEXT NOT NULL, -- 'title' | 'h1' | 'h2' | 'meta' | 'body'
|
|
57
|
+
search_volume INTEGER, -- monthly search volume (null until API populated)
|
|
58
|
+
keyword_difficulty INTEGER -- 0-100 (null until API populated)
|
|
59
|
+
);
|
|
60
|
+
|
|
61
|
+
CREATE TABLE IF NOT EXISTS links (
|
|
62
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
63
|
+
source_id INTEGER NOT NULL REFERENCES pages(id),
|
|
64
|
+
target_url TEXT NOT NULL,
|
|
65
|
+
anchor_text TEXT,
|
|
66
|
+
is_internal INTEGER NOT NULL DEFAULT 0
|
|
67
|
+
);
|
|
68
|
+
|
|
69
|
+
CREATE TABLE IF NOT EXISTS technical (
|
|
70
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
71
|
+
page_id INTEGER UNIQUE NOT NULL REFERENCES pages(id),
|
|
72
|
+
has_canonical INTEGER DEFAULT 0,
|
|
73
|
+
has_og_tags INTEGER DEFAULT 0,
|
|
74
|
+
has_schema INTEGER DEFAULT 0,
|
|
75
|
+
is_mobile_ok INTEGER DEFAULT 0,
|
|
76
|
+
has_sitemap INTEGER DEFAULT 0,
|
|
77
|
+
has_robots INTEGER DEFAULT 0,
|
|
78
|
+
core_web_vitals TEXT -- JSON: { lcp, cls, fid }
|
|
79
|
+
);
|
|
80
|
+
|
|
81
|
+
CREATE TABLE IF NOT EXISTS analyses (
|
|
82
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
83
|
+
project TEXT NOT NULL,
|
|
84
|
+
generated_at INTEGER NOT NULL,
|
|
85
|
+
model TEXT NOT NULL,
|
|
86
|
+
keyword_gaps TEXT, -- JSON array
|
|
87
|
+
long_tails TEXT, -- JSON array
|
|
88
|
+
quick_wins TEXT, -- JSON array
|
|
89
|
+
new_pages TEXT, -- JSON array
|
|
90
|
+
content_gaps TEXT, -- JSON array
|
|
91
|
+
positioning TEXT,
|
|
92
|
+
raw TEXT -- full model response
|
|
93
|
+
);
|
|
94
|
+
|
|
95
|
+
CREATE TABLE IF NOT EXISTS page_schemas (
|
|
96
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
97
|
+
page_id INTEGER NOT NULL REFERENCES pages(id),
|
|
98
|
+
schema_type TEXT NOT NULL, -- '@type' value: Organization, Product, Article, FAQ, etc.
|
|
99
|
+
name TEXT, -- schema name field
|
|
100
|
+
description TEXT, -- schema description field
|
|
101
|
+
rating REAL, -- aggregateRating.ratingValue
|
|
102
|
+
rating_count INTEGER, -- aggregateRating.reviewCount or ratingCount
|
|
103
|
+
price TEXT, -- offers.price or priceRange
|
|
104
|
+
currency TEXT, -- offers.priceCurrency
|
|
105
|
+
author TEXT, -- author.name
|
|
106
|
+
date_published TEXT, -- datePublished from schema
|
|
107
|
+
date_modified TEXT, -- dateModified from schema
|
|
108
|
+
image_url TEXT, -- image or image.url
|
|
109
|
+
raw_json TEXT NOT NULL, -- full JSON-LD object for future queries
|
|
110
|
+
extracted_at INTEGER NOT NULL
|
|
111
|
+
);
|
|
112
|
+
|
|
113
|
+
-- Indexes
|
|
114
|
+
CREATE INDEX IF NOT EXISTS idx_pages_domain ON pages(domain_id);
|
|
115
|
+
CREATE INDEX IF NOT EXISTS idx_keywords_page ON keywords(page_id);
|
|
116
|
+
CREATE INDEX IF NOT EXISTS idx_keywords_kw ON keywords(keyword);
|
|
117
|
+
CREATE INDEX IF NOT EXISTS idx_links_source ON links(source_id);
|
|
118
|
+
CREATE INDEX IF NOT EXISTS idx_headings_page ON headings(page_id);
|
|
119
|
+
CREATE INDEX IF NOT EXISTS idx_page_schemas_page ON page_schemas(page_id);
|
|
120
|
+
CREATE INDEX IF NOT EXISTS idx_page_schemas_type ON page_schemas(schema_type);
|