npm - seo-intel - Versions diffs - 1.0.0 - Mend

seo-intel 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

package/.env.example +41 -0
package/LICENSE +75 -0
package/README.md +243 -0
package/Start SEO Intel.bat +9 -0
package/Start SEO Intel.command +8 -0
package/cli.js +3727 -0
package/config/example.json +29 -0
package/config/setup-wizard.js +522 -0
package/crawler/index.js +566 -0
package/crawler/robots.js +103 -0
package/crawler/sanitize.js +124 -0
package/crawler/schema-parser.js +168 -0
package/crawler/sitemap.js +103 -0
package/crawler/stealth.js +393 -0
package/crawler/subdomain-discovery.js +341 -0
package/db/db.js +213 -0
package/db/schema.sql +120 -0
package/exports/competitive.js +186 -0
package/exports/heuristics.js +67 -0
package/exports/queries.js +197 -0
package/exports/suggestive.js +230 -0
package/exports/technical.js +180 -0
package/exports/templates.js +77 -0
package/lib/gate.js +204 -0
package/lib/license.js +369 -0
package/lib/oauth.js +432 -0
package/lib/updater.js +324 -0
package/package.json +68 -0
package/reports/generate-html.js +6194 -0
package/reports/generate-site-graph.js +949 -0
package/reports/gsc-loader.js +190 -0
package/scheduler.js +142 -0
package/seo-audit.js +619 -0
package/seo-intel.png +0 -0
package/server.js +602 -0
package/setup/ROADMAP.md +109 -0
package/setup/checks.js +483 -0
package/setup/config-builder.js +227 -0
package/setup/engine.js +65 -0
package/setup/installers.js +197 -0
package/setup/models.js +328 -0
package/setup/openclaw-bridge.js +329 -0
package/setup/validator.js +395 -0
package/setup/web-routes.js +688 -0
package/setup/wizard.html +2920 -0
package/start-seo-intel.sh +8 -0

package/crawler/subdomain-discovery.js ADDED Viewed

@@ -0,0 +1,341 @@
+/**
+ * SEO Intel — Subdomain Discovery
+ *
+ * Finds subdomains for a root domain using multiple passive + active techniques.
+ * No bruteforce — uses public data sources + crawl data + DNS checks.
+ *
+ * Methods (in order of speed/reliability):
+ *   1. Certificate Transparency logs (crt.sh) — free, fast, comprehensive
+ *   2. Crawl data mining — check links already in our DB for subdomains
+ *   3. Common subdomain probe — check well-known subdomains (docs, api, app, etc.)
+ *   4. DNS verification — confirm discovered subdomains actually resolve
+ *
+ * Usage:
+ *   import { discoverSubdomains } from './subdomain-discovery.js';
+ *   const results = await discoverSubdomains('example.com', { db });
+ */
+import { resolve as dnsResolve } from 'dns';
+import { promisify } from 'util';
+import { fetchSitemap } from './sitemap.js';
+const resolveDns = promisify(dnsResolve);
+// Common subdomains to probe (prioritized by SEO relevance)
+const COMMON_SUBDOMAINS = [
+  'www', 'docs', 'blog', 'app', 'api', 'dl', 'cdn',
+  'rpc', 'status', 'dashboard', 'portal', 'help', 'support',
+  'dev', 'staging', 'beta', 'shop', 'store', 'mail',
+  'admin', 'auth', 'accounts', 'community', 'forum',
+  'learn', 'academy', 'wiki', 'kb', 'changelog',
+];
+// ── Certificate Transparency (crt.sh) ─────────────────────────────────────
+/**
+ * Query crt.sh for all subdomains seen in SSL certificates.
+ * This is the most comprehensive passive method — catches subdomains
+ * that were ever issued a cert, even if they're no longer active.
+ */
+async function queryCrtSh(rootDomain) {
+  const controller = new AbortController();
+  const timeout = setTimeout(() => controller.abort(), 15000);
+  try {
+    const url = `https://crt.sh/?q=%25.${encodeURIComponent(rootDomain)}&output=json`;
+    const res = await fetch(url, {
+      signal: controller.signal,
+      headers: { 'Accept': 'application/json' },
+    });
+    if (!res.ok) return [];
+    const data = await res.json();
+    const subdomains = new Set();
+    for (const entry of data) {
+      const name = (entry.name_value || '').toLowerCase();
+      // crt.sh returns wildcard and multi-line entries
+      for (const line of name.split('\n')) {
+        const cleaned = line.trim().replace(/^\*\./, '');
+        if (cleaned.endsWith('.' + rootDomain) || cleaned === rootDomain) {
+          subdomains.add(cleaned);
+        }
+      }
+    }
+    return [...subdomains];
+  } catch {
+    return [];
+  } finally {
+    clearTimeout(timeout);
+  }
+}
+// ── Crawl Data Mining ──────────────────────────────────────────────────────
+/**
+ * Scan existing crawl data for links pointing to subdomains.
+ * Free — uses data we already have.
+ */
+function mineFromCrawlData(rootDomain, db) {
+  if (!db) return [];
+  try {
+    // Check all URLs we've seen in links table
+    const rows = db.prepare(`
+      SELECT DISTINCT target_url FROM links
+      WHERE target_url LIKE '%${rootDomain}%'
+    `).all();
+    const subdomains = new Set();
+    for (const row of rows) {
+      try {
+        const u = new URL(row.target_url);
+        if (u.hostname.endsWith('.' + rootDomain) || u.hostname === rootDomain) {
+          subdomains.add(u.hostname);
+        }
+      } catch { /* skip invalid URLs */ }
+    }
+    // Also check page URLs
+    const pages = db.prepare(`
+      SELECT DISTINCT url FROM pages
+      WHERE url LIKE '%${rootDomain}%'
+    `).all();
+    for (const row of pages) {
+      try {
+        const u = new URL(row.url);
+        if (u.hostname.endsWith('.' + rootDomain) || u.hostname === rootDomain) {
+          subdomains.add(u.hostname);
+        }
+      } catch { /* skip */ }
+    }
+    return [...subdomains];
+  } catch {
+    return [];
+  }
+}
+// ── Common Subdomain Probe ─────────────────────────────────────────────────
+/**
+ * Probe well-known subdomains via DNS lookup.
+ * Fast — just DNS queries, no HTTP requests.
+ */
+async function probeCommonSubdomains(rootDomain) {
+  const found = [];
+  const checks = COMMON_SUBDOMAINS.map(async (sub) => {
+    const hostname = `${sub}.${rootDomain}`;
+    try {
+      await resolveDns(hostname);
+      found.push(hostname);
+    } catch {
+      // NXDOMAIN — doesn't exist
+    }
+  });
+  await Promise.all(checks);
+  return found;
+}
+// ── DNS Verification ───────────────────────────────────────────────────────
+/**
+ * Verify a list of hostnames actually resolve via DNS.
+ * Filters out expired/dead subdomains from crt.sh results.
+ */
+async function verifyDns(hostnames) {
+  const verified = [];
+  const checks = hostnames.map(async (hostname) => {
+    try {
+      const addrs = await resolveDns(hostname);
+      if (addrs && addrs.length > 0) {
+        verified.push({ hostname, ip: addrs[0] });
+      }
+    } catch {
+      // Dead subdomain — skip
+    }
+  });
+  await Promise.all(checks);
+  return verified;
+}
+// ── HTTP Liveness Check ────────────────────────────────────────────────────
+/**
+ * Quick HTTP check to see if a subdomain serves content.
+ * Returns status code and basic page info.
+ */
+async function checkHttp(hostname) {
+  const controller = new AbortController();
+  const timeout = setTimeout(() => controller.abort(), 8000);
+  try {
+    const res = await fetch(`https://${hostname}`, {
+      signal: controller.signal,
+      redirect: 'follow',
+      headers: {
+        'User-Agent': 'Mozilla/5.0 (compatible; SEOIntelBot/1.0; +https://froggo.pro/seo-intel/bot)',
+      },
+    });
+    const finalUrl = res.url;
+    const status = res.status;
+    const contentType = res.headers.get('content-type') || '';
+    const isHtml = contentType.includes('text/html');
+    // Read just enough to check if it's a real page
+    let title = null;
+    if (isHtml) {
+      const text = await res.text();
+      const titleMatch = text.match(/<title[^>]*>([^<]+)</i);
+      title = titleMatch ? titleMatch[1].trim() : null;
+    }
+    return {
+      hostname,
+      status,
+      finalUrl,
+      isHtml,
+      title,
+      redirected: new URL(finalUrl).hostname !== hostname,
+      redirectTarget: new URL(finalUrl).hostname !== hostname ? new URL(finalUrl).hostname : null,
+    };
+  } catch (err) {
+    return {
+      hostname,
+      status: 0,
+      error: err.code || err.message || 'unknown',
+      isHtml: false,
+      title: null,
+    };
+  } finally {
+    clearTimeout(timeout);
+  }
+}
+// ── Main Discovery Function ────────────────────────────────────────────────
+/**
+ * Discover all subdomains for a root domain.
+ *
+ * @param {string} rootDomain - e.g. "example.com"
+ * @param {object} opts
+ * @param {object} [opts.db] - SQLite database (for crawl data mining)
+ * @param {boolean} [opts.httpCheck=true] - also check HTTP liveness
+ * @param {function} [opts.onProgress] - callback({ phase, found, total })
+ * @returns {Promise<SubdomainResult>}
+ */
+export async function discoverSubdomains(rootDomain, opts = {}) {
+  const { db, httpCheck = true, onProgress } = opts;
+  const allFound = new Set();
+  const sources = {};
+  // Phase 1: Certificate Transparency
+  if (onProgress) onProgress({ phase: 'crt.sh', message: 'Checking certificate transparency logs...' });
+  const crtResults = await queryCrtSh(rootDomain);
+  for (const d of crtResults) allFound.add(d);
+  sources['crt.sh'] = crtResults.length;
+  // Phase 2: Crawl data mining
+  if (db) {
+    if (onProgress) onProgress({ phase: 'crawl-data', message: 'Mining existing crawl data...' });
+    const crawlResults = mineFromCrawlData(rootDomain, db);
+    for (const d of crawlResults) allFound.add(d);
+    sources['crawl-data'] = crawlResults.length;
+  }
+  // Phase 3: Common subdomain probe
+  if (onProgress) onProgress({ phase: 'dns-probe', message: 'Probing common subdomains...' });
+  const probeResults = await probeCommonSubdomains(rootDomain);
+  for (const d of probeResults) allFound.add(d);
+  sources['dns-probe'] = probeResults.length;
+  // Phase 4: DNS verification (filter dead ones from crt.sh)
+  if (onProgress) onProgress({ phase: 'dns-verify', message: `Verifying ${allFound.size} subdomains via DNS...` });
+  const verified = await verifyDns([...allFound]);
+  const liveHostnames = new Set(verified.map(v => v.hostname));
+  // Phase 5: HTTP liveness check (optional)
+  let httpResults = [];
+  if (httpCheck) {
+    if (onProgress) onProgress({ phase: 'http-check', message: `Checking HTTP on ${liveHostnames.size} live subdomains...` });
+    // Check in batches of 5 to not overwhelm
+    const liveList = [...liveHostnames];
+    for (let i = 0; i < liveList.length; i += 5) {
+      const batch = liveList.slice(i, i + 5);
+      const results = await Promise.all(batch.map(h => checkHttp(h)));
+      httpResults.push(...results);
+    }
+  }
+  // Phase 6: Sitemap check — get page counts for SEO-relevant subdomains
+  const sitemapResults = new Map();
+  const seoLive = httpResults.filter(r => r.isHtml && r.status === 200 && !r.redirected);
+  if (seoLive.length > 0) {
+    if (onProgress) onProgress({ phase: 'sitemaps', message: `Checking sitemaps on ${seoLive.length} live subdomains...` });
+    // Check sitemaps in batches of 3
+    for (let i = 0; i < seoLive.length; i += 3) {
+      const batch = seoLive.slice(i, i + 3);
+      const results = await Promise.all(batch.map(async (r) => {
+        try {
+          const urls = await fetchSitemap(`https://${r.hostname}`);
+          return { hostname: r.hostname, urls };
+        } catch {
+          return { hostname: r.hostname, urls: [] };
+        }
+      }));
+      for (const r of results) sitemapResults.set(r.hostname, r.urls);
+    }
+    sources['sitemaps'] = [...sitemapResults.values()].reduce((sum, urls) => sum + urls.length, 0);
+  }
+  // Build final result
+  const subdomains = [...liveHostnames].sort().map(hostname => {
+    const http = httpResults.find(r => r.hostname === hostname) || {};
+    const dns = verified.find(v => v.hostname === hostname) || {};
+    const isRoot = hostname === rootDomain;
+    const sub = isRoot ? '(root)' : hostname.replace('.' + rootDomain, '');
+    const sitemap = sitemapResults.get(hostname) || [];
+    return {
+      hostname,
+      subdomain: sub,
+      isRoot,
+      ip: dns.ip || null,
+      httpStatus: http.status || null,
+      title: http.title || null,
+      isHtml: http.isHtml || false,
+      redirected: http.redirected || false,
+      redirectTarget: http.redirectTarget || null,
+      error: http.error || null,
+      sitemapUrls: sitemap.length,
+      sitemapSample: sitemap.slice(0, 5).map(u => u.url || u),
+      // SEO relevance score
+      seoRelevant: http.isHtml && http.status === 200 && !http.redirected,
+    };
+  });
+  // Total sitemap URLs across all subdomains
+  const totalSitemapUrls = subdomains.reduce((sum, s) => sum + s.sitemapUrls, 0);
+  return {
+    rootDomain,
+    discovered: subdomains.length,
+    live: subdomains.filter(s => s.httpStatus === 200).length,
+    seoRelevant: subdomains.filter(s => s.seoRelevant).length,
+    totalSitemapUrls,
+    sources,
+    subdomains,
+  };
+}

package/db/db.js ADDED Viewed

@@ -0,0 +1,213 @@
+import { DatabaseSync } from 'node:sqlite';
+import { readFileSync } from 'fs';
+import { fileURLToPath } from 'url';
+import { dirname, join } from 'path';
+const __dirname = dirname(fileURLToPath(import.meta.url));
+let _db = null;
+export function getDb(dbPath = './seo-intel.db') {
+  if (_db) return _db;
+  _db = new DatabaseSync(dbPath);
+  _db.exec('PRAGMA journal_mode = WAL');
+  _db.exec('PRAGMA busy_timeout = 10000');
+  _db.exec('PRAGMA foreign_keys = ON');
+  // Apply schema
+  const schema = readFileSync(join(__dirname, 'schema.sql'), 'utf8');
+  _db.exec(schema);
+  // Migrations for existing databases
+  try { _db.exec('ALTER TABLE pages ADD COLUMN content_hash TEXT'); } catch { /* already exists */ }
+  try { _db.exec('ALTER TABLE pages ADD COLUMN first_seen_at INTEGER'); } catch { /* already exists */ }
+  // Backfill first_seen_at from crawled_at for existing rows
+  _db.exec('UPDATE pages SET first_seen_at = crawled_at WHERE first_seen_at IS NULL');
+  // page_schemas table is created by schema.sql — no migration needed (new table)
+  return _db;
+}
+export function upsertDomain(db, { domain, project, role }) {
+  const now = Date.now();
+  return db.prepare(`
+    INSERT INTO domains (domain, project, role, first_seen, last_crawled)
+    VALUES (?, ?, ?, ?, ?)
+    ON CONFLICT(domain) DO UPDATE SET
+      project = excluded.project,
+      role = excluded.role,
+      last_crawled = excluded.last_crawled
+  `).run(domain, project, role, now, now);
+}
+export function upsertPage(db, { domainId, url, statusCode, wordCount, loadMs, isIndexable, clickDepth = 0, publishedDate = null, modifiedDate = null, contentHash = null }) {
+  const now = Date.now();
+  db.prepare(`
+    INSERT INTO pages (domain_id, url, crawled_at, first_seen_at, status_code, word_count, load_ms, is_indexable, click_depth, published_date, modified_date, content_hash)
+    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+    ON CONFLICT(url) DO UPDATE SET
+      crawled_at     = excluded.crawled_at,
+      status_code    = excluded.status_code,
+      word_count     = excluded.word_count,
+      load_ms        = excluded.load_ms,
+      click_depth    = excluded.click_depth,
+      published_date = excluded.published_date,
+      modified_date  = excluded.modified_date,
+      content_hash   = excluded.content_hash
+  `).run(domainId, url, now, now, statusCode, wordCount, loadMs, isIndexable ? 1 : 0, clickDepth, publishedDate, modifiedDate, contentHash);
+  // first_seen_at is NOT in the ON CONFLICT UPDATE — it stays from original INSERT
+  return db.prepare('SELECT id FROM pages WHERE url = ?').get(url);
+}
+export function getPageHash(db, url) {
+  return db.prepare('SELECT content_hash FROM pages WHERE url = ?').get(url)?.content_hash || null;
+}
+export function insertExtraction(db, { pageId, data }) {
+  if (!pageId) {
+    console.warn('[db] insertExtraction skipped: pageId is missing');
+    return null;
+  }
+  return db.prepare(`
+    INSERT OR REPLACE INTO extractions
+      (page_id, title, meta_desc, h1, product_type, pricing_tier, cta_primary,
+       tech_stack, schema_types, search_intent, primary_entities, extracted_at)
+    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+  `).run(
+    pageId, data.title, data.meta_desc, data.h1,
+    data.product_type, data.pricing_tier, data.cta_primary,
+    JSON.stringify(data.tech_stack || []),
+    JSON.stringify(data.schema_types || []),
+    data.search_intent || 'Informational',
+    JSON.stringify(data.primary_entities || []),
+    Date.now()
+  );
+}
+export function insertKeywords(db, pageId, keywords) {
+  const stmt = db.prepare(`INSERT INTO keywords (page_id, keyword, location) VALUES (?, ?, ?)`);
+  db.exec('BEGIN');
+  try {
+    for (const kw of keywords) stmt.run(pageId, kw.keyword.toLowerCase(), kw.location);
+    db.exec('COMMIT');
+  } catch (e) { db.exec('ROLLBACK'); throw e; }
+}
+export function insertHeadings(db, pageId, headings) {
+  const stmt = db.prepare(`INSERT INTO headings (page_id, level, text) VALUES (?, ?, ?)`);
+  db.exec('BEGIN');
+  try {
+    for (const h of headings) stmt.run(pageId, h.level, h.text);
+    db.exec('COMMIT');
+  } catch (e) { db.exec('ROLLBACK'); throw e; }
+}
+export function insertLinks(db, sourceId, links) {
+  const stmt = db.prepare(`INSERT INTO links (source_id, target_url, anchor_text, is_internal) VALUES (?, ?, ?, ?)`);
+  db.exec('BEGIN');
+  try {
+    for (const l of links) stmt.run(sourceId, l.url, l.anchor, l.isInternal ? 1 : 0);
+    db.exec('COMMIT');
+  } catch (e) { db.exec('ROLLBACK'); throw e; }
+}
+export function insertPageSchemas(db, pageId, schemas) {
+  // Clear old schemas for this page (re-crawl overwrites)
+  db.prepare('DELETE FROM page_schemas WHERE page_id = ?').run(pageId);
+  if (!schemas || schemas.length === 0) return;
+  const stmt = db.prepare(`
+    INSERT INTO page_schemas
+      (page_id, schema_type, name, description, rating, rating_count,
+       price, currency, author, date_published, date_modified, image_url,
+       raw_json, extracted_at)
+    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+  `);
+  db.exec('BEGIN');
+  try {
+    for (const s of schemas) {
+      stmt.run(
+        pageId,
+        s.type,
+        s.name || null,
+        s.description?.slice(0, 500) || null,
+        s.rating ?? null,
+        s.ratingCount ?? null,
+        s.price || null,
+        s.currency || null,
+        s.author || null,
+        s.datePublished || null,
+        s.dateModified || null,
+        s.imageUrl || null,
+        JSON.stringify(s.raw),
+        Date.now()
+      );
+    }
+    db.exec('COMMIT');
+  } catch (e) { db.exec('ROLLBACK'); throw e; }
+}
+export function getSchemasByProject(db, project) {
+  return db.prepare(`
+    SELECT
+      d.domain, d.role, p.url,
+      ps.schema_type, ps.name, ps.description,
+      ps.rating, ps.rating_count,
+      ps.price, ps.currency,
+      ps.author, ps.date_published, ps.date_modified,
+      ps.image_url, ps.raw_json
+    FROM page_schemas ps
+    JOIN pages p ON p.id = ps.page_id
+    JOIN domains d ON d.id = p.domain_id
+    WHERE d.project = ?
+    ORDER BY d.domain, ps.schema_type
+  `).all(project);
+}
+export function getCompetitorSummary(db, project) {
+  return db.prepare(`
+    SELECT
+      d.domain,
+      d.role,
+      COUNT(DISTINCT p.id) as page_count,
+      AVG(p.word_count) as avg_word_count,
+      GROUP_CONCAT(DISTINCT e.product_type) as product_types,
+      GROUP_CONCAT(DISTINCT e.pricing_tier) as pricing_tiers,
+      GROUP_CONCAT(DISTINCT e.cta_primary) as ctas
+    FROM domains d
+    JOIN pages p ON p.domain_id = d.id
+    LEFT JOIN extractions e ON e.page_id = p.id
+    WHERE d.project = ?
+    GROUP BY d.domain, d.role
+  `).all(project);
+}
+export function getKeywordMatrix(db, project) {
+  return db.prepare(`
+    SELECT
+      k.keyword,
+      d.domain,
+      d.role,
+      k.location,
+      COUNT(*) as freq
+    FROM keywords k
+    JOIN pages p ON p.id = k.page_id
+    JOIN domains d ON d.id = p.domain_id
+    WHERE d.project = ?
+    GROUP BY k.keyword, d.domain
+    ORDER BY freq DESC
+  `).all(project);
+}
+export function getHeadingStructure(db, project) {
+  return db.prepare(`
+    SELECT d.domain, d.role, h.level, h.text
+    FROM headings h
+    JOIN pages p ON p.id = h.page_id
+    JOIN domains d ON d.id = p.domain_id
+    WHERE d.project = ?
+    ORDER BY d.domain, h.level
+  `).all(project);
+}

package/db/schema.sql ADDED Viewed

@@ -0,0 +1,120 @@
+-- SEO Intel Database Schema
+CREATE TABLE IF NOT EXISTS domains (
+  id          INTEGER PRIMARY KEY AUTOINCREMENT,
+  domain      TEXT UNIQUE NOT NULL,
+  project     TEXT NOT NULL,  -- e.g. 'mysite'
+  role        TEXT NOT NULL,  -- 'target' | 'competitor'
+  first_seen  INTEGER NOT NULL,
+  last_crawled INTEGER
+);
+CREATE TABLE IF NOT EXISTS pages (
+  id            INTEGER PRIMARY KEY AUTOINCREMENT,
+  domain_id     INTEGER NOT NULL REFERENCES domains(id),
+  url           TEXT UNIQUE NOT NULL,
+  crawled_at    INTEGER NOT NULL,
+  status_code   INTEGER,
+  word_count    INTEGER,
+  load_ms       INTEGER,
+  is_indexable  INTEGER DEFAULT 1,
+  click_depth   INTEGER DEFAULT 0,   -- BFS depth from homepage (0 = homepage)
+  first_seen_at  INTEGER,            -- epoch ms when this URL was first discovered
+  published_date TEXT,               -- ISO string or null
+  modified_date  TEXT,               -- ISO string or null
+  content_hash   TEXT,               -- SHA-256 of body text for incremental crawling
+  FOREIGN KEY (domain_id) REFERENCES domains(id)
+);
+CREATE TABLE IF NOT EXISTS extractions (
+  id               INTEGER PRIMARY KEY AUTOINCREMENT,
+  page_id          INTEGER UNIQUE NOT NULL REFERENCES pages(id),
+  title            TEXT,
+  meta_desc        TEXT,
+  h1               TEXT,
+  product_type     TEXT,
+  pricing_tier     TEXT,             -- 'free' | 'freemium' | 'paid' | 'enterprise' | 'none'
+  cta_primary      TEXT,
+  tech_stack       TEXT,             -- JSON array
+  schema_types     TEXT,             -- JSON array (Article, Product, FAQ, etc.)
+  search_intent    TEXT,             -- 'Informational' | 'Navigational' | 'Commercial' | 'Transactional'
+  primary_entities TEXT,             -- JSON array of 3-7 core concept strings
+  extracted_at     INTEGER NOT NULL
+);
+CREATE TABLE IF NOT EXISTS headings (
+  id        INTEGER PRIMARY KEY AUTOINCREMENT,
+  page_id   INTEGER NOT NULL REFERENCES pages(id),
+  level     INTEGER NOT NULL,  -- 1-6
+  text      TEXT NOT NULL
+);
+CREATE TABLE IF NOT EXISTS keywords (
+  id                  INTEGER PRIMARY KEY AUTOINCREMENT,
+  page_id             INTEGER NOT NULL REFERENCES pages(id),
+  keyword             TEXT NOT NULL,
+  location            TEXT NOT NULL,  -- 'title' | 'h1' | 'h2' | 'meta' | 'body'
+  search_volume       INTEGER,        -- monthly search volume (null until API populated)
+  keyword_difficulty  INTEGER         -- 0-100 (null until API populated)
+);
+CREATE TABLE IF NOT EXISTS links (
+  id          INTEGER PRIMARY KEY AUTOINCREMENT,
+  source_id   INTEGER NOT NULL REFERENCES pages(id),
+  target_url  TEXT NOT NULL,
+  anchor_text TEXT,
+  is_internal INTEGER NOT NULL DEFAULT 0
+);
+CREATE TABLE IF NOT EXISTS technical (
+  id              INTEGER PRIMARY KEY AUTOINCREMENT,
+  page_id         INTEGER UNIQUE NOT NULL REFERENCES pages(id),
+  has_canonical   INTEGER DEFAULT 0,
+  has_og_tags     INTEGER DEFAULT 0,
+  has_schema      INTEGER DEFAULT 0,
+  is_mobile_ok    INTEGER DEFAULT 0,
+  has_sitemap     INTEGER DEFAULT 0,
+  has_robots      INTEGER DEFAULT 0,
+  core_web_vitals TEXT  -- JSON: { lcp, cls, fid }
+);
+CREATE TABLE IF NOT EXISTS analyses (
+  id            INTEGER PRIMARY KEY AUTOINCREMENT,
+  project       TEXT NOT NULL,
+  generated_at  INTEGER NOT NULL,
+  model         TEXT NOT NULL,
+  keyword_gaps  TEXT,  -- JSON array
+  long_tails    TEXT,  -- JSON array
+  quick_wins    TEXT,  -- JSON array
+  new_pages     TEXT,  -- JSON array
+  content_gaps  TEXT,  -- JSON array
+  positioning   TEXT,
+  raw           TEXT   -- full model response
+);
+CREATE TABLE IF NOT EXISTS page_schemas (
+  id          INTEGER PRIMARY KEY AUTOINCREMENT,
+  page_id     INTEGER NOT NULL REFERENCES pages(id),
+  schema_type TEXT NOT NULL,            -- '@type' value: Organization, Product, Article, FAQ, etc.
+  name        TEXT,                     -- schema name field
+  description TEXT,                     -- schema description field
+  rating      REAL,                     -- aggregateRating.ratingValue
+  rating_count INTEGER,                -- aggregateRating.reviewCount or ratingCount
+  price       TEXT,                     -- offers.price or priceRange
+  currency    TEXT,                     -- offers.priceCurrency
+  author      TEXT,                     -- author.name
+  date_published TEXT,                  -- datePublished from schema
+  date_modified  TEXT,                  -- dateModified from schema
+  image_url   TEXT,                     -- image or image.url
+  raw_json    TEXT NOT NULL,            -- full JSON-LD object for future queries
+  extracted_at INTEGER NOT NULL
+);
+-- Indexes
+CREATE INDEX IF NOT EXISTS idx_pages_domain ON pages(domain_id);
+CREATE INDEX IF NOT EXISTS idx_keywords_page ON keywords(page_id);
+CREATE INDEX IF NOT EXISTS idx_keywords_kw ON keywords(keyword);
+CREATE INDEX IF NOT EXISTS idx_links_source ON links(source_id);
+CREATE INDEX IF NOT EXISTS idx_headings_page ON headings(page_id);
+CREATE INDEX IF NOT EXISTS idx_page_schemas_page ON page_schemas(page_id);
+CREATE INDEX IF NOT EXISTS idx_page_schemas_type ON page_schemas(schema_type);