npm - @crawlith/core - Versions diffs - 0.1.0 → 0.1.2 - Mend

@crawlith/core 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (238) hide show

package/LICENSE +201 -0
package/README.md +70 -0
package/dist/analysis/analysis_list.html +35 -0
package/dist/analysis/analysis_page.html +123 -0
package/dist/analysis/analyze.d.ts +40 -5
package/dist/analysis/analyze.js +395 -347
package/dist/analysis/clustering.d.ts +23 -0
package/dist/analysis/clustering.js +206 -0
package/dist/analysis/content.d.ts +1 -1
package/dist/analysis/content.js +11 -5
package/dist/analysis/duplicate.d.ts +34 -0
package/dist/analysis/duplicate.js +305 -0
package/dist/analysis/heading.d.ts +116 -0
package/dist/analysis/heading.js +356 -0
package/dist/analysis/images.d.ts +1 -1
package/dist/analysis/images.js +6 -5
package/dist/analysis/links.d.ts +1 -1
package/dist/analysis/links.js +8 -8
package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
package/dist/analysis/scoring.js +11 -2
package/dist/analysis/seo.d.ts +8 -4
package/dist/analysis/seo.js +41 -30
package/dist/analysis/soft404.d.ts +17 -0
package/dist/analysis/soft404.js +62 -0
package/dist/analysis/structuredData.d.ts +1 -1
package/dist/analysis/structuredData.js +5 -4
package/dist/analysis/templates.d.ts +2 -0
package/dist/analysis/templates.js +7 -0
package/dist/application/index.d.ts +2 -0
package/dist/application/index.js +2 -0
package/dist/application/usecase.d.ts +3 -0
package/dist/application/usecase.js +1 -0
package/dist/application/usecases.d.ts +114 -0
package/dist/application/usecases.js +201 -0
package/dist/audit/index.js +1 -1
package/dist/audit/transport.d.ts +1 -1
package/dist/audit/transport.js +5 -4
package/dist/audit/types.d.ts +1 -0
package/dist/constants.d.ts +17 -0
package/dist/constants.js +23 -0
package/dist/core/scope/scopeManager.js +3 -0
package/dist/core/security/ipGuard.d.ts +11 -0
package/dist/core/security/ipGuard.js +71 -3
package/dist/crawler/crawl.d.ts +4 -22
package/dist/crawler/crawl.js +4 -335
package/dist/crawler/crawler.d.ts +87 -0
package/dist/crawler/crawler.js +683 -0
package/dist/crawler/extract.d.ts +4 -1
package/dist/crawler/extract.js +7 -2
package/dist/crawler/fetcher.d.ts +2 -1
package/dist/crawler/fetcher.js +26 -11
package/dist/crawler/metricsRunner.d.ts +23 -1
package/dist/crawler/metricsRunner.js +202 -72
package/dist/crawler/normalize.d.ts +41 -0
package/dist/crawler/normalize.js +119 -3
package/dist/crawler/parser.d.ts +1 -3
package/dist/crawler/parser.js +2 -49
package/dist/crawler/resolver.d.ts +11 -0
package/dist/crawler/resolver.js +67 -0
package/dist/crawler/sitemap.d.ts +6 -0
package/dist/crawler/sitemap.js +27 -17
package/dist/crawler/trap.d.ts +5 -1
package/dist/crawler/trap.js +23 -2
package/dist/db/CrawlithDB.d.ts +110 -0
package/dist/db/CrawlithDB.js +500 -0
package/dist/db/graphLoader.js +42 -30
package/dist/db/index.d.ts +11 -0
package/dist/db/index.js +41 -29
package/dist/db/migrations.d.ts +2 -0
package/dist/db/{schema.js → migrations.js} +90 -43
package/dist/db/pluginRegistry.d.ts +9 -0
package/dist/db/pluginRegistry.js +19 -0
package/dist/db/repositories/EdgeRepository.d.ts +13 -0
package/dist/db/repositories/EdgeRepository.js +20 -0
package/dist/db/repositories/MetricsRepository.d.ts +16 -8
package/dist/db/repositories/MetricsRepository.js +28 -7
package/dist/db/repositories/PageRepository.d.ts +15 -2
package/dist/db/repositories/PageRepository.js +169 -25
package/dist/db/repositories/SiteRepository.d.ts +9 -0
package/dist/db/repositories/SiteRepository.js +13 -0
package/dist/db/repositories/SnapshotRepository.d.ts +14 -5
package/dist/db/repositories/SnapshotRepository.js +64 -5
package/dist/db/reset.d.ts +9 -0
package/dist/db/reset.js +32 -0
package/dist/db/statements.d.ts +12 -0
package/dist/db/statements.js +40 -0
package/dist/diff/compare.d.ts +0 -5
package/dist/diff/compare.js +0 -12
package/dist/diff/service.d.ts +16 -0
package/dist/diff/service.js +41 -0
package/dist/domain/index.d.ts +4 -0
package/dist/domain/index.js +4 -0
package/dist/events.d.ts +56 -0
package/dist/events.js +1 -0
package/dist/graph/graph.d.ts +36 -42
package/dist/graph/graph.js +26 -17
package/dist/graph/hits.d.ts +23 -0
package/dist/graph/hits.js +111 -0
package/dist/graph/metrics.d.ts +0 -4
package/dist/graph/metrics.js +25 -9
package/dist/graph/pagerank.d.ts +17 -4
package/dist/graph/pagerank.js +126 -91
package/dist/graph/simhash.d.ts +6 -0
package/dist/graph/simhash.js +14 -0
package/dist/index.d.ts +29 -8
package/dist/index.js +29 -8
package/dist/lock/hashKey.js +1 -1
package/dist/lock/lockManager.d.ts +5 -1
package/dist/lock/lockManager.js +38 -13
package/dist/plugin-system/plugin-cli.d.ts +10 -0
package/dist/plugin-system/plugin-cli.js +31 -0
package/dist/plugin-system/plugin-config.d.ts +16 -0
package/dist/plugin-system/plugin-config.js +36 -0
package/dist/plugin-system/plugin-loader.d.ts +17 -0
package/dist/plugin-system/plugin-loader.js +122 -0
package/dist/plugin-system/plugin-registry.d.ts +25 -0
package/dist/plugin-system/plugin-registry.js +167 -0
package/dist/plugin-system/plugin-types.d.ts +205 -0
package/dist/plugin-system/plugin-types.js +1 -0
package/dist/ports/index.d.ts +9 -0
package/dist/ports/index.js +1 -0
package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
package/dist/report/crawlExport.d.ts +3 -0
package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
package/dist/report/crawl_template.d.ts +1 -0
package/dist/report/crawl_template.js +7 -0
package/dist/report/export.d.ts +3 -0
package/dist/report/export.js +81 -0
package/dist/report/html.js +15 -216
package/dist/report/insight.d.ts +27 -0
package/dist/report/insight.js +103 -0
package/dist/scoring/health.d.ts +56 -0
package/dist/scoring/health.js +213 -0
package/dist/utils/chalk.d.ts +6 -0
package/dist/utils/chalk.js +41 -0
package/dist/utils/secureConfig.d.ts +23 -0
package/dist/utils/secureConfig.js +128 -0
package/package.json +12 -6
package/CHANGELOG.md +0 -7
package/dist/db/schema.d.ts +0 -2
package/dist/graph/cluster.d.ts +0 -6
package/dist/graph/cluster.js +0 -173
package/dist/graph/duplicate.d.ts +0 -10
package/dist/graph/duplicate.js +0 -251
package/dist/report/sitegraphExport.d.ts +0 -3
package/dist/report/sitegraph_template.d.ts +0 -1
package/dist/report/sitegraph_template.js +0 -630
package/dist/scoring/hits.d.ts +0 -9
package/dist/scoring/hits.js +0 -111
package/src/analysis/analyze.ts +0 -548
package/src/analysis/content.ts +0 -62
package/src/analysis/images.ts +0 -28
package/src/analysis/links.ts +0 -41
package/src/analysis/scoring.ts +0 -59
package/src/analysis/seo.ts +0 -82
package/src/analysis/structuredData.ts +0 -62
package/src/audit/dns.ts +0 -49
package/src/audit/headers.ts +0 -98
package/src/audit/index.ts +0 -66
package/src/audit/scoring.ts +0 -232
package/src/audit/transport.ts +0 -258
package/src/audit/types.ts +0 -102
package/src/core/network/proxyAdapter.ts +0 -21
package/src/core/network/rateLimiter.ts +0 -39
package/src/core/network/redirectController.ts +0 -47
package/src/core/network/responseLimiter.ts +0 -34
package/src/core/network/retryPolicy.ts +0 -57
package/src/core/scope/domainFilter.ts +0 -45
package/src/core/scope/scopeManager.ts +0 -52
package/src/core/scope/subdomainPolicy.ts +0 -39
package/src/core/security/ipGuard.ts +0 -92
package/src/crawler/crawl.ts +0 -382
package/src/crawler/extract.ts +0 -34
package/src/crawler/fetcher.ts +0 -233
package/src/crawler/metricsRunner.ts +0 -124
package/src/crawler/normalize.ts +0 -108
package/src/crawler/parser.ts +0 -190
package/src/crawler/sitemap.ts +0 -73
package/src/crawler/trap.ts +0 -96
package/src/db/graphLoader.ts +0 -105
package/src/db/index.ts +0 -70
package/src/db/repositories/EdgeRepository.ts +0 -29
package/src/db/repositories/MetricsRepository.ts +0 -49
package/src/db/repositories/PageRepository.ts +0 -128
package/src/db/repositories/SiteRepository.ts +0 -32
package/src/db/repositories/SnapshotRepository.ts +0 -74
package/src/db/schema.ts +0 -177
package/src/diff/compare.ts +0 -84
package/src/graph/cluster.ts +0 -192
package/src/graph/duplicate.ts +0 -286
package/src/graph/graph.ts +0 -172
package/src/graph/metrics.ts +0 -110
package/src/graph/pagerank.ts +0 -125
package/src/graph/simhash.ts +0 -61
package/src/index.ts +0 -30
package/src/lock/hashKey.ts +0 -51
package/src/lock/lockManager.ts +0 -124
package/src/lock/pidCheck.ts +0 -13
package/src/report/html.ts +0 -227
package/src/report/sitegraphExport.ts +0 -58
package/src/scoring/hits.ts +0 -131
package/src/scoring/orphanSeverity.ts +0 -176
package/src/utils/version.ts +0 -18
package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
package/tests/analysis.unit.test.ts +0 -98
package/tests/analyze.integration.test.ts +0 -98
package/tests/audit/dns.test.ts +0 -31
package/tests/audit/headers.test.ts +0 -45
package/tests/audit/scoring.test.ts +0 -133
package/tests/audit/security.test.ts +0 -12
package/tests/audit/transport.test.ts +0 -112
package/tests/clustering.test.ts +0 -118
package/tests/crawler.test.ts +0 -358
package/tests/db.test.ts +0 -159
package/tests/diff.test.ts +0 -67
package/tests/duplicate.test.ts +0 -110
package/tests/fetcher.test.ts +0 -106
package/tests/fetcher_safety.test.ts +0 -85
package/tests/fixtures/analyze-crawl.json +0 -26
package/tests/hits.test.ts +0 -134
package/tests/html_report.test.ts +0 -58
package/tests/lock/lockManager.test.ts +0 -138
package/tests/metrics.test.ts +0 -196
package/tests/normalize.test.ts +0 -101
package/tests/orphanSeverity.test.ts +0 -160
package/tests/pagerank.test.ts +0 -98
package/tests/parser.test.ts +0 -117
package/tests/proxy_safety.test.ts +0 -57
package/tests/redirect_safety.test.ts +0 -73
package/tests/safety.test.ts +0 -114
package/tests/scope.test.ts +0 -66
package/tests/scoring.test.ts +0 -59
package/tests/sitemap.test.ts +0 -88
package/tests/soft404.test.ts +0 -41
package/tests/trap.test.ts +0 -39
package/tests/visualization_data.test.ts +0 -46
package/tsconfig.json +0 -11

package/dist/analysis/analyze.js CHANGED Viewed

@@ -1,293 +1,342 @@
-import fs from 'node:fs/promises';
+import { load } from 'cheerio';
 import { crawl } from '../crawler/crawl.js';
+import { UrlResolver } from '../crawler/resolver.js';
+import { Fetcher } from '../crawler/fetcher.js';
 import { loadGraphFromSnapshot } from '../db/graphLoader.js';
-import { normalizeUrl } from '../crawler/normalize.js';
+import { normalizeUrl, UrlUtil } from '../crawler/normalize.js';
 import { calculateMetrics } from '../graph/metrics.js';
-import { Graph } from '../graph/graph.js';
 import { analyzeContent, calculateThinContentScore } from './content.js';
-import { analyzeH1, analyzeMetaDescription, analyzeTitle, applyDuplicateStatuses } from './seo.js';
+import { analyzeH1, analyzeMetaDescription, analyzeTitle } from './seo.js';
 import { analyzeImageAlts } from './images.js';
 import { analyzeLinks } from './links.js';
 import { analyzeStructuredData } from './structuredData.js';
 import { aggregateSiteScore, scorePageSeo } from './scoring.js';
-import { detectContentClusters } from '../graph/cluster.js';
+import { ClusteringService } from './clustering.js';
+import { DuplicateService } from './duplicate.js';
+import { Soft404Service } from './soft404.js';
 import { getDb } from '../db/index.js';
 import { SiteRepository } from '../db/repositories/SiteRepository.js';
 import { SnapshotRepository } from '../db/repositories/SnapshotRepository.js';
 import { PageRepository } from '../db/repositories/PageRepository.js';
-export async function analyzeSite(url, options) {
-    const normalizedRoot = normalizeUrl(url, '', { stripQuery: false });
-    if (!normalizedRoot) {
+import { MetricsRepository } from '../db/repositories/MetricsRepository.js';
+import { ANALYSIS_LIST_TEMPLATE, ANALYSIS_PAGE_TEMPLATE } from './templates.js';
+import { DEFAULTS } from '../constants.js';
+import { PageRankService } from '../graph/pagerank.js';
+import { HITSService } from '../graph/hits.js';
+import { HeadingHealthService } from './heading.js';
+import { annotateOrphans } from './orphan.js';
+import { HealthService } from '../scoring/health.js';
+/**
+ * Analyzes a site for SEO, content, and accessibility.
+ * Supports live crawling or loading from a database snapshot.
+ */
+export async function analyzeSite(url, options, context) {
+    // 1. Parse siteOrigin (e.g. https://example.com) and targetPath (e.g. /stats) from the URL.
+    //    We resolve the *origin* — not the full page URL — so rootOrigin is always just the
+    //    scheme+host and normalizedPath is always the pathname.
+    let parsedUrl = null;
+    try {
+        parsedUrl = new URL(url);
+    }
+    catch { /* bare domain fallback below */ }
+    const inputFullUrl = parsedUrl ? parsedUrl.toString() : (url.startsWith('http') ? url : `https://${url}`);
+    const inputOrigin = parsedUrl ? `${parsedUrl.protocol}//${parsedUrl.host}` : url;
+    let rootOrigin = inputOrigin;
+    if (options.live !== false) {
+        const resolver = new UrlResolver();
+        const fetcher = new Fetcher({ rate: options.rate, proxyUrl: options.proxyUrl, userAgent: options.userAgent });
+        try {
+            const resolved = await resolver.resolve(inputOrigin, fetcher);
+            rootOrigin = resolved.url;
+        }
+        catch {
+            // Fallback to basic normalization if resolution fails
+        }
+    }
+    // Normalize origin and target URL independently.
+    const normalizedOrigin = normalizeUrl(rootOrigin, '', { stripQuery: false });
+    if (!normalizedOrigin) {
         throw new Error('Invalid URL for analysis');
     }
+    const normalizedTargetAbs = normalizeUrl(inputFullUrl, rootOrigin, { stripQuery: false }) || inputFullUrl;
+    const normalizedPath = normalizeUrl(inputFullUrl, rootOrigin, { stripQuery: false, toPath: true })
+        || UrlUtil.toPath(normalizedTargetAbs, rootOrigin);
+    const start = Date.now();
     let crawlData;
+    let robots = null;
+    // 1. Robots fetch (live-mode only to keep snapshot analysis deterministic and fast)
+    if (options.live) {
+        try {
+            const robotsUrl = new URL('/robots.txt', rootOrigin).toString();
+            const { Fetcher } = await import('../crawler/fetcher.js');
+            const fetcher = new Fetcher({
+                rate: DEFAULTS.RATE_LIMIT,
+                proxyUrl: options.proxyUrl,
+                userAgent: options.userAgent
+            });
+            const robotsRes = await fetcher.fetch(robotsUrl, { maxBytes: 500000 });
+            if (typeof robotsRes.status === 'number' && robotsRes.status >= 200 && robotsRes.status < 300) {
+                const robotsParserModule = await import('robots-parser');
+                const robotsParser = robotsParserModule.default || robotsParserModule;
+                robots = robotsParser(robotsUrl, robotsRes.body);
+                if (context)
+                    context.emit({ type: 'info', message: `[analyze] Robots fetch took ${Date.now() - start}ms` });
+            }
+        }
+        catch {
+            // Fallback
+        }
+    }
+    // Data Acquisition
     if (options.live) {
-        crawlData = await runLiveCrawl(normalizedRoot, options);
+        const crawlStart = Date.now();
+        crawlData = await runLiveCrawl(normalizedTargetAbs, rootOrigin, options, context, robots);
+        if (context)
+            context.emit({ type: 'info', message: `[analyze] runLiveCrawl took ${Date.now() - crawlStart}ms` });
     }
     else {
         try {
-            crawlData = await loadCrawlData(normalizedRoot, options.fromCrawl);
-        }
-        catch (error) {
-            const isNotFound = error.code === 'ENOENT' ||
-                error.message.includes('Crawl data not found') ||
-                error.message.includes('No completed snapshot found') ||
-                error.message.includes('not found in database');
-            if (isNotFound && !options.fromCrawl) {
-                console.log('No local crawl data found. Switching to live analysis mode...');
-                crawlData = await runLiveCrawl(normalizedRoot, options);
-            }
-            else {
-                throw error;
+            const loadStart = Date.now();
+            crawlData = await loadCrawlData(normalizedOrigin, options.snapshotId);
+            if (context)
+                context.emit({ type: 'debug', message: `[analyze] loadCrawlData took ${Date.now() - loadStart}ms` });
+            const allPages = Array.from(crawlData.pages);
+            crawlData.pages = allPages;
+            const exists = allPages.some(p => p.url === normalizedPath || p.url === normalizedTargetAbs);
+            if (!exists) {
+                if (context)
+                    context.emit({ type: 'info', message: `URL ${normalizedTargetAbs} not found. Fetching live...` });
+                crawlData = await runLiveCrawl(normalizedTargetAbs, rootOrigin, options, context, robots);
             }
         }
+        catch (_error) {
+            if (context)
+                context.emit({ type: 'info', message: 'No local crawl data found. Switching to live...' });
+            crawlData = await runLiveCrawl(normalizedTargetAbs, rootOrigin, options, context, robots);
+        }
+    }
+    const snapshotId = crawlData.snapshotId;
+    const crawledAt = crawlData.crawledAt;
+    const pagesStart = Date.now();
+    const pages = analyzePages(normalizedTargetAbs, rootOrigin, crawlData.pages, robots, options);
+    if (context)
+        context.emit({ type: 'debug', message: `[analyze] analyzePages took ${Date.now() - pagesStart}ms` });
+    // Sync basic page analysis results back to graph nodes for persistence
+    for (const pageAnalysis of pages) {
+        const node = crawlData.graph.nodes.get(pageAnalysis.url);
+        if (node) {
+            node.soft404Score = pageAnalysis.soft404?.score;
+            node.wordCount = pageAnalysis.content.wordCount;
+            node.externalLinkRatio = pageAnalysis.links.externalRatio;
+            node.thinContentScore = pageAnalysis.thinScore;
+            node.title = pageAnalysis.title.value || undefined;
+        }
     }
-    // Run clustering if requested or as default
-    detectContentClusters(crawlData.graph, options.clusterThreshold, options.minClusterSize);
-    const pages = analyzePages(normalizedRoot, crawlData.pages);
     const activeModules = {
         seo: !!options.seo,
         content: !!options.content,
         accessibility: !!options.accessibility
     };
     const hasFilters = activeModules.seo || activeModules.content || activeModules.accessibility;
-    const filteredPages = hasFilters
-        ? pages.map((page) => filterPageModules(page, activeModules))
-        : pages;
-    // Filter to only the requested URL
-    const targetPage = filteredPages.find(p => p.url === normalizedRoot);
-    const resultPages = targetPage ? [targetPage] : (options.live ? filteredPages.slice(0, 1) : filteredPages);
+    const filteredPages = hasFilters ? pages.map((page) => filterPageModules(page, activeModules)) : pages;
+    const targetPage = filteredPages.find(p => p.url === normalizedPath || p.url === normalizedTargetAbs);
+    let resultPages;
+    if (options.allPages) {
+        resultPages = filteredPages;
+    }
+    else {
+        resultPages = targetPage ? [targetPage] : (options.live ? filteredPages.slice(0, 1) : []);
+    }
+    let clusters = [];
+    let duplicates = [];
+    let prResults = new Map();
+    let hitsResults = new Map();
+    let headingPayloads = {};
+    if (options.clustering) {
+        const clustering = new ClusteringService();
+        clusters = clustering.detectContentClusters(crawlData.graph, options.clusterThreshold, options.minClusterSize);
+    }
+    if (options.allPages) {
+        const duplication = new DuplicateService();
+        duplicates = duplication.detectDuplicates(crawlData.graph, { collapse: false });
+    }
+    if (options.computePagerank) {
+        const prService = new PageRankService();
+        prResults = prService.evaluate(crawlData.graph);
+    }
+    if (options.computeHits) {
+        const hitsService = new HITSService();
+        hitsResults = hitsService.evaluate(crawlData.graph);
+    }
+    if (options.heading) {
+        const headingService = new HeadingHealthService();
+        const { payloadsByUrl } = headingService.evaluateNodes(crawlData.graph.getNodes());
+        headingPayloads = payloadsByUrl;
+    }
+    if (options.orphans) {
+        const edges = crawlData.graph.getEdges();
+        annotateOrphans(crawlData.graph.getNodes(), edges, {
+            enabled: true,
+            severityEnabled: !!options.orphanSeverity,
+            includeSoftOrphans: !!options.includeSoftOrphans,
+            minInbound: options.minInbound || 2,
+            rootUrl: normalizedOrigin
+        });
+    }
+    // Run HealthService when --health is enabled
+    let healthBreakdown;
+    if (options.health) {
+        const healthService = new HealthService();
+        const issues = healthService.collectCrawlIssues(crawlData.graph, crawlData.metrics, rootOrigin);
+        healthBreakdown = healthService.calculateHealthScore(crawlData.graph.nodes.size, issues);
+    }
+    // Update nodes in graph with results
+    for (const node of crawlData.graph.getNodes()) {
+        const pr = prResults.get(node.url);
+        if (pr)
+            node.pagerankScore = pr.score;
+        const hits = hitsResults.get(node.url);
+        if (hits) {
+            node.hubScore = hits.hub_score;
+            node.authScore = hits.authority_score;
+            node.linkRole = hits.link_role;
+        }
+        const heading = headingPayloads[node.url];
+        if (heading) {
+            node.headingScore = heading.score;
+            node.headingData = JSON.stringify(heading);
+        }
+    }
+    // Synchronize graph-level final scores back to PageAnalysis models
+    for (const page of pages) {
+        const node = crawlData.graph.nodes.get(page.url);
+        if (node) {
+            if (node.headingScore !== undefined)
+                page.headingScore = node.headingScore;
+            page.seoScore = scorePageSeo(page);
+        }
+    }
     const duplicateTitles = pages.filter((page) => page.title.status === 'duplicate').length;
     const thinPages = pages.filter((page) => page.thinScore >= 70).length;
-    const siteScores = aggregateSiteScore(crawlData.metrics, pages);
-    return {
+    const siteScores = aggregateSiteScore(crawlData.metrics, resultPages.length === 1 ? resultPages : pages);
+    if (context)
+        context.emit({ type: 'debug', message: `[analyze] Total analysis completed in ${Date.now() - start}ms` });
+    // Persist to Database
+    const db = getDb();
+    const metricsRepo = new MetricsRepository(db);
+    const pageRepo = new PageRepository(db);
+    // Efficiently map URLs to IDs for this snapshot
+    const pagesIdentity = pageRepo.getPagesIdentityBySnapshot(snapshotId);
+    const urlToIdMap = new Map(pagesIdentity.map(p => [p.normalized_url, p.id]));
+    const metricsToSave = crawlData.graph.getNodes().map(node => {
+        const pageId = urlToIdMap.get(node.url);
+        if (!pageId)
+            return null;
+        return {
+            snapshot_id: snapshotId,
+            page_id: pageId,
+            crawl_status: node.crawlStatus || null,
+            word_count: node.wordCount || null,
+            thin_content_score: node.thinContentScore || null,
+            external_link_ratio: node.externalLinkRatio || null,
+            pagerank_score: node.pagerankScore || null,
+            hub_score: node.hubScore || null,
+            auth_score: node.authScore || null,
+            link_role: node.linkRole || null,
+            duplicate_cluster_id: node.duplicateClusterId || null,
+            duplicate_type: node.duplicateType || null,
+            cluster_id: node.clusterId || null,
+            soft404_score: node.soft404Score || null,
+            heading_score: node.headingScore || null,
+            orphan_score: node.orphanScore || null,
+            orphan_type: node.orphanType || null,
+            impact_level: node.impactLevel || null,
+            heading_data: node.headingData || null,
+            is_cluster_primary: node.isClusterPrimary ? 1 : 0
+        };
+    }).filter(m => m !== null);
+    // Persist health score to snapshot if computed
+    if (healthBreakdown && snapshotId) {
+        const db2 = getDb();
+        const snapshotRepo = new SnapshotRepository(db2);
+        snapshotRepo.updateSnapshotStatus(snapshotId, 'completed', {
+            health_score: healthBreakdown.score
+        });
+    }
+    metricsRepo.insertMany(metricsToSave);
+    const result = {
         site_summary: {
-            pages_analyzed: pages.length,
+            pages_analyzed: resultPages.length,
             avg_seo_score: siteScores.seoHealthScore,
             thin_pages: thinPages,
             duplicate_titles: duplicateTitles,
-            site_score: siteScores.overallScore
+            site_score: siteScores.overallScore,
+            site_score_breakdown: siteScores.breakdown
         },
         site_scores: siteScores,
         pages: resultPages,
         active_modules: activeModules,
-        clusters: crawlData.graph.contentClusters
+        snapshotId,
+        crawledAt,
+        clusters,
+        duplicates
     };
+    return result;
 }
-export function renderAnalysisHtml(result) {
-    if (result.pages.length === 1) {
-        return renderSinglePageHtml(result.pages[0]);
-    }
-    const rows = result.pages
-        .map((page) => `< tr > <td>${escapeHtml(page.url)} </td><td>${page.seoScore}</td > <td>${page.thinScore} </td><td>${page.title.status}</td > <td>${page.metaDescription.status} </td></tr > `)
-        .join('');
-    return `<!DOCTYPE html><html lang="en"><head><meta charset="utf-8" /><title>Crawlith Analysis Report</title></head><body><h1>Analysis</h1><p>Pages: ${result.site_summary.pages_analyzed}</p><p>Average SEO: ${result.site_summary.avg_seo_score}</p><table border="1" cellspacing="0" cellpadding="6"><thead><tr><th>URL</th><th>SEO Score</th><th>Thin Score</th><th>Title</th><th>Meta</th></tr></thead><tbody>${rows}</tbody></table></body></html>`;
-}
-function renderSinglePageHtml(page) {
-    return `<!DOCTYPE html>
-<html lang="en">
-  <head>
-    <meta charset="UTF-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>Analysis for ${escapeHtml(page.url)}</title>
-    <style>
-        body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; line-height: 1.6; color: #333; }
-        h1 { border-bottom: 2px solid #eee; padding-bottom: 10px; }
-        h2 { margin-top: 30px; border-bottom: 1px solid #eee; padding-bottom: 5px; }
-        .score-card { display: flex; gap: 20px; margin-bottom: 30px; }
-        .score-box { background: #f8f9fa; padding: 15px; border-radius: 8px; text-align: center; flex: 1; border: 1px solid #e1e4e8; }
-        .score-val { font-size: 24px; font-weight: bold; color: #0366d6; }
-        .status-ok { color: green; font-weight: bold; }
-        .status-warning { color: orange; font-weight: bold; }
-        .status-critical { color: red; font-weight: bold; }
-        .status-missing { color: red; font-weight: bold; }
-        .data-table { width: 100%; border-collapse: collapse; margin-top: 10px; }
-        .data-table th, .data-table td { text-align: left; padding: 8px; border-bottom: 1px solid #eee; }
-        .data-table th { width: 150px; color: #666; }
-        code { background: #f6f8fa; padding: 2px 4px; border-radius: 3px; font-size: 0.9em; }
-    </style>
-  </head>
-  <body>
-    <h1>Page Analysis</h1>
-    <p><strong>URL:</strong> <a href="${page.url}" target="_blank">${page.url}</a></p>
-    <div class="score-card">
-      <div class="score-box">
-        <div class="score-val">${page.seoScore}</div>
-        <div>SEO Score</div>
-      </div>
-      <div class="score-box">
-        <div class="score-val">${page.thinScore}</div>
-        <div>Thin Content Score</div>
-      </div>
-      <div class="score-box">
-        <div class="score-val">${page.status === 0 ? 'Pending/Limit' : page.status}</div>
-        <div>HTTP Status</div>
-      </div>
-    </div>
-    <h2>Meta Tags</h2>
-    <table class="data-table">
-      <tr>
-        <th>Title</th>
-        <td>
-          <div>${escapeHtml(page.title.value || '(missing)')}</div>
-          <small>Length: ${page.title.length} | Status: <span class="status-${page.title.status}">${page.title.status}</span></small>
-        </td>
-      </tr>
-      <tr>
-        <th>Description</th>
-        <td>
-          <div>${escapeHtml(page.metaDescription.value || '(missing)')}</div>
-          <small>Length: ${page.metaDescription.length} | Status: <span class="status-${page.metaDescription.status}">${page.metaDescription.status}</span></small>
-        </td>
-      </tr>
-      <tr>
-        <th>Canonical</th>
-        <td>${page.meta.canonical ? escapeHtml(page.meta.canonical) : '<em>(none)</em>'}</td>
-      </tr>
-      <tr>
-        <th>Robots</th>
-        <td>
-          Index: ${!page.meta.noindex},
-          Follow: ${!page.meta.nofollow}
-        </td>
-      </tr>
-    </table>
-    <h2>Content & Heading</h2>
-    <table class="data-table">
-      <tr>
-        <th>H1 Tag</th>
-        <td>
-          Status: <span class="status-${page.h1.status}">${page.h1.status}</span>
-          (${page.h1.count} detected)
-          ${page.h1.matchesTitle ? ' | Matches Title' : ''}
-        </td>
-      </tr>
-      <tr>
-        <th>Word Count</th>
-        <td>${page.content.wordCount} words</td>
-      </tr>
-      <tr>
-        <th>Unique Sentences</th>
-        <td>${page.content.uniqueSentenceCount}</td>
-      </tr>
-      <tr>
-        <th>Text / HTML Ratio</th>
-        <td>${(page.content.textHtmlRatio * 100).toFixed(2)}%</td>
-      </tr>
-    </table>
-    <h2>Links & Images</h2>
-    <table class="data-table">
-      <tr>
-        <th>Internal Links</th>
-        <td>${page.links.internalLinks}</td>
-      </tr>
-      <tr>
-        <th>External Links</th>
-        <td>${page.links.externalLinks} (${(page.links.externalRatio * 100).toFixed(1)}%)</td>
-      </tr>
-      <tr>
-        <th>Images</th>
-        <td>${page.images.totalImages} total (${page.images.missingAlt} missing alt text)</td>
-      </tr>
-    </table>
-    <h2>Structured Data</h2>
-    <table class="data-table">
-      <tr>
-        <th>Status</th>
-        <td>
-          ${page.structuredData.present
-        ? (page.structuredData.valid ? '<span class="status-ok">Valid</span>' : '<span class="status-critical">Invalid JSON</span>')
-        : 'Not detected'}
-        </td>
-      </tr>
-      ${page.structuredData.present ? `
-      <tr>
-          <th>Types Found</th>
-          <td>${page.structuredData.types.map(t => `<code>${t}</code>`).join(', ')}</td>
-      </tr>
-      ` : ''}
-    </table>
-  </body>
-</html>`;
-}
-export function renderAnalysisMarkdown(result) {
-    const summary = [
-        '# Crawlith SEO Analysis Report',
-        '',
-        '## 📊 Summary',
-        `- Pages Analyzed: ${result.site_summary.pages_analyzed}`,
-        `- Overall Site Score: ${result.site_summary.site_score.toFixed(1)}`,
-        `- Avg SEO Score: ${result.site_summary.avg_seo_score.toFixed(1)}`,
-        `- Thin Pages Found: ${result.site_summary.thin_pages}`,
-        `- Duplicate Titles: ${result.site_summary.duplicate_titles}`,
-        '',
-        '## 📄 Page Details',
-        '',
-        '| URL | SEO Score | Thin Score | Title Status | Meta Status |',
-        '| :--- | :--- | :--- | :--- | :--- |',
-    ];
-    result.pages.forEach((page) => {
-        summary.push(`| ${page.url} | ${page.seoScore} | ${page.thinScore} | ${page.title.status} | ${page.metaDescription.status} |`);
-    });
-    return summary.join('\n');
-}
-export function renderAnalysisCsv(result) {
-    const headers = ['URL', 'SEO Score', 'Thin Score', 'HTTP Status', 'Title', 'Title Length', 'Meta Description', 'Desc Length', 'Word Count', 'Internal Links', 'External Links'];
-    const rows = result.pages.map((p) => {
-        const statusStr = p.status === 0 ? 'Pending/Limit' : p.status;
-        return [
-            p.url,
-            p.seoScore,
-            p.thinScore,
-            statusStr,
-            `"${(p.title.value || '').replace(/"/g, '""')}"`,
-            p.title.length,
-            `"${(p.metaDescription.value || '').replace(/"/g, '""')}"`,
-            p.metaDescription.length,
-            p.content.wordCount,
-            p.links.internalLinks,
-            p.links.externalLinks
-        ].join(',');
-    });
-    return [headers.join(','), ...rows].join('\n');
-}
-function escapeHtml(value) {
-    return value.replaceAll('&', '&amp;').replaceAll('<', '&lt;').replaceAll('>', '&gt;');
-}
-function analyzePages(rootUrl, pages) {
-    const titleCandidates = pages.map((page) => analyzeTitle(page.html || ''));
-    const metaCandidates = pages.map((page) => analyzeMetaDescription(page.html || ''));
-    const titles = applyDuplicateStatuses(titleCandidates);
-    const metas = applyDuplicateStatuses(metaCandidates);
+export function analyzePages(targetUrl, rootOrigin, pages, robots, options = {}) {
+    const titleCounts = new Map();
+    const metaCounts = new Map();
     const sentenceCountFrequency = new Map();
-    const baseContent = pages.map((page) => analyzeContent(page.html || ''));
-    for (const item of baseContent) {
-        sentenceCountFrequency.set(item.uniqueSentenceCount, (sentenceCountFrequency.get(item.uniqueSentenceCount) || 0) + 1);
-    }
-    return pages.map((page, index) => {
+    const results = [];
+    const targetPath = UrlUtil.toPath(targetUrl, rootOrigin);
+    const targetAbs = UrlUtil.toAbsolute(targetUrl, rootOrigin);
+    for (const page of pages) {
+        const pagePath = UrlUtil.toPath(page.url, rootOrigin);
+        const pageAbs = UrlUtil.toAbsolute(page.url, rootOrigin);
+        const isTarget = page.url === targetUrl || pagePath === targetPath || pageAbs === targetAbs;
+        // In single-page mode, if it's not the target, we skip it entirely for speed.
+        if (!options.allPages && !isTarget)
+            continue;
         const html = page.html || '';
-        const title = titles[index];
-        const metaDescription = metas[index];
-        const h1 = analyzeH1(html, title.value);
-        const content = baseContent[index];
-        const duplicationScore = (sentenceCountFrequency.get(content.uniqueSentenceCount) || 0) > 1 ? 100 : 0;
-        const thinScore = calculateThinContentScore(content, duplicationScore);
-        const images = analyzeImageAlts(html);
-        const links = analyzeLinks(html, page.url, rootUrl);
-        const structuredData = analyzeStructuredData(html);
-        const analysis = {
+        const $ = load(html || '<html></html>');
+        // Reconstruct absolute URL from stored path for robots & link resolution
+        const pageAbsUrl = UrlUtil.toAbsolute(page.url, rootOrigin);
+        let crawlStatus = page.crawlStatus;
+        if (robots) {
+            const isBlocked = !robots.isAllowed(pageAbsUrl, 'crawlith') ||
+                (!pageAbsUrl.endsWith('/') && !robots.isAllowed(pageAbsUrl + '/', 'crawlith'));
+            if (isBlocked)
+                crawlStatus = 'blocked_by_robots';
+        }
+        // Shared DOM Analysis
+        const title = analyzeTitle($);
+        const metaDescription = analyzeMetaDescription($);
+        const h1 = analyzeH1($, title.value);
+        const content = analyzeContent($);
+        const images = analyzeImageAlts($);
+        const links = analyzeLinks($, pageAbsUrl, rootOrigin);
+        const structuredData = analyzeStructuredData($);
+        if (title.value) {
+            const key = title.value.trim().toLowerCase();
+            titleCounts.set(key, (titleCounts.get(key) || 0) + 1);
+        }
+        if (metaDescription.value) {
+            const key = metaDescription.value.trim().toLowerCase();
+            metaCounts.set(key, (metaCounts.get(key) || 0) + 1);
+        }
+        sentenceCountFrequency.set(content.uniqueSentenceCount, (sentenceCountFrequency.get(content.uniqueSentenceCount) || 0) + 1);
+        const soft404Service = new Soft404Service();
+        const soft404 = soft404Service.analyze(html, links.externalLinks + links.internalLinks);
+        const isCanonicalConflict = !!(page.canonical && page.canonical !== page.url && page.canonical !== pageAbsUrl &&
+            page.canonical.replace(/\/$/, '') !== pageAbsUrl.replace(/\/$/, ''));
+        const resultPage = {
             url: page.url,
             status: page.status || 0,
             title,
             metaDescription,
             h1,
             content,
-            thinScore,
+            thinScore: 0,
             images,
             links,
             structuredData,
@@ -295,45 +344,50 @@ function analyzePages(rootUrl, pages) {
             meta: {
                 canonical: page.canonical,
                 noindex: page.noindex,
-                nofollow: page.nofollow
-            }
+                nofollow: page.nofollow,
+                crawlStatus,
+                canonicalConflict: isCanonicalConflict
+            },
+            soft404
         };
+        Object.defineProperty(resultPage, 'html', { value: html, enumerable: false });
+        results.push(resultPage);
+    }
+    for (const analysis of results) {
+        if (analysis.title.value) {
+            const key = analysis.title.value.trim().toLowerCase();
+            if ((titleCounts.get(key) || 0) > 1)
+                analysis.title.status = 'duplicate';
+        }
+        if (analysis.metaDescription.value) {
+            const key = analysis.metaDescription.value.trim().toLowerCase();
+            if ((metaCounts.get(key) || 0) > 1)
+                analysis.metaDescription.status = 'duplicate';
+        }
+        const duplicationScore = (sentenceCountFrequency.get(analysis.content.uniqueSentenceCount) || 0) > 1 ? 100 : 0;
+        analysis.thinScore = calculateThinContentScore(analysis.content, duplicationScore);
         analysis.seoScore = scorePageSeo(analysis);
-        return analysis;
-    });
+    }
+    return results;
 }
 function filterPageModules(page, modules) {
-    const keepSeo = modules.seo;
-    const keepContent = modules.content;
-    const keepAccessibility = modules.accessibility;
-    return {
+    const filtered = {
         ...page,
-        title: keepSeo ? page.title : { value: null, length: 0, status: 'missing' },
-        metaDescription: keepSeo ? page.metaDescription : { value: null, length: 0, status: 'missing' },
-        h1: (keepSeo || keepContent) ? page.h1 : { count: 0, status: 'critical', matchesTitle: false },
-        links: keepSeo ? page.links : { internalLinks: 0, externalLinks: 0, nofollowCount: 0, externalRatio: 0 },
-        structuredData: keepSeo ? page.structuredData : { present: false, valid: false, types: [] },
-        content: keepContent ? page.content : { wordCount: 0, textHtmlRatio: 0, uniqueSentenceCount: 0 },
-        thinScore: keepContent ? page.thinScore : 0,
-        images: keepAccessibility ? page.images : { totalImages: 0, missingAlt: 0, emptyAlt: 0 }
+        title: modules.seo ? page.title : { value: null, length: 0, status: 'missing' },
+        metaDescription: modules.seo ? page.metaDescription : { value: null, length: 0, status: 'missing' },
+        h1: (modules.seo || modules.content) ? page.h1 : { count: 0, status: 'critical', matchesTitle: false, value: null },
+        links: modules.seo ? page.links : { internalLinks: 0, externalLinks: 0, nofollowCount: 0, externalRatio: 0 },
+        structuredData: modules.seo ? page.structuredData : { present: false, valid: false, types: [] },
+        content: modules.content ? page.content : { wordCount: 0, textHtmlRatio: 0, uniqueSentenceCount: 0 },
+        thinScore: modules.content ? page.thinScore : 0,
+        images: modules.accessibility ? page.images : { totalImages: 0, missingAlt: 0, emptyAlt: 0 }
     };
-}
-async function loadCrawlData(rootUrl, fromCrawl) {
-    // If fromCrawl is provided, we could theoretically load JSON, but
-    // we now default to DB fetching for all operations.
-    if (fromCrawl) {
-        try {
-            const content = await fs.readFile(fromCrawl, 'utf-8');
-            const raw = JSON.parse(content);
-            const pages = parsePages(raw);
-            const graph = graphFromPages(rootUrl, pages, raw);
-            const metrics = calculateMetrics(graph, 5);
-            return { pages, metrics, graph };
-        }
-        catch (_e) {
-            // Fallback downwards if file doesn't exist
-        }
+    if (page.html) {
+        Object.defineProperty(filtered, 'html', { value: page.html, enumerable: false });
     }
+    return filtered;
+}
+async function loadCrawlData(rootUrl, snapshotId) {
     const db = getDb();
     const siteRepo = new SiteRepository(db);
     const snapshotRepo = new SnapshotRepository(db);
@@ -341,77 +395,43 @@ async function loadCrawlData(rootUrl, fromCrawl) {
     const urlObj = new URL(rootUrl);
     const domain = urlObj.hostname.replace('www.', '');
     const site = siteRepo.firstOrCreateSite(domain);
-    const snapshot = snapshotRepo.getLatestSnapshot(site.id, 'completed');
+    let snapshot = null;
+    if (snapshotId) {
+        snapshot = snapshotRepo.getSnapshot(snapshotId);
+    }
     if (!snapshot) {
-        throw new Error(`No completed snapshot found for ${rootUrl} in database.`);
+        for (const candidate of UrlUtil.toLookupCandidates(rootUrl, `${urlObj.protocol}//${urlObj.host}`)) {
+            const page = pageRepo.getPage(site.id, candidate);
+            if (page?.last_seen_snapshot_id) {
+                snapshot = snapshotRepo.getSnapshot(page.last_seen_snapshot_id);
+                break;
+            }
+        }
     }
+    if (!snapshot)
+        snapshot = snapshotRepo.getLatestSnapshot(site.id);
+    if (!snapshot)
+        throw new Error(`No crawl data found for ${rootUrl}`);
     const graph = loadGraphFromSnapshot(snapshot.id);
     const metrics = calculateMetrics(graph, 5);
-    // We also need the `pages` array for analysis.
-    // It needs `html` which might not be fully available unless we look up from the DB or Graph.
-    // Wait, the Graph stores Node which doesn't contain HTML since we removed it from memory?
-    // Actually, `loadGraphFromSnapshot` does NOT load actual raw HTML from nodes to save memory.
-    // We need HTML for `analyzeSite` module! So we must fetch it from `pageRepo`.
-    const dbPages = pageRepo.getPagesBySnapshot(snapshot.id);
-    const pages = dbPages.map((p) => ({
-        url: p.normalized_url,
-        status: p.http_status || 0,
-        html: p.html || '',
-        depth: p.depth || 0
-    }));
-    return { pages, metrics, graph };
-}
-function parsePages(raw) {
-    if (Array.isArray(raw.pages)) {
-        return raw.pages.map((page) => {
-            const p = page;
-            return {
-                url: String(p.url || ''),
-                status: Number(p.status || 0),
-                html: typeof p.html === 'string' ? p.html : '',
-                depth: Number(p.depth || 0)
-            };
-        }).filter((page) => Boolean(page.url));
-    }
-    if (Array.isArray(raw.nodes)) {
-        return raw.nodes.map((node) => {
-            const n = node;
-            return {
-                url: String(n.url || ''),
-                status: Number(n.status || 0),
-                html: typeof n.html === 'string' ? n.html : '',
-                depth: Number(n.depth || 0)
+    const dbPagesIterator = pageRepo.getPagesIteratorBySnapshot(snapshot.id);
+    const pagesGenerator = function* () {
+        for (const p of dbPagesIterator) {
+            yield {
+                url: p.normalized_url,
+                status: p.http_status || 0,
+                html: p.html || '',
+                depth: p.depth || 0,
+                canonical: p.canonical_url || undefined,
+                noindex: !!p.noindex,
+                nofollow: !!p.nofollow,
+                crawlStatus: graph.nodes.get(p.normalized_url)?.crawlStatus
             };
-        }).filter((page) => Boolean(page.url));
-    }
-    return [];
-}
-function graphFromPages(rootUrl, pages, raw) {
-    const graph = new Graph();
-    for (const page of pages) {
-        graph.addNode(page.url, page.depth || 0, page.status || 0);
-    }
-    if (Array.isArray(raw.edges)) {
-        for (const edge of raw.edges) {
-            const e = edge;
-            if (typeof e.source === 'string' && typeof e.target === 'string') {
-                graph.addNode(e.source, 0, 0);
-                graph.addNode(e.target, 0, 0);
-                graph.addEdge(e.source, e.target);
-            }
         }
-        return graph;
-    }
-    for (const page of pages) {
-        if (!page.html)
-            continue;
-        const linkAnalysis = analyzeLinks(page.html, page.url, rootUrl);
-        if (linkAnalysis.internalLinks === 0 && linkAnalysis.externalLinks === 0)
-            continue;
-    }
-    return graph;
+    };
+    return { pages: pagesGenerator(), metrics, graph, snapshotId: snapshot.id, crawledAt: snapshot.created_at };
 }
-async function runLiveCrawl(url, options) {
+async function runLiveCrawl(url, origin, options, context, robots) {
     const snapshotId = await crawl(url, {
         limit: 1,
         depth: 0,
@@ -419,18 +439,46 @@ async function runLiveCrawl(url, options) {
         proxyUrl: options.proxyUrl,
         userAgent: options.userAgent,
         maxRedirects: options.maxRedirects,
-        debug: options.debug
-    });
+        debug: options.debug,
+        snapshotRunType: 'single',
+        robots,
+        sitemap: options.sitemap,
+        plugins: options.plugins
+    }, context);
     const graph = loadGraphFromSnapshot(snapshotId);
     const pages = graph.getNodes().map((node) => ({
         url: node.url,
         status: node.status,
-        html: node.html || '', // Include HTML
-        depth: node.depth
+        html: node.html || '',
+        depth: node.depth,
+        crawlStatus: node.crawlStatus
     }));
-    return {
-        pages,
-        metrics: calculateMetrics(graph, 1),
-        graph
-    };
+    return { pages, metrics: calculateMetrics(graph, 1), graph, snapshotId };
+}
+export function escapeHtml(value) {
+    return value.replaceAll('&', '&amp;').replaceAll('<', '&lt;').replaceAll('>', '&gt;');
+}
+export function renderAnalysisHtml(result) {
+    if (result.pages.length === 1)
+        return renderSinglePageHtml(result.pages[0]);
+    const rows = result.pages.map((page) => `<tr><td>${escapeHtml(page.url)}</td><td>${page.seoScore}</td><td>${page.thinScore}</td><td>${page.title.status}</td><td>${page.metaDescription.status}</td></tr>`).join('');
+    return ANALYSIS_LIST_TEMPLATE.replace('{{PAGES_ANALYZED}}', result.site_summary.pages_analyzed.toString()).replace('{{AVG_SEO_SCORE}}', result.site_summary.avg_seo_score.toString()).replace('{{ROWS}}', rows);
+}
+function renderSinglePageHtml(page) {
+    const structuredDataStatus = page.structuredData.present ? (page.structuredData.valid ? '<span class="status-ok">Valid</span>' : '<span class="status-critical">Invalid JSON</span>') : 'Not detected';
+    const structuredDataTypesRow = page.structuredData.present ? `<tr><th>Types Found</th><td>${page.structuredData.types.map(t => `<code>${t}</code>`).join(', ')}</td></tr>` : '';
+    return ANALYSIS_PAGE_TEMPLATE.replaceAll('{{URL}}', escapeHtml(page.url)).replace('{{SEO_SCORE}}', page.seoScore.toString()).replace('{{THIN_SCORE}}', page.thinScore.toString()).replace('{{HTTP_STATUS}}', page.status === 0 ? 'Pending/Limit' : page.status.toString()).replace('{{TITLE_VALUE}}', escapeHtml(page.title.value || '(missing)')).replace('{{TITLE_LENGTH}}', page.title.length.toString()).replaceAll('{{TITLE_STATUS}}', page.title.status).replace('{{META_DESCRIPTION_VALUE}}', escapeHtml(page.metaDescription.value || '(missing)')).replace('{{META_DESCRIPTION_LENGTH}}', page.metaDescription.length.toString()).replaceAll('{{META_DESCRIPTION_STATUS}}', page.metaDescription.status).replace('{{CANONICAL}}', page.meta.canonical ? escapeHtml(page.meta.canonical) : '<em>(none)</em>').replace('{{ROBOTS_INDEX}}', (!page.meta.noindex).toString()).replace('{{ROBOTS_FOLLOW}}', (!page.meta.nofollow).toString()).replaceAll('{{H1_STATUS}}', page.h1.status).replace('{{H1_COUNT}}', page.h1.count.toString()).replace('{{H1_MATCHES_TITLE}}', page.h1.matchesTitle ? ' | Matches Title' : '').replace('{{WORD_COUNT}}', page.content.wordCount.toString()).replace('{{UNIQUE_SENTENCES}}', page.content.uniqueSentenceCount.toString()).replace('{{TEXT_HTML_RATIO}}', (page.content.textHtmlRatio * 100).toFixed(2)).replace('{{INTERNAL_LINKS}}', page.links.internalLinks.toString()).replace('{{EXTERNAL_LINKS}}', page.links.externalLinks.toString()).replace('{{EXTERNAL_RATIO}}', (page.links.externalRatio * 100).toFixed(1)).replace('{{TOTAL_IMAGES}}', page.images.totalImages.toString()).replace('{{MISSING_ALT}}', page.images.missingAlt.toString()).replace('{{STRUCTURED_DATA_STATUS}}', structuredDataStatus).replace('{{STRUCTURED_DATA_TYPES_ROW}}', structuredDataTypesRow);
+}
+export function renderAnalysisMarkdown(result) {
+    const summary = ['# Crawlith SEO Analysis Report', '', '## 📊 Summary', `- Pages Analyzed: ${result.site_summary.pages_analyzed}`, `- Overall Site Score: ${result.site_summary.site_score.toFixed(1)}`, `- Avg SEO Score: ${result.site_summary.avg_seo_score.toFixed(1)}`, `- Thin Pages Found: ${result.site_summary.thin_pages}`, `- Duplicate Titles: ${result.site_summary.duplicate_titles}`, '', '## 📄 Page Details', '', '| URL | SEO Score | Thin Score | Title Status | Meta Status | Canonical |', '| :--- | :--- | :--- | :--- | :--- | :--- |'];
+    result.pages.forEach((page) => summary.push(`| ${page.url} | ${page.seoScore} | ${page.thinScore} | ${page.title.status} | ${page.metaDescription.status} | ${page.meta.canonical || '-'} |`));
+    return summary.join('\n');
+}
+export function renderAnalysisCsv(result) {
+    const headers = ['URL', 'SEO Score', 'Thin Score', 'HTTP Status', 'Title', 'Title Length', 'Meta Description', 'Desc Length', 'Word Count', 'Internal Links', 'External Links', 'Canonical'];
+    const rows = result.pages.map((p) => {
+        const statusStr = p.status === 0 ? 'Pending/Limit' : p.status;
+        return [p.url, p.seoScore, p.thinScore, statusStr, `"${(p.title.value || '').replace(/"/g, '""')}"`, p.title.length, `"${(p.metaDescription.value || '').replace(/"/g, '""')}"`, p.metaDescription.length, p.content.wordCount, p.links.internalLinks, p.links.externalLinks, p.meta.canonical || ''].join(',');
+    });
+    return [headers.join(','), ...rows].join('\n');
 }