npm - @crawlith/core - Versions diffs - 0.1.1 → 0.1.2 - Mend

@crawlith/core 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (237) hide show

package/LICENSE +201 -0
package/README.md +70 -0
package/dist/analysis/analyze.d.ts +29 -8
package/dist/analysis/analyze.js +325 -221
package/dist/analysis/clustering.d.ts +23 -0
package/dist/analysis/clustering.js +206 -0
package/dist/analysis/content.d.ts +1 -1
package/dist/analysis/content.js +11 -5
package/dist/analysis/duplicate.d.ts +34 -0
package/dist/analysis/duplicate.js +305 -0
package/dist/analysis/heading.d.ts +116 -0
package/dist/analysis/heading.js +356 -0
package/dist/analysis/images.d.ts +1 -1
package/dist/analysis/images.js +6 -5
package/dist/analysis/links.d.ts +1 -1
package/dist/analysis/links.js +8 -8
package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
package/dist/analysis/scoring.js +4 -1
package/dist/analysis/seo.d.ts +8 -4
package/dist/analysis/seo.js +41 -30
package/dist/analysis/soft404.d.ts +17 -0
package/dist/analysis/soft404.js +62 -0
package/dist/analysis/structuredData.d.ts +1 -1
package/dist/analysis/structuredData.js +5 -4
package/dist/application/index.d.ts +2 -0
package/dist/application/index.js +2 -0
package/dist/application/usecase.d.ts +3 -0
package/dist/application/usecase.js +1 -0
package/dist/application/usecases.d.ts +114 -0
package/dist/application/usecases.js +201 -0
package/dist/audit/index.js +1 -1
package/dist/audit/transport.d.ts +1 -1
package/dist/audit/transport.js +5 -4
package/dist/audit/types.d.ts +1 -0
package/dist/constants.d.ts +17 -0
package/dist/constants.js +23 -0
package/dist/core/scope/scopeManager.js +3 -0
package/dist/crawler/crawl.d.ts +2 -2
package/dist/crawler/crawler.d.ts +17 -5
package/dist/crawler/crawler.js +259 -94
package/dist/crawler/fetcher.d.ts +1 -1
package/dist/crawler/fetcher.js +6 -6
package/dist/crawler/metricsRunner.d.ts +21 -1
package/dist/crawler/metricsRunner.js +181 -60
package/dist/crawler/normalize.d.ts +41 -0
package/dist/crawler/normalize.js +119 -3
package/dist/crawler/parser.d.ts +1 -3
package/dist/crawler/parser.js +2 -49
package/dist/crawler/resolver.d.ts +11 -0
package/dist/crawler/resolver.js +67 -0
package/dist/crawler/sitemap.d.ts +4 -1
package/dist/crawler/sitemap.js +24 -18
package/dist/crawler/trap.d.ts +5 -1
package/dist/crawler/trap.js +23 -2
package/dist/db/CrawlithDB.d.ts +110 -0
package/dist/db/CrawlithDB.js +500 -0
package/dist/db/graphLoader.js +15 -32
package/dist/db/index.d.ts +9 -1
package/dist/db/index.js +39 -31
package/dist/db/migrations.d.ts +2 -0
package/dist/db/{schema.js → migrations.js} +90 -43
package/dist/db/pluginRegistry.d.ts +9 -0
package/dist/db/pluginRegistry.js +19 -0
package/dist/db/repositories/EdgeRepository.d.ts +5 -0
package/dist/db/repositories/EdgeRepository.js +7 -0
package/dist/db/repositories/MetricsRepository.d.ts +13 -8
package/dist/db/repositories/MetricsRepository.js +14 -6
package/dist/db/repositories/PageRepository.d.ts +5 -3
package/dist/db/repositories/PageRepository.js +68 -17
package/dist/db/repositories/SiteRepository.d.ts +6 -0
package/dist/db/repositories/SiteRepository.js +4 -0
package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
package/dist/db/repositories/SnapshotRepository.js +48 -10
package/dist/db/reset.d.ts +9 -0
package/dist/db/reset.js +32 -0
package/dist/db/statements.d.ts +12 -0
package/dist/db/statements.js +40 -0
package/dist/diff/compare.d.ts +0 -5
package/dist/diff/compare.js +0 -12
package/dist/diff/service.d.ts +16 -0
package/dist/diff/service.js +41 -0
package/dist/domain/index.d.ts +4 -0
package/dist/domain/index.js +4 -0
package/dist/events.d.ts +8 -0
package/dist/graph/graph.d.ts +20 -42
package/dist/graph/graph.js +12 -16
package/dist/graph/hits.d.ts +23 -0
package/dist/graph/hits.js +111 -0
package/dist/graph/metrics.d.ts +0 -4
package/dist/graph/metrics.js +19 -15
package/dist/graph/pagerank.d.ts +17 -4
package/dist/graph/pagerank.js +126 -93
package/dist/index.d.ts +27 -9
package/dist/index.js +27 -9
package/dist/lock/lockManager.d.ts +1 -0
package/dist/lock/lockManager.js +15 -0
package/dist/plugin-system/plugin-cli.d.ts +10 -0
package/dist/plugin-system/plugin-cli.js +31 -0
package/dist/plugin-system/plugin-config.d.ts +16 -0
package/dist/plugin-system/plugin-config.js +36 -0
package/dist/plugin-system/plugin-loader.d.ts +17 -0
package/dist/plugin-system/plugin-loader.js +122 -0
package/dist/plugin-system/plugin-registry.d.ts +25 -0
package/dist/plugin-system/plugin-registry.js +167 -0
package/dist/plugin-system/plugin-types.d.ts +205 -0
package/dist/plugin-system/plugin-types.js +1 -0
package/dist/ports/index.d.ts +9 -0
package/dist/ports/index.js +1 -0
package/dist/report/export.d.ts +3 -0
package/dist/report/export.js +81 -0
package/dist/report/insight.d.ts +27 -0
package/dist/report/insight.js +103 -0
package/dist/scoring/health.d.ts +17 -11
package/dist/scoring/health.js +183 -140
package/dist/utils/chalk.d.ts +6 -0
package/dist/utils/chalk.js +41 -0
package/dist/utils/secureConfig.d.ts +23 -0
package/dist/utils/secureConfig.js +128 -0
package/package.json +10 -4
package/CHANGELOG.md +0 -13
package/dist/db/schema.d.ts +0 -2
package/dist/graph/cluster.d.ts +0 -6
package/dist/graph/cluster.js +0 -221
package/dist/graph/duplicate.d.ts +0 -10
package/dist/graph/duplicate.js +0 -302
package/dist/scoring/hits.d.ts +0 -10
package/dist/scoring/hits.js +0 -131
package/scripts/copy-assets.js +0 -37
package/src/analysis/analysis_list.html +0 -35
package/src/analysis/analysis_page.html +0 -123
package/src/analysis/analyze.ts +0 -505
package/src/analysis/content.ts +0 -62
package/src/analysis/images.ts +0 -28
package/src/analysis/links.ts +0 -41
package/src/analysis/scoring.ts +0 -66
package/src/analysis/seo.ts +0 -82
package/src/analysis/structuredData.ts +0 -62
package/src/analysis/templates.ts +0 -9
package/src/audit/dns.ts +0 -49
package/src/audit/headers.ts +0 -98
package/src/audit/index.ts +0 -66
package/src/audit/scoring.ts +0 -232
package/src/audit/transport.ts +0 -258
package/src/audit/types.ts +0 -102
package/src/core/network/proxyAdapter.ts +0 -21
package/src/core/network/rateLimiter.ts +0 -39
package/src/core/network/redirectController.ts +0 -47
package/src/core/network/responseLimiter.ts +0 -34
package/src/core/network/retryPolicy.ts +0 -57
package/src/core/scope/domainFilter.ts +0 -45
package/src/core/scope/scopeManager.ts +0 -52
package/src/core/scope/subdomainPolicy.ts +0 -39
package/src/core/security/ipGuard.ts +0 -171
package/src/crawler/crawl.ts +0 -9
package/src/crawler/crawler.ts +0 -601
package/src/crawler/extract.ts +0 -39
package/src/crawler/fetcher.ts +0 -251
package/src/crawler/metricsRunner.ts +0 -137
package/src/crawler/normalize.ts +0 -108
package/src/crawler/parser.ts +0 -190
package/src/crawler/sitemap.ts +0 -76
package/src/crawler/trap.ts +0 -96
package/src/db/graphLoader.ts +0 -135
package/src/db/index.ts +0 -75
package/src/db/repositories/EdgeRepository.ts +0 -43
package/src/db/repositories/MetricsRepository.ts +0 -63
package/src/db/repositories/PageRepository.ts +0 -228
package/src/db/repositories/SiteRepository.ts +0 -43
package/src/db/repositories/SnapshotRepository.ts +0 -99
package/src/db/schema.ts +0 -177
package/src/diff/compare.ts +0 -84
package/src/events.ts +0 -16
package/src/graph/cluster.ts +0 -246
package/src/graph/duplicate.ts +0 -350
package/src/graph/graph.ts +0 -192
package/src/graph/metrics.ts +0 -125
package/src/graph/pagerank.ts +0 -126
package/src/graph/simhash.ts +0 -76
package/src/index.ts +0 -33
package/src/lock/hashKey.ts +0 -51
package/src/lock/lockManager.ts +0 -132
package/src/lock/pidCheck.ts +0 -13
package/src/report/crawl.html +0 -879
package/src/report/crawlExport.ts +0 -58
package/src/report/crawl_template.ts +0 -9
package/src/report/html.ts +0 -27
package/src/scoring/health.ts +0 -241
package/src/scoring/hits.ts +0 -153
package/src/scoring/orphanSeverity.ts +0 -176
package/src/utils/version.ts +0 -18
package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
package/tests/analysis.unit.test.ts +0 -142
package/tests/analyze.integration.test.ts +0 -133
package/tests/analyze_markdown.test.ts +0 -98
package/tests/audit/audit.test.ts +0 -101
package/tests/audit/dns.test.ts +0 -31
package/tests/audit/headers.test.ts +0 -45
package/tests/audit/scoring.test.ts +0 -133
package/tests/audit/security.test.ts +0 -12
package/tests/audit/transport.test.ts +0 -111
package/tests/clustering.test.ts +0 -118
package/tests/clustering_risk.test.ts +0 -118
package/tests/crawler.test.ts +0 -364
package/tests/db/index.test.ts +0 -134
package/tests/db/repositories.test.ts +0 -115
package/tests/db.test.ts +0 -159
package/tests/db_repos.test.ts +0 -72
package/tests/diff.test.ts +0 -67
package/tests/duplicate.test.ts +0 -110
package/tests/extract.test.ts +0 -86
package/tests/fetcher.test.ts +0 -110
package/tests/fetcher_safety.test.ts +0 -91
package/tests/fixtures/analyze-crawl.json +0 -26
package/tests/graph/graph.test.ts +0 -100
package/tests/graphLoader.test.ts +0 -124
package/tests/hits.test.ts +0 -134
package/tests/html_report.test.ts +0 -59
package/tests/ipGuard.test.ts +0 -73
package/tests/lock/lockManager.test.ts +0 -198
package/tests/metrics.test.ts +0 -196
package/tests/normalize.test.ts +0 -88
package/tests/orphanSeverity.test.ts +0 -160
package/tests/pagerank.test.ts +0 -98
package/tests/parser.test.ts +0 -117
package/tests/proxy_safety.test.ts +0 -57
package/tests/redirect_safety.test.ts +0 -77
package/tests/renderAnalysisCsv.test.ts +0 -183
package/tests/safety.test.ts +0 -126
package/tests/scope.test.ts +0 -84
package/tests/scoring.test.ts +0 -60
package/tests/sitemap.test.ts +0 -100
package/tests/soft404.test.ts +0 -41
package/tests/ssrf_fix.test.ts +0 -69
package/tests/trap.test.ts +0 -39
package/tests/visualization_data.test.ts +0 -46
package/tsconfig.json +0 -11

package/dist/analysis/analyze.js CHANGED Viewed

@@ -1,6 +1,9 @@
+import { load } from 'cheerio';
 import { crawl } from '../crawler/crawl.js';
+import { UrlResolver } from '../crawler/resolver.js';
+import { Fetcher } from '../crawler/fetcher.js';
 import { loadGraphFromSnapshot } from '../db/graphLoader.js';
-import { normalizeUrl } from '../crawler/normalize.js';
+import { normalizeUrl, UrlUtil } from '../crawler/normalize.js';
 import { calculateMetrics } from '../graph/metrics.js';
 import { analyzeContent, calculateThinContentScore } from './content.js';
 import { analyzeH1, analyzeMetaDescription, analyzeTitle } from './seo.js';
@@ -8,95 +11,135 @@ import { analyzeImageAlts } from './images.js';
 import { analyzeLinks } from './links.js';
 import { analyzeStructuredData } from './structuredData.js';
 import { aggregateSiteScore, scorePageSeo } from './scoring.js';
-import { detectContentClusters } from '../graph/cluster.js';
+import { ClusteringService } from './clustering.js';
+import { DuplicateService } from './duplicate.js';
+import { Soft404Service } from './soft404.js';
 import { getDb } from '../db/index.js';
 import { SiteRepository } from '../db/repositories/SiteRepository.js';
 import { SnapshotRepository } from '../db/repositories/SnapshotRepository.js';
 import { PageRepository } from '../db/repositories/PageRepository.js';
+import { MetricsRepository } from '../db/repositories/MetricsRepository.js';
 import { ANALYSIS_LIST_TEMPLATE, ANALYSIS_PAGE_TEMPLATE } from './templates.js';
+import { DEFAULTS } from '../constants.js';
+import { PageRankService } from '../graph/pagerank.js';
+import { HITSService } from '../graph/hits.js';
+import { HeadingHealthService } from './heading.js';
+import { annotateOrphans } from './orphan.js';
+import { HealthService } from '../scoring/health.js';
 /**
  * Analyzes a site for SEO, content, and accessibility.
  * Supports live crawling or loading from a database snapshot.
- * Note: File-based data loading is not supported.
- *
- * @param url The root URL to analyze
- * @param options Analysis options
- * @param context Engine context for event emission
  */
 export async function analyzeSite(url, options, context) {
-    const normalizedRoot = normalizeUrl(url, '', { stripQuery: false });
-    if (!normalizedRoot) {
+    // 1. Parse siteOrigin (e.g. https://example.com) and targetPath (e.g. /stats) from the URL.
+    //    We resolve the *origin* — not the full page URL — so rootOrigin is always just the
+    //    scheme+host and normalizedPath is always the pathname.
+    let parsedUrl = null;
+    try {
+        parsedUrl = new URL(url);
+    }
+    catch { /* bare domain fallback below */ }
+    const inputFullUrl = parsedUrl ? parsedUrl.toString() : (url.startsWith('http') ? url : `https://${url}`);
+    const inputOrigin = parsedUrl ? `${parsedUrl.protocol}//${parsedUrl.host}` : url;
+    let rootOrigin = inputOrigin;
+    if (options.live !== false) {
+        const resolver = new UrlResolver();
+        const fetcher = new Fetcher({ rate: options.rate, proxyUrl: options.proxyUrl, userAgent: options.userAgent });
+        try {
+            const resolved = await resolver.resolve(inputOrigin, fetcher);
+            rootOrigin = resolved.url;
+        }
+        catch {
+            // Fallback to basic normalization if resolution fails
+        }
+    }
+    // Normalize origin and target URL independently.
+    const normalizedOrigin = normalizeUrl(rootOrigin, '', { stripQuery: false });
+    if (!normalizedOrigin) {
         throw new Error('Invalid URL for analysis');
     }
+    const normalizedTargetAbs = normalizeUrl(inputFullUrl, rootOrigin, { stripQuery: false }) || inputFullUrl;
+    const normalizedPath = normalizeUrl(inputFullUrl, rootOrigin, { stripQuery: false, toPath: true })
+        || UrlUtil.toPath(normalizedTargetAbs, rootOrigin);
+    const start = Date.now();
     let crawlData;
     let robots = null;
-    // Always try to fetch robots.txt for the analysis session
-    // to ensure we have the latest rules for visibility reporting.
-    try {
-        const robotsUrl = new URL('/robots.txt', normalizedRoot).toString();
-        const robotsRes = await (new (await import('../crawler/fetcher.js')).Fetcher()).fetch(robotsUrl, { maxBytes: 500000 });
-        const status = robotsRes.status;
-        if (typeof status === 'number' && status >= 200 && status < 300) {
-            const robotsParserModule = await import('robots-parser');
-            const robotsParser = robotsParserModule.default || robotsParserModule;
-            robots = robotsParser(robotsUrl, robotsRes.body);
+    // 1. Robots fetch (live-mode only to keep snapshot analysis deterministic and fast)
+    if (options.live) {
+        try {
+            const robotsUrl = new URL('/robots.txt', rootOrigin).toString();
+            const { Fetcher } = await import('../crawler/fetcher.js');
+            const fetcher = new Fetcher({
+                rate: DEFAULTS.RATE_LIMIT,
+                proxyUrl: options.proxyUrl,
+                userAgent: options.userAgent
+            });
+            const robotsRes = await fetcher.fetch(robotsUrl, { maxBytes: 500000 });
+            if (typeof robotsRes.status === 'number' && robotsRes.status >= 200 && robotsRes.status < 300) {
+                const robotsParserModule = await import('robots-parser');
+                const robotsParser = robotsParserModule.default || robotsParserModule;
+                robots = robotsParser(robotsUrl, robotsRes.body);
+                if (context)
+                    context.emit({ type: 'info', message: `[analyze] Robots fetch took ${Date.now() - start}ms` });
+            }
+        }
+        catch {
+            // Fallback
         }
     }
-    catch {
-        // Silence robots fetch errors, fallback to existing or none
-    }
+    // Data Acquisition
     if (options.live) {
-        crawlData = await runLiveCrawl(normalizedRoot, options, context);
+        const crawlStart = Date.now();
+        crawlData = await runLiveCrawl(normalizedTargetAbs, rootOrigin, options, context, robots);
+        if (context)
+            context.emit({ type: 'info', message: `[analyze] runLiveCrawl took ${Date.now() - crawlStart}ms` });
     }
     else {
         try {
-            crawlData = await loadCrawlData(normalizedRoot);
-            // Convert generator to array so it can be reused multiple times
+            const loadStart = Date.now();
+            crawlData = await loadCrawlData(normalizedOrigin, options.snapshotId);
+            if (context)
+                context.emit({ type: 'debug', message: `[analyze] loadCrawlData took ${Date.now() - loadStart}ms` });
             const allPages = Array.from(crawlData.pages);
             crawlData.pages = allPages;
-            // Check if the requested URL actually exists in this snapshot
-            const exists = allPages.some(p => p.url === normalizedRoot);
+            const exists = allPages.some(p => p.url === normalizedPath || p.url === normalizedTargetAbs);
             if (!exists) {
-                options.live = true; // Mark as live so the analysis knows to pick the first page if exact match fails
-                if (context) {
-                    context.emit({ type: 'info', message: `URL ${normalizedRoot} not found in latest snapshot. Fetching live...` });
-                }
-                crawlData = await runLiveCrawl(normalizedRoot, options, context);
+                if (context)
+                    context.emit({ type: 'info', message: `URL ${normalizedTargetAbs} not found. Fetching live...` });
+                crawlData = await runLiveCrawl(normalizedTargetAbs, rootOrigin, options, context, robots);
             }
         }
-        catch (error) {
-            const isNotFound = error.code === 'ENOENT' ||
-                error.message.includes('Crawl data not found') ||
-                error.message.includes('No completed snapshot found') ||
-                error.message.includes('not found in database');
-            if (isNotFound) {
-                options.live = true; // Force live mode
-                if (context) {
-                    context.emit({ type: 'info', message: 'No local crawl data found. Switching to live analysis mode...' });
-                }
-                crawlData = await runLiveCrawl(normalizedRoot, options, context);
-            }
-            else {
-                throw error;
-            }
+        catch (_error) {
+            if (context)
+                context.emit({ type: 'info', message: 'No local crawl data found. Switching to live...' });
+            crawlData = await runLiveCrawl(normalizedTargetAbs, rootOrigin, options, context, robots);
         }
     }
     const snapshotId = crawlData.snapshotId;
     const crawledAt = crawlData.crawledAt;
-    // Run clustering if requested or as default
-    detectContentClusters(crawlData.graph, options.clusterThreshold, options.minClusterSize);
-    const pages = analyzePages(normalizedRoot, crawlData.pages, robots);
+    const pagesStart = Date.now();
+    const pages = analyzePages(normalizedTargetAbs, rootOrigin, crawlData.pages, robots, options);
+    if (context)
+        context.emit({ type: 'debug', message: `[analyze] analyzePages took ${Date.now() - pagesStart}ms` });
+    // Sync basic page analysis results back to graph nodes for persistence
+    for (const pageAnalysis of pages) {
+        const node = crawlData.graph.nodes.get(pageAnalysis.url);
+        if (node) {
+            node.soft404Score = pageAnalysis.soft404?.score;
+            node.wordCount = pageAnalysis.content.wordCount;
+            node.externalLinkRatio = pageAnalysis.links.externalRatio;
+            node.thinContentScore = pageAnalysis.thinScore;
+            node.title = pageAnalysis.title.value || undefined;
+        }
+    }
     const activeModules = {
         seo: !!options.seo,
         content: !!options.content,
         accessibility: !!options.accessibility
     };
     const hasFilters = activeModules.seo || activeModules.content || activeModules.accessibility;
-    const filteredPages = hasFilters
-        ? pages.map((page) => filterPageModules(page, activeModules))
-        : pages;
-    // Filter to only the requested URL
-    const targetPage = filteredPages.find(p => p.url === normalizedRoot);
+    const filteredPages = hasFilters ? pages.map((page) => filterPageModules(page, activeModules)) : pages;
+    const targetPage = filteredPages.find(p => p.url === normalizedPath || p.url === normalizedTargetAbs);
     let resultPages;
     if (options.allPages) {
         resultPages = filteredPages;
@@ -104,215 +147,247 @@ export async function analyzeSite(url, options, context) {
     else {
         resultPages = targetPage ? [targetPage] : (options.live ? filteredPages.slice(0, 1) : []);
     }
+    let clusters = [];
+    let duplicates = [];
+    let prResults = new Map();
+    let hitsResults = new Map();
+    let headingPayloads = {};
+    if (options.clustering) {
+        const clustering = new ClusteringService();
+        clusters = clustering.detectContentClusters(crawlData.graph, options.clusterThreshold, options.minClusterSize);
+    }
+    if (options.allPages) {
+        const duplication = new DuplicateService();
+        duplicates = duplication.detectDuplicates(crawlData.graph, { collapse: false });
+    }
+    if (options.computePagerank) {
+        const prService = new PageRankService();
+        prResults = prService.evaluate(crawlData.graph);
+    }
+    if (options.computeHits) {
+        const hitsService = new HITSService();
+        hitsResults = hitsService.evaluate(crawlData.graph);
+    }
+    if (options.heading) {
+        const headingService = new HeadingHealthService();
+        const { payloadsByUrl } = headingService.evaluateNodes(crawlData.graph.getNodes());
+        headingPayloads = payloadsByUrl;
+    }
+    if (options.orphans) {
+        const edges = crawlData.graph.getEdges();
+        annotateOrphans(crawlData.graph.getNodes(), edges, {
+            enabled: true,
+            severityEnabled: !!options.orphanSeverity,
+            includeSoftOrphans: !!options.includeSoftOrphans,
+            minInbound: options.minInbound || 2,
+            rootUrl: normalizedOrigin
+        });
+    }
+    // Run HealthService when --health is enabled
+    let healthBreakdown;
+    if (options.health) {
+        const healthService = new HealthService();
+        const issues = healthService.collectCrawlIssues(crawlData.graph, crawlData.metrics, rootOrigin);
+        healthBreakdown = healthService.calculateHealthScore(crawlData.graph.nodes.size, issues);
+    }
+    // Update nodes in graph with results
+    for (const node of crawlData.graph.getNodes()) {
+        const pr = prResults.get(node.url);
+        if (pr)
+            node.pagerankScore = pr.score;
+        const hits = hitsResults.get(node.url);
+        if (hits) {
+            node.hubScore = hits.hub_score;
+            node.authScore = hits.authority_score;
+            node.linkRole = hits.link_role;
+        }
+        const heading = headingPayloads[node.url];
+        if (heading) {
+            node.headingScore = heading.score;
+            node.headingData = JSON.stringify(heading);
+        }
+    }
+    // Synchronize graph-level final scores back to PageAnalysis models
+    for (const page of pages) {
+        const node = crawlData.graph.nodes.get(page.url);
+        if (node) {
+            if (node.headingScore !== undefined)
+                page.headingScore = node.headingScore;
+            page.seoScore = scorePageSeo(page);
+        }
+    }
     const duplicateTitles = pages.filter((page) => page.title.status === 'duplicate').length;
     const thinPages = pages.filter((page) => page.thinScore >= 70).length;
     const siteScores = aggregateSiteScore(crawlData.metrics, resultPages.length === 1 ? resultPages : pages);
-    return {
+    if (context)
+        context.emit({ type: 'debug', message: `[analyze] Total analysis completed in ${Date.now() - start}ms` });
+    // Persist to Database
+    const db = getDb();
+    const metricsRepo = new MetricsRepository(db);
+    const pageRepo = new PageRepository(db);
+    // Efficiently map URLs to IDs for this snapshot
+    const pagesIdentity = pageRepo.getPagesIdentityBySnapshot(snapshotId);
+    const urlToIdMap = new Map(pagesIdentity.map(p => [p.normalized_url, p.id]));
+    const metricsToSave = crawlData.graph.getNodes().map(node => {
+        const pageId = urlToIdMap.get(node.url);
+        if (!pageId)
+            return null;
+        return {
+            snapshot_id: snapshotId,
+            page_id: pageId,
+            crawl_status: node.crawlStatus || null,
+            word_count: node.wordCount || null,
+            thin_content_score: node.thinContentScore || null,
+            external_link_ratio: node.externalLinkRatio || null,
+            pagerank_score: node.pagerankScore || null,
+            hub_score: node.hubScore || null,
+            auth_score: node.authScore || null,
+            link_role: node.linkRole || null,
+            duplicate_cluster_id: node.duplicateClusterId || null,
+            duplicate_type: node.duplicateType || null,
+            cluster_id: node.clusterId || null,
+            soft404_score: node.soft404Score || null,
+            heading_score: node.headingScore || null,
+            orphan_score: node.orphanScore || null,
+            orphan_type: node.orphanType || null,
+            impact_level: node.impactLevel || null,
+            heading_data: node.headingData || null,
+            is_cluster_primary: node.isClusterPrimary ? 1 : 0
+        };
+    }).filter(m => m !== null);
+    // Persist health score to snapshot if computed
+    if (healthBreakdown && snapshotId) {
+        const db2 = getDb();
+        const snapshotRepo = new SnapshotRepository(db2);
+        snapshotRepo.updateSnapshotStatus(snapshotId, 'completed', {
+            health_score: healthBreakdown.score
+        });
+    }
+    metricsRepo.insertMany(metricsToSave);
+    const result = {
         site_summary: {
             pages_analyzed: resultPages.length,
             avg_seo_score: siteScores.seoHealthScore,
             thin_pages: thinPages,
             duplicate_titles: duplicateTitles,
-            site_score: siteScores.overallScore
+            site_score: siteScores.overallScore,
+            site_score_breakdown: siteScores.breakdown
         },
         site_scores: siteScores,
         pages: resultPages,
         active_modules: activeModules,
-        clusters: crawlData.graph.contentClusters,
         snapshotId,
-        crawledAt
+        crawledAt,
+        clusters,
+        duplicates
     };
+    return result;
 }
-export function renderAnalysisHtml(result) {
-    if (result.pages.length === 1) {
-        return renderSinglePageHtml(result.pages[0]);
-    }
-    const rows = result.pages
-        .map((page) => `<tr><td>${escapeHtml(page.url)}</td><td>${page.seoScore}</td><td>${page.thinScore}</td><td>${page.title.status}</td><td>${page.metaDescription.status}</td></tr>`)
-        .join('');
-    return ANALYSIS_LIST_TEMPLATE
-        .replace('{{PAGES_ANALYZED}}', result.site_summary.pages_analyzed.toString())
-        .replace('{{AVG_SEO_SCORE}}', result.site_summary.avg_seo_score.toString())
-        .replace('{{ROWS}}', rows);
-}
-function renderSinglePageHtml(page) {
-    const structuredDataStatus = page.structuredData.present
-        ? (page.structuredData.valid ? '<span class="status-ok">Valid</span>' : '<span class="status-critical">Invalid JSON</span>')
-        : 'Not detected';
-    const structuredDataTypesRow = page.structuredData.present ? `
-      <tr>
-          <th>Types Found</th>
-          <td>${page.structuredData.types.map(t => `<code>${t}</code>`).join(', ')}</td>
-      </tr>
-      ` : '';
-    return ANALYSIS_PAGE_TEMPLATE
-        .replaceAll('{{URL}}', escapeHtml(page.url))
-        .replace('{{SEO_SCORE}}', page.seoScore.toString())
-        .replace('{{THIN_SCORE}}', page.thinScore.toString())
-        .replace('{{HTTP_STATUS}}', page.status === 0 ? 'Pending/Limit' : page.status.toString())
-        .replace('{{TITLE_VALUE}}', escapeHtml(page.title.value || '(missing)'))
-        .replace('{{TITLE_LENGTH}}', page.title.length.toString())
-        .replaceAll('{{TITLE_STATUS}}', page.title.status)
-        .replace('{{META_DESCRIPTION_VALUE}}', escapeHtml(page.metaDescription.value || '(missing)'))
-        .replace('{{META_DESCRIPTION_LENGTH}}', page.metaDescription.length.toString())
-        .replaceAll('{{META_DESCRIPTION_STATUS}}', page.metaDescription.status)
-        .replace('{{CANONICAL}}', page.meta.canonical ? escapeHtml(page.meta.canonical) : '<em>(none)</em>')
-        .replace('{{ROBOTS_INDEX}}', (!page.meta.noindex).toString())
-        .replace('{{ROBOTS_FOLLOW}}', (!page.meta.nofollow).toString())
-        .replaceAll('{{H1_STATUS}}', page.h1.status)
-        .replace('{{H1_COUNT}}', page.h1.count.toString())
-        .replace('{{H1_MATCHES_TITLE}}', page.h1.matchesTitle ? ' | Matches Title' : '')
-        .replace('{{WORD_COUNT}}', page.content.wordCount.toString())
-        .replace('{{UNIQUE_SENTENCES}}', page.content.uniqueSentenceCount.toString())
-        .replace('{{TEXT_HTML_RATIO}}', (page.content.textHtmlRatio * 100).toFixed(2))
-        .replace('{{INTERNAL_LINKS}}', page.links.internalLinks.toString())
-        .replace('{{EXTERNAL_LINKS}}', page.links.externalLinks.toString())
-        .replace('{{EXTERNAL_RATIO}}', (page.links.externalRatio * 100).toFixed(1))
-        .replace('{{TOTAL_IMAGES}}', page.images.totalImages.toString())
-        .replace('{{MISSING_ALT}}', page.images.missingAlt.toString())
-        .replace('{{STRUCTURED_DATA_STATUS}}', structuredDataStatus)
-        .replace('{{STRUCTURED_DATA_TYPES_ROW}}', structuredDataTypesRow);
-}
-export function renderAnalysisMarkdown(result) {
-    const summary = [
-        '# Crawlith SEO Analysis Report',
-        '',
-        '## 📊 Summary',
-        `- Pages Analyzed: ${result.site_summary.pages_analyzed}`,
-        `- Overall Site Score: ${result.site_summary.site_score.toFixed(1)}`,
-        `- Avg SEO Score: ${result.site_summary.avg_seo_score.toFixed(1)}`,
-        `- Thin Pages Found: ${result.site_summary.thin_pages}`,
-        `- Duplicate Titles: ${result.site_summary.duplicate_titles}`,
-        '',
-        '## 📄 Page Details',
-        '',
-        '| URL | SEO Score | Thin Score | Title Status | Meta Status |',
-        '| :--- | :--- | :--- | :--- | :--- |',
-    ];
-    result.pages.forEach((page) => {
-        summary.push(`| ${page.url} | ${page.seoScore} | ${page.thinScore} | ${page.title.status} | ${page.metaDescription.status} |`);
-    });
-    return summary.join('\n');
-}
-export function renderAnalysisCsv(result) {
-    const headers = ['URL', 'SEO Score', 'Thin Score', 'HTTP Status', 'Title', 'Title Length', 'Meta Description', 'Desc Length', 'Word Count', 'Internal Links', 'External Links'];
-    const rows = result.pages.map((p) => {
-        const statusStr = p.status === 0 ? 'Pending/Limit' : p.status;
-        return [
-            p.url,
-            p.seoScore,
-            p.thinScore,
-            statusStr,
-            `"${(p.title.value || '').replace(/"/g, '""')}"`,
-            p.title.length,
-            `"${(p.metaDescription.value || '').replace(/"/g, '""')}"`,
-            p.metaDescription.length,
-            p.content.wordCount,
-            p.links.internalLinks,
-            p.links.externalLinks
-        ].join(',');
-    });
-    return [headers.join(','), ...rows].join('\n');
-}
-function escapeHtml(value) {
-    return value.replaceAll('&', '&amp;').replaceAll('<', '&lt;').replaceAll('>', '&gt;');
-}
-export function analyzePages(rootUrl, pages, robots) {
+export function analyzePages(targetUrl, rootOrigin, pages, robots, options = {}) {
     const titleCounts = new Map();
     const metaCounts = new Map();
     const sentenceCountFrequency = new Map();
     const results = [];
+    const targetPath = UrlUtil.toPath(targetUrl, rootOrigin);
+    const targetAbs = UrlUtil.toAbsolute(targetUrl, rootOrigin);
     for (const page of pages) {
+        const pagePath = UrlUtil.toPath(page.url, rootOrigin);
+        const pageAbs = UrlUtil.toAbsolute(page.url, rootOrigin);
+        const isTarget = page.url === targetUrl || pagePath === targetPath || pageAbs === targetAbs;
+        // In single-page mode, if it's not the target, we skip it entirely for speed.
+        if (!options.allPages && !isTarget)
+            continue;
         const html = page.html || '';
-        // 0. Update crawl status based on current robots rules
+        const $ = load(html || '<html></html>');
+        // Reconstruct absolute URL from stored path for robots & link resolution
+        const pageAbsUrl = UrlUtil.toAbsolute(page.url, rootOrigin);
         let crawlStatus = page.crawlStatus;
         if (robots) {
-            const isBlocked = !robots.isAllowed(page.url, 'crawlith') ||
-                (!page.url.endsWith('/') && !robots.isAllowed(page.url + '/', 'crawlith'));
-            if (isBlocked) {
+            const isBlocked = !robots.isAllowed(pageAbsUrl, 'crawlith') ||
+                (!pageAbsUrl.endsWith('/') && !robots.isAllowed(pageAbsUrl + '/', 'crawlith'));
+            if (isBlocked)
                 crawlStatus = 'blocked_by_robots';
-            }
         }
-        // 1. Analyze Individual Components
-        const title = analyzeTitle(html);
-        const metaDescription = analyzeMetaDescription(html);
-        const h1 = analyzeH1(html, title.value);
-        const content = analyzeContent(html);
-        const images = analyzeImageAlts(html);
-        const links = analyzeLinks(html, page.url, rootUrl);
-        const structuredData = analyzeStructuredData(html);
-        // 2. Accumulate Frequencies for Duplicates
+        // Shared DOM Analysis
+        const title = analyzeTitle($);
+        const metaDescription = analyzeMetaDescription($);
+        const h1 = analyzeH1($, title.value);
+        const content = analyzeContent($);
+        const images = analyzeImageAlts($);
+        const links = analyzeLinks($, pageAbsUrl, rootOrigin);
+        const structuredData = analyzeStructuredData($);
         if (title.value) {
-            const key = (title.value || '').trim().toLowerCase();
+            const key = title.value.trim().toLowerCase();
             titleCounts.set(key, (titleCounts.get(key) || 0) + 1);
         }
         if (metaDescription.value) {
-            const key = (metaDescription.value || '').trim().toLowerCase();
+            const key = metaDescription.value.trim().toLowerCase();
             metaCounts.set(key, (metaCounts.get(key) || 0) + 1);
         }
         sentenceCountFrequency.set(content.uniqueSentenceCount, (sentenceCountFrequency.get(content.uniqueSentenceCount) || 0) + 1);
-        // 3. Store Preliminary Result
-        results.push({
+        const soft404Service = new Soft404Service();
+        const soft404 = soft404Service.analyze(html, links.externalLinks + links.internalLinks);
+        const isCanonicalConflict = !!(page.canonical && page.canonical !== page.url && page.canonical !== pageAbsUrl &&
+            page.canonical.replace(/\/$/, '') !== pageAbsUrl.replace(/\/$/, ''));
+        const resultPage = {
             url: page.url,
             status: page.status || 0,
             title,
             metaDescription,
             h1,
             content,
-            thinScore: 0, // Calculated in pass 2
+            thinScore: 0,
             images,
             links,
             structuredData,
-            seoScore: 0, // Calculated in pass 2
+            seoScore: 0,
             meta: {
                 canonical: page.canonical,
                 noindex: page.noindex,
                 nofollow: page.nofollow,
-                crawlStatus
-            }
-        });
+                crawlStatus,
+                canonicalConflict: isCanonicalConflict
+            },
+            soft404
+        };
+        Object.defineProperty(resultPage, 'html', { value: html, enumerable: false });
+        results.push(resultPage);
     }
-    // 4. Finalize Statuses and Scores (Pass 2)
     for (const analysis of results) {
-        // Check Title Duplicates
         if (analysis.title.value) {
-            const key = (analysis.title.value || '').trim().toLowerCase();
-            if ((titleCounts.get(key) || 0) > 1) {
+            const key = analysis.title.value.trim().toLowerCase();
+            if ((titleCounts.get(key) || 0) > 1)
                 analysis.title.status = 'duplicate';
-            }
         }
-        // Check Meta Duplicates
         if (analysis.metaDescription.value) {
-            const key = (analysis.metaDescription.value || '').trim().toLowerCase();
-            if ((metaCounts.get(key) || 0) > 1) {
+            const key = analysis.metaDescription.value.trim().toLowerCase();
+            if ((metaCounts.get(key) || 0) > 1)
                 analysis.metaDescription.status = 'duplicate';
-            }
         }
-        // Check Content Duplication
         const duplicationScore = (sentenceCountFrequency.get(analysis.content.uniqueSentenceCount) || 0) > 1 ? 100 : 0;
         analysis.thinScore = calculateThinContentScore(analysis.content, duplicationScore);
-        // Calculate Final SEO Score
         analysis.seoScore = scorePageSeo(analysis);
     }
     return results;
 }
 function filterPageModules(page, modules) {
-    const keepSeo = modules.seo;
-    const keepContent = modules.content;
-    const keepAccessibility = modules.accessibility;
-    return {
+    const filtered = {
         ...page,
-        title: keepSeo ? page.title : { value: null, length: 0, status: 'missing' },
-        metaDescription: keepSeo ? page.metaDescription : { value: null, length: 0, status: 'missing' },
-        h1: (keepSeo || keepContent) ? page.h1 : { count: 0, status: 'critical', matchesTitle: false },
-        links: keepSeo ? page.links : { internalLinks: 0, externalLinks: 0, nofollowCount: 0, externalRatio: 0 },
-        structuredData: keepSeo ? page.structuredData : { present: false, valid: false, types: [] },
-        content: keepContent ? page.content : { wordCount: 0, textHtmlRatio: 0, uniqueSentenceCount: 0 },
-        thinScore: keepContent ? page.thinScore : 0,
-        images: keepAccessibility ? page.images : { totalImages: 0, missingAlt: 0, emptyAlt: 0 }
+        title: modules.seo ? page.title : { value: null, length: 0, status: 'missing' },
+        metaDescription: modules.seo ? page.metaDescription : { value: null, length: 0, status: 'missing' },
+        h1: (modules.seo || modules.content) ? page.h1 : { count: 0, status: 'critical', matchesTitle: false, value: null },
+        links: modules.seo ? page.links : { internalLinks: 0, externalLinks: 0, nofollowCount: 0, externalRatio: 0 },
+        structuredData: modules.seo ? page.structuredData : { present: false, valid: false, types: [] },
+        content: modules.content ? page.content : { wordCount: 0, textHtmlRatio: 0, uniqueSentenceCount: 0 },
+        thinScore: modules.content ? page.thinScore : 0,
+        images: modules.accessibility ? page.images : { totalImages: 0, missingAlt: 0, emptyAlt: 0 }
     };
+    if (page.html) {
+        Object.defineProperty(filtered, 'html', { value: page.html, enumerable: false });
+    }
+    return filtered;
 }
-async function loadCrawlData(rootUrl) {
+async function loadCrawlData(rootUrl, snapshotId) {
     const db = getDb();
     const siteRepo = new SiteRepository(db);
     const snapshotRepo = new SnapshotRepository(db);
@@ -320,22 +395,26 @@ async function loadCrawlData(rootUrl) {
     const urlObj = new URL(rootUrl);
     const domain = urlObj.hostname.replace('www.', '');
     const site = siteRepo.firstOrCreateSite(domain);
-    let snapshot;
-    const page = pageRepo.getPage(site.id, rootUrl);
-    if (page && page.last_seen_snapshot_id) {
-        snapshot = snapshotRepo.getSnapshot(page.last_seen_snapshot_id);
-    }
-    if (!snapshot) {
-        snapshot = snapshotRepo.getLatestSnapshot(site.id);
+    let snapshot = null;
+    if (snapshotId) {
+        snapshot = snapshotRepo.getSnapshot(snapshotId);
     }
     if (!snapshot) {
-        throw new Error(`No crawl data found for ${rootUrl} in database.`);
+        for (const candidate of UrlUtil.toLookupCandidates(rootUrl, `${urlObj.protocol}//${urlObj.host}`)) {
+            const page = pageRepo.getPage(site.id, candidate);
+            if (page?.last_seen_snapshot_id) {
+                snapshot = snapshotRepo.getSnapshot(page.last_seen_snapshot_id);
+                break;
+            }
+        }
     }
+    if (!snapshot)
+        snapshot = snapshotRepo.getLatestSnapshot(site.id);
+    if (!snapshot)
+        throw new Error(`No crawl data found for ${rootUrl}`);
     const graph = loadGraphFromSnapshot(snapshot.id);
     const metrics = calculateMetrics(graph, 5);
-    // Use iterator to save memory
     const dbPagesIterator = pageRepo.getPagesIteratorBySnapshot(snapshot.id);
-    // We need to map the DB pages to CrawlPage format lazily
     const pagesGenerator = function* () {
         for (const p of dbPagesIterator) {
             yield {
@@ -352,29 +431,54 @@ async function loadCrawlData(rootUrl) {
     };
     return { pages: pagesGenerator(), metrics, graph, snapshotId: snapshot.id, crawledAt: snapshot.created_at };
 }
-async function runLiveCrawl(url, options, context) {
+async function runLiveCrawl(url, origin, options, context, robots) {
     const snapshotId = await crawl(url, {
-        limit: 1, // Always limit to 1 for single page live analysis
+        limit: 1,
         depth: 0,
         rate: options.rate,
         proxyUrl: options.proxyUrl,
         userAgent: options.userAgent,
         maxRedirects: options.maxRedirects,
         debug: options.debug,
-        snapshotType: 'partial'
+        snapshotRunType: 'single',
+        robots,
+        sitemap: options.sitemap,
+        plugins: options.plugins
     }, context);
     const graph = loadGraphFromSnapshot(snapshotId);
     const pages = graph.getNodes().map((node) => ({
         url: node.url,
         status: node.status,
-        html: node.html || '', // Include HTML
+        html: node.html || '',
         depth: node.depth,
         crawlStatus: node.crawlStatus
     }));
-    return {
-        pages,
-        metrics: calculateMetrics(graph, 1),
-        graph,
-        snapshotId
-    };
+    return { pages, metrics: calculateMetrics(graph, 1), graph, snapshotId };
+}
+export function escapeHtml(value) {
+    return value.replaceAll('&', '&amp;').replaceAll('<', '&lt;').replaceAll('>', '&gt;');
+}
+export function renderAnalysisHtml(result) {
+    if (result.pages.length === 1)
+        return renderSinglePageHtml(result.pages[0]);
+    const rows = result.pages.map((page) => `<tr><td>${escapeHtml(page.url)}</td><td>${page.seoScore}</td><td>${page.thinScore}</td><td>${page.title.status}</td><td>${page.metaDescription.status}</td></tr>`).join('');
+    return ANALYSIS_LIST_TEMPLATE.replace('{{PAGES_ANALYZED}}', result.site_summary.pages_analyzed.toString()).replace('{{AVG_SEO_SCORE}}', result.site_summary.avg_seo_score.toString()).replace('{{ROWS}}', rows);
+}
+function renderSinglePageHtml(page) {
+    const structuredDataStatus = page.structuredData.present ? (page.structuredData.valid ? '<span class="status-ok">Valid</span>' : '<span class="status-critical">Invalid JSON</span>') : 'Not detected';
+    const structuredDataTypesRow = page.structuredData.present ? `<tr><th>Types Found</th><td>${page.structuredData.types.map(t => `<code>${t}</code>`).join(', ')}</td></tr>` : '';
+    return ANALYSIS_PAGE_TEMPLATE.replaceAll('{{URL}}', escapeHtml(page.url)).replace('{{SEO_SCORE}}', page.seoScore.toString()).replace('{{THIN_SCORE}}', page.thinScore.toString()).replace('{{HTTP_STATUS}}', page.status === 0 ? 'Pending/Limit' : page.status.toString()).replace('{{TITLE_VALUE}}', escapeHtml(page.title.value || '(missing)')).replace('{{TITLE_LENGTH}}', page.title.length.toString()).replaceAll('{{TITLE_STATUS}}', page.title.status).replace('{{META_DESCRIPTION_VALUE}}', escapeHtml(page.metaDescription.value || '(missing)')).replace('{{META_DESCRIPTION_LENGTH}}', page.metaDescription.length.toString()).replaceAll('{{META_DESCRIPTION_STATUS}}', page.metaDescription.status).replace('{{CANONICAL}}', page.meta.canonical ? escapeHtml(page.meta.canonical) : '<em>(none)</em>').replace('{{ROBOTS_INDEX}}', (!page.meta.noindex).toString()).replace('{{ROBOTS_FOLLOW}}', (!page.meta.nofollow).toString()).replaceAll('{{H1_STATUS}}', page.h1.status).replace('{{H1_COUNT}}', page.h1.count.toString()).replace('{{H1_MATCHES_TITLE}}', page.h1.matchesTitle ? ' | Matches Title' : '').replace('{{WORD_COUNT}}', page.content.wordCount.toString()).replace('{{UNIQUE_SENTENCES}}', page.content.uniqueSentenceCount.toString()).replace('{{TEXT_HTML_RATIO}}', (page.content.textHtmlRatio * 100).toFixed(2)).replace('{{INTERNAL_LINKS}}', page.links.internalLinks.toString()).replace('{{EXTERNAL_LINKS}}', page.links.externalLinks.toString()).replace('{{EXTERNAL_RATIO}}', (page.links.externalRatio * 100).toFixed(1)).replace('{{TOTAL_IMAGES}}', page.images.totalImages.toString()).replace('{{MISSING_ALT}}', page.images.missingAlt.toString()).replace('{{STRUCTURED_DATA_STATUS}}', structuredDataStatus).replace('{{STRUCTURED_DATA_TYPES_ROW}}', structuredDataTypesRow);
+}
+export function renderAnalysisMarkdown(result) {
+    const summary = ['# Crawlith SEO Analysis Report', '', '## 📊 Summary', `- Pages Analyzed: ${result.site_summary.pages_analyzed}`, `- Overall Site Score: ${result.site_summary.site_score.toFixed(1)}`, `- Avg SEO Score: ${result.site_summary.avg_seo_score.toFixed(1)}`, `- Thin Pages Found: ${result.site_summary.thin_pages}`, `- Duplicate Titles: ${result.site_summary.duplicate_titles}`, '', '## 📄 Page Details', '', '| URL | SEO Score | Thin Score | Title Status | Meta Status | Canonical |', '| :--- | :--- | :--- | :--- | :--- | :--- |'];
+    result.pages.forEach((page) => summary.push(`| ${page.url} | ${page.seoScore} | ${page.thinScore} | ${page.title.status} | ${page.metaDescription.status} | ${page.meta.canonical || '-'} |`));
+    return summary.join('\n');
+}
+export function renderAnalysisCsv(result) {
+    const headers = ['URL', 'SEO Score', 'Thin Score', 'HTTP Status', 'Title', 'Title Length', 'Meta Description', 'Desc Length', 'Word Count', 'Internal Links', 'External Links', 'Canonical'];
+    const rows = result.pages.map((p) => {
+        const statusStr = p.status === 0 ? 'Pending/Limit' : p.status;
+        return [p.url, p.seoScore, p.thinScore, statusStr, `"${(p.title.value || '').replace(/"/g, '""')}"`, p.title.length, `"${(p.metaDescription.value || '').replace(/"/g, '""')}"`, p.metaDescription.length, p.content.wordCount, p.links.internalLinks, p.links.externalLinks, p.meta.canonical || ''].join(',');
+    });
+    return [headers.join(','), ...rows].join('\n');
 }