npm - @crawlith/core - Versions diffs - 0.1.0 → 0.1.1 - Mend

@crawlith/core 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (131) hide show

package/CHANGELOG.md +6 -0
package/dist/analysis/analysis_list.html +35 -0
package/dist/analysis/analysis_page.html +123 -0
package/dist/analysis/analyze.d.ts +17 -3
package/dist/analysis/analyze.js +192 -248
package/dist/analysis/scoring.js +7 -1
package/dist/analysis/templates.d.ts +2 -0
package/dist/analysis/templates.js +7 -0
package/dist/core/security/ipGuard.d.ts +11 -0
package/dist/core/security/ipGuard.js +71 -3
package/dist/crawler/crawl.d.ts +4 -22
package/dist/crawler/crawl.js +4 -335
package/dist/crawler/crawler.d.ts +75 -0
package/dist/crawler/crawler.js +518 -0
package/dist/crawler/extract.d.ts +4 -1
package/dist/crawler/extract.js +7 -2
package/dist/crawler/fetcher.d.ts +1 -0
package/dist/crawler/fetcher.js +20 -5
package/dist/crawler/metricsRunner.d.ts +3 -1
package/dist/crawler/metricsRunner.js +55 -46
package/dist/crawler/sitemap.d.ts +3 -0
package/dist/crawler/sitemap.js +5 -1
package/dist/db/graphLoader.js +32 -3
package/dist/db/index.d.ts +3 -0
package/dist/db/index.js +4 -0
package/dist/db/repositories/EdgeRepository.d.ts +8 -0
package/dist/db/repositories/EdgeRepository.js +13 -0
package/dist/db/repositories/MetricsRepository.d.ts +3 -0
package/dist/db/repositories/MetricsRepository.js +14 -1
package/dist/db/repositories/PageRepository.d.ts +11 -0
package/dist/db/repositories/PageRepository.js +112 -19
package/dist/db/repositories/SiteRepository.d.ts +3 -0
package/dist/db/repositories/SiteRepository.js +9 -0
package/dist/db/repositories/SnapshotRepository.d.ts +2 -0
package/dist/db/repositories/SnapshotRepository.js +23 -2
package/dist/events.d.ts +48 -0
package/dist/events.js +1 -0
package/dist/graph/cluster.js +62 -14
package/dist/graph/duplicate.js +242 -191
package/dist/graph/graph.d.ts +16 -0
package/dist/graph/graph.js +17 -4
package/dist/graph/metrics.js +12 -0
package/dist/graph/pagerank.js +2 -0
package/dist/graph/simhash.d.ts +6 -0
package/dist/graph/simhash.js +14 -0
package/dist/index.d.ts +5 -2
package/dist/index.js +5 -2
package/dist/lock/hashKey.js +1 -1
package/dist/lock/lockManager.d.ts +4 -1
package/dist/lock/lockManager.js +23 -13
package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
package/dist/report/crawlExport.d.ts +3 -0
package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
package/dist/report/crawl_template.d.ts +1 -0
package/dist/report/crawl_template.js +7 -0
package/dist/report/html.js +15 -216
package/dist/scoring/health.d.ts +50 -0
package/dist/scoring/health.js +170 -0
package/dist/scoring/hits.d.ts +1 -0
package/dist/scoring/hits.js +64 -44
package/dist/scoring/orphanSeverity.d.ts +5 -5
package/package.json +3 -3
package/scripts/copy-assets.js +37 -0
package/src/analysis/analysis_list.html +35 -0
package/src/analysis/analysis_page.html +123 -0
package/src/analysis/analyze.ts +218 -261
package/src/analysis/scoring.ts +8 -1
package/src/analysis/templates.ts +9 -0
package/src/core/security/ipGuard.ts +82 -3
package/src/crawler/crawl.ts +6 -379
package/src/crawler/crawler.ts +601 -0
package/src/crawler/extract.ts +7 -2
package/src/crawler/fetcher.ts +24 -6
package/src/crawler/metricsRunner.ts +60 -47
package/src/crawler/sitemap.ts +4 -1
package/src/db/graphLoader.ts +33 -3
package/src/db/index.ts +5 -0
package/src/db/repositories/EdgeRepository.ts +14 -0
package/src/db/repositories/MetricsRepository.ts +15 -1
package/src/db/repositories/PageRepository.ts +119 -19
package/src/db/repositories/SiteRepository.ts +11 -0
package/src/db/repositories/SnapshotRepository.ts +28 -3
package/src/events.ts +16 -0
package/src/graph/cluster.ts +69 -15
package/src/graph/duplicate.ts +249 -185
package/src/graph/graph.ts +24 -4
package/src/graph/metrics.ts +15 -0
package/src/graph/pagerank.ts +1 -0
package/src/graph/simhash.ts +15 -0
package/src/index.ts +5 -2
package/src/lock/hashKey.ts +1 -1
package/src/lock/lockManager.ts +21 -13
package/{dist/report/sitegraph_template.js → src/report/crawl.html} +330 -81
package/src/report/{sitegraphExport.ts → crawlExport.ts} +3 -3
package/src/report/crawl_template.ts +9 -0
package/src/report/html.ts +17 -217
package/src/scoring/health.ts +241 -0
package/src/scoring/hits.ts +67 -45
package/src/scoring/orphanSeverity.ts +8 -8
package/tests/analysis.unit.test.ts +44 -0
package/tests/analyze.integration.test.ts +88 -53
package/tests/analyze_markdown.test.ts +98 -0
package/tests/audit/audit.test.ts +101 -0
package/tests/audit/scoring.test.ts +25 -25
package/tests/audit/transport.test.ts +0 -1
package/tests/clustering_risk.test.ts +118 -0
package/tests/crawler.test.ts +19 -13
package/tests/db/index.test.ts +134 -0
package/tests/db/repositories.test.ts +115 -0
package/tests/db_repos.test.ts +72 -0
package/tests/duplicate.test.ts +2 -2
package/tests/extract.test.ts +86 -0
package/tests/fetcher.test.ts +5 -1
package/tests/fetcher_safety.test.ts +9 -3
package/tests/graph/graph.test.ts +100 -0
package/tests/graphLoader.test.ts +124 -0
package/tests/html_report.test.ts +52 -51
package/tests/ipGuard.test.ts +73 -0
package/tests/lock/lockManager.test.ts +77 -17
package/tests/normalize.test.ts +6 -19
package/tests/orphanSeverity.test.ts +9 -9
package/tests/redirect_safety.test.ts +5 -1
package/tests/renderAnalysisCsv.test.ts +183 -0
package/tests/safety.test.ts +12 -0
package/tests/scope.test.ts +18 -0
package/tests/scoring.test.ts +25 -24
package/tests/sitemap.test.ts +13 -1
package/tests/ssrf_fix.test.ts +69 -0
package/tests/visualization_data.test.ts +10 -10
package/dist/report/sitegraphExport.d.ts +0 -3
package/dist/report/sitegraph_template.d.ts +0 -1

package/dist/analysis/analyze.js CHANGED Viewed

@@ -1,11 +1,9 @@
-import fs from 'node:fs/promises';
 import { crawl } from '../crawler/crawl.js';
 import { loadGraphFromSnapshot } from '../db/graphLoader.js';
 import { normalizeUrl } from '../crawler/normalize.js';
 import { calculateMetrics } from '../graph/metrics.js';
-import { Graph } from '../graph/graph.js';
 import { analyzeContent, calculateThinContentScore } from './content.js';
-import { analyzeH1, analyzeMetaDescription, analyzeTitle, applyDuplicateStatuses } from './seo.js';
+import { analyzeH1, analyzeMetaDescription, analyzeTitle } from './seo.js';
 import { analyzeImageAlts } from './images.js';
 import { analyzeLinks } from './links.js';
 import { analyzeStructuredData } from './structuredData.js';
@@ -15,36 +13,79 @@ import { getDb } from '../db/index.js';
 import { SiteRepository } from '../db/repositories/SiteRepository.js';
 import { SnapshotRepository } from '../db/repositories/SnapshotRepository.js';
 import { PageRepository } from '../db/repositories/PageRepository.js';
-export async function analyzeSite(url, options) {
+import { ANALYSIS_LIST_TEMPLATE, ANALYSIS_PAGE_TEMPLATE } from './templates.js';
+/**
+ * Analyzes a site for SEO, content, and accessibility.
+ * Supports live crawling or loading from a database snapshot.
+ * Note: File-based data loading is not supported.
+ *
+ * @param url The root URL to analyze
+ * @param options Analysis options
+ * @param context Engine context for event emission
+ */
+export async function analyzeSite(url, options, context) {
     const normalizedRoot = normalizeUrl(url, '', { stripQuery: false });
     if (!normalizedRoot) {
         throw new Error('Invalid URL for analysis');
     }
     let crawlData;
+    let robots = null;
+    // Always try to fetch robots.txt for the analysis session
+    // to ensure we have the latest rules for visibility reporting.
+    try {
+        const robotsUrl = new URL('/robots.txt', normalizedRoot).toString();
+        const robotsRes = await (new (await import('../crawler/fetcher.js')).Fetcher()).fetch(robotsUrl, { maxBytes: 500000 });
+        const status = robotsRes.status;
+        if (typeof status === 'number' && status >= 200 && status < 300) {
+            const robotsParserModule = await import('robots-parser');
+            const robotsParser = robotsParserModule.default || robotsParserModule;
+            robots = robotsParser(robotsUrl, robotsRes.body);
+        }
+    }
+    catch {
+        // Silence robots fetch errors, fallback to existing or none
+    }
     if (options.live) {
-        crawlData = await runLiveCrawl(normalizedRoot, options);
+        crawlData = await runLiveCrawl(normalizedRoot, options, context);
     }
     else {
         try {
-            crawlData = await loadCrawlData(normalizedRoot, options.fromCrawl);
+            crawlData = await loadCrawlData(normalizedRoot);
+            // Convert generator to array so it can be reused multiple times
+            const allPages = Array.from(crawlData.pages);
+            crawlData.pages = allPages;
+            // Check if the requested URL actually exists in this snapshot
+            const exists = allPages.some(p => p.url === normalizedRoot);
+            if (!exists) {
+                options.live = true; // Mark as live so the analysis knows to pick the first page if exact match fails
+                if (context) {
+                    context.emit({ type: 'info', message: `URL ${normalizedRoot} not found in latest snapshot. Fetching live...` });
+                }
+                crawlData = await runLiveCrawl(normalizedRoot, options, context);
+            }
         }
         catch (error) {
             const isNotFound = error.code === 'ENOENT' ||
                 error.message.includes('Crawl data not found') ||
                 error.message.includes('No completed snapshot found') ||
                 error.message.includes('not found in database');
-            if (isNotFound && !options.fromCrawl) {
-                console.log('No local crawl data found. Switching to live analysis mode...');
-                crawlData = await runLiveCrawl(normalizedRoot, options);
+            if (isNotFound) {
+                options.live = true; // Force live mode
+                if (context) {
+                    context.emit({ type: 'info', message: 'No local crawl data found. Switching to live analysis mode...' });
+                }
+                crawlData = await runLiveCrawl(normalizedRoot, options, context);
             }
             else {
                 throw error;
             }
         }
     }
+    const snapshotId = crawlData.snapshotId;
+    const crawledAt = crawlData.crawledAt;
     // Run clustering if requested or as default
     detectContentClusters(crawlData.graph, options.clusterThreshold, options.minClusterSize);
-    const pages = analyzePages(normalizedRoot, crawlData.pages);
+    const pages = analyzePages(normalizedRoot, crawlData.pages, robots);
     const activeModules = {
         seo: !!options.seo,
         content: !!options.content,
@@ -56,13 +97,19 @@ export async function analyzeSite(url, options) {
         : pages;
     // Filter to only the requested URL
     const targetPage = filteredPages.find(p => p.url === normalizedRoot);
-    const resultPages = targetPage ? [targetPage] : (options.live ? filteredPages.slice(0, 1) : filteredPages);
+    let resultPages;
+    if (options.allPages) {
+        resultPages = filteredPages;
+    }
+    else {
+        resultPages = targetPage ? [targetPage] : (options.live ? filteredPages.slice(0, 1) : []);
+    }
     const duplicateTitles = pages.filter((page) => page.title.status === 'duplicate').length;
     const thinPages = pages.filter((page) => page.thinScore >= 70).length;
-    const siteScores = aggregateSiteScore(crawlData.metrics, pages);
+    const siteScores = aggregateSiteScore(crawlData.metrics, resultPages.length === 1 ? resultPages : pages);
     return {
         site_summary: {
-            pages_analyzed: pages.length,
+            pages_analyzed: resultPages.length,
             avg_seo_score: siteScores.seoHealthScore,
             thin_pages: thinPages,
             duplicate_titles: duplicateTitles,
@@ -71,7 +118,9 @@ export async function analyzeSite(url, options) {
         site_scores: siteScores,
         pages: resultPages,
         active_modules: activeModules,
-        clusters: crawlData.graph.contentClusters
+        clusters: crawlData.graph.contentClusters,
+        snapshotId,
+        crawledAt
     };
 }
 export function renderAnalysisHtml(result) {
@@ -79,141 +128,50 @@ export function renderAnalysisHtml(result) {
         return renderSinglePageHtml(result.pages[0]);
     }
     const rows = result.pages
-        .map((page) => `< tr > <td>${escapeHtml(page.url)} </td><td>${page.seoScore}</td > <td>${page.thinScore} </td><td>${page.title.status}</td > <td>${page.metaDescription.status} </td></tr > `)
+        .map((page) => `<tr><td>${escapeHtml(page.url)}</td><td>${page.seoScore}</td><td>${page.thinScore}</td><td>${page.title.status}</td><td>${page.metaDescription.status}</td></tr>`)
         .join('');
-    return `<!DOCTYPE html><html lang="en"><head><meta charset="utf-8" /><title>Crawlith Analysis Report</title></head><body><h1>Analysis</h1><p>Pages: ${result.site_summary.pages_analyzed}</p><p>Average SEO: ${result.site_summary.avg_seo_score}</p><table border="1" cellspacing="0" cellpadding="6"><thead><tr><th>URL</th><th>SEO Score</th><th>Thin Score</th><th>Title</th><th>Meta</th></tr></thead><tbody>${rows}</tbody></table></body></html>`;
+    return ANALYSIS_LIST_TEMPLATE
+        .replace('{{PAGES_ANALYZED}}', result.site_summary.pages_analyzed.toString())
+        .replace('{{AVG_SEO_SCORE}}', result.site_summary.avg_seo_score.toString())
+        .replace('{{ROWS}}', rows);
 }
 function renderSinglePageHtml(page) {
-    return `<!DOCTYPE html>
-<html lang="en">
-  <head>
-    <meta charset="UTF-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>Analysis for ${escapeHtml(page.url)}</title>
-    <style>
-        body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; line-height: 1.6; color: #333; }
-        h1 { border-bottom: 2px solid #eee; padding-bottom: 10px; }
-        h2 { margin-top: 30px; border-bottom: 1px solid #eee; padding-bottom: 5px; }
-        .score-card { display: flex; gap: 20px; margin-bottom: 30px; }
-        .score-box { background: #f8f9fa; padding: 15px; border-radius: 8px; text-align: center; flex: 1; border: 1px solid #e1e4e8; }
-        .score-val { font-size: 24px; font-weight: bold; color: #0366d6; }
-        .status-ok { color: green; font-weight: bold; }
-        .status-warning { color: orange; font-weight: bold; }
-        .status-critical { color: red; font-weight: bold; }
-        .status-missing { color: red; font-weight: bold; }
-        .data-table { width: 100%; border-collapse: collapse; margin-top: 10px; }
-        .data-table th, .data-table td { text-align: left; padding: 8px; border-bottom: 1px solid #eee; }
-        .data-table th { width: 150px; color: #666; }
-        code { background: #f6f8fa; padding: 2px 4px; border-radius: 3px; font-size: 0.9em; }
-    </style>
-  </head>
-  <body>
-    <h1>Page Analysis</h1>
-    <p><strong>URL:</strong> <a href="${page.url}" target="_blank">${page.url}</a></p>
-    <div class="score-card">
-      <div class="score-box">
-        <div class="score-val">${page.seoScore}</div>
-        <div>SEO Score</div>
-      </div>
-      <div class="score-box">
-        <div class="score-val">${page.thinScore}</div>
-        <div>Thin Content Score</div>
-      </div>
-      <div class="score-box">
-        <div class="score-val">${page.status === 0 ? 'Pending/Limit' : page.status}</div>
-        <div>HTTP Status</div>
-      </div>
-    </div>
-    <h2>Meta Tags</h2>
-    <table class="data-table">
-      <tr>
-        <th>Title</th>
-        <td>
-          <div>${escapeHtml(page.title.value || '(missing)')}</div>
-          <small>Length: ${page.title.length} | Status: <span class="status-${page.title.status}">${page.title.status}</span></small>
-        </td>
-      </tr>
-      <tr>
-        <th>Description</th>
-        <td>
-          <div>${escapeHtml(page.metaDescription.value || '(missing)')}</div>
-          <small>Length: ${page.metaDescription.length} | Status: <span class="status-${page.metaDescription.status}">${page.metaDescription.status}</span></small>
-        </td>
-      </tr>
-      <tr>
-        <th>Canonical</th>
-        <td>${page.meta.canonical ? escapeHtml(page.meta.canonical) : '<em>(none)</em>'}</td>
-      </tr>
-      <tr>
-        <th>Robots</th>
-        <td>
-          Index: ${!page.meta.noindex},
-          Follow: ${!page.meta.nofollow}
-        </td>
-      </tr>
-    </table>
-    <h2>Content & Heading</h2>
-    <table class="data-table">
-      <tr>
-        <th>H1 Tag</th>
-        <td>
-          Status: <span class="status-${page.h1.status}">${page.h1.status}</span>
-          (${page.h1.count} detected)
-          ${page.h1.matchesTitle ? ' | Matches Title' : ''}
-        </td>
-      </tr>
-      <tr>
-        <th>Word Count</th>
-        <td>${page.content.wordCount} words</td>
-      </tr>
-      <tr>
-        <th>Unique Sentences</th>
-        <td>${page.content.uniqueSentenceCount}</td>
-      </tr>
-      <tr>
-        <th>Text / HTML Ratio</th>
-        <td>${(page.content.textHtmlRatio * 100).toFixed(2)}%</td>
-      </tr>
-    </table>
-    <h2>Links & Images</h2>
-    <table class="data-table">
-      <tr>
-        <th>Internal Links</th>
-        <td>${page.links.internalLinks}</td>
-      </tr>
-      <tr>
-        <th>External Links</th>
-        <td>${page.links.externalLinks} (${(page.links.externalRatio * 100).toFixed(1)}%)</td>
-      </tr>
-      <tr>
-        <th>Images</th>
-        <td>${page.images.totalImages} total (${page.images.missingAlt} missing alt text)</td>
-      </tr>
-    </table>
-    <h2>Structured Data</h2>
-    <table class="data-table">
-      <tr>
-        <th>Status</th>
-        <td>
-          ${page.structuredData.present
+    const structuredDataStatus = page.structuredData.present
         ? (page.structuredData.valid ? '<span class="status-ok">Valid</span>' : '<span class="status-critical">Invalid JSON</span>')
-        : 'Not detected'}
-        </td>
-      </tr>
-      ${page.structuredData.present ? `
+        : 'Not detected';
+    const structuredDataTypesRow = page.structuredData.present ? `
       <tr>
           <th>Types Found</th>
           <td>${page.structuredData.types.map(t => `<code>${t}</code>`).join(', ')}</td>
       </tr>
-      ` : ''}
-    </table>
-  </body>
-</html>`;
+      ` : '';
+    return ANALYSIS_PAGE_TEMPLATE
+        .replaceAll('{{URL}}', escapeHtml(page.url))
+        .replace('{{SEO_SCORE}}', page.seoScore.toString())
+        .replace('{{THIN_SCORE}}', page.thinScore.toString())
+        .replace('{{HTTP_STATUS}}', page.status === 0 ? 'Pending/Limit' : page.status.toString())
+        .replace('{{TITLE_VALUE}}', escapeHtml(page.title.value || '(missing)'))
+        .replace('{{TITLE_LENGTH}}', page.title.length.toString())
+        .replaceAll('{{TITLE_STATUS}}', page.title.status)
+        .replace('{{META_DESCRIPTION_VALUE}}', escapeHtml(page.metaDescription.value || '(missing)'))
+        .replace('{{META_DESCRIPTION_LENGTH}}', page.metaDescription.length.toString())
+        .replaceAll('{{META_DESCRIPTION_STATUS}}', page.metaDescription.status)
+        .replace('{{CANONICAL}}', page.meta.canonical ? escapeHtml(page.meta.canonical) : '<em>(none)</em>')
+        .replace('{{ROBOTS_INDEX}}', (!page.meta.noindex).toString())
+        .replace('{{ROBOTS_FOLLOW}}', (!page.meta.nofollow).toString())
+        .replaceAll('{{H1_STATUS}}', page.h1.status)
+        .replace('{{H1_COUNT}}', page.h1.count.toString())
+        .replace('{{H1_MATCHES_TITLE}}', page.h1.matchesTitle ? ' | Matches Title' : '')
+        .replace('{{WORD_COUNT}}', page.content.wordCount.toString())
+        .replace('{{UNIQUE_SENTENCES}}', page.content.uniqueSentenceCount.toString())
+        .replace('{{TEXT_HTML_RATIO}}', (page.content.textHtmlRatio * 100).toFixed(2))
+        .replace('{{INTERNAL_LINKS}}', page.links.internalLinks.toString())
+        .replace('{{EXTERNAL_LINKS}}', page.links.externalLinks.toString())
+        .replace('{{EXTERNAL_RATIO}}', (page.links.externalRatio * 100).toFixed(1))
+        .replace('{{TOTAL_IMAGES}}', page.images.totalImages.toString())
+        .replace('{{MISSING_ALT}}', page.images.missingAlt.toString())
+        .replace('{{STRUCTURED_DATA_STATUS}}', structuredDataStatus)
+        .replace('{{STRUCTURED_DATA_TYPES_ROW}}', structuredDataTypesRow);
 }
 export function renderAnalysisMarkdown(result) {
     const summary = [
@@ -259,48 +217,84 @@ export function renderAnalysisCsv(result) {
 function escapeHtml(value) {
     return value.replaceAll('&', '&amp;').replaceAll('<', '&lt;').replaceAll('>', '&gt;');
 }
-function analyzePages(rootUrl, pages) {
-    const titleCandidates = pages.map((page) => analyzeTitle(page.html || ''));
-    const metaCandidates = pages.map((page) => analyzeMetaDescription(page.html || ''));
-    const titles = applyDuplicateStatuses(titleCandidates);
-    const metas = applyDuplicateStatuses(metaCandidates);
+export function analyzePages(rootUrl, pages, robots) {
+    const titleCounts = new Map();
+    const metaCounts = new Map();
     const sentenceCountFrequency = new Map();
-    const baseContent = pages.map((page) => analyzeContent(page.html || ''));
-    for (const item of baseContent) {
-        sentenceCountFrequency.set(item.uniqueSentenceCount, (sentenceCountFrequency.get(item.uniqueSentenceCount) || 0) + 1);
-    }
-    return pages.map((page, index) => {
+    const results = [];
+    for (const page of pages) {
         const html = page.html || '';
-        const title = titles[index];
-        const metaDescription = metas[index];
+        // 0. Update crawl status based on current robots rules
+        let crawlStatus = page.crawlStatus;
+        if (robots) {
+            const isBlocked = !robots.isAllowed(page.url, 'crawlith') ||
+                (!page.url.endsWith('/') && !robots.isAllowed(page.url + '/', 'crawlith'));
+            if (isBlocked) {
+                crawlStatus = 'blocked_by_robots';
+            }
+        }
+        // 1. Analyze Individual Components
+        const title = analyzeTitle(html);
+        const metaDescription = analyzeMetaDescription(html);
         const h1 = analyzeH1(html, title.value);
-        const content = baseContent[index];
-        const duplicationScore = (sentenceCountFrequency.get(content.uniqueSentenceCount) || 0) > 1 ? 100 : 0;
-        const thinScore = calculateThinContentScore(content, duplicationScore);
+        const content = analyzeContent(html);
         const images = analyzeImageAlts(html);
         const links = analyzeLinks(html, page.url, rootUrl);
         const structuredData = analyzeStructuredData(html);
-        const analysis = {
+        // 2. Accumulate Frequencies for Duplicates
+        if (title.value) {
+            const key = (title.value || '').trim().toLowerCase();
+            titleCounts.set(key, (titleCounts.get(key) || 0) + 1);
+        }
+        if (metaDescription.value) {
+            const key = (metaDescription.value || '').trim().toLowerCase();
+            metaCounts.set(key, (metaCounts.get(key) || 0) + 1);
+        }
+        sentenceCountFrequency.set(content.uniqueSentenceCount, (sentenceCountFrequency.get(content.uniqueSentenceCount) || 0) + 1);
+        // 3. Store Preliminary Result
+        results.push({
             url: page.url,
             status: page.status || 0,
             title,
             metaDescription,
             h1,
             content,
-            thinScore,
+            thinScore: 0, // Calculated in pass 2
             images,
             links,
             structuredData,
-            seoScore: 0,
+            seoScore: 0, // Calculated in pass 2
             meta: {
                 canonical: page.canonical,
                 noindex: page.noindex,
-                nofollow: page.nofollow
+                nofollow: page.nofollow,
+                crawlStatus
+            }
+        });
+    }
+    // 4. Finalize Statuses and Scores (Pass 2)
+    for (const analysis of results) {
+        // Check Title Duplicates
+        if (analysis.title.value) {
+            const key = (analysis.title.value || '').trim().toLowerCase();
+            if ((titleCounts.get(key) || 0) > 1) {
+                analysis.title.status = 'duplicate';
+            }
+        }
+        // Check Meta Duplicates
+        if (analysis.metaDescription.value) {
+            const key = (analysis.metaDescription.value || '').trim().toLowerCase();
+            if ((metaCounts.get(key) || 0) > 1) {
+                analysis.metaDescription.status = 'duplicate';
             }
-        };
+        }
+        // Check Content Duplication
+        const duplicationScore = (sentenceCountFrequency.get(analysis.content.uniqueSentenceCount) || 0) > 1 ? 100 : 0;
+        analysis.thinScore = calculateThinContentScore(analysis.content, duplicationScore);
+        // Calculate Final SEO Score
         analysis.seoScore = scorePageSeo(analysis);
-        return analysis;
-    });
+    }
+    return results;
 }
 function filterPageModules(page, modules) {
     const keepSeo = modules.seo;
@@ -318,22 +312,7 @@ function filterPageModules(page, modules) {
         images: keepAccessibility ? page.images : { totalImages: 0, missingAlt: 0, emptyAlt: 0 }
     };
 }
-async function loadCrawlData(rootUrl, fromCrawl) {
-    // If fromCrawl is provided, we could theoretically load JSON, but
-    // we now default to DB fetching for all operations.
-    if (fromCrawl) {
-        try {
-            const content = await fs.readFile(fromCrawl, 'utf-8');
-            const raw = JSON.parse(content);
-            const pages = parsePages(raw);
-            const graph = graphFromPages(rootUrl, pages, raw);
-            const metrics = calculateMetrics(graph, 5);
-            return { pages, metrics, graph };
-        }
-        catch (_e) {
-            // Fallback downwards if file doesn't exist
-        }
-    }
+async function loadCrawlData(rootUrl) {
     const db = getDb();
     const siteRepo = new SiteRepository(db);
     const snapshotRepo = new SnapshotRepository(db);
@@ -341,96 +320,61 @@ async function loadCrawlData(rootUrl, fromCrawl) {
     const urlObj = new URL(rootUrl);
     const domain = urlObj.hostname.replace('www.', '');
     const site = siteRepo.firstOrCreateSite(domain);
-    const snapshot = snapshotRepo.getLatestSnapshot(site.id, 'completed');
+    let snapshot;
+    const page = pageRepo.getPage(site.id, rootUrl);
+    if (page && page.last_seen_snapshot_id) {
+        snapshot = snapshotRepo.getSnapshot(page.last_seen_snapshot_id);
+    }
     if (!snapshot) {
-        throw new Error(`No completed snapshot found for ${rootUrl} in database.`);
+        snapshot = snapshotRepo.getLatestSnapshot(site.id);
+    }
+    if (!snapshot) {
+        throw new Error(`No crawl data found for ${rootUrl} in database.`);
     }
     const graph = loadGraphFromSnapshot(snapshot.id);
     const metrics = calculateMetrics(graph, 5);
-    // We also need the `pages` array for analysis.
-    // It needs `html` which might not be fully available unless we look up from the DB or Graph.
-    // Wait, the Graph stores Node which doesn't contain HTML since we removed it from memory?
-    // Actually, `loadGraphFromSnapshot` does NOT load actual raw HTML from nodes to save memory.
-    // We need HTML for `analyzeSite` module! So we must fetch it from `pageRepo`.
-    const dbPages = pageRepo.getPagesBySnapshot(snapshot.id);
-    const pages = dbPages.map((p) => ({
-        url: p.normalized_url,
-        status: p.http_status || 0,
-        html: p.html || '',
-        depth: p.depth || 0
-    }));
-    return { pages, metrics, graph };
-}
-function parsePages(raw) {
-    if (Array.isArray(raw.pages)) {
-        return raw.pages.map((page) => {
-            const p = page;
-            return {
-                url: String(p.url || ''),
-                status: Number(p.status || 0),
-                html: typeof p.html === 'string' ? p.html : '',
-                depth: Number(p.depth || 0)
+    // Use iterator to save memory
+    const dbPagesIterator = pageRepo.getPagesIteratorBySnapshot(snapshot.id);
+    // We need to map the DB pages to CrawlPage format lazily
+    const pagesGenerator = function* () {
+        for (const p of dbPagesIterator) {
+            yield {
+                url: p.normalized_url,
+                status: p.http_status || 0,
+                html: p.html || '',
+                depth: p.depth || 0,
+                canonical: p.canonical_url || undefined,
+                noindex: !!p.noindex,
+                nofollow: !!p.nofollow,
+                crawlStatus: graph.nodes.get(p.normalized_url)?.crawlStatus
             };
-        }).filter((page) => Boolean(page.url));
-    }
-    if (Array.isArray(raw.nodes)) {
-        return raw.nodes.map((node) => {
-            const n = node;
-            return {
-                url: String(n.url || ''),
-                status: Number(n.status || 0),
-                html: typeof n.html === 'string' ? n.html : '',
-                depth: Number(n.depth || 0)
-            };
-        }).filter((page) => Boolean(page.url));
-    }
-    return [];
-}
-function graphFromPages(rootUrl, pages, raw) {
-    const graph = new Graph();
-    for (const page of pages) {
-        graph.addNode(page.url, page.depth || 0, page.status || 0);
-    }
-    if (Array.isArray(raw.edges)) {
-        for (const edge of raw.edges) {
-            const e = edge;
-            if (typeof e.source === 'string' && typeof e.target === 'string') {
-                graph.addNode(e.source, 0, 0);
-                graph.addNode(e.target, 0, 0);
-                graph.addEdge(e.source, e.target);
-            }
         }
-        return graph;
-    }
-    for (const page of pages) {
-        if (!page.html)
-            continue;
-        const linkAnalysis = analyzeLinks(page.html, page.url, rootUrl);
-        if (linkAnalysis.internalLinks === 0 && linkAnalysis.externalLinks === 0)
-            continue;
-    }
-    return graph;
+    };
+    return { pages: pagesGenerator(), metrics, graph, snapshotId: snapshot.id, crawledAt: snapshot.created_at };
 }
-async function runLiveCrawl(url, options) {
+async function runLiveCrawl(url, options, context) {
     const snapshotId = await crawl(url, {
-        limit: 1,
+        limit: 1, // Always limit to 1 for single page live analysis
         depth: 0,
         rate: options.rate,
         proxyUrl: options.proxyUrl,
         userAgent: options.userAgent,
         maxRedirects: options.maxRedirects,
-        debug: options.debug
-    });
+        debug: options.debug,
+        snapshotType: 'partial'
+    }, context);
     const graph = loadGraphFromSnapshot(snapshotId);
     const pages = graph.getNodes().map((node) => ({
         url: node.url,
         status: node.status,
         html: node.html || '', // Include HTML
-        depth: node.depth
+        depth: node.depth,
+        crawlStatus: node.crawlStatus
     }));
     return {
         pages,
         metrics: calculateMetrics(graph, 1),
-        graph
+        graph,
+        snapshotId
     };
 }

package/dist/analysis/scoring.js CHANGED Viewed

@@ -1,4 +1,7 @@
 export function scorePageSeo(page) {
+    if (page.meta.crawlStatus === 'blocked_by_robots') {
+        return 0;
+    }
     const titleMeta = (scoreTextStatus(page.title.status) + scoreTextStatus(page.metaDescription.status)) / 2;
     const h1 = page.h1.status === 'ok' ? 100 : page.h1.status === 'warning' ? 60 : 10;
     const wordQuality = Math.min(100, (page.content.wordCount / 600) * 100) * 0.7 + Math.min(100, page.content.textHtmlRatio * 500) * 0.3;
@@ -33,7 +36,10 @@ export function aggregateSiteScore(metrics, pages) {
     const entropyScore = Math.max(0, 100 - Math.abs(metrics.structuralEntropy - 2) * 25);
     const orphanPenalty = metrics.totalPages === 0 ? 0 : (metrics.orphanPages.length / metrics.totalPages) * 100;
     const authorityEntropyOrphanScore = Math.max(0, Math.min(100, (avgAuthority * 100 * 0.4) + (entropyScore * 0.35) + ((100 - orphanPenalty) * 0.25)));
-    const overallScore = Number((seoHealthScore * 0.7 + authorityEntropyOrphanScore * 0.3).toFixed(2));
+    let overallScore = Number((seoHealthScore * 0.7 + authorityEntropyOrphanScore * 0.3).toFixed(2));
+    if (pages.some(p => p.meta.crawlStatus === 'blocked_by_robots')) {
+        overallScore = 0;
+    }
     return {
         seoHealthScore: Number(seoHealthScore.toFixed(2)),
         authorityEntropyOrphanScore: Number(authorityEntropyOrphanScore.toFixed(2)),

package/dist/analysis/templates.d.ts ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ export declare const ANALYSIS_LIST_TEMPLATE: string;
2	+ export declare const ANALYSIS_PAGE_TEMPLATE: string;

package/dist/analysis/templates.js ADDED Viewed

@@ -0,0 +1,7 @@
+import fs from 'node:fs';
+import path from 'node:path';
+import { fileURLToPath } from 'node:url';
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+export const ANALYSIS_LIST_TEMPLATE = fs.readFileSync(path.join(__dirname, 'analysis_list.html'), 'utf-8');
+export const ANALYSIS_PAGE_TEMPLATE = fs.readFileSync(path.join(__dirname, 'analysis_page.html'), 'utf-8');

package/dist/core/security/ipGuard.d.ts CHANGED Viewed

@@ -1,3 +1,5 @@
+import * as dns from 'dns';
+import { Agent } from 'undici';
 export declare class IPGuard {
     /**
      * Checks if an IP address is internal/private
@@ -7,5 +9,14 @@ export declare class IPGuard {
      * Resolves a hostname and validates all result IPs
      */
     static validateHost(host: string): Promise<boolean>;
+    /**
+     * Custom lookup function for undici that validates the resolved IP.
+     * Prevents DNS Rebinding attacks by checking the IP immediately before connection.
+     */
+    static secureLookup(hostname: string, options: dns.LookupOneOptions | dns.LookupAllOptions, callback: (err: NodeJS.ErrnoException | null, address: string | dns.LookupAddress[], family: number) => void): void;
+    /**
+     * Returns an undici Agent configured with secure DNS lookup.
+     */
+    static getSecureDispatcher(): Agent;
     private static expandIPv6;
 }