npm - @crawlith/core - Versions diffs - 0.1.0 → 0.1.2 - Mend

@crawlith/core 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (238) hide show

package/LICENSE +201 -0
package/README.md +70 -0
package/dist/analysis/analysis_list.html +35 -0
package/dist/analysis/analysis_page.html +123 -0
package/dist/analysis/analyze.d.ts +40 -5
package/dist/analysis/analyze.js +395 -347
package/dist/analysis/clustering.d.ts +23 -0
package/dist/analysis/clustering.js +206 -0
package/dist/analysis/content.d.ts +1 -1
package/dist/analysis/content.js +11 -5
package/dist/analysis/duplicate.d.ts +34 -0
package/dist/analysis/duplicate.js +305 -0
package/dist/analysis/heading.d.ts +116 -0
package/dist/analysis/heading.js +356 -0
package/dist/analysis/images.d.ts +1 -1
package/dist/analysis/images.js +6 -5
package/dist/analysis/links.d.ts +1 -1
package/dist/analysis/links.js +8 -8
package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
package/dist/analysis/scoring.js +11 -2
package/dist/analysis/seo.d.ts +8 -4
package/dist/analysis/seo.js +41 -30
package/dist/analysis/soft404.d.ts +17 -0
package/dist/analysis/soft404.js +62 -0
package/dist/analysis/structuredData.d.ts +1 -1
package/dist/analysis/structuredData.js +5 -4
package/dist/analysis/templates.d.ts +2 -0
package/dist/analysis/templates.js +7 -0
package/dist/application/index.d.ts +2 -0
package/dist/application/index.js +2 -0
package/dist/application/usecase.d.ts +3 -0
package/dist/application/usecase.js +1 -0
package/dist/application/usecases.d.ts +114 -0
package/dist/application/usecases.js +201 -0
package/dist/audit/index.js +1 -1
package/dist/audit/transport.d.ts +1 -1
package/dist/audit/transport.js +5 -4
package/dist/audit/types.d.ts +1 -0
package/dist/constants.d.ts +17 -0
package/dist/constants.js +23 -0
package/dist/core/scope/scopeManager.js +3 -0
package/dist/core/security/ipGuard.d.ts +11 -0
package/dist/core/security/ipGuard.js +71 -3
package/dist/crawler/crawl.d.ts +4 -22
package/dist/crawler/crawl.js +4 -335
package/dist/crawler/crawler.d.ts +87 -0
package/dist/crawler/crawler.js +683 -0
package/dist/crawler/extract.d.ts +4 -1
package/dist/crawler/extract.js +7 -2
package/dist/crawler/fetcher.d.ts +2 -1
package/dist/crawler/fetcher.js +26 -11
package/dist/crawler/metricsRunner.d.ts +23 -1
package/dist/crawler/metricsRunner.js +202 -72
package/dist/crawler/normalize.d.ts +41 -0
package/dist/crawler/normalize.js +119 -3
package/dist/crawler/parser.d.ts +1 -3
package/dist/crawler/parser.js +2 -49
package/dist/crawler/resolver.d.ts +11 -0
package/dist/crawler/resolver.js +67 -0
package/dist/crawler/sitemap.d.ts +6 -0
package/dist/crawler/sitemap.js +27 -17
package/dist/crawler/trap.d.ts +5 -1
package/dist/crawler/trap.js +23 -2
package/dist/db/CrawlithDB.d.ts +110 -0
package/dist/db/CrawlithDB.js +500 -0
package/dist/db/graphLoader.js +42 -30
package/dist/db/index.d.ts +11 -0
package/dist/db/index.js +41 -29
package/dist/db/migrations.d.ts +2 -0
package/dist/db/{schema.js → migrations.js} +90 -43
package/dist/db/pluginRegistry.d.ts +9 -0
package/dist/db/pluginRegistry.js +19 -0
package/dist/db/repositories/EdgeRepository.d.ts +13 -0
package/dist/db/repositories/EdgeRepository.js +20 -0
package/dist/db/repositories/MetricsRepository.d.ts +16 -8
package/dist/db/repositories/MetricsRepository.js +28 -7
package/dist/db/repositories/PageRepository.d.ts +15 -2
package/dist/db/repositories/PageRepository.js +169 -25
package/dist/db/repositories/SiteRepository.d.ts +9 -0
package/dist/db/repositories/SiteRepository.js +13 -0
package/dist/db/repositories/SnapshotRepository.d.ts +14 -5
package/dist/db/repositories/SnapshotRepository.js +64 -5
package/dist/db/reset.d.ts +9 -0
package/dist/db/reset.js +32 -0
package/dist/db/statements.d.ts +12 -0
package/dist/db/statements.js +40 -0
package/dist/diff/compare.d.ts +0 -5
package/dist/diff/compare.js +0 -12
package/dist/diff/service.d.ts +16 -0
package/dist/diff/service.js +41 -0
package/dist/domain/index.d.ts +4 -0
package/dist/domain/index.js +4 -0
package/dist/events.d.ts +56 -0
package/dist/events.js +1 -0
package/dist/graph/graph.d.ts +36 -42
package/dist/graph/graph.js +26 -17
package/dist/graph/hits.d.ts +23 -0
package/dist/graph/hits.js +111 -0
package/dist/graph/metrics.d.ts +0 -4
package/dist/graph/metrics.js +25 -9
package/dist/graph/pagerank.d.ts +17 -4
package/dist/graph/pagerank.js +126 -91
package/dist/graph/simhash.d.ts +6 -0
package/dist/graph/simhash.js +14 -0
package/dist/index.d.ts +29 -8
package/dist/index.js +29 -8
package/dist/lock/hashKey.js +1 -1
package/dist/lock/lockManager.d.ts +5 -1
package/dist/lock/lockManager.js +38 -13
package/dist/plugin-system/plugin-cli.d.ts +10 -0
package/dist/plugin-system/plugin-cli.js +31 -0
package/dist/plugin-system/plugin-config.d.ts +16 -0
package/dist/plugin-system/plugin-config.js +36 -0
package/dist/plugin-system/plugin-loader.d.ts +17 -0
package/dist/plugin-system/plugin-loader.js +122 -0
package/dist/plugin-system/plugin-registry.d.ts +25 -0
package/dist/plugin-system/plugin-registry.js +167 -0
package/dist/plugin-system/plugin-types.d.ts +205 -0
package/dist/plugin-system/plugin-types.js +1 -0
package/dist/ports/index.d.ts +9 -0
package/dist/ports/index.js +1 -0
package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
package/dist/report/crawlExport.d.ts +3 -0
package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
package/dist/report/crawl_template.d.ts +1 -0
package/dist/report/crawl_template.js +7 -0
package/dist/report/export.d.ts +3 -0
package/dist/report/export.js +81 -0
package/dist/report/html.js +15 -216
package/dist/report/insight.d.ts +27 -0
package/dist/report/insight.js +103 -0
package/dist/scoring/health.d.ts +56 -0
package/dist/scoring/health.js +213 -0
package/dist/utils/chalk.d.ts +6 -0
package/dist/utils/chalk.js +41 -0
package/dist/utils/secureConfig.d.ts +23 -0
package/dist/utils/secureConfig.js +128 -0
package/package.json +12 -6
package/CHANGELOG.md +0 -7
package/dist/db/schema.d.ts +0 -2
package/dist/graph/cluster.d.ts +0 -6
package/dist/graph/cluster.js +0 -173
package/dist/graph/duplicate.d.ts +0 -10
package/dist/graph/duplicate.js +0 -251
package/dist/report/sitegraphExport.d.ts +0 -3
package/dist/report/sitegraph_template.d.ts +0 -1
package/dist/report/sitegraph_template.js +0 -630
package/dist/scoring/hits.d.ts +0 -9
package/dist/scoring/hits.js +0 -111
package/src/analysis/analyze.ts +0 -548
package/src/analysis/content.ts +0 -62
package/src/analysis/images.ts +0 -28
package/src/analysis/links.ts +0 -41
package/src/analysis/scoring.ts +0 -59
package/src/analysis/seo.ts +0 -82
package/src/analysis/structuredData.ts +0 -62
package/src/audit/dns.ts +0 -49
package/src/audit/headers.ts +0 -98
package/src/audit/index.ts +0 -66
package/src/audit/scoring.ts +0 -232
package/src/audit/transport.ts +0 -258
package/src/audit/types.ts +0 -102
package/src/core/network/proxyAdapter.ts +0 -21
package/src/core/network/rateLimiter.ts +0 -39
package/src/core/network/redirectController.ts +0 -47
package/src/core/network/responseLimiter.ts +0 -34
package/src/core/network/retryPolicy.ts +0 -57
package/src/core/scope/domainFilter.ts +0 -45
package/src/core/scope/scopeManager.ts +0 -52
package/src/core/scope/subdomainPolicy.ts +0 -39
package/src/core/security/ipGuard.ts +0 -92
package/src/crawler/crawl.ts +0 -382
package/src/crawler/extract.ts +0 -34
package/src/crawler/fetcher.ts +0 -233
package/src/crawler/metricsRunner.ts +0 -124
package/src/crawler/normalize.ts +0 -108
package/src/crawler/parser.ts +0 -190
package/src/crawler/sitemap.ts +0 -73
package/src/crawler/trap.ts +0 -96
package/src/db/graphLoader.ts +0 -105
package/src/db/index.ts +0 -70
package/src/db/repositories/EdgeRepository.ts +0 -29
package/src/db/repositories/MetricsRepository.ts +0 -49
package/src/db/repositories/PageRepository.ts +0 -128
package/src/db/repositories/SiteRepository.ts +0 -32
package/src/db/repositories/SnapshotRepository.ts +0 -74
package/src/db/schema.ts +0 -177
package/src/diff/compare.ts +0 -84
package/src/graph/cluster.ts +0 -192
package/src/graph/duplicate.ts +0 -286
package/src/graph/graph.ts +0 -172
package/src/graph/metrics.ts +0 -110
package/src/graph/pagerank.ts +0 -125
package/src/graph/simhash.ts +0 -61
package/src/index.ts +0 -30
package/src/lock/hashKey.ts +0 -51
package/src/lock/lockManager.ts +0 -124
package/src/lock/pidCheck.ts +0 -13
package/src/report/html.ts +0 -227
package/src/report/sitegraphExport.ts +0 -58
package/src/scoring/hits.ts +0 -131
package/src/scoring/orphanSeverity.ts +0 -176
package/src/utils/version.ts +0 -18
package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
package/tests/analysis.unit.test.ts +0 -98
package/tests/analyze.integration.test.ts +0 -98
package/tests/audit/dns.test.ts +0 -31
package/tests/audit/headers.test.ts +0 -45
package/tests/audit/scoring.test.ts +0 -133
package/tests/audit/security.test.ts +0 -12
package/tests/audit/transport.test.ts +0 -112
package/tests/clustering.test.ts +0 -118
package/tests/crawler.test.ts +0 -358
package/tests/db.test.ts +0 -159
package/tests/diff.test.ts +0 -67
package/tests/duplicate.test.ts +0 -110
package/tests/fetcher.test.ts +0 -106
package/tests/fetcher_safety.test.ts +0 -85
package/tests/fixtures/analyze-crawl.json +0 -26
package/tests/hits.test.ts +0 -134
package/tests/html_report.test.ts +0 -58
package/tests/lock/lockManager.test.ts +0 -138
package/tests/metrics.test.ts +0 -196
package/tests/normalize.test.ts +0 -101
package/tests/orphanSeverity.test.ts +0 -160
package/tests/pagerank.test.ts +0 -98
package/tests/parser.test.ts +0 -117
package/tests/proxy_safety.test.ts +0 -57
package/tests/redirect_safety.test.ts +0 -73
package/tests/safety.test.ts +0 -114
package/tests/scope.test.ts +0 -66
package/tests/scoring.test.ts +0 -59
package/tests/sitemap.test.ts +0 -88
package/tests/soft404.test.ts +0 -41
package/tests/trap.test.ts +0 -39
package/tests/visualization_data.test.ts +0 -46
package/tsconfig.json +0 -11

package/dist/analysis/heading.d.ts ADDED Viewed

@@ -0,0 +1,116 @@
+/**
+ * Supported heading levels within HTML content.
+ */
+export type HeadingLevel = 1 | 2 | 3 | 4 | 5 | 6;
+/**
+ * Represents a normalized heading node extracted from the DOM.
+ */
+export interface HeadingNode {
+    level: HeadingLevel;
+    text: string;
+    index: number;
+    parentIndex?: number;
+}
+/**
+ * Represents content statistics for a section under a heading.
+ */
+export interface SectionMetrics {
+    headingIndex: number;
+    headingText: string;
+    words: number;
+    keywordConcentration: number;
+    thin: boolean;
+    duplicateRisk: number;
+}
+/**
+ * Raw heading analysis generated for a single URL.
+ */
+export interface LocalPageAnalysis {
+    url: string;
+    headingNodes: HeadingNode[];
+    sections: SectionMetrics[];
+    h1Norm: string;
+    h2SetHash: string;
+    patternHash: string;
+    issues: string[];
+    metrics: {
+        entropy: number;
+        maxDepth: number;
+        avgDepth: number;
+        headingDensity: number;
+        fragmentation: number;
+        levelVolatility: number;
+        hierarchySkips: number;
+        reverseJumps: number;
+        missingH1: number;
+        multipleH1: number;
+    };
+}
+/**
+ * Final heading-health payload attached to a page node.
+ */
+export interface HeadingHealthPayload {
+    score: number;
+    status: 'Healthy' | 'Moderate' | 'Poor';
+    issues: string[];
+    map: HeadingNode[];
+    missing_h1: number;
+    multiple_h1: number;
+    entropy: number;
+    max_depth: number;
+    avg_depth: number;
+    heading_density: number;
+    fragmentation: number;
+    volatility: number;
+    hierarchy_skips: number;
+    reverse_jumps: number;
+    thin_sections: number;
+    duplicate_h1_group: number;
+    similar_h1_group: number;
+    identical_h2_set_group: number;
+    duplicate_pattern_group: number;
+    template_risk: number;
+}
+/**
+ * Snapshot-level summary emitted by the plugin.
+ */
+export interface HeadingHealthSummary {
+    avgScore: number;
+    evaluatedPages: number;
+    totalMissing: number;
+    totalMultiple: number;
+    totalSkips: number;
+    totalReverseJumps: number;
+    totalThinSections: number;
+    avgEntropy: number;
+    poorPages: number;
+}
+/**
+ * Calculates token-level Jaccard similarity between two text values.
+ */
+export declare function jaccardSimilarity(a: string, b: string): number;
+/**
+ * Performs per-page heading extraction and structural scoring signal generation.
+ */
+export declare function analyzeHeadingHealth(html: string, fallbackTitle?: string): LocalPageAnalysis;
+/**
+ * Enriches section-level duplicate risk by comparing normalized section signatures across pages.
+ */
+export declare function enrichDuplicateRisk(pages: LocalPageAnalysis[]): void;
+/**
+ * Coordinates heading analysis across graph nodes and builds report-safe payloads.
+ */
+export declare class HeadingHealthService {
+    /**
+     * Builds page-level payloads plus a snapshot summary for every eligible node.
+     */
+    evaluateNodes(nodes: Array<Record<string, any>>): {
+        payloadsByUrl: Map<string, HeadingHealthPayload>;
+        summary: HeadingHealthSummary;
+    };
+    private collectAnalyses;
+    private buildBuckets;
+    private computeSimilarH1GroupSizes;
+    private scoreHealth;
+    private computeTemplateRisk;
+}

package/dist/analysis/heading.js ADDED Viewed

@@ -0,0 +1,356 @@
+import { createHash } from 'node:crypto';
+const STOPWORDS = new Set(['the', 'and', 'for', 'with', 'from', 'that', 'this', 'your', 'about', 'into', 'over', 'under', 'are', 'was', 'were', 'can', 'has', 'have', 'had', 'you', 'our', 'out', 'all']);
+const THIN_SECTION_WORDS = 80;
+const HEADING_PATTERN = /<h([1-6])\b[^<>]*>([\s\S]*?)<\/h\1>/gi;
+const TITLE_PATTERN = /<title\b[^<>]*>([\s\S]*?)<\/title>/i;
+const normalizeText = (input) => input.replace(/<[^<>]*>/g, ' ').replace(/&nbsp;/g, ' ').replace(/&amp;/g, '&').replace(/\s+/g, ' ').trim();
+const normalizeComparable = (input) => normalizeText(input).toLowerCase();
+const tokenize = (input) => normalizeComparable(input).replace(/[^a-z0-9\s]/g, ' ').split(/\s+/).filter((token) => token.length > 2 && !STOPWORDS.has(token));
+const stableHash = (input) => createHash('sha1').update(input).digest('hex').slice(0, 16);
+/**
+ * Calculates token-level Jaccard similarity between two text values.
+ */
+export function jaccardSimilarity(a, b) {
+    const aSet = new Set(tokenize(a));
+    const bSet = new Set(tokenize(b));
+    if (!aSet.size || !bSet.size) {
+        return 0;
+    }
+    let intersection = 0;
+    for (const token of aSet) {
+        if (bSet.has(token)) {
+            intersection += 1;
+        }
+    }
+    return intersection / (aSet.size + bSet.size - intersection);
+}
+/**
+ * Performs per-page heading extraction and structural scoring signal generation.
+ */
+export function analyzeHeadingHealth(html, fallbackTitle) {
+    const segments = [];
+    for (const match of html.matchAll(HEADING_PATTERN)) {
+        const level = Number(match[1]);
+        segments.push({
+            level,
+            text: normalizeText(match[2] || ''),
+            start: match.index || 0,
+            end: (match.index || 0) + match[0].length
+        });
+    }
+    const headingNodes = [];
+    const stack = [];
+    segments.forEach((segment, index) => {
+        const node = { level: segment.level, text: segment.text, index };
+        while (stack.length > 0 && stack[stack.length - 1].level >= node.level) {
+            stack.pop();
+        }
+        if (stack.length > 0) {
+            node.parentIndex = stack[stack.length - 1].index;
+        }
+        stack.push(node);
+        headingNodes.push(node);
+    });
+    const sections = [];
+    const pageWords = tokenize(html);
+    const frequency = new Map();
+    for (const word of pageWords) {
+        frequency.set(word, (frequency.get(word) || 0) + 1);
+    }
+    for (let i = 0; i < segments.length; i += 1) {
+        const current = segments[i];
+        const next = segments[i + 1];
+        const textChunk = html.slice(current.end, next ? next.start : html.length);
+        const words = tokenize(textChunk);
+        const headingTokens = tokenize(current.text);
+        const concentration = headingTokens.reduce((sum, token) => sum + (frequency.get(token) || 0), 0);
+        sections.push({
+            headingIndex: headingNodes[i]?.index ?? i,
+            headingText: current.text,
+            words: words.length,
+            keywordConcentration: words.length > 0 ? Number((concentration / words.length).toFixed(3)) : 0,
+            thin: words.length > 0 && words.length < THIN_SECTION_WORDS,
+            duplicateRisk: 0
+        });
+    }
+    const levelCounts = [0, 0, 0, 0, 0, 0];
+    headingNodes.forEach((node) => {
+        levelCounts[node.level - 1] += 1;
+    });
+    const entropyScore = Number(calculateEntropy(levelCounts).toFixed(3));
+    const missingH1 = levelCounts[0] === 0 ? 1 : 0;
+    const multipleH1 = levelCounts[0] > 1 ? 1 : 0;
+    let hierarchySkips = 0;
+    let reverseJumps = 0;
+    let volatilitySum = 0;
+    for (let i = 1; i < headingNodes.length; i += 1) {
+        const delta = headingNodes[i].level - headingNodes[i - 1].level;
+        volatilitySum += Math.abs(delta);
+        if (delta > 1) {
+            hierarchySkips += 1;
+        }
+        if (delta < -1) {
+            reverseJumps += 1;
+        }
+    }
+    const maxDepth = headingNodes.length ? Math.max(...headingNodes.map((node) => node.level)) : 0;
+    const avgDepth = headingNodes.length ? Number((headingNodes.reduce((sum, node) => sum + node.level, 0) / headingNodes.length).toFixed(2)) : 0;
+    const headingDensity = pageWords.length ? Number((headingNodes.length / pageWords.length).toFixed(4)) : 0;
+    const fragmentation = headingNodes.length ? Number((headingNodes.filter((node) => node.level <= 2).length / headingNodes.length).toFixed(3)) : 0;
+    const levelVolatility = headingNodes.length > 1 ? Number((volatilitySum / (headingNodes.length - 1)).toFixed(3)) : 0;
+    const h1Nodes = headingNodes.filter((node) => node.level === 1);
+    const issues = [];
+    if (missingH1)
+        issues.push('Missing H1');
+    if (multipleH1)
+        issues.push('Multiple H1 found');
+    if (h1Nodes.some((node) => node.text.length < 6))
+        issues.push('Empty or near-empty H1');
+    const title = fallbackTitle || getTitleFromHtml(html);
+    if (title && h1Nodes[0] && jaccardSimilarity(title, h1Nodes[0].text) < 0.3) {
+        issues.push('H1 diverges from <title>');
+    }
+    if (hierarchySkips > 0)
+        issues.push(`${hierarchySkips} hierarchy skips detected`);
+    if (reverseJumps > 0)
+        issues.push(`${reverseJumps} reverse hierarchy jumps detected`);
+    for (const thin of sections.filter((section) => section.thin).slice(0, 2)) {
+        issues.push(`Thin section under "${thin.headingText || 'Untitled heading'}"`);
+    }
+    if (entropyScore > 2.1)
+        issues.push('High structural entropy');
+    if (fragmentation > 0.65)
+        issues.push('Section fragmentation is high');
+    const h1Norm = normalizeComparable(h1Nodes[0]?.text || '');
+    const h2SetHash = stableHash(headingNodes
+        .filter((node) => node.level === 2)
+        .map((node) => normalizeComparable(node.text))
+        .filter(Boolean)
+        .sort()
+        .join('|'));
+    const patternHash = stableHash(headingNodes.map((node) => node.level).join('>'));
+    return {
+        url: '',
+        headingNodes,
+        sections,
+        h1Norm,
+        h2SetHash,
+        patternHash,
+        issues,
+        metrics: {
+            entropy: entropyScore,
+            maxDepth,
+            avgDepth,
+            headingDensity,
+            fragmentation,
+            levelVolatility,
+            hierarchySkips,
+            reverseJumps,
+            missingH1,
+            multipleH1
+        }
+    };
+}
+/**
+ * Enriches section-level duplicate risk by comparing normalized section signatures across pages.
+ */
+export function enrichDuplicateRisk(pages) {
+    const buckets = new Map();
+    for (const page of pages) {
+        for (const section of page.sections) {
+            const key = stableHash(`${normalizeComparable(section.headingText)}:${section.words}`);
+            const bucket = buckets.get(key) || [];
+            bucket.push(page.url);
+            buckets.set(key, bucket);
+        }
+    }
+    for (const page of pages) {
+        for (const section of page.sections) {
+            const key = stableHash(`${normalizeComparable(section.headingText)}:${section.words}`);
+            const size = (buckets.get(key) || []).length;
+            section.duplicateRisk = Number(Math.min(1, (size - 1) / 5).toFixed(3));
+        }
+    }
+}
+function calculateEntropy(values) {
+    const total = values.reduce((a, b) => a + b, 0);
+    if (!total) {
+        return 0;
+    }
+    return values.reduce((sum, value) => {
+        if (value === 0) {
+            return sum;
+        }
+        const probability = value / total;
+        return sum - probability * Math.log2(probability);
+    }, 0);
+}
+function getTitleFromHtml(html) {
+    const match = html.match(TITLE_PATTERN);
+    return match ? normalizeText(match[1]) : '';
+}
+/**
+ * Coordinates heading analysis across graph nodes and builds report-safe payloads.
+ */
+export class HeadingHealthService {
+    /**
+     * Builds page-level payloads plus a snapshot summary for every eligible node.
+     */
+    evaluateNodes(nodes) {
+        const analyzedPages = this.collectAnalyses(nodes);
+        enrichDuplicateRisk(analyzedPages);
+        const exactH1Buckets = this.buildBuckets(analyzedPages, (page) => page.h1Norm);
+        const h2SetBuckets = this.buildBuckets(analyzedPages, (page) => page.h2SetHash);
+        const patternBuckets = this.buildBuckets(analyzedPages, (page) => page.patternHash);
+        const similarH1GroupSizes = this.computeSimilarH1GroupSizes(analyzedPages);
+        const payloadsByUrl = new Map();
+        let totalScore = 0;
+        let totalMissing = 0;
+        let totalMultiple = 0;
+        let totalSkips = 0;
+        let totalReverseJumps = 0;
+        let totalThinSections = 0;
+        let totalEntropy = 0;
+        let poorPages = 0;
+        for (const page of analyzedPages) {
+            const duplicateH1GroupSize = page.h1Norm ? exactH1Buckets.get(page.h1Norm)?.length || 1 : 0;
+            const similarH1GroupSize = similarH1GroupSizes.get(page.url) || 0;
+            const identicalH2SetGroupSize = h2SetBuckets.get(page.h2SetHash)?.length || 1;
+            const duplicatePatternGroupSize = patternBuckets.get(page.patternHash)?.length || 1;
+            const templateRisk = this.computeTemplateRisk(similarH1GroupSize, identicalH2SetGroupSize, duplicatePatternGroupSize);
+            const thinSectionCount = page.sections.filter((section) => section.thin).length;
+            const health = this.scoreHealth({
+                metrics: page.metrics,
+                thinSectionCount,
+                duplicateH1GroupSize,
+                similarH1GroupSize,
+                identicalH2SetGroupSize,
+                duplicatePatternGroupSize,
+                templateRisk,
+                issues: page.issues
+            });
+            if (health.status === 'Poor') {
+                poorPages += 1;
+            }
+            totalScore += health.score;
+            totalMissing += page.metrics.missingH1;
+            totalMultiple += page.metrics.multipleH1;
+            totalSkips += page.metrics.hierarchySkips;
+            totalReverseJumps += page.metrics.reverseJumps;
+            totalThinSections += thinSectionCount;
+            totalEntropy += page.metrics.entropy;
+            payloadsByUrl.set(page.url, {
+                score: health.score,
+                status: health.status,
+                issues: health.issues,
+                map: page.headingNodes,
+                missing_h1: page.metrics.missingH1,
+                multiple_h1: page.metrics.multipleH1,
+                entropy: page.metrics.entropy,
+                max_depth: page.metrics.maxDepth,
+                avg_depth: page.metrics.avgDepth,
+                heading_density: page.metrics.headingDensity,
+                fragmentation: page.metrics.fragmentation,
+                volatility: page.metrics.levelVolatility,
+                hierarchy_skips: page.metrics.hierarchySkips,
+                reverse_jumps: page.metrics.reverseJumps,
+                thin_sections: thinSectionCount,
+                duplicate_h1_group: duplicateH1GroupSize,
+                similar_h1_group: similarH1GroupSize,
+                identical_h2_set_group: identicalH2SetGroupSize,
+                duplicate_pattern_group: duplicatePatternGroupSize,
+                template_risk: templateRisk
+            });
+        }
+        const evaluatedPages = analyzedPages.length;
+        const summary = {
+            avgScore: evaluatedPages ? Math.round(totalScore / evaluatedPages) : 0,
+            evaluatedPages,
+            totalMissing,
+            totalMultiple,
+            totalSkips,
+            totalReverseJumps,
+            totalThinSections,
+            avgEntropy: evaluatedPages ? Number((totalEntropy / evaluatedPages).toFixed(3)) : 0,
+            poorPages
+        };
+        return { payloadsByUrl, summary };
+    }
+    collectAnalyses(nodes) {
+        const analyses = [];
+        for (const node of nodes) {
+            if (node.status < 200 || node.status >= 300 || !node.html || !node.url) {
+                continue;
+            }
+            const analysis = analyzeHeadingHealth(node.html, node.title || node.rawTitle);
+            analysis.url = node.url;
+            analyses.push(analysis);
+        }
+        return analyses;
+    }
+    buildBuckets(pages, selector) {
+        const buckets = new Map();
+        for (const page of pages) {
+            const key = selector(page);
+            if (!key) {
+                continue;
+            }
+            buckets.set(key, [...(buckets.get(key) || []), page.url]);
+        }
+        return buckets;
+    }
+    computeSimilarH1GroupSizes(pages) {
+        const uniqueH1 = Array.from(new Set(pages.map((page) => page.h1Norm).filter(Boolean)));
+        const similarBuckets = new Map();
+        for (const h1 of uniqueH1) {
+            similarBuckets.set(h1, new Set([h1]));
+        }
+        for (let i = 0; i < uniqueH1.length; i += 1) {
+            for (let j = i + 1; j < uniqueH1.length; j += 1) {
+                const a = uniqueH1[i];
+                const b = uniqueH1[j];
+                if (jaccardSimilarity(a, b) >= 0.7) {
+                    similarBuckets.get(a)?.add(b);
+                    similarBuckets.get(b)?.add(a);
+                }
+            }
+        }
+        const groupSizes = new Map();
+        for (const page of pages) {
+            groupSizes.set(page.url, similarBuckets.get(page.h1Norm)?.size || (page.h1Norm ? 1 : 0));
+        }
+        return groupSizes;
+    }
+    scoreHealth(input) {
+        let score = 100;
+        const metrics = input.metrics;
+        if (metrics.missingH1)
+            score -= 20;
+        if (metrics.multipleH1)
+            score -= 6;
+        score -= metrics.hierarchySkips * 8;
+        score -= metrics.reverseJumps * 6;
+        score -= Math.round(metrics.entropy * 7);
+        score -= Math.round(metrics.fragmentation * 20);
+        score -= Math.round(metrics.levelVolatility * 6);
+        score -= input.thinSectionCount * 4;
+        if (input.duplicateH1GroupSize > 1)
+            score -= Math.min(16, (input.duplicateH1GroupSize - 1) * 3);
+        if (input.similarH1GroupSize > 1)
+            score -= Math.min(8, (input.similarH1GroupSize - 1) * 2);
+        if (input.identicalH2SetGroupSize > 1)
+            score -= Math.min(10, (input.identicalH2SetGroupSize - 1) * 2);
+        if (input.duplicatePatternGroupSize > 1)
+            score -= Math.min(12, (input.duplicatePatternGroupSize - 1) * 2);
+        score -= Math.round(input.templateRisk * 12);
+        score = Math.max(0, Math.min(100, score));
+        return {
+            score,
+            status: score >= 80 ? 'Healthy' : score >= 55 ? 'Moderate' : 'Poor',
+            issues: input.issues
+        };
+    }
+    computeTemplateRisk(similar, h2set, pattern) {
+        return Number(Math.max(0, Math.min(1, ((similar - 1) * 0.15) + ((h2set - 1) * 0.2) + ((pattern - 1) * 0.2))).toFixed(3));
+    }
+}

package/dist/analysis/images.d.ts CHANGED Viewed

@@ -3,4 +3,4 @@ export interface ImageAltAnalysis {
     missingAlt: number;
     emptyAlt: number;
 }
-export declare function analyzeImageAlts(html: string): ImageAltAnalysis;
+export declare function analyzeImageAlts($: any): ImageAltAnalysis;

package/dist/analysis/images.js CHANGED Viewed

@@ -1,10 +1,11 @@
 import { load } from 'cheerio';
-export function analyzeImageAlts(html) {
-    const $ = load(html);
+export function analyzeImageAlts($) {
+    const isString = typeof $ === 'string';
+    const cheerioObj = isString ? load($ || '<html></html>') : $;
     let missingAlt = 0;
     let emptyAlt = 0;
-    $('img').each((_idx, el) => {
-        const alt = $(el).attr('alt');
+    cheerioObj('img').each((_idx, el) => {
+        const alt = cheerioObj(el).attr('alt');
         if (alt === undefined) {
             missingAlt += 1;
             return;
@@ -13,6 +14,6 @@ export function analyzeImageAlts(html) {
             emptyAlt += 1;
         }
     });
-    const totalImages = $('img').length;
+    const totalImages = cheerioObj('img').length;
     return { totalImages, missingAlt, emptyAlt };
 }

package/dist/analysis/links.d.ts CHANGED Viewed

@@ -4,4 +4,4 @@ export interface LinkRatioAnalysis {
     nofollowCount: number;
     externalRatio: number;
 }
-export declare function analyzeLinks(html: string, pageUrl: string, rootUrl: string): LinkRatioAnalysis;
+export declare function analyzeLinks($: any, pageUrl: string, rootUrl: string): LinkRatioAnalysis;

package/dist/analysis/links.js CHANGED Viewed

@@ -1,23 +1,23 @@
 import { load } from 'cheerio';
-import { normalizeUrl } from '../crawler/normalize.js';
-export function analyzeLinks(html, pageUrl, rootUrl) {
-    const $ = load(html);
-    const rootOrigin = new URL(rootUrl).origin;
+import { normalizeUrl, UrlUtil } from '../crawler/normalize.js';
+export function analyzeLinks($, pageUrl, rootUrl) {
+    const isString = typeof $ === 'string';
+    const cheerioObj = isString ? load($ || '<html></html>') : $;
     let internalLinks = 0;
     let externalLinks = 0;
     let nofollowCount = 0;
-    $('a[href]').each((_idx, el) => {
-        const href = $(el).attr('href');
+    cheerioObj('a[href]').each((_idx, el) => {
+        const href = cheerioObj(el).attr('href');
         if (!href)
             return;
         const normalized = normalizeUrl(href, pageUrl, { stripQuery: false });
         if (!normalized)
             return;
-        const rel = ($(el).attr('rel') || '').toLowerCase();
+        const rel = (cheerioObj(el).attr('rel') || '').toLowerCase();
         if (rel.includes('nofollow')) {
             nofollowCount += 1;
         }
-        if (new URL(normalized).origin === rootOrigin) {
+        if (UrlUtil.isInternal(normalized, rootUrl)) {
             internalLinks += 1;
         }
         else {

package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} RENAMED Viewed

@@ -1,26 +1,15 @@
-export type OrphanType = 'hard' | 'near' | 'soft' | 'crawl-only';
-export type ImpactLevel = 'low' | 'medium' | 'high' | 'critical';
-export interface SitegraphNode {
-    url: string;
-    depth: number;
-    inLinks: number;
-    outLinks: number;
-    status: number;
-    discoveredViaSitemap?: boolean;
-    robotsExcluded?: boolean;
-    canonicalUrl?: string;
-    isHomepage?: boolean;
-    wordCount?: number;
-    hasStructuredData?: boolean;
+import type { GraphNode, GraphEdge } from '../graph/graph.js';
+export interface ExtendedGraphNode extends GraphNode {
     pageType?: string;
-    noindex?: boolean;
-    duplicateContent?: boolean;
+    hasStructuredData?: boolean;
     isProductOrCommercial?: boolean;
+    duplicateContent?: boolean;
+    isHomepage?: boolean;
+    robotsExcluded?: boolean;
+    discoveredViaSitemap?: boolean;
 }
-export interface SitegraphEdge {
-    source: string;
-    target: string;
-}
+export type OrphanType = 'hard' | 'near' | 'soft' | 'crawl-only';
+export type ImpactLevel = 'low' | 'medium' | 'high' | 'critical';
 export interface OrphanScoringOptions {
     enabled: boolean;
     severityEnabled: boolean;
@@ -28,12 +17,12 @@ export interface OrphanScoringOptions {
     minInbound: number;
     rootUrl?: string;
 }
-export type AnnotatedNode = SitegraphNode & {
+export type AnnotatedNode = GraphNode & {
     orphan: boolean;
     orphanType?: OrphanType;
     orphanSeverity?: number;
     impactLevel?: ImpactLevel;
 };
 export declare function mapImpactLevel(score: number): ImpactLevel;
-export declare function calculateOrphanSeverity(orphanType: OrphanType, node: SitegraphNode): number;
-export declare function annotateOrphans(nodes: SitegraphNode[], edges: SitegraphEdge[], options: OrphanScoringOptions): AnnotatedNode[];
+export declare function calculateOrphanSeverity(orphanType: OrphanType, node: ExtendedGraphNode): number;
+export declare function annotateOrphans(nodes: ExtendedGraphNode[], edges: GraphEdge[], options: OrphanScoringOptions): AnnotatedNode[];

package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} RENAMED Viewed

@@ -33,7 +33,8 @@ export function calculateOrphanSeverity(orphanType, node) {
             score = 90;
             break;
         case 'crawl-only':
-            score = 80;
+            // Sitemap-only URLs are less severe if we haven't even tried to crawl them yet.
+            score = node.status === 0 ? 50 : 70;
             break;
         case 'near':
             score = node.inLinks <= 1 ? 70 : 60;
@@ -64,12 +65,17 @@ export function calculateOrphanSeverity(orphanType, node) {
     negativeModifier = Math.min(20, negativeModifier);
     score += positiveModifier;
     score -= negativeModifier;
+    // Safety: unvisited nodes should not be flagged as high-severity orphans
+    // because we haven't confirmed they are real pages or fully explored their context.
+    if (node.status === 0) {
+        score = Math.min(score, 60);
+    }
     return clampScore(score);
 }
 function consolidateInboundByCanonical(nodes) {
     const canonicalInbound = new Map();
     for (const node of nodes) {
-        const canonical = node.canonicalUrl || node.url;
+        const canonical = node.canonical || node.url;
         canonicalInbound.set(canonical, (canonicalInbound.get(canonical) || 0) + node.inLinks);
     }
     return canonicalInbound;
@@ -85,7 +91,7 @@ export function annotateOrphans(nodes, edges, options) {
         if (isHomepage || node.robotsExcluded) {
             return { ...node, orphan: false };
         }
-        const canonical = node.canonicalUrl || node.url;
+        const canonical = node.canonical || node.url;
         const inbound = canonicalInbound.get(canonical) || 0;
         let orphanType;
         if (inbound === 0) {

package/dist/analysis/scoring.js CHANGED Viewed

@@ -1,6 +1,12 @@
 export function scorePageSeo(page) {
+    if (page.meta.crawlStatus === 'blocked_by_robots') {
+        return 0;
+    }
     const titleMeta = (scoreTextStatus(page.title.status) + scoreTextStatus(page.metaDescription.status)) / 2;
-    const h1 = page.h1.status === 'ok' ? 100 : page.h1.status === 'warning' ? 60 : 10;
+    let h1 = page.h1.status === 'ok' ? 100 : page.h1.status === 'warning' ? 60 : 10;
+    if (page.headingScore !== undefined && page.headingScore !== null) {
+        h1 = page.headingScore;
+    }
     const wordQuality = Math.min(100, (page.content.wordCount / 600) * 100) * 0.7 + Math.min(100, page.content.textHtmlRatio * 500) * 0.3;
     const thin = 100 - page.thinScore;
     const imageDen = Math.max(1, page.images.totalImages);
@@ -33,7 +39,10 @@ export function aggregateSiteScore(metrics, pages) {
     const entropyScore = Math.max(0, 100 - Math.abs(metrics.structuralEntropy - 2) * 25);
     const orphanPenalty = metrics.totalPages === 0 ? 0 : (metrics.orphanPages.length / metrics.totalPages) * 100;
     const authorityEntropyOrphanScore = Math.max(0, Math.min(100, (avgAuthority * 100 * 0.4) + (entropyScore * 0.35) + ((100 - orphanPenalty) * 0.25)));
-    const overallScore = Number((seoHealthScore * 0.7 + authorityEntropyOrphanScore * 0.3).toFixed(2));
+    let overallScore = Number((seoHealthScore * 0.7 + authorityEntropyOrphanScore * 0.3).toFixed(2));
+    if (pages.some(p => p.meta.crawlStatus === 'blocked_by_robots')) {
+        overallScore = 0;
+    }
     return {
         seoHealthScore: Number(seoHealthScore.toFixed(2)),
         authorityEntropyOrphanScore: Number(authorityEntropyOrphanScore.toFixed(2)),

package/dist/analysis/seo.d.ts CHANGED Viewed

@@ -8,8 +8,12 @@ export interface H1Analysis {
     count: number;
     status: 'ok' | 'critical' | 'warning';
     matchesTitle: boolean;
+    value: string | null;
 }
-export declare function analyzeTitle(html: string): TextFieldAnalysis;
-export declare function analyzeMetaDescription(html: string): TextFieldAnalysis;
-export declare function applyDuplicateStatuses<T extends TextFieldAnalysis>(fields: T[]): T[];
-export declare function analyzeH1(html: string, titleValue: string | null): H1Analysis;
+export declare function analyzeTitle($: any): TextFieldAnalysis;
+export declare function analyzeMetaDescription($: any): TextFieldAnalysis;
+export declare function analyzeH1($: any, titleValue: string | null): H1Analysis;
+export declare function applyDuplicateStatuses<T extends {
+    value: string | null;
+    status: string;
+}>(items: T[]): T[];