npm - @crawlith/core - Versions diffs - 0.1.0 - Mend

@crawlith/core 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (201) hide show

package/CHANGELOG.md +7 -0
package/dist/analysis/analyze.d.ts +70 -0
package/dist/analysis/analyze.js +436 -0
package/dist/analysis/content.d.ts +12 -0
package/dist/analysis/content.js +33 -0
package/dist/analysis/images.d.ts +6 -0
package/dist/analysis/images.js +18 -0
package/dist/analysis/links.d.ts +7 -0
package/dist/analysis/links.js +30 -0
package/dist/analysis/scoring.d.ts +9 -0
package/dist/analysis/scoring.js +42 -0
package/dist/analysis/seo.d.ts +15 -0
package/dist/analysis/seo.js +64 -0
package/dist/analysis/structuredData.d.ts +6 -0
package/dist/analysis/structuredData.js +51 -0
package/dist/audit/dns.d.ts +2 -0
package/dist/audit/dns.js +42 -0
package/dist/audit/headers.d.ts +2 -0
package/dist/audit/headers.js +95 -0
package/dist/audit/index.d.ts +2 -0
package/dist/audit/index.js +50 -0
package/dist/audit/scoring.d.ts +14 -0
package/dist/audit/scoring.js +214 -0
package/dist/audit/transport.d.ts +6 -0
package/dist/audit/transport.js +207 -0
package/dist/audit/types.d.ts +88 -0
package/dist/audit/types.js +1 -0
package/dist/core/network/proxyAdapter.d.ts +6 -0
package/dist/core/network/proxyAdapter.js +19 -0
package/dist/core/network/rateLimiter.d.ts +6 -0
package/dist/core/network/rateLimiter.js +31 -0
package/dist/core/network/redirectController.d.ts +13 -0
package/dist/core/network/redirectController.js +41 -0
package/dist/core/network/responseLimiter.d.ts +4 -0
package/dist/core/network/responseLimiter.js +26 -0
package/dist/core/network/retryPolicy.d.ts +10 -0
package/dist/core/network/retryPolicy.js +41 -0
package/dist/core/scope/domainFilter.d.ts +11 -0
package/dist/core/scope/domainFilter.js +40 -0
package/dist/core/scope/scopeManager.d.ts +14 -0
package/dist/core/scope/scopeManager.js +39 -0
package/dist/core/scope/subdomainPolicy.d.ts +6 -0
package/dist/core/scope/subdomainPolicy.js +35 -0
package/dist/core/security/ipGuard.d.ts +11 -0
package/dist/core/security/ipGuard.js +84 -0
package/dist/crawler/crawl.d.ts +22 -0
package/dist/crawler/crawl.js +336 -0
package/dist/crawler/extract.d.ts +5 -0
package/dist/crawler/extract.js +33 -0
package/dist/crawler/fetcher.d.ts +40 -0
package/dist/crawler/fetcher.js +161 -0
package/dist/crawler/metricsRunner.d.ts +1 -0
package/dist/crawler/metricsRunner.js +108 -0
package/dist/crawler/normalize.d.ts +7 -0
package/dist/crawler/normalize.js +88 -0
package/dist/crawler/parser.d.ts +22 -0
package/dist/crawler/parser.js +158 -0
package/dist/crawler/sitemap.d.ts +8 -0
package/dist/crawler/sitemap.js +70 -0
package/dist/crawler/trap.d.ts +24 -0
package/dist/crawler/trap.js +78 -0
package/dist/db/graphLoader.d.ts +2 -0
package/dist/db/graphLoader.js +96 -0
package/dist/db/index.d.ts +4 -0
package/dist/db/index.js +61 -0
package/dist/db/repositories/EdgeRepository.d.ts +16 -0
package/dist/db/repositories/EdgeRepository.js +17 -0
package/dist/db/repositories/MetricsRepository.d.ts +26 -0
package/dist/db/repositories/MetricsRepository.js +27 -0
package/dist/db/repositories/PageRepository.d.ts +47 -0
package/dist/db/repositories/PageRepository.js +93 -0
package/dist/db/repositories/SiteRepository.d.ts +15 -0
package/dist/db/repositories/SiteRepository.js +22 -0
package/dist/db/repositories/SnapshotRepository.d.ts +22 -0
package/dist/db/repositories/SnapshotRepository.js +55 -0
package/dist/db/schema.d.ts +2 -0
package/dist/db/schema.js +169 -0
package/dist/diff/compare.d.ts +26 -0
package/dist/diff/compare.js +64 -0
package/dist/graph/cluster.d.ts +6 -0
package/dist/graph/cluster.js +173 -0
package/dist/graph/duplicate.d.ts +10 -0
package/dist/graph/duplicate.js +251 -0
package/dist/graph/graph.d.ts +103 -0
package/dist/graph/graph.js +106 -0
package/dist/graph/metrics.d.ts +29 -0
package/dist/graph/metrics.js +74 -0
package/dist/graph/pagerank.d.ts +12 -0
package/dist/graph/pagerank.js +102 -0
package/dist/graph/simhash.d.ts +17 -0
package/dist/graph/simhash.js +56 -0
package/dist/index.d.ts +30 -0
package/dist/index.js +30 -0
package/dist/lock/hashKey.d.ts +1 -0
package/dist/lock/hashKey.js +44 -0
package/dist/lock/lockManager.d.ts +7 -0
package/dist/lock/lockManager.js +112 -0
package/dist/lock/pidCheck.d.ts +1 -0
package/dist/lock/pidCheck.js +14 -0
package/dist/report/html.d.ts +2 -0
package/dist/report/html.js +223 -0
package/dist/report/sitegraphExport.d.ts +3 -0
package/dist/report/sitegraphExport.js +52 -0
package/dist/report/sitegraph_template.d.ts +1 -0
package/dist/report/sitegraph_template.js +630 -0
package/dist/scoring/hits.d.ts +9 -0
package/dist/scoring/hits.js +111 -0
package/dist/scoring/orphanSeverity.d.ts +39 -0
package/dist/scoring/orphanSeverity.js +125 -0
package/dist/utils/version.d.ts +2 -0
package/dist/utils/version.js +15 -0
package/package.json +33 -0
package/src/analysis/analyze.ts +548 -0
package/src/analysis/content.ts +62 -0
package/src/analysis/images.ts +28 -0
package/src/analysis/links.ts +41 -0
package/src/analysis/scoring.ts +59 -0
package/src/analysis/seo.ts +82 -0
package/src/analysis/structuredData.ts +62 -0
package/src/audit/dns.ts +49 -0
package/src/audit/headers.ts +98 -0
package/src/audit/index.ts +66 -0
package/src/audit/scoring.ts +232 -0
package/src/audit/transport.ts +258 -0
package/src/audit/types.ts +102 -0
package/src/core/network/proxyAdapter.ts +21 -0
package/src/core/network/rateLimiter.ts +39 -0
package/src/core/network/redirectController.ts +47 -0
package/src/core/network/responseLimiter.ts +34 -0
package/src/core/network/retryPolicy.ts +57 -0
package/src/core/scope/domainFilter.ts +45 -0
package/src/core/scope/scopeManager.ts +52 -0
package/src/core/scope/subdomainPolicy.ts +39 -0
package/src/core/security/ipGuard.ts +92 -0
package/src/crawler/crawl.ts +382 -0
package/src/crawler/extract.ts +34 -0
package/src/crawler/fetcher.ts +233 -0
package/src/crawler/metricsRunner.ts +124 -0
package/src/crawler/normalize.ts +108 -0
package/src/crawler/parser.ts +190 -0
package/src/crawler/sitemap.ts +73 -0
package/src/crawler/trap.ts +96 -0
package/src/db/graphLoader.ts +105 -0
package/src/db/index.ts +70 -0
package/src/db/repositories/EdgeRepository.ts +29 -0
package/src/db/repositories/MetricsRepository.ts +49 -0
package/src/db/repositories/PageRepository.ts +128 -0
package/src/db/repositories/SiteRepository.ts +32 -0
package/src/db/repositories/SnapshotRepository.ts +74 -0
package/src/db/schema.ts +177 -0
package/src/diff/compare.ts +84 -0
package/src/graph/cluster.ts +192 -0
package/src/graph/duplicate.ts +286 -0
package/src/graph/graph.ts +172 -0
package/src/graph/metrics.ts +110 -0
package/src/graph/pagerank.ts +125 -0
package/src/graph/simhash.ts +61 -0
package/src/index.ts +30 -0
package/src/lock/hashKey.ts +51 -0
package/src/lock/lockManager.ts +124 -0
package/src/lock/pidCheck.ts +13 -0
package/src/report/html.ts +227 -0
package/src/report/sitegraphExport.ts +58 -0
package/src/report/sitegraph_template.ts +630 -0
package/src/scoring/hits.ts +131 -0
package/src/scoring/orphanSeverity.ts +176 -0
package/src/utils/version.ts +18 -0
package/tests/__snapshots__/orphanSeverity.test.ts.snap +49 -0
package/tests/analysis.unit.test.ts +98 -0
package/tests/analyze.integration.test.ts +98 -0
package/tests/audit/dns.test.ts +31 -0
package/tests/audit/headers.test.ts +45 -0
package/tests/audit/scoring.test.ts +133 -0
package/tests/audit/security.test.ts +12 -0
package/tests/audit/transport.test.ts +112 -0
package/tests/clustering.test.ts +118 -0
package/tests/crawler.test.ts +358 -0
package/tests/db.test.ts +159 -0
package/tests/diff.test.ts +67 -0
package/tests/duplicate.test.ts +110 -0
package/tests/fetcher.test.ts +106 -0
package/tests/fetcher_safety.test.ts +85 -0
package/tests/fixtures/analyze-crawl.json +26 -0
package/tests/hits.test.ts +134 -0
package/tests/html_report.test.ts +58 -0
package/tests/lock/lockManager.test.ts +138 -0
package/tests/metrics.test.ts +196 -0
package/tests/normalize.test.ts +101 -0
package/tests/orphanSeverity.test.ts +160 -0
package/tests/pagerank.test.ts +98 -0
package/tests/parser.test.ts +117 -0
package/tests/proxy_safety.test.ts +57 -0
package/tests/redirect_safety.test.ts +73 -0
package/tests/safety.test.ts +114 -0
package/tests/scope.test.ts +66 -0
package/tests/scoring.test.ts +59 -0
package/tests/sitemap.test.ts +88 -0
package/tests/soft404.test.ts +41 -0
package/tests/trap.test.ts +39 -0
package/tests/visualization_data.test.ts +46 -0
package/tsconfig.json +11 -0

package/dist/crawler/metricsRunner.d.ts ADDED Viewed

	@@ -0,0 +1 @@
1	+ export declare function runPostCrawlMetrics(snapshotId: number, maxDepth: number, limitReached?: boolean): void;

package/dist/crawler/metricsRunner.js ADDED Viewed

@@ -0,0 +1,108 @@
+import { getDb } from '../db/index.js';
+import { loadGraphFromSnapshot } from '../db/graphLoader.js';
+import { MetricsRepository } from '../db/repositories/MetricsRepository.js';
+import { SnapshotRepository } from '../db/repositories/SnapshotRepository.js';
+import { PageRepository } from '../db/repositories/PageRepository.js';
+import { computePageRank } from '../graph/pagerank.js';
+import { calculateMetrics } from '../graph/metrics.js';
+import { computeHITS } from '../scoring/hits.js';
+export function runPostCrawlMetrics(snapshotId, maxDepth, limitReached = false) {
+    const db = getDb();
+    const metricsRepo = new MetricsRepository(db);
+    const snapshotRepo = new SnapshotRepository(db);
+    const pageRepo = new PageRepository(db);
+    const snapshot = snapshotRepo.getSnapshot(snapshotId);
+    if (!snapshot) {
+        console.error(`Snapshot ${snapshotId} not found`);
+        return;
+    }
+    console.log('Loading graph for metrics calculation...');
+    const graph = loadGraphFromSnapshot(snapshotId);
+    console.log('Computing PageRank...');
+    computePageRank(graph);
+    console.log('Computing HITS...');
+    computeHITS(graph);
+    console.log('Updating metrics in DB...');
+    const nodes = graph.getNodes();
+    const tx = db.transaction(() => {
+        for (const node of nodes) {
+            const pageId = pageRepo.getIdByUrl(snapshot.site_id, node.url);
+            if (!pageId)
+                continue;
+            const existing = metricsRepo.getMetricsForPage(snapshotId, pageId);
+            metricsRepo.insertMetrics({
+                snapshot_id: snapshotId,
+                page_id: pageId,
+                authority_score: node.authorityScore ?? null,
+                hub_score: node.hubScore ?? null,
+                pagerank: node.pageRank ?? null,
+                pagerank_score: node.pageRankScore ?? null,
+                link_role: node.linkRole ?? null,
+                crawl_status: existing?.crawl_status ?? null,
+                word_count: existing?.word_count ?? null,
+                thin_content_score: existing?.thin_content_score ?? null,
+                external_link_ratio: existing?.external_link_ratio ?? null,
+                orphan_score: existing?.orphan_score ?? null,
+                duplicate_cluster_id: node.duplicateClusterId ?? null,
+                duplicate_type: node.duplicateType ?? null,
+                is_cluster_primary: node.isClusterPrimary ? 1 : 0
+            });
+            // Update page-level crawl trap data
+            if (node.crawlTrapFlag || node.redirectChain?.length || node.bytesReceived) {
+                pageRepo.upsertPage({
+                    site_id: snapshot.site_id,
+                    normalized_url: node.url,
+                    last_seen_snapshot_id: snapshotId,
+                    redirect_chain: node.redirectChain ? JSON.stringify(node.redirectChain) : null,
+                    bytes_received: node.bytesReceived ?? null,
+                    crawl_trap_flag: node.crawlTrapFlag ? 1 : 0,
+                    crawl_trap_risk: node.crawlTrapRisk ?? null,
+                    trap_type: node.trapType ?? null,
+                });
+            }
+        }
+        // Save duplicate clusters
+        if (graph.duplicateClusters.length > 0) {
+            const clusterStmt = db.prepare(`
+                INSERT OR REPLACE INTO duplicate_clusters (id, snapshot_id, type, size, representative, severity)
+                VALUES (?, ?, ?, ?, ?, ?)
+            `);
+            for (const cluster of graph.duplicateClusters) {
+                clusterStmt.run(cluster.id, snapshotId, cluster.type, cluster.size, cluster.representative, cluster.severity);
+            }
+        }
+        // Save content clusters
+        if (graph.contentClusters.length > 0) {
+            const contentStmt = db.prepare(`
+                INSERT OR REPLACE INTO content_clusters (id, snapshot_id, count, primary_url, risk, shared_path_prefix)
+                VALUES (?, ?, ?, ?, ?, ?)
+            `);
+            for (const cluster of graph.contentClusters) {
+                contentStmt.run(cluster.id, snapshotId, cluster.count, cluster.primaryUrl, cluster.risk, cluster.sharedPathPrefix ?? null);
+            }
+        }
+    });
+    tx();
+    console.log('Computing aggregate stats...');
+    const metrics = calculateMetrics(graph, maxDepth);
+    let totalScore = 0;
+    let totalWeight = 0;
+    for (const node of nodes) {
+        const score = node.authorityScore || node.pageRankScore || 0;
+        const depth = node.depth;
+        const weight = 1 / (depth + 1);
+        totalScore += score * weight;
+        totalWeight += weight;
+    }
+    const healthScore = totalWeight > 0 ? (totalScore / totalWeight) * 100 : 0;
+    const thinCountRow = db.prepare('SELECT count(*) as count FROM metrics WHERE snapshot_id = ? AND thin_content_score >= 70').get(snapshotId);
+    snapshotRepo.updateSnapshotStatus(snapshotId, 'completed', {
+        node_count: metrics.totalPages,
+        edge_count: metrics.totalEdges,
+        health_score: healthScore,
+        orphan_count: metrics.orphanPages.length,
+        thin_content_count: thinCountRow.count,
+        limit_reached: limitReached ? 1 : 0
+    });
+    console.log('Metrics calculation complete.');
+}

package/dist/crawler/normalize.d.ts ADDED Viewed

@@ -0,0 +1,7 @@
+/**
+ * Normalizes a URL string based on specific rules.
+ */
+export interface NormalizeOptions {
+    stripQuery?: boolean;
+}
+export declare function normalizeUrl(input: string, base: string, options?: NormalizeOptions): string | null;

package/dist/crawler/normalize.js ADDED Viewed

@@ -0,0 +1,88 @@
+const TRACKING_PARAMS = new Set([
+    'utm_source',
+    'utm_medium',
+    'utm_campaign',
+    'utm_term',
+    'utm_content',
+    'fbclid',
+    'gclid',
+    'msclkid'
+]);
+const SKIP_EXTENSIONS = new Set([
+    '.pdf', '.jpg', '.png', '.svg', '.webp', '.gif',
+    '.zip', '.xml', '.json', '.mp4'
+]);
+export function normalizeUrl(input, base, options = {}) {
+    try {
+        // 1. Resolve absolute URL
+        let u;
+        if (base) {
+            u = new URL(input, base);
+        }
+        else {
+            u = new URL(input);
+        }
+        // 2. Allow only http/https
+        if (u.protocol !== 'http:' && u.protocol !== 'https:') {
+            return null;
+        }
+        // 3. Lowercase hostname
+        u.hostname = u.hostname.toLowerCase();
+        // 4. Remove default ports
+        if ((u.protocol === 'http:' && u.port === '80') || (u.protocol === 'https:' && u.port === '443')) {
+            u.port = '';
+        }
+        // 5. Remove hash fragments
+        u.hash = '';
+        // 6. Query params handling
+        const params = new URLSearchParams(u.search);
+        const newParams = new URLSearchParams();
+        // Check if we should strip all query params
+        if (options.stripQuery) {
+            u.search = '';
+        }
+        else {
+            // Filter tracking params
+            let hasParams = false;
+            for (const [key, value] of params) {
+                // Remove utm_* and other tracking params
+                if (key.startsWith('utm_') || TRACKING_PARAMS.has(key)) {
+                    continue;
+                }
+                newParams.append(key, value);
+                hasParams = true;
+            }
+            // Sort for consistency
+            newParams.sort();
+            if (hasParams || newParams.toString()) {
+                u.search = newParams.toString();
+            }
+            else {
+                u.search = '';
+            }
+        }
+        // 7. Normalize trailing slash
+        // 8. Collapse duplicate slashes in pathname
+        let pathname = u.pathname;
+        // Collapse duplicate slashes
+        pathname = pathname.replace(/\/+/g, '/');
+        // Remove trailing slash unless root
+        if (pathname.length > 1 && pathname.endsWith('/')) {
+            pathname = pathname.slice(0, -1);
+        }
+        u.pathname = pathname;
+        // 9. Skip non-HTML assets by extension
+        const lastDotIndex = u.pathname.lastIndexOf('.');
+        if (lastDotIndex !== -1) {
+            const ext = u.pathname.slice(lastDotIndex).toLowerCase();
+            if (SKIP_EXTENSIONS.has(ext)) {
+                return null;
+            }
+        }
+        // 10. Return final string
+        return u.toString();
+    }
+    catch (_e) {
+        return null;
+    }
+}

package/dist/crawler/parser.d.ts ADDED Viewed

@@ -0,0 +1,22 @@
+export interface ParseLink {
+    url: string;
+    weight: number;
+}
+export interface ParseResult {
+    links: ParseLink[];
+    html: string;
+    canonical: string | null;
+    noindex: boolean;
+    nofollow: boolean;
+    contentHash: string;
+    simhash?: string;
+    uniqueTokenRatio?: number;
+    soft404Score: number;
+    soft404Signals: string[];
+}
+export declare class Parser {
+    /**
+     * Parses HTML content to extract metadata and links.
+     */
+    parse(html: string, baseUrl: string, status: number): ParseResult;
+}

package/dist/crawler/parser.js ADDED Viewed

@@ -0,0 +1,158 @@
+import * as cheerio from 'cheerio';
+import crypto from 'node:crypto';
+import { normalizeUrl } from './normalize.js';
+import { SimHash } from '../graph/simhash.js';
+export class Parser {
+    /**
+     * Parses HTML content to extract metadata and links.
+     */
+    parse(html, baseUrl, status) {
+        const $ = cheerio.load(html);
+        // 1. Robots Meta
+        let noindex = false;
+        let nofollow = false;
+        const robotsMeta = $('meta[name="robots"]').attr('content');
+        if (robotsMeta) {
+            const directives = robotsMeta.toLowerCase().split(',').map(s => s.trim());
+            if (directives.includes('noindex') || directives.includes('none'))
+                noindex = true;
+            if (directives.includes('nofollow') || directives.includes('none'))
+                nofollow = true;
+        }
+        // 2. Canonical
+        let canonical = null;
+        const canonicalLink = $('link[rel="canonical"]').attr('href');
+        if (canonicalLink) {
+            try {
+                // Resolve relative canonicals
+                const u = new URL(canonicalLink, baseUrl);
+                // Normalize minimally (remove default ports, lowercase host, etc)
+                // We don't strip query by default for canonical as it might be relevant
+                canonical = normalizeUrl(u.toString(), '', { stripQuery: false });
+            }
+            catch (_e) {
+                // Invalid canonical URL, ignore
+            }
+        }
+        // 3. Links
+        const links = new Map();
+        if (!nofollow) { // Don't extract links if nofollow is set
+            $('a').each((_, element) => {
+                const href = $(element).attr('href');
+                const rel = $(element).attr('rel');
+                const isNofollow = rel && rel.toLowerCase().includes('nofollow');
+                if (href && !isNofollow) {
+                    try {
+                        const absoluteUrl = new URL(href, baseUrl);
+                        if (absoluteUrl.protocol === 'http:' || absoluteUrl.protocol === 'https:') {
+                            absoluteUrl.hash = '';
+                            const urlStr = absoluteUrl.toString();
+                            // Calculate Weight
+                            let weight = 1.0; // Default: Body
+                            // Semantic Check
+                            const $el = $(element);
+                            if ($el.closest('nav').length > 0 || $el.closest('header').length > 0) {
+                                weight = 0.7;
+                            }
+                            else if ($el.closest('footer').length > 0) {
+                                weight = 0.4;
+                            }
+                            else {
+                                // Secondary check: Common attributes
+                                const parentText = ($el.parent().attr('class') || '') + ($el.parent().attr('id') || '');
+                                const grandParentText = ($el.parent().parent().attr('class') || '') + ($el.parent().parent().attr('id') || '');
+                                const combinedContext = (parentText + grandParentText).toLowerCase();
+                                if (combinedContext.includes('nav') || combinedContext.includes('menu')) {
+                                    weight = 0.7;
+                                }
+                                else if (combinedContext.includes('footer')) {
+                                    weight = 0.4;
+                                }
+                            }
+                            // Store highest weight if multiple links to same URL
+                            const currentMax = links.get(urlStr) || 0;
+                            if (weight > currentMax) {
+                                links.set(urlStr, weight);
+                            }
+                        }
+                    }
+                    catch (_e) {
+                        // Invalid URL
+                    }
+                }
+            });
+        }
+        // 4. Content Hash (ignoring script/style/comments)
+        // Clone body to avoid modifying the loaded doc (though we don't reuse it)
+        // Actually cheerio load gives us a fresh instance.
+        $('script').remove();
+        $('style').remove();
+        $('noscript').remove();
+        $('iframe').remove();
+        const cleanText = $('body').text().replace(/\s+/g, ' ').trim();
+        const contentHash = crypto.createHash('sha256').update(cleanText).digest('hex');
+        // 4b. Simhash & Token calculation (limit to 50k chars for performance)
+        const limitedText = cleanText.substring(0, 50000).toLowerCase();
+        const tokens = limitedText.split(/\W+/).filter(t => t.length > 0);
+        const uniqueTokens = new Set(tokens);
+        const uniqueTokenRatio = tokens.length > 0 ? (uniqueTokens.size / tokens.length) : 0;
+        const simhash = SimHash.generate(tokens).toString();
+        // 5. Soft 404 Detection
+        let soft404Score = 0;
+        const soft404Signals = [];
+        if (status === 200) {
+            const title = $('title').text().toLowerCase();
+            const h1Text = $('h1').first().text().toLowerCase();
+            const bodyText = cleanText.toLowerCase();
+            const errorPatterns = ['404', 'not found', 'error', 'doesn\'t exist', 'unavailable', 'invalid'];
+            // Pattern checks
+            for (const pattern of errorPatterns) {
+                if (title.includes(pattern)) {
+                    soft404Score += 0.4;
+                    soft404Signals.push(`title_pattern_${pattern.replace(/\s+/g, '_')}`);
+                    break;
+                }
+            }
+            for (const pattern of errorPatterns) {
+                if (h1Text.includes(pattern)) {
+                    soft404Score += 0.3;
+                    soft404Signals.push(`h1_pattern_${pattern.replace(/\s+/g, '_')}`);
+                    break;
+                }
+            }
+            if (bodyText.includes('page not found') || bodyText.includes('404 error')) {
+                soft404Score += 0.2;
+                soft404Signals.push('body_error_phrase');
+            }
+            // Content length check (Word count approximation)
+            const words = cleanText.split(/\s+/).filter(w => w.length > 0);
+            if (words.length < 50) {
+                soft404Score += 0.3;
+                soft404Signals.push('very_low_word_count');
+            }
+            else if (words.length < 150) {
+                soft404Score += 0.1;
+                soft404Signals.push('low_word_count');
+            }
+            // Link count check
+            if (links.size === 0) {
+                soft404Score += 0.2;
+                soft404Signals.push('no_outbound_links');
+            }
+            // Cap at 1.0
+            soft404Score = Math.min(1.0, soft404Score);
+        }
+        return {
+            links: Array.from(links.entries()).map(([url, weight]) => ({ url, weight })),
+            html: html, // pass raw HTML for analysis
+            canonical,
+            noindex,
+            nofollow,
+            contentHash,
+            simhash,
+            uniqueTokenRatio,
+            soft404Score,
+            soft404Signals
+        };
+    }
+}

package/dist/crawler/sitemap.d.ts ADDED Viewed

@@ -0,0 +1,8 @@
+export declare class Sitemap {
+    /**
+     * Fetches and parses a sitemap (or sitemap index) to extract URLs.
+     * Recursively handles sitemap indexes with loop detection and depth limits.
+     */
+    fetch(url: string): Promise<string[]>;
+    private processSitemap;
+}

package/dist/crawler/sitemap.js ADDED Viewed

@@ -0,0 +1,70 @@
+import { request } from 'undici';
+import * as cheerio from 'cheerio';
+import { normalizeUrl } from './normalize.js';
+export class Sitemap {
+    /**
+     * Fetches and parses a sitemap (or sitemap index) to extract URLs.
+     * Recursively handles sitemap indexes with loop detection and depth limits.
+     */
+    async fetch(url) {
+        const visited = new Set();
+        const urls = new Set();
+        await this.processSitemap(url, visited, urls);
+        return Array.from(urls);
+    }
+    async processSitemap(url, visited, urls) {
+        if (visited.has(url))
+            return;
+        visited.add(url);
+        // Hard limit on number of sitemaps to fetch to prevent abuse
+        if (visited.size > 50)
+            return;
+        try {
+            const res = await request(url, {
+                maxRedirections: 3,
+                headers: { 'User-Agent': 'crawlith/1.0' },
+                headersTimeout: 10000,
+                bodyTimeout: 10000
+            });
+            if (res.statusCode >= 200 && res.statusCode < 300) {
+                const xml = await res.body.text();
+                // Basic validation: must verify it looks like XML
+                if (!xml.trim().startsWith('<'))
+                    return;
+                const $ = cheerio.load(xml, { xmlMode: true });
+                // Check if it's a sitemap index
+                const sitemaps = $('sitemap > loc');
+                if (sitemaps.length > 0) {
+                    const childSitemaps = [];
+                    sitemaps.each((_, el) => {
+                        const loc = $(el).text().trim();
+                        if (loc)
+                            childSitemaps.push(loc);
+                    });
+                    // Process children sequentially to avoid massive concurrency spike
+                    for (const childUrl of childSitemaps) {
+                        await this.processSitemap(childUrl, visited, urls);
+                    }
+                }
+                else {
+                    // It's a URL Set
+                    $('url > loc').each((_, el) => {
+                        const loc = $(el).text().trim();
+                        if (loc) {
+                            const normalized = normalizeUrl(loc, '');
+                            if (normalized) {
+                                urls.add(normalized);
+                            }
+                        }
+                    });
+                }
+            }
+            else {
+                await res.body.dump();
+            }
+        }
+        catch (e) {
+            console.warn(`Failed to fetch sitemap ${url}:`, e);
+        }
+    }
+}

package/dist/crawler/trap.d.ts ADDED Viewed

@@ -0,0 +1,24 @@
+export type TrapType = 'faceted_navigation' | 'calendar_trap' | 'pagination_loop' | 'session_trap';
+export interface TrapResult {
+    risk: number;
+    type: TrapType | null;
+}
+export declare class TrapDetector {
+    private pathCounters;
+    private paginationCounters;
+    private sessionParams;
+    private PARAM_EXPLOSION_THRESHOLD;
+    private PAGINATION_THRESHOLD;
+    constructor(options?: {
+        paramThreshold?: number;
+        paginationThreshold?: number;
+    });
+    /**
+     * Checks if a URL represents a potential crawl trap.
+     */
+    checkTrap(rawUrl: string, _depth: number): TrapResult;
+    /**
+     * Resets internal state (useful for multi-crawl sessions if needed)
+     */
+    reset(): void;
+}

package/dist/crawler/trap.js ADDED Viewed

@@ -0,0 +1,78 @@
+export class TrapDetector {
+    pathCounters = new Map();
+    paginationCounters = new Map();
+    sessionParams = new Set(['sid', 'session', 'phpsessid', 'sessid', 'token']);
+    // Configurable thresholds
+    PARAM_EXPLOSION_THRESHOLD = 30;
+    PAGINATION_THRESHOLD = 50;
+    constructor(options = {}) {
+        if (options.paramThreshold)
+            this.PARAM_EXPLOSION_THRESHOLD = options.paramThreshold;
+        if (options.paginationThreshold)
+            this.PAGINATION_THRESHOLD = options.paginationThreshold;
+    }
+    /**
+     * Checks if a URL represents a potential crawl trap.
+     */
+    checkTrap(rawUrl, _depth) {
+        let risk = 0;
+        let type = null;
+        try {
+            const u = new URL(rawUrl);
+            const params = new URLSearchParams(u.search);
+            const pathname = u.pathname;
+            const pathKey = `${u.origin}${pathname}`;
+            // 1. Session IDs / Tracking Parameters
+            for (const [key] of params) {
+                if (this.sessionParams.has(key.toLowerCase()) || key.toLowerCase().includes('session')) {
+                    risk = Math.max(risk, 0.9);
+                    type = 'session_trap';
+                }
+            }
+            // 2. Calendar Pattern Detection
+            // Matches /2023/12/01, /2023-12-01, /12-2023 etc
+            const calendarRegex = /\/\d{4}[-/]\d{2}[-/]\d{2}\/|\/\d{2}[-/]\d{2}[-/]\d{4}\//;
+            if (calendarRegex.test(pathname)) {
+                risk = Math.max(risk, 0.7);
+                type = 'calendar_trap';
+            }
+            // 3. Pagination Loop
+            const pageParam = params.get('page') || params.get('p') || params.get('pg');
+            if (pageParam && /^\d+$/.test(pageParam)) {
+                const pageNum = parseInt(pageParam, 10);
+                const currentMaxPage = this.paginationCounters.get(pathKey) || 0;
+                if (pageNum > currentMaxPage) {
+                    this.paginationCounters.set(pathKey, pageNum);
+                }
+                if (pageNum > this.PAGINATION_THRESHOLD) {
+                    risk = Math.max(risk, 0.85);
+                    type = 'pagination_loop';
+                }
+            }
+            // 4. Infinite Parameter Explosion (Faceted Navigation)
+            if (params.size > 0) {
+                const paramSet = this.pathCounters.get(pathKey) || new Set();
+                params.sort();
+                const paramKey = params.toString();
+                paramSet.add(paramKey);
+                this.pathCounters.set(pathKey, paramSet);
+                if (paramSet.size > this.PARAM_EXPLOSION_THRESHOLD) {
+                    risk = Math.max(risk, 0.95);
+                    if (!type)
+                        type = 'faceted_navigation';
+                }
+            }
+        }
+        catch (_e) {
+            // Invalid URL
+        }
+        return { risk, type };
+    }
+    /**
+     * Resets internal state (useful for multi-crawl sessions if needed)
+     */
+    reset() {
+        this.pathCounters.clear();
+        this.paginationCounters.clear();
+    }
+}

package/dist/db/graphLoader.d.ts ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ import { Graph } from '../graph/graph.js';
2	+ export declare function loadGraphFromSnapshot(snapshotId: number): Graph;

package/dist/db/graphLoader.js ADDED Viewed

@@ -0,0 +1,96 @@
+import { getDb } from './index.js';
+import { PageRepository } from './repositories/PageRepository.js';
+import { EdgeRepository } from './repositories/EdgeRepository.js';
+import { MetricsRepository } from './repositories/MetricsRepository.js';
+import { SnapshotRepository } from './repositories/SnapshotRepository.js';
+import { Graph } from '../graph/graph.js';
+export function loadGraphFromSnapshot(snapshotId) {
+    const db = getDb();
+    const pageRepo = new PageRepository(db);
+    const edgeRepo = new EdgeRepository(db);
+    const metricsRepo = new MetricsRepository(db);
+    const snapshotRepo = new SnapshotRepository(db);
+    const pages = pageRepo.getPagesBySnapshot(snapshotId);
+    const metrics = metricsRepo.getMetrics(snapshotId);
+    const snapshot = snapshotRepo.getSnapshot(snapshotId);
+    const metricsMap = new Map();
+    for (const m of metrics) {
+        metricsMap.set(m.page_id, m);
+    }
+    const graph = new Graph();
+    if (snapshot) {
+        graph.limitReached = !!snapshot.limit_reached;
+    }
+    const idMap = new Map();
+    for (const p of pages) {
+        idMap.set(p.id, p.normalized_url);
+        graph.addNode(p.normalized_url, p.depth, p.http_status || 0);
+        const m = metricsMap.get(p.id);
+        let incrementalStatus;
+        if (p.first_seen_snapshot_id === snapshotId) {
+            incrementalStatus = 'new';
+        }
+        else if (m?.crawl_status === 'cached') {
+            incrementalStatus = 'unchanged';
+        }
+        else if (m?.crawl_status === 'fetched') {
+            incrementalStatus = 'changed';
+        }
+        graph.updateNodeData(p.normalized_url, {
+            canonical: p.canonical_url || undefined,
+            contentHash: p.content_hash || undefined,
+            simhash: p.simhash || undefined,
+            etag: p.etag || undefined,
+            lastModified: p.last_modified || undefined,
+            html: p.html || undefined,
+            soft404Score: p.soft404_score || undefined,
+            noindex: !!p.noindex,
+            nofollow: !!p.nofollow,
+            incrementalStatus,
+            securityError: p.security_error || undefined,
+            retries: p.retries || undefined,
+            bytesReceived: p.bytes_received || undefined,
+            redirectChain: p.redirect_chain ? JSON.parse(p.redirect_chain) : undefined,
+            crawlTrapFlag: !!p.crawl_trap_flag,
+            crawlTrapRisk: p.crawl_trap_risk || undefined,
+            trapType: p.trap_type || undefined,
+            // Metrics
+            pageRank: m?.pagerank ?? undefined,
+            pageRankScore: m?.pagerank_score ?? m?.pagerank ?? undefined,
+            authorityScore: m?.authority_score ?? undefined,
+            hubScore: m?.hub_score ?? undefined,
+            linkRole: m?.link_role ?? undefined,
+            // Duplicate info
+            duplicateClusterId: m?.duplicate_cluster_id ?? undefined,
+            duplicateType: m?.duplicate_type ?? undefined,
+            isClusterPrimary: m?.is_cluster_primary ? true : undefined,
+        });
+    }
+    const edges = edgeRepo.getEdgesBySnapshot(snapshotId);
+    for (const e of edges) {
+        const source = idMap.get(e.source_page_id);
+        const target = idMap.get(e.target_page_id);
+        if (source && target) {
+            graph.addEdge(source, target, e.weight || 1.0);
+        }
+    }
+    // Load duplicate clusters
+    const dupClusters = db.prepare('SELECT * FROM duplicate_clusters WHERE snapshot_id = ?').all(snapshotId);
+    graph.duplicateClusters = dupClusters.map(c => ({
+        id: c.id,
+        type: c.type,
+        size: c.size,
+        representative: c.representative,
+        severity: c.severity
+    }));
+    // Load content clusters
+    const contentClusters = db.prepare('SELECT * FROM content_clusters WHERE snapshot_id = ?').all(snapshotId);
+    graph.contentClusters = contentClusters.map(c => ({
+        id: c.id,
+        count: c.count,
+        primaryUrl: c.primary_url,
+        risk: c.risk,
+        sharedPathPrefix: c.shared_path_prefix || undefined
+    }));
+    return graph;
+}

package/dist/db/index.d.ts ADDED Viewed

@@ -0,0 +1,4 @@
+import Database from 'better-sqlite3';
+export declare function getDbPath(): string;
+export declare function getDb(): Database.Database;
+export declare function closeDb(): void;