npm - @crawlith/core - Versions diffs - 0.1.0 → 0.1.1 - Mend

@crawlith/core 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (131) hide show

package/CHANGELOG.md +6 -0
package/dist/analysis/analysis_list.html +35 -0
package/dist/analysis/analysis_page.html +123 -0
package/dist/analysis/analyze.d.ts +17 -3
package/dist/analysis/analyze.js +192 -248
package/dist/analysis/scoring.js +7 -1
package/dist/analysis/templates.d.ts +2 -0
package/dist/analysis/templates.js +7 -0
package/dist/core/security/ipGuard.d.ts +11 -0
package/dist/core/security/ipGuard.js +71 -3
package/dist/crawler/crawl.d.ts +4 -22
package/dist/crawler/crawl.js +4 -335
package/dist/crawler/crawler.d.ts +75 -0
package/dist/crawler/crawler.js +518 -0
package/dist/crawler/extract.d.ts +4 -1
package/dist/crawler/extract.js +7 -2
package/dist/crawler/fetcher.d.ts +1 -0
package/dist/crawler/fetcher.js +20 -5
package/dist/crawler/metricsRunner.d.ts +3 -1
package/dist/crawler/metricsRunner.js +55 -46
package/dist/crawler/sitemap.d.ts +3 -0
package/dist/crawler/sitemap.js +5 -1
package/dist/db/graphLoader.js +32 -3
package/dist/db/index.d.ts +3 -0
package/dist/db/index.js +4 -0
package/dist/db/repositories/EdgeRepository.d.ts +8 -0
package/dist/db/repositories/EdgeRepository.js +13 -0
package/dist/db/repositories/MetricsRepository.d.ts +3 -0
package/dist/db/repositories/MetricsRepository.js +14 -1
package/dist/db/repositories/PageRepository.d.ts +11 -0
package/dist/db/repositories/PageRepository.js +112 -19
package/dist/db/repositories/SiteRepository.d.ts +3 -0
package/dist/db/repositories/SiteRepository.js +9 -0
package/dist/db/repositories/SnapshotRepository.d.ts +2 -0
package/dist/db/repositories/SnapshotRepository.js +23 -2
package/dist/events.d.ts +48 -0
package/dist/events.js +1 -0
package/dist/graph/cluster.js +62 -14
package/dist/graph/duplicate.js +242 -191
package/dist/graph/graph.d.ts +16 -0
package/dist/graph/graph.js +17 -4
package/dist/graph/metrics.js +12 -0
package/dist/graph/pagerank.js +2 -0
package/dist/graph/simhash.d.ts +6 -0
package/dist/graph/simhash.js +14 -0
package/dist/index.d.ts +5 -2
package/dist/index.js +5 -2
package/dist/lock/hashKey.js +1 -1
package/dist/lock/lockManager.d.ts +4 -1
package/dist/lock/lockManager.js +23 -13
package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
package/dist/report/crawlExport.d.ts +3 -0
package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
package/dist/report/crawl_template.d.ts +1 -0
package/dist/report/crawl_template.js +7 -0
package/dist/report/html.js +15 -216
package/dist/scoring/health.d.ts +50 -0
package/dist/scoring/health.js +170 -0
package/dist/scoring/hits.d.ts +1 -0
package/dist/scoring/hits.js +64 -44
package/dist/scoring/orphanSeverity.d.ts +5 -5
package/package.json +3 -3
package/scripts/copy-assets.js +37 -0
package/src/analysis/analysis_list.html +35 -0
package/src/analysis/analysis_page.html +123 -0
package/src/analysis/analyze.ts +218 -261
package/src/analysis/scoring.ts +8 -1
package/src/analysis/templates.ts +9 -0
package/src/core/security/ipGuard.ts +82 -3
package/src/crawler/crawl.ts +6 -379
package/src/crawler/crawler.ts +601 -0
package/src/crawler/extract.ts +7 -2
package/src/crawler/fetcher.ts +24 -6
package/src/crawler/metricsRunner.ts +60 -47
package/src/crawler/sitemap.ts +4 -1
package/src/db/graphLoader.ts +33 -3
package/src/db/index.ts +5 -0
package/src/db/repositories/EdgeRepository.ts +14 -0
package/src/db/repositories/MetricsRepository.ts +15 -1
package/src/db/repositories/PageRepository.ts +119 -19
package/src/db/repositories/SiteRepository.ts +11 -0
package/src/db/repositories/SnapshotRepository.ts +28 -3
package/src/events.ts +16 -0
package/src/graph/cluster.ts +69 -15
package/src/graph/duplicate.ts +249 -185
package/src/graph/graph.ts +24 -4
package/src/graph/metrics.ts +15 -0
package/src/graph/pagerank.ts +1 -0
package/src/graph/simhash.ts +15 -0
package/src/index.ts +5 -2
package/src/lock/hashKey.ts +1 -1
package/src/lock/lockManager.ts +21 -13
package/{dist/report/sitegraph_template.js → src/report/crawl.html} +330 -81
package/src/report/{sitegraphExport.ts → crawlExport.ts} +3 -3
package/src/report/crawl_template.ts +9 -0
package/src/report/html.ts +17 -217
package/src/scoring/health.ts +241 -0
package/src/scoring/hits.ts +67 -45
package/src/scoring/orphanSeverity.ts +8 -8
package/tests/analysis.unit.test.ts +44 -0
package/tests/analyze.integration.test.ts +88 -53
package/tests/analyze_markdown.test.ts +98 -0
package/tests/audit/audit.test.ts +101 -0
package/tests/audit/scoring.test.ts +25 -25
package/tests/audit/transport.test.ts +0 -1
package/tests/clustering_risk.test.ts +118 -0
package/tests/crawler.test.ts +19 -13
package/tests/db/index.test.ts +134 -0
package/tests/db/repositories.test.ts +115 -0
package/tests/db_repos.test.ts +72 -0
package/tests/duplicate.test.ts +2 -2
package/tests/extract.test.ts +86 -0
package/tests/fetcher.test.ts +5 -1
package/tests/fetcher_safety.test.ts +9 -3
package/tests/graph/graph.test.ts +100 -0
package/tests/graphLoader.test.ts +124 -0
package/tests/html_report.test.ts +52 -51
package/tests/ipGuard.test.ts +73 -0
package/tests/lock/lockManager.test.ts +77 -17
package/tests/normalize.test.ts +6 -19
package/tests/orphanSeverity.test.ts +9 -9
package/tests/redirect_safety.test.ts +5 -1
package/tests/renderAnalysisCsv.test.ts +183 -0
package/tests/safety.test.ts +12 -0
package/tests/scope.test.ts +18 -0
package/tests/scoring.test.ts +25 -24
package/tests/sitemap.test.ts +13 -1
package/tests/ssrf_fix.test.ts +69 -0
package/tests/visualization_data.test.ts +10 -10
package/dist/report/sitegraphExport.d.ts +0 -3
package/dist/report/sitegraph_template.d.ts +0 -1

package/tests/clustering_risk.test.ts ADDED Viewed

@@ -0,0 +1,118 @@
+import { describe, it, expect, beforeEach } from 'vitest';
+import { Graph } from '../src/graph/graph.js';
+import { detectContentClusters } from '../src/graph/cluster.js';
+describe('Cluster Risk Heuristic', () => {
+    let graph: Graph;
+    beforeEach(() => {
+        graph = new Graph();
+    });
+    it('should assign HIGH risk to clusters with identical titles', () => {
+        const html = '<html><head><title>Duplicate Title</title></head><body>Content</body></html>';
+        const h = 0b101010n.toString();
+        graph.addNode('https://example.com/p1', 0, 200);
+        graph.addNode('https://example.com/p2', 0, 200);
+        graph.addNode('https://example.com/p3', 0, 200);
+        graph.updateNodeData('https://example.com/p1', { simhash: h, html });
+        graph.updateNodeData('https://example.com/p2', { simhash: h, html });
+        graph.updateNodeData('https://example.com/p3', { simhash: h, html });
+        const clusters = detectContentClusters(graph, 2, 2);
+        expect(clusters.length).toBe(1);
+        expect(clusters[0].risk).toBe('high');
+    });
+    it('should assign HIGH risk to clusters with identical H1s', () => {
+        const h = 0b101010n.toString();
+        graph.addNode('https://example.com/p1', 0, 200);
+        graph.addNode('https://example.com/p2', 0, 200);
+        graph.addNode('https://example.com/p3', 0, 200);
+        // Different titles, same H1
+        graph.updateNodeData('https://example.com/p1', {
+            simhash: h,
+            html: '<html><head><title>Page 1</title></head><body><h1>Duplicate Header</h1></body></html>'
+        });
+        graph.updateNodeData('https://example.com/p2', {
+            simhash: h,
+            html: '<html><head><title>Page 2</title></head><body><h1>Duplicate Header</h1></body></html>'
+        });
+        graph.updateNodeData('https://example.com/p3', {
+            simhash: h,
+            html: '<html><head><title>Page 3</title></head><body><h1>Duplicate Header</h1></body></html>'
+        });
+        const clusters = detectContentClusters(graph, 2, 2);
+        expect(clusters.length).toBe(1);
+        expect(clusters[0].risk).toBe('high');
+    });
+    it('should assign LOW risk to small clusters with unique titles and H1s', () => {
+        const h = 0b101010n.toString();
+        graph.addNode('https://example.com/p1', 0, 200);
+        graph.addNode('https://example.com/p2', 0, 200);
+        graph.addNode('https://example.com/p3', 0, 200);
+        graph.updateNodeData('https://example.com/p1', {
+            simhash: h,
+            html: '<html><head><title>Page 1</title></head><body><h1>Header 1</h1></body></html>'
+        });
+        graph.updateNodeData('https://example.com/p2', {
+            simhash: h,
+            html: '<html><head><title>Page 2</title></head><body><h1>Header 2</h1></body></html>'
+        });
+        graph.updateNodeData('https://example.com/p3', {
+            simhash: h,
+            html: '<html><head><title>Page 3</title></head><body><h1>Header 3</h1></body></html>'
+        });
+        const clusters = detectContentClusters(graph, 2, 2);
+        expect(clusters.length).toBe(1);
+        expect(clusters[0].risk).toBe('low');
+    });
+    it('should assign MEDIUM risk to large clusters even with unique titles', () => {
+        const h = 0b101010n.toString();
+        // 12 nodes, all unique titles
+        for (let i = 0; i < 12; i++) {
+            const url = `https://example.com/p${i}`;
+            graph.addNode(url, 0, 200);
+            graph.updateNodeData(url, {
+                simhash: h,
+                html: `<html><head><title>Page ${i}</title></head><body><h1>Header ${i}</h1></body></html>`
+            });
+        }
+        const clusters = detectContentClusters(graph, 2, 2);
+        expect(clusters.length).toBe(1);
+        expect(clusters[0].risk).toBe('medium');
+    });
+    it('should handle missing HTML gracefully', () => {
+         const h = 0b101010n.toString();
+        graph.addNode('https://example.com/p1', 0, 200);
+        graph.addNode('https://example.com/p2', 0, 200);
+        // No HTML provided
+        graph.updateNodeData('https://example.com/p1', { simhash: h });
+        graph.updateNodeData('https://example.com/p2', { simhash: h });
+        const clusters = detectContentClusters(graph, 2, 2);
+        expect(clusters.length).toBe(1);
+        // Fallback to size based? 2 nodes -> low risk
+        expect(clusters[0].risk).toBe('low');
+    });
+});

package/tests/crawler.test.ts CHANGED Viewed

@@ -1,16 +1,22 @@
-import { test, expect, beforeEach, afterEach } from 'vitest';
+import { test, expect, beforeEach, afterEach, vi } from 'vitest';
 import { crawl } from '../src/crawler/crawl.js';
 import { loadGraphFromSnapshot } from '../src/db/graphLoader.js';
 import { closeDb } from '../src/db/index.js';
 import { MockAgent, setGlobalDispatcher } from 'undici';
+import { IPGuard } from '../src/core/security/ipGuard.js';
+import { EngineContext } from '../src/events.js';
 let mockAgent: MockAgent;
+const mockContext: EngineContext = { emit: vi.fn() };
 beforeEach(() => {
   process.env.CRAWLITH_DB_PATH = ':memory:';
   mockAgent = new MockAgent();
   mockAgent.disableNetConnect();
   setGlobalDispatcher(mockAgent);
+  // IPGuard.getSecureDispatcher must return the mockAgent so Fetcher uses it
+  vi.spyOn(IPGuard, 'getSecureDispatcher').mockReturnValue(mockAgent as any);
 });
 afterEach(() => {
@@ -68,7 +74,7 @@ test('crawler should crawl and build graph', async () => {
     depth: 2,
     ignoreRobots: false,
     rate: 1000
-  });
+  }, mockContext);
   const graph = loadGraphFromSnapshot(snapshotId);
   const nodes = graph.getNodes();
@@ -110,7 +116,7 @@ test('hard page limit', async () => {
     depth: 5,
     ignoreRobots: true,
     rate: 1000
-  });
+  }, mockContext);
   const graph = loadGraphFromSnapshot(snapshotId);
   // Should have visited root + 1 other page (total 2 nodes with status > 0)
@@ -138,7 +144,7 @@ test('hard depth cap', async () => {
     depth: 20, // requested 20, but internal hard cap is 10
     ignoreRobots: true,
     rate: 1000
-  });
+  }, mockContext);
   const graph = loadGraphFromSnapshot(snapshotId);
   const crawledNodes = graph.getNodes().filter(n => n.status > 0);
@@ -172,7 +178,7 @@ test('parameter explosion control', async () => {
     stripQuery: false,
     detectTraps: true,
     rate: 1000
-  });
+  }, mockContext);
   const graph = loadGraphFromSnapshot(snapshotId);
   // Should only crawl 5 variations + root
@@ -205,7 +211,7 @@ test('redirect safety', async () => {
     depth: 5,
     ignoreRobots: true,
     rate: 1000
-  });
+  }, mockContext);
   const graph = loadGraphFromSnapshot(snapshotId);
   const destNode = graph.nodes.get('https://redirect.com/dest');
@@ -223,7 +229,7 @@ test('redirect safety', async () => {
   clientLoop.intercept({ path: '/b', method: 'GET' }).reply(301, '', { headers: { location: '/a' } });
   // We might mock /a again if it retries, but it shouldn't infinitely loop
-  const snapshotIdLoop = await crawl('https://loop.com', { limit: 10, depth: 5, ignoreRobots: true, rate: 1000 });
+  const snapshotIdLoop = await crawl('https://loop.com', { limit: 10, depth: 5, ignoreRobots: true, rate: 1000 }, mockContext);
   const graphLoop = loadGraphFromSnapshot(snapshotIdLoop);
   // It should eventually stop
   expect(graphLoop.getNodes().length).toBeGreaterThan(0);
@@ -246,7 +252,7 @@ test('mime check', async () => {
     <html><a href="/data">Data</a></html>
   `, { headers: { 'content-type': 'text/html' } });
-  const snapshotId = await crawl('https://mime.com/start', { limit: 10, depth: 5, ignoreRobots: true, rate: 1000 });
+  const snapshotId = await crawl('https://mime.com/start', { limit: 10, depth: 5, ignoreRobots: true, rate: 1000 }, mockContext);
   const graph = loadGraphFromSnapshot(snapshotId);
   // /data should be in graph
@@ -267,7 +273,7 @@ test('self-link guard', async () => {
   client.intercept({ path: '/other', method: 'GET' }).reply(200, '', { headers: { 'content-type': 'text/html' } });
-  const snapshotId = await crawl('https://self.com', { limit: 10, depth: 5, ignoreRobots: true, rate: 1000 });
+  const snapshotId = await crawl('https://self.com', { limit: 10, depth: 5, ignoreRobots: true, rate: 1000 }, mockContext);
   const graph = loadGraphFromSnapshot(snapshotId);
   const edges = graph.getEdges();
@@ -288,7 +294,7 @@ test('limit warning', async () => {
   client.intercept({ path: '/1', method: 'GET' }).reply(200, '', { headers: { 'content-type': 'text/html' } });
-  const snapshotId = await crawl('https://warn.com', { limit: 2, depth: 5, ignoreRobots: true, rate: 1000 });
+  const snapshotId = await crawl('https://warn.com', { limit: 2, depth: 5, ignoreRobots: true, rate: 1000 }, mockContext);
   const graph = loadGraphFromSnapshot(snapshotId);
   expect(graph.limitReached).toBe(true);
@@ -315,7 +321,7 @@ test('seeds from sitemap', async () => {
     ignoreRobots: true,
     sitemap: 'true',
     rate: 1000
-  });
+  }, mockContext);
   const graph = loadGraphFromSnapshot(snapshotId);
   const page1 = graph.nodes.get('https://sitemap-seed.com/page1');
@@ -332,7 +338,7 @@ test('incremental crawl uses etags', async () => {
     headers: { 'content-type': 'text/html', 'etag': '"v1"' }
   });
-  const snapshotId1 = await crawl('https://incremental.com', { limit: 10, depth: 1, ignoreRobots: true, rate: 1000 });
+  const snapshotId1 = await crawl('https://incremental.com', { limit: 10, depth: 1, ignoreRobots: true, rate: 1000 }, mockContext);
   const graph1 = loadGraphFromSnapshot(snapshotId1);
   const node1 = graph1.nodes.get('https://incremental.com/');
   expect(node1?.etag).toBe('"v1"');
@@ -350,7 +356,7 @@ test('incremental crawl uses etags', async () => {
     ignoreRobots: true,
     previousGraph: graph1,
     rate: 1000
-  });
+  }, mockContext);
   const graph2 = loadGraphFromSnapshot(snapshotId2);
   const node2 = graph2.nodes.get('https://incremental.com/');

package/tests/db/index.test.ts ADDED Viewed

@@ -0,0 +1,134 @@
+import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
+import { getDbPath, getDb, closeDb } from '../../src/db/index.js';
+import fs from 'node:fs';
+import os from 'node:os';
+import path from 'node:path';
+vi.mock('node:fs');
+vi.mock('node:os');
+vi.mock('better-sqlite3', () => {
+  return {
+    default: vi.fn(function () {
+      return {
+        pragma: vi.fn().mockReturnValue('ok'),
+        prepare: vi.fn().mockReturnValue({
+          run: vi.fn(),
+          get: vi.fn(),
+          iterate: vi.fn(),
+          all: vi.fn()
+        }),
+        exec: vi.fn(),
+        close: vi.fn(),
+        transaction: vi.fn((fn) => fn),
+      };
+    }),
+  };
+});
+vi.mock('../../src/db/schema.js', () => ({
+  initSchema: vi.fn(),
+}));
+describe('DB Index', () => {
+  const originalEnv = process.env;
+  beforeEach(() => {
+    vi.resetAllMocks();
+    closeDb();
+    process.env = { ...originalEnv };
+    // Default mock behaviors
+    vi.mocked(os.homedir).mockReturnValue('/home/user');
+    vi.mocked(fs.existsSync).mockReturnValue(false);
+    vi.mocked(fs.mkdirSync).mockImplementation(() => undefined as any);
+    vi.mocked(fs.chmodSync).mockImplementation(() => undefined);
+  });
+  afterEach(() => {
+    process.env = originalEnv;
+    closeDb();
+  });
+  describe('getDbPath', () => {
+    it('should return :memory: in test environment', () => {
+      process.env.NODE_ENV = 'test';
+      expect(getDbPath()).toBe(':memory:');
+    });
+    it('should return custom path if CRAWLITH_DB_PATH is set', () => {
+      process.env.NODE_ENV = 'production';
+      process.env.CRAWLITH_DB_PATH = '/custom/path/db.sqlite';
+      expect(getDbPath()).toBe('/custom/path/db.sqlite');
+    });
+    it('should return default path in home dir if no env var', () => {
+      process.env.NODE_ENV = 'production';
+      delete process.env.CRAWLITH_DB_PATH;
+      const expectedPath = path.join('/home/user', '.crawlith', 'crawlith.db');
+      expect(getDbPath()).toBe(expectedPath);
+      expect(fs.mkdirSync).toHaveBeenCalledWith(path.join('/home/user', '.crawlith'), { recursive: true });
+      expect(fs.chmodSync).toHaveBeenCalledWith(path.join('/home/user', '.crawlith'), 0o700);
+    });
+    it('should not create dir if it exists', () => {
+      process.env.NODE_ENV = 'production';
+      vi.mocked(fs.existsSync).mockReturnValue(true);
+      getDbPath();
+      expect(fs.mkdirSync).not.toHaveBeenCalled();
+    });
+  });
+  describe('getDb', () => {
+     it('should create a new database instance', () => {
+       process.env.NODE_ENV = 'production';
+       const db = getDb();
+       expect(db).toBeDefined();
+       // Check if pragma was called
+       expect(db.pragma).toHaveBeenCalledWith('journal_mode = WAL');
+     });
+     it('should return existing instance if called twice', () => {
+        process.env.NODE_ENV = 'production';
+        const db1 = getDb();
+        const db2 = getDb();
+        expect(db1).toBe(db2);
+     });
+     it('should handle permission errors gracefully', () => {
+        process.env.NODE_ENV = 'production';
+        // Avoid getDbPath throwing
+        vi.mocked(fs.existsSync).mockReturnValue(true);
+        vi.mocked(fs.chmodSync).mockImplementation((path) => {
+           if (path.toString().endsWith('crawlith.db')) {
+             throw new Error('EPERM');
+           }
+        });
+        expect(() => getDb()).not.toThrow();
+     });
+     it('should warn if integrity check fails', async () => {
+         const warnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {});
+         process.env.NODE_ENV = 'production';
+         vi.mocked(fs.existsSync).mockReturnValue(true);
+         const MockDatabase = (await import('better-sqlite3')).default;
+         vi.mocked(MockDatabase).mockImplementationOnce(function() {
+            return {
+               pragma: vi.fn().mockReturnValue('corrupt'),
+               prepare: vi.fn(),
+               exec: vi.fn(),
+               close: vi.fn(),
+               transaction: vi.fn(),
+            } as any;
+         });
+         getDb();
+         expect(warnSpy).toHaveBeenCalledWith('Database integrity check failed:', 'corrupt');
+     });
+  });
+});

package/tests/db/repositories.test.ts ADDED Viewed

@@ -0,0 +1,115 @@
+import { describe, it, expect, beforeEach, afterEach } from 'vitest';
+import Database from 'better-sqlite3';
+import { PageRepository } from '../../src/db/repositories/PageRepository.js';
+import { initSchema } from '../../src/db/schema.js';
+describe('PageRepository', () => {
+  let db: Database.Database;
+  let repo: PageRepository;
+  beforeEach(() => {
+    db = new Database(':memory:');
+    initSchema(db);
+    repo = new PageRepository(db);
+    // Seed required tables (sites, snapshots)
+    db.prepare("INSERT INTO sites (domain) VALUES ('example.com')").run();
+    db.prepare("INSERT INTO snapshots (site_id, type) VALUES (1, 'full')").run();
+  });
+  afterEach(() => {
+    db.close();
+  });
+  it('should get pages by URLs in chunks', () => {
+    const urls: string[] = [];
+    const siteId = 1;
+    const snapshotId = 1;
+    // Create 1000 pages (chunk size is 900)
+    const insertStmt = db.prepare(`
+      INSERT INTO pages (site_id, normalized_url, first_seen_snapshot_id, last_seen_snapshot_id)
+      VALUES (?, ?, ?, ?)
+    `);
+    const tx = db.transaction(() => {
+      for (let i = 0; i < 1000; i++) {
+        const url = `http://example.com/page${i}`;
+        urls.push(url);
+        insertStmt.run(siteId, url, snapshotId, snapshotId);
+      }
+    });
+    tx();
+    // Fetch pages
+    const pages = repo.getPagesByUrls(siteId, urls);
+    expect(pages).toHaveLength(1000);
+    expect(pages[0].normalized_url).toBe('http://example.com/page0');
+    expect(pages[999].normalized_url).toBe('http://example.com/page999');
+  });
+  it('should return empty array for empty URL list', () => {
+    const pages = repo.getPagesByUrls(1, []);
+    expect(pages).toEqual([]);
+  });
+  it('should iterate over pages by snapshot', () => {
+    const siteId = 1;
+    const snapshotId = 1;
+    const insertStmt = db.prepare(`
+      INSERT INTO pages (site_id, normalized_url, first_seen_snapshot_id, last_seen_snapshot_id)
+      VALUES (?, ?, ?, ?)
+    `);
+    db.transaction(() => {
+      insertStmt.run(siteId, 'http://example.com/1', snapshotId, snapshotId);
+      insertStmt.run(siteId, 'http://example.com/2', snapshotId, snapshotId);
+      insertStmt.run(siteId, 'http://example.com/3', snapshotId, snapshotId);
+    })();
+    const iterator = repo.getPagesIteratorBySnapshot(snapshotId);
+    const pages = Array.from(iterator);
+    expect(pages).toHaveLength(3);
+    expect(pages.map(p => p.normalized_url).sort()).toEqual([
+      'http://example.com/1',
+      'http://example.com/2',
+      'http://example.com/3'
+    ]);
+  });
+  it('should upsert and get ID', () => {
+    const pageData = {
+      site_id: 1,
+      normalized_url: 'http://example.com/new',
+      last_seen_snapshot_id: 1,
+      http_status: 200,
+    };
+    const id = repo.upsertAndGetId(pageData);
+    expect(id).toBeGreaterThan(0);
+    const sameId = repo.upsertAndGetId({ ...pageData, http_status: 404 });
+    expect(sameId).toBe(id);
+    const page = repo.getPage(1, 'http://example.com/new');
+    expect(page?.http_status).toBe(404);
+  });
+  it('should get ID by URL', () => {
+    const pageData = {
+      site_id: 1,
+      normalized_url: 'http://example.com/id-test',
+      last_seen_snapshot_id: 1,
+    };
+    repo.upsertPage(pageData);
+    const id = repo.getIdByUrl(1, 'http://example.com/id-test');
+    expect(id).toBeDefined();
+    expect(id).toBeGreaterThan(0);
+    const missingId = repo.getIdByUrl(1, 'http://example.com/missing');
+    expect(missingId).toBeUndefined();
+  });
+});

package/tests/db_repos.test.ts ADDED Viewed

@@ -0,0 +1,72 @@
+import { describe, it, expect, beforeEach, afterEach } from 'vitest';
+import Database from 'better-sqlite3';
+import { initSchema } from '../src/db/schema.js';
+import { SiteRepository } from '../src/db/repositories/SiteRepository.js';
+import { SnapshotRepository } from '../src/db/repositories/SnapshotRepository.js';
+describe('SiteRepository & SnapshotRepository', () => {
+  let db: Database.Database;
+  let siteRepo: SiteRepository;
+  let snapshotRepo: SnapshotRepository;
+  beforeEach(() => {
+    db = new Database(':memory:');
+    initSchema(db);
+    siteRepo = new SiteRepository(db);
+    snapshotRepo = new SnapshotRepository(db);
+  });
+  afterEach(() => {
+    db.close();
+  });
+  it('getAllSites should return all sites ordered by domain', () => {
+    siteRepo.createSite('b.com');
+    siteRepo.createSite('a.com');
+    siteRepo.createSite('c.com');
+    const sites = siteRepo.getAllSites();
+    expect(sites).toHaveLength(3);
+    expect(sites[0].domain).toBe('a.com');
+    expect(sites[1].domain).toBe('b.com');
+    expect(sites[2].domain).toBe('c.com');
+  });
+  it('getSnapshotCount should return correct count', () => {
+    const siteId = siteRepo.createSite('test.com');
+    expect(snapshotRepo.getSnapshotCount(siteId)).toBe(0);
+    snapshotRepo.createSnapshot(siteId, 'full');
+    expect(snapshotRepo.getSnapshotCount(siteId)).toBe(1);
+    snapshotRepo.createSnapshot(siteId, 'partial');
+    expect(snapshotRepo.getSnapshotCount(siteId)).toBe(2);
+  });
+  it('getLatestSnapshot should return the latest snapshot', () => {
+    const siteId = siteRepo.createSite('test.com');
+    // First snapshot
+    snapshotRepo.createSnapshot(siteId, 'full', 'completed');
+    // Wait a tiny bit to ensure timestamp diff if needed, but synchronous execution usually implies order
+    // Second snapshot
+    const secondId = snapshotRepo.createSnapshot(siteId, 'full', 'running');
+    const latest = snapshotRepo.getLatestSnapshot(siteId);
+    expect(latest).toBeDefined();
+    expect(latest?.id).toBe(secondId);
+    expect(latest?.status).toBe('running');
+  });
+  it('getLatestSnapshot with status filter', () => {
+    const siteId = siteRepo.createSite('test.com');
+    const firstId = snapshotRepo.createSnapshot(siteId, 'full', 'completed');
+    snapshotRepo.createSnapshot(siteId, 'full', 'running');
+    const latestCompleted = snapshotRepo.getLatestSnapshot(siteId, 'completed');
+    expect(latestCompleted).toBeDefined();
+    expect(latestCompleted?.id).toBe(firstId);
+  });
+});

package/tests/duplicate.test.ts CHANGED Viewed

@@ -89,7 +89,7 @@ describe('Duplicate Detection', () => {
         graph.updateNodeData('https://example.com/b', { contentHash: 'h1' });
         // Add edge pointing to B
-        graph.edges.set('https://example.com/source|https://example.com/b', 1);
+        graph.addEdge('https://example.com/source', 'https://example.com/b', 1);
         // Force A to be the representative by giving it higher inLinks manually, though it's determined dynamically
         graph.nodes.get('https://example.com/a')!.inLinks = 10;
@@ -105,6 +105,6 @@ describe('Duplicate Detection', () => {
         expect(b.collapseInto).toBe('https://example.com/a');
         // Check edge transfer
-        expect(graph.edges.has('https://example.com/source|https://example.com/a')).toBe(true);
+        expect(graph.edges.has(Graph.getEdgeKey('https://example.com/source', 'https://example.com/a'))).toBe(true);
     });
 });

package/tests/extract.test.ts ADDED Viewed

@@ -0,0 +1,86 @@
+import { extractLinks } from '../src/crawler/extract.js';
+import { test, expect, describe, vi, afterEach } from 'vitest';
+import * as cheerio from 'cheerio';
+// Mock cheerio.load to allow us to simulate errors
+vi.mock('cheerio', async (importOriginal) => {
+    const mod = await importOriginal<any>();
+    return {
+        ...mod,
+        load: vi.fn((...args: any[]) => mod.load(...args))
+    };
+});
+describe('extractLinks', () => {
+    afterEach(() => {
+        vi.restoreAllMocks();
+    });
+    test('should extract links correctly', () => {
+        const html = `
+      <html>
+        <body>
+          <a href="/foo">Foo</a>
+          <a href="bar">Bar</a>
+          <a href="https://other.com/baz">Baz</a>
+          <a href="#top">Top</a>
+        </body>
+      </html>
+    `;
+        const links = extractLinks(html, 'https://example.com/page/');
+        expect(links).toContain('https://example.com/foo');
+        expect(links).toContain('https://example.com/page/bar');
+        expect(links).toContain('https://other.com/baz');
+        expect(links).not.toContain('https://example.com/page/#top');
+        expect(links).toContain('https://example.com/page/');
+    });
+    test('should handle cheerio errors gracefully', () => {
+        const error = new Error('Cheerio error');
+        // Mock cheerio.load to throw an error
+        vi.mocked(cheerio.load).mockImplementationOnce(() => {
+            throw error;
+        });
+        const links = extractLinks('<html></html>', 'https://example.com');
+        expect(links).toEqual([]);
+        // No console error expected
+    });
+    test('should handle non-Error exceptions gracefully', () => {
+        const error = 'String error'; // Simulate a thrown string
+        vi.mocked(cheerio.load).mockImplementationOnce(() => {
+            throw error;
+        });
+        const links = extractLinks('<html></html>', 'https://example.com');
+        expect(links).toEqual([]);
+        // No console error expected
+    });
+    test('should ignore invalid URLs that cause URL constructor to throw', () => {
+        const html = '<a href="http://[">Invalid</a>';
+        const links = extractLinks(html, 'https://example.com');
+        expect(links).toEqual([]);
+    });
+    test('should ignore non-http protocols', () => {
+        const html = `
+            <a href="mailto:test@example.com">Mail</a>
+            <a href="javascript:void(0)">JS</a>
+            <a href="ftp://example.com/file">FTP</a>
+        `;
+        const links = extractLinks(html, 'https://example.com');
+        expect(links).toEqual([]);
+    });
+    test('should ignore links without href', () => {
+        const html = '<a>No Href</a>';
+        const links = extractLinks(html, 'https://example.com');
+        expect(links).toEqual([]);
+    });
+});