npm - @crawlith/core - Versions diffs - 0.1.1 → 0.1.2 - Mend

@crawlith/core 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (237) hide show

package/LICENSE +201 -0
package/README.md +70 -0
package/dist/analysis/analyze.d.ts +29 -8
package/dist/analysis/analyze.js +325 -221
package/dist/analysis/clustering.d.ts +23 -0
package/dist/analysis/clustering.js +206 -0
package/dist/analysis/content.d.ts +1 -1
package/dist/analysis/content.js +11 -5
package/dist/analysis/duplicate.d.ts +34 -0
package/dist/analysis/duplicate.js +305 -0
package/dist/analysis/heading.d.ts +116 -0
package/dist/analysis/heading.js +356 -0
package/dist/analysis/images.d.ts +1 -1
package/dist/analysis/images.js +6 -5
package/dist/analysis/links.d.ts +1 -1
package/dist/analysis/links.js +8 -8
package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
package/dist/analysis/scoring.js +4 -1
package/dist/analysis/seo.d.ts +8 -4
package/dist/analysis/seo.js +41 -30
package/dist/analysis/soft404.d.ts +17 -0
package/dist/analysis/soft404.js +62 -0
package/dist/analysis/structuredData.d.ts +1 -1
package/dist/analysis/structuredData.js +5 -4
package/dist/application/index.d.ts +2 -0
package/dist/application/index.js +2 -0
package/dist/application/usecase.d.ts +3 -0
package/dist/application/usecase.js +1 -0
package/dist/application/usecases.d.ts +114 -0
package/dist/application/usecases.js +201 -0
package/dist/audit/index.js +1 -1
package/dist/audit/transport.d.ts +1 -1
package/dist/audit/transport.js +5 -4
package/dist/audit/types.d.ts +1 -0
package/dist/constants.d.ts +17 -0
package/dist/constants.js +23 -0
package/dist/core/scope/scopeManager.js +3 -0
package/dist/crawler/crawl.d.ts +2 -2
package/dist/crawler/crawler.d.ts +17 -5
package/dist/crawler/crawler.js +259 -94
package/dist/crawler/fetcher.d.ts +1 -1
package/dist/crawler/fetcher.js +6 -6
package/dist/crawler/metricsRunner.d.ts +21 -1
package/dist/crawler/metricsRunner.js +181 -60
package/dist/crawler/normalize.d.ts +41 -0
package/dist/crawler/normalize.js +119 -3
package/dist/crawler/parser.d.ts +1 -3
package/dist/crawler/parser.js +2 -49
package/dist/crawler/resolver.d.ts +11 -0
package/dist/crawler/resolver.js +67 -0
package/dist/crawler/sitemap.d.ts +4 -1
package/dist/crawler/sitemap.js +24 -18
package/dist/crawler/trap.d.ts +5 -1
package/dist/crawler/trap.js +23 -2
package/dist/db/CrawlithDB.d.ts +110 -0
package/dist/db/CrawlithDB.js +500 -0
package/dist/db/graphLoader.js +15 -32
package/dist/db/index.d.ts +9 -1
package/dist/db/index.js +39 -31
package/dist/db/migrations.d.ts +2 -0
package/dist/db/{schema.js → migrations.js} +90 -43
package/dist/db/pluginRegistry.d.ts +9 -0
package/dist/db/pluginRegistry.js +19 -0
package/dist/db/repositories/EdgeRepository.d.ts +5 -0
package/dist/db/repositories/EdgeRepository.js +7 -0
package/dist/db/repositories/MetricsRepository.d.ts +13 -8
package/dist/db/repositories/MetricsRepository.js +14 -6
package/dist/db/repositories/PageRepository.d.ts +5 -3
package/dist/db/repositories/PageRepository.js +68 -17
package/dist/db/repositories/SiteRepository.d.ts +6 -0
package/dist/db/repositories/SiteRepository.js +4 -0
package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
package/dist/db/repositories/SnapshotRepository.js +48 -10
package/dist/db/reset.d.ts +9 -0
package/dist/db/reset.js +32 -0
package/dist/db/statements.d.ts +12 -0
package/dist/db/statements.js +40 -0
package/dist/diff/compare.d.ts +0 -5
package/dist/diff/compare.js +0 -12
package/dist/diff/service.d.ts +16 -0
package/dist/diff/service.js +41 -0
package/dist/domain/index.d.ts +4 -0
package/dist/domain/index.js +4 -0
package/dist/events.d.ts +8 -0
package/dist/graph/graph.d.ts +20 -42
package/dist/graph/graph.js +12 -16
package/dist/graph/hits.d.ts +23 -0
package/dist/graph/hits.js +111 -0
package/dist/graph/metrics.d.ts +0 -4
package/dist/graph/metrics.js +19 -15
package/dist/graph/pagerank.d.ts +17 -4
package/dist/graph/pagerank.js +126 -93
package/dist/index.d.ts +27 -9
package/dist/index.js +27 -9
package/dist/lock/lockManager.d.ts +1 -0
package/dist/lock/lockManager.js +15 -0
package/dist/plugin-system/plugin-cli.d.ts +10 -0
package/dist/plugin-system/plugin-cli.js +31 -0
package/dist/plugin-system/plugin-config.d.ts +16 -0
package/dist/plugin-system/plugin-config.js +36 -0
package/dist/plugin-system/plugin-loader.d.ts +17 -0
package/dist/plugin-system/plugin-loader.js +122 -0
package/dist/plugin-system/plugin-registry.d.ts +25 -0
package/dist/plugin-system/plugin-registry.js +167 -0
package/dist/plugin-system/plugin-types.d.ts +205 -0
package/dist/plugin-system/plugin-types.js +1 -0
package/dist/ports/index.d.ts +9 -0
package/dist/ports/index.js +1 -0
package/dist/report/export.d.ts +3 -0
package/dist/report/export.js +81 -0
package/dist/report/insight.d.ts +27 -0
package/dist/report/insight.js +103 -0
package/dist/scoring/health.d.ts +17 -11
package/dist/scoring/health.js +183 -140
package/dist/utils/chalk.d.ts +6 -0
package/dist/utils/chalk.js +41 -0
package/dist/utils/secureConfig.d.ts +23 -0
package/dist/utils/secureConfig.js +128 -0
package/package.json +10 -4
package/CHANGELOG.md +0 -13
package/dist/db/schema.d.ts +0 -2
package/dist/graph/cluster.d.ts +0 -6
package/dist/graph/cluster.js +0 -221
package/dist/graph/duplicate.d.ts +0 -10
package/dist/graph/duplicate.js +0 -302
package/dist/scoring/hits.d.ts +0 -10
package/dist/scoring/hits.js +0 -131
package/scripts/copy-assets.js +0 -37
package/src/analysis/analysis_list.html +0 -35
package/src/analysis/analysis_page.html +0 -123
package/src/analysis/analyze.ts +0 -505
package/src/analysis/content.ts +0 -62
package/src/analysis/images.ts +0 -28
package/src/analysis/links.ts +0 -41
package/src/analysis/scoring.ts +0 -66
package/src/analysis/seo.ts +0 -82
package/src/analysis/structuredData.ts +0 -62
package/src/analysis/templates.ts +0 -9
package/src/audit/dns.ts +0 -49
package/src/audit/headers.ts +0 -98
package/src/audit/index.ts +0 -66
package/src/audit/scoring.ts +0 -232
package/src/audit/transport.ts +0 -258
package/src/audit/types.ts +0 -102
package/src/core/network/proxyAdapter.ts +0 -21
package/src/core/network/rateLimiter.ts +0 -39
package/src/core/network/redirectController.ts +0 -47
package/src/core/network/responseLimiter.ts +0 -34
package/src/core/network/retryPolicy.ts +0 -57
package/src/core/scope/domainFilter.ts +0 -45
package/src/core/scope/scopeManager.ts +0 -52
package/src/core/scope/subdomainPolicy.ts +0 -39
package/src/core/security/ipGuard.ts +0 -171
package/src/crawler/crawl.ts +0 -9
package/src/crawler/crawler.ts +0 -601
package/src/crawler/extract.ts +0 -39
package/src/crawler/fetcher.ts +0 -251
package/src/crawler/metricsRunner.ts +0 -137
package/src/crawler/normalize.ts +0 -108
package/src/crawler/parser.ts +0 -190
package/src/crawler/sitemap.ts +0 -76
package/src/crawler/trap.ts +0 -96
package/src/db/graphLoader.ts +0 -135
package/src/db/index.ts +0 -75
package/src/db/repositories/EdgeRepository.ts +0 -43
package/src/db/repositories/MetricsRepository.ts +0 -63
package/src/db/repositories/PageRepository.ts +0 -228
package/src/db/repositories/SiteRepository.ts +0 -43
package/src/db/repositories/SnapshotRepository.ts +0 -99
package/src/db/schema.ts +0 -177
package/src/diff/compare.ts +0 -84
package/src/events.ts +0 -16
package/src/graph/cluster.ts +0 -246
package/src/graph/duplicate.ts +0 -350
package/src/graph/graph.ts +0 -192
package/src/graph/metrics.ts +0 -125
package/src/graph/pagerank.ts +0 -126
package/src/graph/simhash.ts +0 -76
package/src/index.ts +0 -33
package/src/lock/hashKey.ts +0 -51
package/src/lock/lockManager.ts +0 -132
package/src/lock/pidCheck.ts +0 -13
package/src/report/crawl.html +0 -879
package/src/report/crawlExport.ts +0 -58
package/src/report/crawl_template.ts +0 -9
package/src/report/html.ts +0 -27
package/src/scoring/health.ts +0 -241
package/src/scoring/hits.ts +0 -153
package/src/scoring/orphanSeverity.ts +0 -176
package/src/utils/version.ts +0 -18
package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
package/tests/analysis.unit.test.ts +0 -142
package/tests/analyze.integration.test.ts +0 -133
package/tests/analyze_markdown.test.ts +0 -98
package/tests/audit/audit.test.ts +0 -101
package/tests/audit/dns.test.ts +0 -31
package/tests/audit/headers.test.ts +0 -45
package/tests/audit/scoring.test.ts +0 -133
package/tests/audit/security.test.ts +0 -12
package/tests/audit/transport.test.ts +0 -111
package/tests/clustering.test.ts +0 -118
package/tests/clustering_risk.test.ts +0 -118
package/tests/crawler.test.ts +0 -364
package/tests/db/index.test.ts +0 -134
package/tests/db/repositories.test.ts +0 -115
package/tests/db.test.ts +0 -159
package/tests/db_repos.test.ts +0 -72
package/tests/diff.test.ts +0 -67
package/tests/duplicate.test.ts +0 -110
package/tests/extract.test.ts +0 -86
package/tests/fetcher.test.ts +0 -110
package/tests/fetcher_safety.test.ts +0 -91
package/tests/fixtures/analyze-crawl.json +0 -26
package/tests/graph/graph.test.ts +0 -100
package/tests/graphLoader.test.ts +0 -124
package/tests/hits.test.ts +0 -134
package/tests/html_report.test.ts +0 -59
package/tests/ipGuard.test.ts +0 -73
package/tests/lock/lockManager.test.ts +0 -198
package/tests/metrics.test.ts +0 -196
package/tests/normalize.test.ts +0 -88
package/tests/orphanSeverity.test.ts +0 -160
package/tests/pagerank.test.ts +0 -98
package/tests/parser.test.ts +0 -117
package/tests/proxy_safety.test.ts +0 -57
package/tests/redirect_safety.test.ts +0 -77
package/tests/renderAnalysisCsv.test.ts +0 -183
package/tests/safety.test.ts +0 -126
package/tests/scope.test.ts +0 -84
package/tests/scoring.test.ts +0 -60
package/tests/sitemap.test.ts +0 -100
package/tests/soft404.test.ts +0 -41
package/tests/ssrf_fix.test.ts +0 -69
package/tests/trap.test.ts +0 -39
package/tests/visualization_data.test.ts +0 -46
package/tsconfig.json +0 -11

package/tests/crawler.test.ts DELETED Viewed

@@ -1,364 +0,0 @@
-import { test, expect, beforeEach, afterEach, vi } from 'vitest';
-import { crawl } from '../src/crawler/crawl.js';
-import { loadGraphFromSnapshot } from '../src/db/graphLoader.js';
-import { closeDb } from '../src/db/index.js';
-import { MockAgent, setGlobalDispatcher } from 'undici';
-import { IPGuard } from '../src/core/security/ipGuard.js';
-import { EngineContext } from '../src/events.js';
-let mockAgent: MockAgent;
-const mockContext: EngineContext = { emit: vi.fn() };
-beforeEach(() => {
-  process.env.CRAWLITH_DB_PATH = ':memory:';
-  mockAgent = new MockAgent();
-  mockAgent.disableNetConnect();
-  setGlobalDispatcher(mockAgent);
-  // IPGuard.getSecureDispatcher must return the mockAgent so Fetcher uses it
-  vi.spyOn(IPGuard, 'getSecureDispatcher').mockReturnValue(mockAgent as any);
-});
-afterEach(() => {
-  closeDb();
-});
-test('crawler should crawl and build graph', async () => {
-  const client = mockAgent.get('https://example.com');
-  // Root
-  client.intercept({
-    path: '/',
-    method: 'GET'
-  }).reply(200, `
-        <html><body>
-            <a href="/page1">Page 1</a>
-            <a href="/page2">Page 2</a>
-        </body></html>
-    `, {
-    headers: { 'content-type': 'text/html' }
-  });
-  // Page 1
-  client.intercept({
-    path: '/page1',
-    method: 'GET'
-  }).reply(200, `
-        <html><body>
-            <a href="/page2">Page 2</a>
-        </body></html>
-    `, {
-    headers: { 'content-type': 'text/html' }
-  });
-  // Page 2
-  client.intercept({
-    path: '/page2',
-    method: 'GET'
-  }).reply(200, `
-        <html><body>
-            <a href="/">Home</a>
-        </body></html>
-    `, {
-    headers: { 'content-type': 'text/html' }
-  });
-  // Robots.txt
-  client.intercept({
-    path: '/robots.txt',
-    method: 'GET'
-  }).reply(404, 'Not Found');
-  const snapshotId = await crawl('https://example.com', {
-    limit: 10,
-    depth: 2,
-    ignoreRobots: false,
-    rate: 1000
-  }, mockContext);
-  const graph = loadGraphFromSnapshot(snapshotId);
-  const nodes = graph.getNodes();
-  expect(nodes.length).toBe(3);
-  const root = graph.nodes.get('https://example.com/');
-  expect(root).toBeDefined();
-  expect(root?.depth).toBe(0);
-  expect(root?.outLinks).toBe(2);
-  const page1 = graph.nodes.get('https://example.com/page1');
-  expect(page1).toBeDefined();
-  expect(page1?.depth).toBe(1);
-  expect(page1?.inLinks).toBe(1);
-  const page2 = graph.nodes.get('https://example.com/page2');
-  expect(page2).toBeDefined();
-  expect(page2?.inLinks).toBe(2);
-});
-test('hard page limit', async () => {
-  const client = mockAgent.get('https://limit.com');
-  // Robots
-  client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
-  // Root links to 1, 2, 3
-  client.intercept({ path: '/', method: 'GET' }).reply(200, `
-    <html><a href="/1">1</a><a href="/2">2</a><a href="/3">3</a></html>
-  `, { headers: { 'content-type': 'text/html' } });
-  // 1, 2, 3 return html
-  client.intercept({ path: '/1', method: 'GET' }).reply(200, '<html></html>', { headers: { 'content-type': 'text/html' } });
-  client.intercept({ path: '/2', method: 'GET' }).reply(200, '<html></html>', { headers: { 'content-type': 'text/html' } });
-  client.intercept({ path: '/3', method: 'GET' }).reply(200, '<html></html>', { headers: { 'content-type': 'text/html' } });
-  const snapshotId = await crawl('https://limit.com', {
-    limit: 2, // root + 1 page
-    depth: 5,
-    ignoreRobots: true,
-    rate: 1000
-  }, mockContext);
-  const graph = loadGraphFromSnapshot(snapshotId);
-  // Should have visited root + 1 other page (total 2 nodes with status > 0)
-  const crawledNodes = graph.getNodes().filter(n => n.status > 0);
-  expect(crawledNodes.length).toBeLessThanOrEqual(2);
-});
-test('hard depth cap', async () => {
-  const client = mockAgent.get('https://depth.com');
-  // Robots
-  client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
-  // Chain of 12 pages
-  for (let i = 0; i < 12; i++) {
-    const path = i === 0 ? '/' : `/p${i}`;
-    const nextPath = `/p${i + 1}`;
-    client.intercept({ path, method: 'GET' }).reply(200, `
-      <html><a href="${nextPath}">Next</a></html>
-    `, { headers: { 'content-type': 'text/html' } });
-  }
-  const snapshotId = await crawl('https://depth.com', {
-    limit: 100,
-    depth: 20, // requested 20, but internal hard cap is 10
-    ignoreRobots: true,
-    rate: 1000
-  }, mockContext);
-  const graph = loadGraphFromSnapshot(snapshotId);
-  const crawledNodes = graph.getNodes().filter(n => n.status > 0);
-  const maxCrawledDepth = crawledNodes.reduce((max, n) => Math.max(max, n.depth), 0);
-  expect(maxCrawledDepth).toBeLessThanOrEqual(10);
-});
-test('parameter explosion control', async () => {
-  const client = mockAgent.get('https://params.com');
-  client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
-  // Root links to many variations
-  let links = '';
-  for (let i = 0; i < 10; i++) {
-    links += `<a href="/search?q=${i}">q${i}</a>`;
-  }
-  client.intercept({ path: '/', method: 'GET' }).reply(200, `
-    <html>${links}</html>
-  `, { headers: { 'content-type': 'text/html' } });
-  // Intercept all variations
-  for (let i = 0; i < 40; i++) {
-    client.intercept({ path: `/search?q=${i}`, method: 'GET' }).reply(200, '<html></html>', { headers: { 'content-type': 'text/html' } });
-  }
-  const snapshotId = await crawl('https://params.com', {
-    limit: 100,
-    depth: 5,
-    ignoreRobots: true,
-    stripQuery: false,
-    detectTraps: true,
-    rate: 1000
-  }, mockContext);
-  const graph = loadGraphFromSnapshot(snapshotId);
-  // Should only crawl 5 variations + root
-  const nodes = graph.getNodes();
-  // Filter nodes that match /search pathname
-  const searchNodes = nodes.filter(n => n.url.includes('/search') && n.status > 0);
-  expect(searchNodes.length).toBeLessThanOrEqual(31);
-});
-test('redirect safety', async () => {
-  const client = mockAgent.get('https://redirect.com');
-  client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
-  // Root -> /redir1
-  client.intercept({ path: '/', method: 'GET' }).reply(200, `
-    <html><a href="/redir1">Go</a></html>
-  `, { headers: { 'content-type': 'text/html' } });
-  // /redir1 -> 301 -> /dest
-  client.intercept({ path: '/redir1', method: 'GET' }).reply(301, '', {
-    headers: { 'location': '/dest' }
-  });
-  // /dest -> 200
-  client.intercept({ path: '/dest', method: 'GET' }).reply(200, '<html>Success</html>', { headers: { 'content-type': 'text/html' } });
-  const snapshotId = await crawl('https://redirect.com', {
-    limit: 10,
-    depth: 5,
-    ignoreRobots: true,
-    rate: 1000
-  }, mockContext);
-  const graph = loadGraphFromSnapshot(snapshotId);
-  const destNode = graph.nodes.get('https://redirect.com/dest');
-  expect(destNode).toBeDefined();
-  expect(destNode?.status).toBe(200);
-  // Redirect loop: A -> B -> A
-  const clientLoop = mockAgent.get('https://loop.com');
-  clientLoop.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
-  clientLoop.intercept({ path: '/', method: 'GET' }).reply(200, `
-    <html><a href="/a">Loop</a></html>
-  `, { headers: { 'content-type': 'text/html' } });
-  clientLoop.intercept({ path: '/a', method: 'GET' }).reply(301, '', { headers: { location: '/b' } });
-  clientLoop.intercept({ path: '/b', method: 'GET' }).reply(301, '', { headers: { location: '/a' } });
-  // We might mock /a again if it retries, but it shouldn't infinitely loop
-  const snapshotIdLoop = await crawl('https://loop.com', { limit: 10, depth: 5, ignoreRobots: true, rate: 1000 }, mockContext);
-  const graphLoop = loadGraphFromSnapshot(snapshotIdLoop);
-  // It should eventually stop
-  expect(graphLoop.getNodes().length).toBeGreaterThan(0);
-});
-test('mime check', async () => {
-  const client = mockAgent.get('https://mime.com');
-  client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
-  client.intercept({ path: '/', method: 'GET' }).reply(200, `
-    <html><a href="/image.png">Img</a></html>
-  `, { headers: { 'content-type': 'text/html' } });
-  client.intercept({ path: '/data', method: 'GET' }).reply(200, `
-    <html><a href="/hidden">Hidden</a></html>
-  `, { headers: { 'content-type': 'application/json' } });
-  // Root links to /data
-  client.intercept({ path: '/start', method: 'GET' }).reply(200, `
-    <html><a href="/data">Data</a></html>
-  `, { headers: { 'content-type': 'text/html' } });
-  const snapshotId = await crawl('https://mime.com/start', { limit: 10, depth: 5, ignoreRobots: true, rate: 1000 }, mockContext);
-  const graph = loadGraphFromSnapshot(snapshotId);
-  // /data should be in graph
-  const dataNode = graph.nodes.get('https://mime.com/data');
-  expect(dataNode).toBeDefined();
-  // But we should NOT have parsed it, so /hidden should NOT be in graph
-  const hiddenNode = graph.nodes.get('https://mime.com/hidden');
-  expect(hiddenNode).toBeUndefined();
-});
-test('self-link guard', async () => {
-  const client = mockAgent.get('https://self.com');
-  client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
-  client.intercept({ path: '/', method: 'GET' }).reply(200, `
-    <html><a href="/">Self</a><a href="/other">Other</a></html>
-  `, { headers: { 'content-type': 'text/html' } });
-  client.intercept({ path: '/other', method: 'GET' }).reply(200, '', { headers: { 'content-type': 'text/html' } });
-  const snapshotId = await crawl('https://self.com', { limit: 10, depth: 5, ignoreRobots: true, rate: 1000 }, mockContext);
-  const graph = loadGraphFromSnapshot(snapshotId);
-  const edges = graph.getEdges();
-  const selfEdge = edges.find(e => e.source === 'https://self.com/' && e.target === 'https://self.com/');
-  expect(selfEdge).toBeUndefined();
-  const otherEdge = edges.find(e => e.source === 'https://self.com/' && e.target === 'https://self.com/other');
-  expect(otherEdge).toBeDefined();
-});
-test('limit warning', async () => {
-  const client = mockAgent.get('https://warn.com');
-  client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
-  client.intercept({ path: '/', method: 'GET' }).reply(200, `
-    <html><a href="/1">1</a><a href="/2">2</a></html>
-  `, { headers: { 'content-type': 'text/html' } });
-  client.intercept({ path: '/1', method: 'GET' }).reply(200, '', { headers: { 'content-type': 'text/html' } });
-  const snapshotId = await crawl('https://warn.com', { limit: 2, depth: 5, ignoreRobots: true, rate: 1000 }, mockContext);
-  const graph = loadGraphFromSnapshot(snapshotId);
-  expect(graph.limitReached).toBe(true);
-});
-test('seeds from sitemap', async () => {
-  const client = mockAgent.get('https://sitemap-seed.com');
-  client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
-  // Sitemap
-  client.intercept({ path: '/sitemap.xml', method: 'GET' }).reply(200, `
-    <urlset><url><loc>https://sitemap-seed.com/page1</loc></url></urlset>
-  `);
-  // Root
-  client.intercept({ path: '/', method: 'GET' }).reply(200, '<html>Root</html>', { headers: { 'content-type': 'text/html' } });
-  // Page 1
-  client.intercept({ path: '/page1', method: 'GET' }).reply(200, '<html>Page 1</html>', { headers: { 'content-type': 'text/html' } });
-  const snapshotId = await crawl('https://sitemap-seed.com', {
-    limit: 10,
-    depth: 5,
-    ignoreRobots: true,
-    sitemap: 'true',
-    rate: 1000
-  }, mockContext);
-  const graph = loadGraphFromSnapshot(snapshotId);
-  const page1 = graph.nodes.get('https://sitemap-seed.com/page1');
-  expect(page1).toBeDefined();
-  expect(page1?.status).toBe(200);
-});
-test('incremental crawl uses etags', async () => {
-  const client = mockAgent.get('https://incremental.com');
-  client.intercept({ path: '/robots.txt', method: 'GET' }).reply(404, '');
-  // First crawl setup
-  client.intercept({ path: '/', method: 'GET' }).reply(200, 'Original', {
-    headers: { 'content-type': 'text/html', 'etag': '"v1"' }
-  });
-  const snapshotId1 = await crawl('https://incremental.com', { limit: 10, depth: 1, ignoreRobots: true, rate: 1000 }, mockContext);
-  const graph1 = loadGraphFromSnapshot(snapshotId1);
-  const node1 = graph1.nodes.get('https://incremental.com/');
-  expect(node1?.etag).toBe('"v1"');
-  // Second crawl setup
-  client.intercept({
-    path: '/',
-    method: 'GET',
-    headers: { 'If-None-Match': '"v1"' }
-  }).reply(304, '', { headers: { 'etag': '"v1"' } });
-  const snapshotId2 = await crawl('https://incremental.com', {
-    limit: 10,
-    depth: 1,
-    ignoreRobots: true,
-    previousGraph: graph1,
-    rate: 1000
-  }, mockContext);
-  const graph2 = loadGraphFromSnapshot(snapshotId2);
-  const node2 = graph2.nodes.get('https://incremental.com/');
-  expect(node2?.incrementalStatus).toBe('unchanged');
-});

package/tests/db/index.test.ts DELETED Viewed

@@ -1,134 +0,0 @@
-import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
-import { getDbPath, getDb, closeDb } from '../../src/db/index.js';
-import fs from 'node:fs';
-import os from 'node:os';
-import path from 'node:path';
-vi.mock('node:fs');
-vi.mock('node:os');
-vi.mock('better-sqlite3', () => {
-  return {
-    default: vi.fn(function () {
-      return {
-        pragma: vi.fn().mockReturnValue('ok'),
-        prepare: vi.fn().mockReturnValue({
-          run: vi.fn(),
-          get: vi.fn(),
-          iterate: vi.fn(),
-          all: vi.fn()
-        }),
-        exec: vi.fn(),
-        close: vi.fn(),
-        transaction: vi.fn((fn) => fn),
-      };
-    }),
-  };
-});
-vi.mock('../../src/db/schema.js', () => ({
-  initSchema: vi.fn(),
-}));
-describe('DB Index', () => {
-  const originalEnv = process.env;
-  beforeEach(() => {
-    vi.resetAllMocks();
-    closeDb();
-    process.env = { ...originalEnv };
-    // Default mock behaviors
-    vi.mocked(os.homedir).mockReturnValue('/home/user');
-    vi.mocked(fs.existsSync).mockReturnValue(false);
-    vi.mocked(fs.mkdirSync).mockImplementation(() => undefined as any);
-    vi.mocked(fs.chmodSync).mockImplementation(() => undefined);
-  });
-  afterEach(() => {
-    process.env = originalEnv;
-    closeDb();
-  });
-  describe('getDbPath', () => {
-    it('should return :memory: in test environment', () => {
-      process.env.NODE_ENV = 'test';
-      expect(getDbPath()).toBe(':memory:');
-    });
-    it('should return custom path if CRAWLITH_DB_PATH is set', () => {
-      process.env.NODE_ENV = 'production';
-      process.env.CRAWLITH_DB_PATH = '/custom/path/db.sqlite';
-      expect(getDbPath()).toBe('/custom/path/db.sqlite');
-    });
-    it('should return default path in home dir if no env var', () => {
-      process.env.NODE_ENV = 'production';
-      delete process.env.CRAWLITH_DB_PATH;
-      const expectedPath = path.join('/home/user', '.crawlith', 'crawlith.db');
-      expect(getDbPath()).toBe(expectedPath);
-      expect(fs.mkdirSync).toHaveBeenCalledWith(path.join('/home/user', '.crawlith'), { recursive: true });
-      expect(fs.chmodSync).toHaveBeenCalledWith(path.join('/home/user', '.crawlith'), 0o700);
-    });
-    it('should not create dir if it exists', () => {
-      process.env.NODE_ENV = 'production';
-      vi.mocked(fs.existsSync).mockReturnValue(true);
-      getDbPath();
-      expect(fs.mkdirSync).not.toHaveBeenCalled();
-    });
-  });
-  describe('getDb', () => {
-     it('should create a new database instance', () => {
-       process.env.NODE_ENV = 'production';
-       const db = getDb();
-       expect(db).toBeDefined();
-       // Check if pragma was called
-       expect(db.pragma).toHaveBeenCalledWith('journal_mode = WAL');
-     });
-     it('should return existing instance if called twice', () => {
-        process.env.NODE_ENV = 'production';
-        const db1 = getDb();
-        const db2 = getDb();
-        expect(db1).toBe(db2);
-     });
-     it('should handle permission errors gracefully', () => {
-        process.env.NODE_ENV = 'production';
-        // Avoid getDbPath throwing
-        vi.mocked(fs.existsSync).mockReturnValue(true);
-        vi.mocked(fs.chmodSync).mockImplementation((path) => {
-           if (path.toString().endsWith('crawlith.db')) {
-             throw new Error('EPERM');
-           }
-        });
-        expect(() => getDb()).not.toThrow();
-     });
-     it('should warn if integrity check fails', async () => {
-         const warnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {});
-         process.env.NODE_ENV = 'production';
-         vi.mocked(fs.existsSync).mockReturnValue(true);
-         const MockDatabase = (await import('better-sqlite3')).default;
-         vi.mocked(MockDatabase).mockImplementationOnce(function() {
-            return {
-               pragma: vi.fn().mockReturnValue('corrupt'),
-               prepare: vi.fn(),
-               exec: vi.fn(),
-               close: vi.fn(),
-               transaction: vi.fn(),
-            } as any;
-         });
-         getDb();
-         expect(warnSpy).toHaveBeenCalledWith('Database integrity check failed:', 'corrupt');
-     });
-  });
-});

package/tests/db/repositories.test.ts DELETED Viewed

@@ -1,115 +0,0 @@
-import { describe, it, expect, beforeEach, afterEach } from 'vitest';
-import Database from 'better-sqlite3';
-import { PageRepository } from '../../src/db/repositories/PageRepository.js';
-import { initSchema } from '../../src/db/schema.js';
-describe('PageRepository', () => {
-  let db: Database.Database;
-  let repo: PageRepository;
-  beforeEach(() => {
-    db = new Database(':memory:');
-    initSchema(db);
-    repo = new PageRepository(db);
-    // Seed required tables (sites, snapshots)
-    db.prepare("INSERT INTO sites (domain) VALUES ('example.com')").run();
-    db.prepare("INSERT INTO snapshots (site_id, type) VALUES (1, 'full')").run();
-  });
-  afterEach(() => {
-    db.close();
-  });
-  it('should get pages by URLs in chunks', () => {
-    const urls: string[] = [];
-    const siteId = 1;
-    const snapshotId = 1;
-    // Create 1000 pages (chunk size is 900)
-    const insertStmt = db.prepare(`
-      INSERT INTO pages (site_id, normalized_url, first_seen_snapshot_id, last_seen_snapshot_id)
-      VALUES (?, ?, ?, ?)
-    `);
-    const tx = db.transaction(() => {
-      for (let i = 0; i < 1000; i++) {
-        const url = `http://example.com/page${i}`;
-        urls.push(url);
-        insertStmt.run(siteId, url, snapshotId, snapshotId);
-      }
-    });
-    tx();
-    // Fetch pages
-    const pages = repo.getPagesByUrls(siteId, urls);
-    expect(pages).toHaveLength(1000);
-    expect(pages[0].normalized_url).toBe('http://example.com/page0');
-    expect(pages[999].normalized_url).toBe('http://example.com/page999');
-  });
-  it('should return empty array for empty URL list', () => {
-    const pages = repo.getPagesByUrls(1, []);
-    expect(pages).toEqual([]);
-  });
-  it('should iterate over pages by snapshot', () => {
-    const siteId = 1;
-    const snapshotId = 1;
-    const insertStmt = db.prepare(`
-      INSERT INTO pages (site_id, normalized_url, first_seen_snapshot_id, last_seen_snapshot_id)
-      VALUES (?, ?, ?, ?)
-    `);
-    db.transaction(() => {
-      insertStmt.run(siteId, 'http://example.com/1', snapshotId, snapshotId);
-      insertStmt.run(siteId, 'http://example.com/2', snapshotId, snapshotId);
-      insertStmt.run(siteId, 'http://example.com/3', snapshotId, snapshotId);
-    })();
-    const iterator = repo.getPagesIteratorBySnapshot(snapshotId);
-    const pages = Array.from(iterator);
-    expect(pages).toHaveLength(3);
-    expect(pages.map(p => p.normalized_url).sort()).toEqual([
-      'http://example.com/1',
-      'http://example.com/2',
-      'http://example.com/3'
-    ]);
-  });
-  it('should upsert and get ID', () => {
-    const pageData = {
-      site_id: 1,
-      normalized_url: 'http://example.com/new',
-      last_seen_snapshot_id: 1,
-      http_status: 200,
-    };
-    const id = repo.upsertAndGetId(pageData);
-    expect(id).toBeGreaterThan(0);
-    const sameId = repo.upsertAndGetId({ ...pageData, http_status: 404 });
-    expect(sameId).toBe(id);
-    const page = repo.getPage(1, 'http://example.com/new');
-    expect(page?.http_status).toBe(404);
-  });
-  it('should get ID by URL', () => {
-    const pageData = {
-      site_id: 1,
-      normalized_url: 'http://example.com/id-test',
-      last_seen_snapshot_id: 1,
-    };
-    repo.upsertPage(pageData);
-    const id = repo.getIdByUrl(1, 'http://example.com/id-test');
-    expect(id).toBeDefined();
-    expect(id).toBeGreaterThan(0);
-    const missingId = repo.getIdByUrl(1, 'http://example.com/missing');
-    expect(missingId).toBeUndefined();
-  });
-});