@crawlith/core 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (201) hide show
  1. package/CHANGELOG.md +7 -0
  2. package/dist/analysis/analyze.d.ts +70 -0
  3. package/dist/analysis/analyze.js +436 -0
  4. package/dist/analysis/content.d.ts +12 -0
  5. package/dist/analysis/content.js +33 -0
  6. package/dist/analysis/images.d.ts +6 -0
  7. package/dist/analysis/images.js +18 -0
  8. package/dist/analysis/links.d.ts +7 -0
  9. package/dist/analysis/links.js +30 -0
  10. package/dist/analysis/scoring.d.ts +9 -0
  11. package/dist/analysis/scoring.js +42 -0
  12. package/dist/analysis/seo.d.ts +15 -0
  13. package/dist/analysis/seo.js +64 -0
  14. package/dist/analysis/structuredData.d.ts +6 -0
  15. package/dist/analysis/structuredData.js +51 -0
  16. package/dist/audit/dns.d.ts +2 -0
  17. package/dist/audit/dns.js +42 -0
  18. package/dist/audit/headers.d.ts +2 -0
  19. package/dist/audit/headers.js +95 -0
  20. package/dist/audit/index.d.ts +2 -0
  21. package/dist/audit/index.js +50 -0
  22. package/dist/audit/scoring.d.ts +14 -0
  23. package/dist/audit/scoring.js +214 -0
  24. package/dist/audit/transport.d.ts +6 -0
  25. package/dist/audit/transport.js +207 -0
  26. package/dist/audit/types.d.ts +88 -0
  27. package/dist/audit/types.js +1 -0
  28. package/dist/core/network/proxyAdapter.d.ts +6 -0
  29. package/dist/core/network/proxyAdapter.js +19 -0
  30. package/dist/core/network/rateLimiter.d.ts +6 -0
  31. package/dist/core/network/rateLimiter.js +31 -0
  32. package/dist/core/network/redirectController.d.ts +13 -0
  33. package/dist/core/network/redirectController.js +41 -0
  34. package/dist/core/network/responseLimiter.d.ts +4 -0
  35. package/dist/core/network/responseLimiter.js +26 -0
  36. package/dist/core/network/retryPolicy.d.ts +10 -0
  37. package/dist/core/network/retryPolicy.js +41 -0
  38. package/dist/core/scope/domainFilter.d.ts +11 -0
  39. package/dist/core/scope/domainFilter.js +40 -0
  40. package/dist/core/scope/scopeManager.d.ts +14 -0
  41. package/dist/core/scope/scopeManager.js +39 -0
  42. package/dist/core/scope/subdomainPolicy.d.ts +6 -0
  43. package/dist/core/scope/subdomainPolicy.js +35 -0
  44. package/dist/core/security/ipGuard.d.ts +11 -0
  45. package/dist/core/security/ipGuard.js +84 -0
  46. package/dist/crawler/crawl.d.ts +22 -0
  47. package/dist/crawler/crawl.js +336 -0
  48. package/dist/crawler/extract.d.ts +5 -0
  49. package/dist/crawler/extract.js +33 -0
  50. package/dist/crawler/fetcher.d.ts +40 -0
  51. package/dist/crawler/fetcher.js +161 -0
  52. package/dist/crawler/metricsRunner.d.ts +1 -0
  53. package/dist/crawler/metricsRunner.js +108 -0
  54. package/dist/crawler/normalize.d.ts +7 -0
  55. package/dist/crawler/normalize.js +88 -0
  56. package/dist/crawler/parser.d.ts +22 -0
  57. package/dist/crawler/parser.js +158 -0
  58. package/dist/crawler/sitemap.d.ts +8 -0
  59. package/dist/crawler/sitemap.js +70 -0
  60. package/dist/crawler/trap.d.ts +24 -0
  61. package/dist/crawler/trap.js +78 -0
  62. package/dist/db/graphLoader.d.ts +2 -0
  63. package/dist/db/graphLoader.js +96 -0
  64. package/dist/db/index.d.ts +4 -0
  65. package/dist/db/index.js +61 -0
  66. package/dist/db/repositories/EdgeRepository.d.ts +16 -0
  67. package/dist/db/repositories/EdgeRepository.js +17 -0
  68. package/dist/db/repositories/MetricsRepository.d.ts +26 -0
  69. package/dist/db/repositories/MetricsRepository.js +27 -0
  70. package/dist/db/repositories/PageRepository.d.ts +47 -0
  71. package/dist/db/repositories/PageRepository.js +93 -0
  72. package/dist/db/repositories/SiteRepository.d.ts +15 -0
  73. package/dist/db/repositories/SiteRepository.js +22 -0
  74. package/dist/db/repositories/SnapshotRepository.d.ts +22 -0
  75. package/dist/db/repositories/SnapshotRepository.js +55 -0
  76. package/dist/db/schema.d.ts +2 -0
  77. package/dist/db/schema.js +169 -0
  78. package/dist/diff/compare.d.ts +26 -0
  79. package/dist/diff/compare.js +64 -0
  80. package/dist/graph/cluster.d.ts +6 -0
  81. package/dist/graph/cluster.js +173 -0
  82. package/dist/graph/duplicate.d.ts +10 -0
  83. package/dist/graph/duplicate.js +251 -0
  84. package/dist/graph/graph.d.ts +103 -0
  85. package/dist/graph/graph.js +106 -0
  86. package/dist/graph/metrics.d.ts +29 -0
  87. package/dist/graph/metrics.js +74 -0
  88. package/dist/graph/pagerank.d.ts +12 -0
  89. package/dist/graph/pagerank.js +102 -0
  90. package/dist/graph/simhash.d.ts +17 -0
  91. package/dist/graph/simhash.js +56 -0
  92. package/dist/index.d.ts +30 -0
  93. package/dist/index.js +30 -0
  94. package/dist/lock/hashKey.d.ts +1 -0
  95. package/dist/lock/hashKey.js +44 -0
  96. package/dist/lock/lockManager.d.ts +7 -0
  97. package/dist/lock/lockManager.js +112 -0
  98. package/dist/lock/pidCheck.d.ts +1 -0
  99. package/dist/lock/pidCheck.js +14 -0
  100. package/dist/report/html.d.ts +2 -0
  101. package/dist/report/html.js +223 -0
  102. package/dist/report/sitegraphExport.d.ts +3 -0
  103. package/dist/report/sitegraphExport.js +52 -0
  104. package/dist/report/sitegraph_template.d.ts +1 -0
  105. package/dist/report/sitegraph_template.js +630 -0
  106. package/dist/scoring/hits.d.ts +9 -0
  107. package/dist/scoring/hits.js +111 -0
  108. package/dist/scoring/orphanSeverity.d.ts +39 -0
  109. package/dist/scoring/orphanSeverity.js +125 -0
  110. package/dist/utils/version.d.ts +2 -0
  111. package/dist/utils/version.js +15 -0
  112. package/package.json +33 -0
  113. package/src/analysis/analyze.ts +548 -0
  114. package/src/analysis/content.ts +62 -0
  115. package/src/analysis/images.ts +28 -0
  116. package/src/analysis/links.ts +41 -0
  117. package/src/analysis/scoring.ts +59 -0
  118. package/src/analysis/seo.ts +82 -0
  119. package/src/analysis/structuredData.ts +62 -0
  120. package/src/audit/dns.ts +49 -0
  121. package/src/audit/headers.ts +98 -0
  122. package/src/audit/index.ts +66 -0
  123. package/src/audit/scoring.ts +232 -0
  124. package/src/audit/transport.ts +258 -0
  125. package/src/audit/types.ts +102 -0
  126. package/src/core/network/proxyAdapter.ts +21 -0
  127. package/src/core/network/rateLimiter.ts +39 -0
  128. package/src/core/network/redirectController.ts +47 -0
  129. package/src/core/network/responseLimiter.ts +34 -0
  130. package/src/core/network/retryPolicy.ts +57 -0
  131. package/src/core/scope/domainFilter.ts +45 -0
  132. package/src/core/scope/scopeManager.ts +52 -0
  133. package/src/core/scope/subdomainPolicy.ts +39 -0
  134. package/src/core/security/ipGuard.ts +92 -0
  135. package/src/crawler/crawl.ts +382 -0
  136. package/src/crawler/extract.ts +34 -0
  137. package/src/crawler/fetcher.ts +233 -0
  138. package/src/crawler/metricsRunner.ts +124 -0
  139. package/src/crawler/normalize.ts +108 -0
  140. package/src/crawler/parser.ts +190 -0
  141. package/src/crawler/sitemap.ts +73 -0
  142. package/src/crawler/trap.ts +96 -0
  143. package/src/db/graphLoader.ts +105 -0
  144. package/src/db/index.ts +70 -0
  145. package/src/db/repositories/EdgeRepository.ts +29 -0
  146. package/src/db/repositories/MetricsRepository.ts +49 -0
  147. package/src/db/repositories/PageRepository.ts +128 -0
  148. package/src/db/repositories/SiteRepository.ts +32 -0
  149. package/src/db/repositories/SnapshotRepository.ts +74 -0
  150. package/src/db/schema.ts +177 -0
  151. package/src/diff/compare.ts +84 -0
  152. package/src/graph/cluster.ts +192 -0
  153. package/src/graph/duplicate.ts +286 -0
  154. package/src/graph/graph.ts +172 -0
  155. package/src/graph/metrics.ts +110 -0
  156. package/src/graph/pagerank.ts +125 -0
  157. package/src/graph/simhash.ts +61 -0
  158. package/src/index.ts +30 -0
  159. package/src/lock/hashKey.ts +51 -0
  160. package/src/lock/lockManager.ts +124 -0
  161. package/src/lock/pidCheck.ts +13 -0
  162. package/src/report/html.ts +227 -0
  163. package/src/report/sitegraphExport.ts +58 -0
  164. package/src/report/sitegraph_template.ts +630 -0
  165. package/src/scoring/hits.ts +131 -0
  166. package/src/scoring/orphanSeverity.ts +176 -0
  167. package/src/utils/version.ts +18 -0
  168. package/tests/__snapshots__/orphanSeverity.test.ts.snap +49 -0
  169. package/tests/analysis.unit.test.ts +98 -0
  170. package/tests/analyze.integration.test.ts +98 -0
  171. package/tests/audit/dns.test.ts +31 -0
  172. package/tests/audit/headers.test.ts +45 -0
  173. package/tests/audit/scoring.test.ts +133 -0
  174. package/tests/audit/security.test.ts +12 -0
  175. package/tests/audit/transport.test.ts +112 -0
  176. package/tests/clustering.test.ts +118 -0
  177. package/tests/crawler.test.ts +358 -0
  178. package/tests/db.test.ts +159 -0
  179. package/tests/diff.test.ts +67 -0
  180. package/tests/duplicate.test.ts +110 -0
  181. package/tests/fetcher.test.ts +106 -0
  182. package/tests/fetcher_safety.test.ts +85 -0
  183. package/tests/fixtures/analyze-crawl.json +26 -0
  184. package/tests/hits.test.ts +134 -0
  185. package/tests/html_report.test.ts +58 -0
  186. package/tests/lock/lockManager.test.ts +138 -0
  187. package/tests/metrics.test.ts +196 -0
  188. package/tests/normalize.test.ts +101 -0
  189. package/tests/orphanSeverity.test.ts +160 -0
  190. package/tests/pagerank.test.ts +98 -0
  191. package/tests/parser.test.ts +117 -0
  192. package/tests/proxy_safety.test.ts +57 -0
  193. package/tests/redirect_safety.test.ts +73 -0
  194. package/tests/safety.test.ts +114 -0
  195. package/tests/scope.test.ts +66 -0
  196. package/tests/scoring.test.ts +59 -0
  197. package/tests/sitemap.test.ts +88 -0
  198. package/tests/soft404.test.ts +41 -0
  199. package/tests/trap.test.ts +39 -0
  200. package/tests/visualization_data.test.ts +46 -0
  201. package/tsconfig.json +11 -0
@@ -0,0 +1 @@
1
+ export declare function runPostCrawlMetrics(snapshotId: number, maxDepth: number, limitReached?: boolean): void;
@@ -0,0 +1,108 @@
1
+ import { getDb } from '../db/index.js';
2
+ import { loadGraphFromSnapshot } from '../db/graphLoader.js';
3
+ import { MetricsRepository } from '../db/repositories/MetricsRepository.js';
4
+ import { SnapshotRepository } from '../db/repositories/SnapshotRepository.js';
5
+ import { PageRepository } from '../db/repositories/PageRepository.js';
6
+ import { computePageRank } from '../graph/pagerank.js';
7
+ import { calculateMetrics } from '../graph/metrics.js';
8
+ import { computeHITS } from '../scoring/hits.js';
9
+ export function runPostCrawlMetrics(snapshotId, maxDepth, limitReached = false) {
10
+ const db = getDb();
11
+ const metricsRepo = new MetricsRepository(db);
12
+ const snapshotRepo = new SnapshotRepository(db);
13
+ const pageRepo = new PageRepository(db);
14
+ const snapshot = snapshotRepo.getSnapshot(snapshotId);
15
+ if (!snapshot) {
16
+ console.error(`Snapshot ${snapshotId} not found`);
17
+ return;
18
+ }
19
+ console.log('Loading graph for metrics calculation...');
20
+ const graph = loadGraphFromSnapshot(snapshotId);
21
+ console.log('Computing PageRank...');
22
+ computePageRank(graph);
23
+ console.log('Computing HITS...');
24
+ computeHITS(graph);
25
+ console.log('Updating metrics in DB...');
26
+ const nodes = graph.getNodes();
27
+ const tx = db.transaction(() => {
28
+ for (const node of nodes) {
29
+ const pageId = pageRepo.getIdByUrl(snapshot.site_id, node.url);
30
+ if (!pageId)
31
+ continue;
32
+ const existing = metricsRepo.getMetricsForPage(snapshotId, pageId);
33
+ metricsRepo.insertMetrics({
34
+ snapshot_id: snapshotId,
35
+ page_id: pageId,
36
+ authority_score: node.authorityScore ?? null,
37
+ hub_score: node.hubScore ?? null,
38
+ pagerank: node.pageRank ?? null,
39
+ pagerank_score: node.pageRankScore ?? null,
40
+ link_role: node.linkRole ?? null,
41
+ crawl_status: existing?.crawl_status ?? null,
42
+ word_count: existing?.word_count ?? null,
43
+ thin_content_score: existing?.thin_content_score ?? null,
44
+ external_link_ratio: existing?.external_link_ratio ?? null,
45
+ orphan_score: existing?.orphan_score ?? null,
46
+ duplicate_cluster_id: node.duplicateClusterId ?? null,
47
+ duplicate_type: node.duplicateType ?? null,
48
+ is_cluster_primary: node.isClusterPrimary ? 1 : 0
49
+ });
50
+ // Update page-level crawl trap data
51
+ if (node.crawlTrapFlag || node.redirectChain?.length || node.bytesReceived) {
52
+ pageRepo.upsertPage({
53
+ site_id: snapshot.site_id,
54
+ normalized_url: node.url,
55
+ last_seen_snapshot_id: snapshotId,
56
+ redirect_chain: node.redirectChain ? JSON.stringify(node.redirectChain) : null,
57
+ bytes_received: node.bytesReceived ?? null,
58
+ crawl_trap_flag: node.crawlTrapFlag ? 1 : 0,
59
+ crawl_trap_risk: node.crawlTrapRisk ?? null,
60
+ trap_type: node.trapType ?? null,
61
+ });
62
+ }
63
+ }
64
+ // Save duplicate clusters
65
+ if (graph.duplicateClusters.length > 0) {
66
+ const clusterStmt = db.prepare(`
67
+ INSERT OR REPLACE INTO duplicate_clusters (id, snapshot_id, type, size, representative, severity)
68
+ VALUES (?, ?, ?, ?, ?, ?)
69
+ `);
70
+ for (const cluster of graph.duplicateClusters) {
71
+ clusterStmt.run(cluster.id, snapshotId, cluster.type, cluster.size, cluster.representative, cluster.severity);
72
+ }
73
+ }
74
+ // Save content clusters
75
+ if (graph.contentClusters.length > 0) {
76
+ const contentStmt = db.prepare(`
77
+ INSERT OR REPLACE INTO content_clusters (id, snapshot_id, count, primary_url, risk, shared_path_prefix)
78
+ VALUES (?, ?, ?, ?, ?, ?)
79
+ `);
80
+ for (const cluster of graph.contentClusters) {
81
+ contentStmt.run(cluster.id, snapshotId, cluster.count, cluster.primaryUrl, cluster.risk, cluster.sharedPathPrefix ?? null);
82
+ }
83
+ }
84
+ });
85
+ tx();
86
+ console.log('Computing aggregate stats...');
87
+ const metrics = calculateMetrics(graph, maxDepth);
88
+ let totalScore = 0;
89
+ let totalWeight = 0;
90
+ for (const node of nodes) {
91
+ const score = node.authorityScore || node.pageRankScore || 0;
92
+ const depth = node.depth;
93
+ const weight = 1 / (depth + 1);
94
+ totalScore += score * weight;
95
+ totalWeight += weight;
96
+ }
97
+ const healthScore = totalWeight > 0 ? (totalScore / totalWeight) * 100 : 0;
98
+ const thinCountRow = db.prepare('SELECT count(*) as count FROM metrics WHERE snapshot_id = ? AND thin_content_score >= 70').get(snapshotId);
99
+ snapshotRepo.updateSnapshotStatus(snapshotId, 'completed', {
100
+ node_count: metrics.totalPages,
101
+ edge_count: metrics.totalEdges,
102
+ health_score: healthScore,
103
+ orphan_count: metrics.orphanPages.length,
104
+ thin_content_count: thinCountRow.count,
105
+ limit_reached: limitReached ? 1 : 0
106
+ });
107
+ console.log('Metrics calculation complete.');
108
+ }
@@ -0,0 +1,7 @@
1
+ /**
2
+ * Normalizes a URL string based on specific rules.
3
+ */
4
+ export interface NormalizeOptions {
5
+ stripQuery?: boolean;
6
+ }
7
+ export declare function normalizeUrl(input: string, base: string, options?: NormalizeOptions): string | null;
@@ -0,0 +1,88 @@
1
+ const TRACKING_PARAMS = new Set([
2
+ 'utm_source',
3
+ 'utm_medium',
4
+ 'utm_campaign',
5
+ 'utm_term',
6
+ 'utm_content',
7
+ 'fbclid',
8
+ 'gclid',
9
+ 'msclkid'
10
+ ]);
11
+ const SKIP_EXTENSIONS = new Set([
12
+ '.pdf', '.jpg', '.png', '.svg', '.webp', '.gif',
13
+ '.zip', '.xml', '.json', '.mp4'
14
+ ]);
15
+ export function normalizeUrl(input, base, options = {}) {
16
+ try {
17
+ // 1. Resolve absolute URL
18
+ let u;
19
+ if (base) {
20
+ u = new URL(input, base);
21
+ }
22
+ else {
23
+ u = new URL(input);
24
+ }
25
+ // 2. Allow only http/https
26
+ if (u.protocol !== 'http:' && u.protocol !== 'https:') {
27
+ return null;
28
+ }
29
+ // 3. Lowercase hostname
30
+ u.hostname = u.hostname.toLowerCase();
31
+ // 4. Remove default ports
32
+ if ((u.protocol === 'http:' && u.port === '80') || (u.protocol === 'https:' && u.port === '443')) {
33
+ u.port = '';
34
+ }
35
+ // 5. Remove hash fragments
36
+ u.hash = '';
37
+ // 6. Query params handling
38
+ const params = new URLSearchParams(u.search);
39
+ const newParams = new URLSearchParams();
40
+ // Check if we should strip all query params
41
+ if (options.stripQuery) {
42
+ u.search = '';
43
+ }
44
+ else {
45
+ // Filter tracking params
46
+ let hasParams = false;
47
+ for (const [key, value] of params) {
48
+ // Remove utm_* and other tracking params
49
+ if (key.startsWith('utm_') || TRACKING_PARAMS.has(key)) {
50
+ continue;
51
+ }
52
+ newParams.append(key, value);
53
+ hasParams = true;
54
+ }
55
+ // Sort for consistency
56
+ newParams.sort();
57
+ if (hasParams || newParams.toString()) {
58
+ u.search = newParams.toString();
59
+ }
60
+ else {
61
+ u.search = '';
62
+ }
63
+ }
64
+ // 7. Normalize trailing slash
65
+ // 8. Collapse duplicate slashes in pathname
66
+ let pathname = u.pathname;
67
+ // Collapse duplicate slashes
68
+ pathname = pathname.replace(/\/+/g, '/');
69
+ // Remove trailing slash unless root
70
+ if (pathname.length > 1 && pathname.endsWith('/')) {
71
+ pathname = pathname.slice(0, -1);
72
+ }
73
+ u.pathname = pathname;
74
+ // 9. Skip non-HTML assets by extension
75
+ const lastDotIndex = u.pathname.lastIndexOf('.');
76
+ if (lastDotIndex !== -1) {
77
+ const ext = u.pathname.slice(lastDotIndex).toLowerCase();
78
+ if (SKIP_EXTENSIONS.has(ext)) {
79
+ return null;
80
+ }
81
+ }
82
+ // 10. Return final string
83
+ return u.toString();
84
+ }
85
+ catch (_e) {
86
+ return null;
87
+ }
88
+ }
@@ -0,0 +1,22 @@
1
+ export interface ParseLink {
2
+ url: string;
3
+ weight: number;
4
+ }
5
+ export interface ParseResult {
6
+ links: ParseLink[];
7
+ html: string;
8
+ canonical: string | null;
9
+ noindex: boolean;
10
+ nofollow: boolean;
11
+ contentHash: string;
12
+ simhash?: string;
13
+ uniqueTokenRatio?: number;
14
+ soft404Score: number;
15
+ soft404Signals: string[];
16
+ }
17
+ export declare class Parser {
18
+ /**
19
+ * Parses HTML content to extract metadata and links.
20
+ */
21
+ parse(html: string, baseUrl: string, status: number): ParseResult;
22
+ }
@@ -0,0 +1,158 @@
1
+ import * as cheerio from 'cheerio';
2
+ import crypto from 'node:crypto';
3
+ import { normalizeUrl } from './normalize.js';
4
+ import { SimHash } from '../graph/simhash.js';
5
+ export class Parser {
6
+ /**
7
+ * Parses HTML content to extract metadata and links.
8
+ */
9
+ parse(html, baseUrl, status) {
10
+ const $ = cheerio.load(html);
11
+ // 1. Robots Meta
12
+ let noindex = false;
13
+ let nofollow = false;
14
+ const robotsMeta = $('meta[name="robots"]').attr('content');
15
+ if (robotsMeta) {
16
+ const directives = robotsMeta.toLowerCase().split(',').map(s => s.trim());
17
+ if (directives.includes('noindex') || directives.includes('none'))
18
+ noindex = true;
19
+ if (directives.includes('nofollow') || directives.includes('none'))
20
+ nofollow = true;
21
+ }
22
+ // 2. Canonical
23
+ let canonical = null;
24
+ const canonicalLink = $('link[rel="canonical"]').attr('href');
25
+ if (canonicalLink) {
26
+ try {
27
+ // Resolve relative canonicals
28
+ const u = new URL(canonicalLink, baseUrl);
29
+ // Normalize minimally (remove default ports, lowercase host, etc)
30
+ // We don't strip query by default for canonical as it might be relevant
31
+ canonical = normalizeUrl(u.toString(), '', { stripQuery: false });
32
+ }
33
+ catch (_e) {
34
+ // Invalid canonical URL, ignore
35
+ }
36
+ }
37
+ // 3. Links
38
+ const links = new Map();
39
+ if (!nofollow) { // Don't extract links if nofollow is set
40
+ $('a').each((_, element) => {
41
+ const href = $(element).attr('href');
42
+ const rel = $(element).attr('rel');
43
+ const isNofollow = rel && rel.toLowerCase().includes('nofollow');
44
+ if (href && !isNofollow) {
45
+ try {
46
+ const absoluteUrl = new URL(href, baseUrl);
47
+ if (absoluteUrl.protocol === 'http:' || absoluteUrl.protocol === 'https:') {
48
+ absoluteUrl.hash = '';
49
+ const urlStr = absoluteUrl.toString();
50
+ // Calculate Weight
51
+ let weight = 1.0; // Default: Body
52
+ // Semantic Check
53
+ const $el = $(element);
54
+ if ($el.closest('nav').length > 0 || $el.closest('header').length > 0) {
55
+ weight = 0.7;
56
+ }
57
+ else if ($el.closest('footer').length > 0) {
58
+ weight = 0.4;
59
+ }
60
+ else {
61
+ // Secondary check: Common attributes
62
+ const parentText = ($el.parent().attr('class') || '') + ($el.parent().attr('id') || '');
63
+ const grandParentText = ($el.parent().parent().attr('class') || '') + ($el.parent().parent().attr('id') || '');
64
+ const combinedContext = (parentText + grandParentText).toLowerCase();
65
+ if (combinedContext.includes('nav') || combinedContext.includes('menu')) {
66
+ weight = 0.7;
67
+ }
68
+ else if (combinedContext.includes('footer')) {
69
+ weight = 0.4;
70
+ }
71
+ }
72
+ // Store highest weight if multiple links to same URL
73
+ const currentMax = links.get(urlStr) || 0;
74
+ if (weight > currentMax) {
75
+ links.set(urlStr, weight);
76
+ }
77
+ }
78
+ }
79
+ catch (_e) {
80
+ // Invalid URL
81
+ }
82
+ }
83
+ });
84
+ }
85
+ // 4. Content Hash (ignoring script/style/comments)
86
+ // Clone body to avoid modifying the loaded doc (though we don't reuse it)
87
+ // Actually cheerio load gives us a fresh instance.
88
+ $('script').remove();
89
+ $('style').remove();
90
+ $('noscript').remove();
91
+ $('iframe').remove();
92
+ const cleanText = $('body').text().replace(/\s+/g, ' ').trim();
93
+ const contentHash = crypto.createHash('sha256').update(cleanText).digest('hex');
94
+ // 4b. Simhash & Token calculation (limit to 50k chars for performance)
95
+ const limitedText = cleanText.substring(0, 50000).toLowerCase();
96
+ const tokens = limitedText.split(/\W+/).filter(t => t.length > 0);
97
+ const uniqueTokens = new Set(tokens);
98
+ const uniqueTokenRatio = tokens.length > 0 ? (uniqueTokens.size / tokens.length) : 0;
99
+ const simhash = SimHash.generate(tokens).toString();
100
+ // 5. Soft 404 Detection
101
+ let soft404Score = 0;
102
+ const soft404Signals = [];
103
+ if (status === 200) {
104
+ const title = $('title').text().toLowerCase();
105
+ const h1Text = $('h1').first().text().toLowerCase();
106
+ const bodyText = cleanText.toLowerCase();
107
+ const errorPatterns = ['404', 'not found', 'error', 'doesn\'t exist', 'unavailable', 'invalid'];
108
+ // Pattern checks
109
+ for (const pattern of errorPatterns) {
110
+ if (title.includes(pattern)) {
111
+ soft404Score += 0.4;
112
+ soft404Signals.push(`title_pattern_${pattern.replace(/\s+/g, '_')}`);
113
+ break;
114
+ }
115
+ }
116
+ for (const pattern of errorPatterns) {
117
+ if (h1Text.includes(pattern)) {
118
+ soft404Score += 0.3;
119
+ soft404Signals.push(`h1_pattern_${pattern.replace(/\s+/g, '_')}`);
120
+ break;
121
+ }
122
+ }
123
+ if (bodyText.includes('page not found') || bodyText.includes('404 error')) {
124
+ soft404Score += 0.2;
125
+ soft404Signals.push('body_error_phrase');
126
+ }
127
+ // Content length check (Word count approximation)
128
+ const words = cleanText.split(/\s+/).filter(w => w.length > 0);
129
+ if (words.length < 50) {
130
+ soft404Score += 0.3;
131
+ soft404Signals.push('very_low_word_count');
132
+ }
133
+ else if (words.length < 150) {
134
+ soft404Score += 0.1;
135
+ soft404Signals.push('low_word_count');
136
+ }
137
+ // Link count check
138
+ if (links.size === 0) {
139
+ soft404Score += 0.2;
140
+ soft404Signals.push('no_outbound_links');
141
+ }
142
+ // Cap at 1.0
143
+ soft404Score = Math.min(1.0, soft404Score);
144
+ }
145
+ return {
146
+ links: Array.from(links.entries()).map(([url, weight]) => ({ url, weight })),
147
+ html: html, // pass raw HTML for analysis
148
+ canonical,
149
+ noindex,
150
+ nofollow,
151
+ contentHash,
152
+ simhash,
153
+ uniqueTokenRatio,
154
+ soft404Score,
155
+ soft404Signals
156
+ };
157
+ }
158
+ }
@@ -0,0 +1,8 @@
1
+ export declare class Sitemap {
2
+ /**
3
+ * Fetches and parses a sitemap (or sitemap index) to extract URLs.
4
+ * Recursively handles sitemap indexes with loop detection and depth limits.
5
+ */
6
+ fetch(url: string): Promise<string[]>;
7
+ private processSitemap;
8
+ }
@@ -0,0 +1,70 @@
1
+ import { request } from 'undici';
2
+ import * as cheerio from 'cheerio';
3
+ import { normalizeUrl } from './normalize.js';
4
+ export class Sitemap {
5
+ /**
6
+ * Fetches and parses a sitemap (or sitemap index) to extract URLs.
7
+ * Recursively handles sitemap indexes with loop detection and depth limits.
8
+ */
9
+ async fetch(url) {
10
+ const visited = new Set();
11
+ const urls = new Set();
12
+ await this.processSitemap(url, visited, urls);
13
+ return Array.from(urls);
14
+ }
15
+ async processSitemap(url, visited, urls) {
16
+ if (visited.has(url))
17
+ return;
18
+ visited.add(url);
19
+ // Hard limit on number of sitemaps to fetch to prevent abuse
20
+ if (visited.size > 50)
21
+ return;
22
+ try {
23
+ const res = await request(url, {
24
+ maxRedirections: 3,
25
+ headers: { 'User-Agent': 'crawlith/1.0' },
26
+ headersTimeout: 10000,
27
+ bodyTimeout: 10000
28
+ });
29
+ if (res.statusCode >= 200 && res.statusCode < 300) {
30
+ const xml = await res.body.text();
31
+ // Basic validation: must verify it looks like XML
32
+ if (!xml.trim().startsWith('<'))
33
+ return;
34
+ const $ = cheerio.load(xml, { xmlMode: true });
35
+ // Check if it's a sitemap index
36
+ const sitemaps = $('sitemap > loc');
37
+ if (sitemaps.length > 0) {
38
+ const childSitemaps = [];
39
+ sitemaps.each((_, el) => {
40
+ const loc = $(el).text().trim();
41
+ if (loc)
42
+ childSitemaps.push(loc);
43
+ });
44
+ // Process children sequentially to avoid massive concurrency spike
45
+ for (const childUrl of childSitemaps) {
46
+ await this.processSitemap(childUrl, visited, urls);
47
+ }
48
+ }
49
+ else {
50
+ // It's a URL Set
51
+ $('url > loc').each((_, el) => {
52
+ const loc = $(el).text().trim();
53
+ if (loc) {
54
+ const normalized = normalizeUrl(loc, '');
55
+ if (normalized) {
56
+ urls.add(normalized);
57
+ }
58
+ }
59
+ });
60
+ }
61
+ }
62
+ else {
63
+ await res.body.dump();
64
+ }
65
+ }
66
+ catch (e) {
67
+ console.warn(`Failed to fetch sitemap ${url}:`, e);
68
+ }
69
+ }
70
+ }
@@ -0,0 +1,24 @@
1
+ export type TrapType = 'faceted_navigation' | 'calendar_trap' | 'pagination_loop' | 'session_trap';
2
+ export interface TrapResult {
3
+ risk: number;
4
+ type: TrapType | null;
5
+ }
6
+ export declare class TrapDetector {
7
+ private pathCounters;
8
+ private paginationCounters;
9
+ private sessionParams;
10
+ private PARAM_EXPLOSION_THRESHOLD;
11
+ private PAGINATION_THRESHOLD;
12
+ constructor(options?: {
13
+ paramThreshold?: number;
14
+ paginationThreshold?: number;
15
+ });
16
+ /**
17
+ * Checks if a URL represents a potential crawl trap.
18
+ */
19
+ checkTrap(rawUrl: string, _depth: number): TrapResult;
20
+ /**
21
+ * Resets internal state (useful for multi-crawl sessions if needed)
22
+ */
23
+ reset(): void;
24
+ }
@@ -0,0 +1,78 @@
1
+ export class TrapDetector {
2
+ pathCounters = new Map();
3
+ paginationCounters = new Map();
4
+ sessionParams = new Set(['sid', 'session', 'phpsessid', 'sessid', 'token']);
5
+ // Configurable thresholds
6
+ PARAM_EXPLOSION_THRESHOLD = 30;
7
+ PAGINATION_THRESHOLD = 50;
8
+ constructor(options = {}) {
9
+ if (options.paramThreshold)
10
+ this.PARAM_EXPLOSION_THRESHOLD = options.paramThreshold;
11
+ if (options.paginationThreshold)
12
+ this.PAGINATION_THRESHOLD = options.paginationThreshold;
13
+ }
14
+ /**
15
+ * Checks if a URL represents a potential crawl trap.
16
+ */
17
+ checkTrap(rawUrl, _depth) {
18
+ let risk = 0;
19
+ let type = null;
20
+ try {
21
+ const u = new URL(rawUrl);
22
+ const params = new URLSearchParams(u.search);
23
+ const pathname = u.pathname;
24
+ const pathKey = `${u.origin}${pathname}`;
25
+ // 1. Session IDs / Tracking Parameters
26
+ for (const [key] of params) {
27
+ if (this.sessionParams.has(key.toLowerCase()) || key.toLowerCase().includes('session')) {
28
+ risk = Math.max(risk, 0.9);
29
+ type = 'session_trap';
30
+ }
31
+ }
32
+ // 2. Calendar Pattern Detection
33
+ // Matches /2023/12/01, /2023-12-01, /12-2023 etc
34
+ const calendarRegex = /\/\d{4}[-/]\d{2}[-/]\d{2}\/|\/\d{2}[-/]\d{2}[-/]\d{4}\//;
35
+ if (calendarRegex.test(pathname)) {
36
+ risk = Math.max(risk, 0.7);
37
+ type = 'calendar_trap';
38
+ }
39
+ // 3. Pagination Loop
40
+ const pageParam = params.get('page') || params.get('p') || params.get('pg');
41
+ if (pageParam && /^\d+$/.test(pageParam)) {
42
+ const pageNum = parseInt(pageParam, 10);
43
+ const currentMaxPage = this.paginationCounters.get(pathKey) || 0;
44
+ if (pageNum > currentMaxPage) {
45
+ this.paginationCounters.set(pathKey, pageNum);
46
+ }
47
+ if (pageNum > this.PAGINATION_THRESHOLD) {
48
+ risk = Math.max(risk, 0.85);
49
+ type = 'pagination_loop';
50
+ }
51
+ }
52
+ // 4. Infinite Parameter Explosion (Faceted Navigation)
53
+ if (params.size > 0) {
54
+ const paramSet = this.pathCounters.get(pathKey) || new Set();
55
+ params.sort();
56
+ const paramKey = params.toString();
57
+ paramSet.add(paramKey);
58
+ this.pathCounters.set(pathKey, paramSet);
59
+ if (paramSet.size > this.PARAM_EXPLOSION_THRESHOLD) {
60
+ risk = Math.max(risk, 0.95);
61
+ if (!type)
62
+ type = 'faceted_navigation';
63
+ }
64
+ }
65
+ }
66
+ catch (_e) {
67
+ // Invalid URL
68
+ }
69
+ return { risk, type };
70
+ }
71
+ /**
72
+ * Resets internal state (useful for multi-crawl sessions if needed)
73
+ */
74
+ reset() {
75
+ this.pathCounters.clear();
76
+ this.paginationCounters.clear();
77
+ }
78
+ }
@@ -0,0 +1,2 @@
1
+ import { Graph } from '../graph/graph.js';
2
+ export declare function loadGraphFromSnapshot(snapshotId: number): Graph;
@@ -0,0 +1,96 @@
1
+ import { getDb } from './index.js';
2
+ import { PageRepository } from './repositories/PageRepository.js';
3
+ import { EdgeRepository } from './repositories/EdgeRepository.js';
4
+ import { MetricsRepository } from './repositories/MetricsRepository.js';
5
+ import { SnapshotRepository } from './repositories/SnapshotRepository.js';
6
+ import { Graph } from '../graph/graph.js';
7
+ export function loadGraphFromSnapshot(snapshotId) {
8
+ const db = getDb();
9
+ const pageRepo = new PageRepository(db);
10
+ const edgeRepo = new EdgeRepository(db);
11
+ const metricsRepo = new MetricsRepository(db);
12
+ const snapshotRepo = new SnapshotRepository(db);
13
+ const pages = pageRepo.getPagesBySnapshot(snapshotId);
14
+ const metrics = metricsRepo.getMetrics(snapshotId);
15
+ const snapshot = snapshotRepo.getSnapshot(snapshotId);
16
+ const metricsMap = new Map();
17
+ for (const m of metrics) {
18
+ metricsMap.set(m.page_id, m);
19
+ }
20
+ const graph = new Graph();
21
+ if (snapshot) {
22
+ graph.limitReached = !!snapshot.limit_reached;
23
+ }
24
+ const idMap = new Map();
25
+ for (const p of pages) {
26
+ idMap.set(p.id, p.normalized_url);
27
+ graph.addNode(p.normalized_url, p.depth, p.http_status || 0);
28
+ const m = metricsMap.get(p.id);
29
+ let incrementalStatus;
30
+ if (p.first_seen_snapshot_id === snapshotId) {
31
+ incrementalStatus = 'new';
32
+ }
33
+ else if (m?.crawl_status === 'cached') {
34
+ incrementalStatus = 'unchanged';
35
+ }
36
+ else if (m?.crawl_status === 'fetched') {
37
+ incrementalStatus = 'changed';
38
+ }
39
+ graph.updateNodeData(p.normalized_url, {
40
+ canonical: p.canonical_url || undefined,
41
+ contentHash: p.content_hash || undefined,
42
+ simhash: p.simhash || undefined,
43
+ etag: p.etag || undefined,
44
+ lastModified: p.last_modified || undefined,
45
+ html: p.html || undefined,
46
+ soft404Score: p.soft404_score || undefined,
47
+ noindex: !!p.noindex,
48
+ nofollow: !!p.nofollow,
49
+ incrementalStatus,
50
+ securityError: p.security_error || undefined,
51
+ retries: p.retries || undefined,
52
+ bytesReceived: p.bytes_received || undefined,
53
+ redirectChain: p.redirect_chain ? JSON.parse(p.redirect_chain) : undefined,
54
+ crawlTrapFlag: !!p.crawl_trap_flag,
55
+ crawlTrapRisk: p.crawl_trap_risk || undefined,
56
+ trapType: p.trap_type || undefined,
57
+ // Metrics
58
+ pageRank: m?.pagerank ?? undefined,
59
+ pageRankScore: m?.pagerank_score ?? m?.pagerank ?? undefined,
60
+ authorityScore: m?.authority_score ?? undefined,
61
+ hubScore: m?.hub_score ?? undefined,
62
+ linkRole: m?.link_role ?? undefined,
63
+ // Duplicate info
64
+ duplicateClusterId: m?.duplicate_cluster_id ?? undefined,
65
+ duplicateType: m?.duplicate_type ?? undefined,
66
+ isClusterPrimary: m?.is_cluster_primary ? true : undefined,
67
+ });
68
+ }
69
+ const edges = edgeRepo.getEdgesBySnapshot(snapshotId);
70
+ for (const e of edges) {
71
+ const source = idMap.get(e.source_page_id);
72
+ const target = idMap.get(e.target_page_id);
73
+ if (source && target) {
74
+ graph.addEdge(source, target, e.weight || 1.0);
75
+ }
76
+ }
77
+ // Load duplicate clusters
78
+ const dupClusters = db.prepare('SELECT * FROM duplicate_clusters WHERE snapshot_id = ?').all(snapshotId);
79
+ graph.duplicateClusters = dupClusters.map(c => ({
80
+ id: c.id,
81
+ type: c.type,
82
+ size: c.size,
83
+ representative: c.representative,
84
+ severity: c.severity
85
+ }));
86
+ // Load content clusters
87
+ const contentClusters = db.prepare('SELECT * FROM content_clusters WHERE snapshot_id = ?').all(snapshotId);
88
+ graph.contentClusters = contentClusters.map(c => ({
89
+ id: c.id,
90
+ count: c.count,
91
+ primaryUrl: c.primary_url,
92
+ risk: c.risk,
93
+ sharedPathPrefix: c.shared_path_prefix || undefined
94
+ }));
95
+ return graph;
96
+ }
@@ -0,0 +1,4 @@
1
+ import Database from 'better-sqlite3';
2
+ export declare function getDbPath(): string;
3
+ export declare function getDb(): Database.Database;
4
+ export declare function closeDb(): void;