@crawlith/core 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analysis_list.html +35 -0
  4. package/dist/analysis/analysis_page.html +123 -0
  5. package/dist/analysis/analyze.d.ts +40 -5
  6. package/dist/analysis/analyze.js +395 -347
  7. package/dist/analysis/clustering.d.ts +23 -0
  8. package/dist/analysis/clustering.js +206 -0
  9. package/dist/analysis/content.d.ts +1 -1
  10. package/dist/analysis/content.js +11 -5
  11. package/dist/analysis/duplicate.d.ts +34 -0
  12. package/dist/analysis/duplicate.js +305 -0
  13. package/dist/analysis/heading.d.ts +116 -0
  14. package/dist/analysis/heading.js +356 -0
  15. package/dist/analysis/images.d.ts +1 -1
  16. package/dist/analysis/images.js +6 -5
  17. package/dist/analysis/links.d.ts +1 -1
  18. package/dist/analysis/links.js +8 -8
  19. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  20. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  21. package/dist/analysis/scoring.js +11 -2
  22. package/dist/analysis/seo.d.ts +8 -4
  23. package/dist/analysis/seo.js +41 -30
  24. package/dist/analysis/soft404.d.ts +17 -0
  25. package/dist/analysis/soft404.js +62 -0
  26. package/dist/analysis/structuredData.d.ts +1 -1
  27. package/dist/analysis/structuredData.js +5 -4
  28. package/dist/analysis/templates.d.ts +2 -0
  29. package/dist/analysis/templates.js +7 -0
  30. package/dist/application/index.d.ts +2 -0
  31. package/dist/application/index.js +2 -0
  32. package/dist/application/usecase.d.ts +3 -0
  33. package/dist/application/usecase.js +1 -0
  34. package/dist/application/usecases.d.ts +114 -0
  35. package/dist/application/usecases.js +201 -0
  36. package/dist/audit/index.js +1 -1
  37. package/dist/audit/transport.d.ts +1 -1
  38. package/dist/audit/transport.js +5 -4
  39. package/dist/audit/types.d.ts +1 -0
  40. package/dist/constants.d.ts +17 -0
  41. package/dist/constants.js +23 -0
  42. package/dist/core/scope/scopeManager.js +3 -0
  43. package/dist/core/security/ipGuard.d.ts +11 -0
  44. package/dist/core/security/ipGuard.js +71 -3
  45. package/dist/crawler/crawl.d.ts +4 -22
  46. package/dist/crawler/crawl.js +4 -335
  47. package/dist/crawler/crawler.d.ts +87 -0
  48. package/dist/crawler/crawler.js +683 -0
  49. package/dist/crawler/extract.d.ts +4 -1
  50. package/dist/crawler/extract.js +7 -2
  51. package/dist/crawler/fetcher.d.ts +2 -1
  52. package/dist/crawler/fetcher.js +26 -11
  53. package/dist/crawler/metricsRunner.d.ts +23 -1
  54. package/dist/crawler/metricsRunner.js +202 -72
  55. package/dist/crawler/normalize.d.ts +41 -0
  56. package/dist/crawler/normalize.js +119 -3
  57. package/dist/crawler/parser.d.ts +1 -3
  58. package/dist/crawler/parser.js +2 -49
  59. package/dist/crawler/resolver.d.ts +11 -0
  60. package/dist/crawler/resolver.js +67 -0
  61. package/dist/crawler/sitemap.d.ts +6 -0
  62. package/dist/crawler/sitemap.js +27 -17
  63. package/dist/crawler/trap.d.ts +5 -1
  64. package/dist/crawler/trap.js +23 -2
  65. package/dist/db/CrawlithDB.d.ts +110 -0
  66. package/dist/db/CrawlithDB.js +500 -0
  67. package/dist/db/graphLoader.js +42 -30
  68. package/dist/db/index.d.ts +11 -0
  69. package/dist/db/index.js +41 -29
  70. package/dist/db/migrations.d.ts +2 -0
  71. package/dist/db/{schema.js → migrations.js} +90 -43
  72. package/dist/db/pluginRegistry.d.ts +9 -0
  73. package/dist/db/pluginRegistry.js +19 -0
  74. package/dist/db/repositories/EdgeRepository.d.ts +13 -0
  75. package/dist/db/repositories/EdgeRepository.js +20 -0
  76. package/dist/db/repositories/MetricsRepository.d.ts +16 -8
  77. package/dist/db/repositories/MetricsRepository.js +28 -7
  78. package/dist/db/repositories/PageRepository.d.ts +15 -2
  79. package/dist/db/repositories/PageRepository.js +169 -25
  80. package/dist/db/repositories/SiteRepository.d.ts +9 -0
  81. package/dist/db/repositories/SiteRepository.js +13 -0
  82. package/dist/db/repositories/SnapshotRepository.d.ts +14 -5
  83. package/dist/db/repositories/SnapshotRepository.js +64 -5
  84. package/dist/db/reset.d.ts +9 -0
  85. package/dist/db/reset.js +32 -0
  86. package/dist/db/statements.d.ts +12 -0
  87. package/dist/db/statements.js +40 -0
  88. package/dist/diff/compare.d.ts +0 -5
  89. package/dist/diff/compare.js +0 -12
  90. package/dist/diff/service.d.ts +16 -0
  91. package/dist/diff/service.js +41 -0
  92. package/dist/domain/index.d.ts +4 -0
  93. package/dist/domain/index.js +4 -0
  94. package/dist/events.d.ts +56 -0
  95. package/dist/events.js +1 -0
  96. package/dist/graph/graph.d.ts +36 -42
  97. package/dist/graph/graph.js +26 -17
  98. package/dist/graph/hits.d.ts +23 -0
  99. package/dist/graph/hits.js +111 -0
  100. package/dist/graph/metrics.d.ts +0 -4
  101. package/dist/graph/metrics.js +25 -9
  102. package/dist/graph/pagerank.d.ts +17 -4
  103. package/dist/graph/pagerank.js +126 -91
  104. package/dist/graph/simhash.d.ts +6 -0
  105. package/dist/graph/simhash.js +14 -0
  106. package/dist/index.d.ts +29 -8
  107. package/dist/index.js +29 -8
  108. package/dist/lock/hashKey.js +1 -1
  109. package/dist/lock/lockManager.d.ts +5 -1
  110. package/dist/lock/lockManager.js +38 -13
  111. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  112. package/dist/plugin-system/plugin-cli.js +31 -0
  113. package/dist/plugin-system/plugin-config.d.ts +16 -0
  114. package/dist/plugin-system/plugin-config.js +36 -0
  115. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  116. package/dist/plugin-system/plugin-loader.js +122 -0
  117. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  118. package/dist/plugin-system/plugin-registry.js +167 -0
  119. package/dist/plugin-system/plugin-types.d.ts +205 -0
  120. package/dist/plugin-system/plugin-types.js +1 -0
  121. package/dist/ports/index.d.ts +9 -0
  122. package/dist/ports/index.js +1 -0
  123. package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
  124. package/dist/report/crawlExport.d.ts +3 -0
  125. package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
  126. package/dist/report/crawl_template.d.ts +1 -0
  127. package/dist/report/crawl_template.js +7 -0
  128. package/dist/report/export.d.ts +3 -0
  129. package/dist/report/export.js +81 -0
  130. package/dist/report/html.js +15 -216
  131. package/dist/report/insight.d.ts +27 -0
  132. package/dist/report/insight.js +103 -0
  133. package/dist/scoring/health.d.ts +56 -0
  134. package/dist/scoring/health.js +213 -0
  135. package/dist/utils/chalk.d.ts +6 -0
  136. package/dist/utils/chalk.js +41 -0
  137. package/dist/utils/secureConfig.d.ts +23 -0
  138. package/dist/utils/secureConfig.js +128 -0
  139. package/package.json +12 -6
  140. package/CHANGELOG.md +0 -7
  141. package/dist/db/schema.d.ts +0 -2
  142. package/dist/graph/cluster.d.ts +0 -6
  143. package/dist/graph/cluster.js +0 -173
  144. package/dist/graph/duplicate.d.ts +0 -10
  145. package/dist/graph/duplicate.js +0 -251
  146. package/dist/report/sitegraphExport.d.ts +0 -3
  147. package/dist/report/sitegraph_template.d.ts +0 -1
  148. package/dist/report/sitegraph_template.js +0 -630
  149. package/dist/scoring/hits.d.ts +0 -9
  150. package/dist/scoring/hits.js +0 -111
  151. package/src/analysis/analyze.ts +0 -548
  152. package/src/analysis/content.ts +0 -62
  153. package/src/analysis/images.ts +0 -28
  154. package/src/analysis/links.ts +0 -41
  155. package/src/analysis/scoring.ts +0 -59
  156. package/src/analysis/seo.ts +0 -82
  157. package/src/analysis/structuredData.ts +0 -62
  158. package/src/audit/dns.ts +0 -49
  159. package/src/audit/headers.ts +0 -98
  160. package/src/audit/index.ts +0 -66
  161. package/src/audit/scoring.ts +0 -232
  162. package/src/audit/transport.ts +0 -258
  163. package/src/audit/types.ts +0 -102
  164. package/src/core/network/proxyAdapter.ts +0 -21
  165. package/src/core/network/rateLimiter.ts +0 -39
  166. package/src/core/network/redirectController.ts +0 -47
  167. package/src/core/network/responseLimiter.ts +0 -34
  168. package/src/core/network/retryPolicy.ts +0 -57
  169. package/src/core/scope/domainFilter.ts +0 -45
  170. package/src/core/scope/scopeManager.ts +0 -52
  171. package/src/core/scope/subdomainPolicy.ts +0 -39
  172. package/src/core/security/ipGuard.ts +0 -92
  173. package/src/crawler/crawl.ts +0 -382
  174. package/src/crawler/extract.ts +0 -34
  175. package/src/crawler/fetcher.ts +0 -233
  176. package/src/crawler/metricsRunner.ts +0 -124
  177. package/src/crawler/normalize.ts +0 -108
  178. package/src/crawler/parser.ts +0 -190
  179. package/src/crawler/sitemap.ts +0 -73
  180. package/src/crawler/trap.ts +0 -96
  181. package/src/db/graphLoader.ts +0 -105
  182. package/src/db/index.ts +0 -70
  183. package/src/db/repositories/EdgeRepository.ts +0 -29
  184. package/src/db/repositories/MetricsRepository.ts +0 -49
  185. package/src/db/repositories/PageRepository.ts +0 -128
  186. package/src/db/repositories/SiteRepository.ts +0 -32
  187. package/src/db/repositories/SnapshotRepository.ts +0 -74
  188. package/src/db/schema.ts +0 -177
  189. package/src/diff/compare.ts +0 -84
  190. package/src/graph/cluster.ts +0 -192
  191. package/src/graph/duplicate.ts +0 -286
  192. package/src/graph/graph.ts +0 -172
  193. package/src/graph/metrics.ts +0 -110
  194. package/src/graph/pagerank.ts +0 -125
  195. package/src/graph/simhash.ts +0 -61
  196. package/src/index.ts +0 -30
  197. package/src/lock/hashKey.ts +0 -51
  198. package/src/lock/lockManager.ts +0 -124
  199. package/src/lock/pidCheck.ts +0 -13
  200. package/src/report/html.ts +0 -227
  201. package/src/report/sitegraphExport.ts +0 -58
  202. package/src/scoring/hits.ts +0 -131
  203. package/src/scoring/orphanSeverity.ts +0 -176
  204. package/src/utils/version.ts +0 -18
  205. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  206. package/tests/analysis.unit.test.ts +0 -98
  207. package/tests/analyze.integration.test.ts +0 -98
  208. package/tests/audit/dns.test.ts +0 -31
  209. package/tests/audit/headers.test.ts +0 -45
  210. package/tests/audit/scoring.test.ts +0 -133
  211. package/tests/audit/security.test.ts +0 -12
  212. package/tests/audit/transport.test.ts +0 -112
  213. package/tests/clustering.test.ts +0 -118
  214. package/tests/crawler.test.ts +0 -358
  215. package/tests/db.test.ts +0 -159
  216. package/tests/diff.test.ts +0 -67
  217. package/tests/duplicate.test.ts +0 -110
  218. package/tests/fetcher.test.ts +0 -106
  219. package/tests/fetcher_safety.test.ts +0 -85
  220. package/tests/fixtures/analyze-crawl.json +0 -26
  221. package/tests/hits.test.ts +0 -134
  222. package/tests/html_report.test.ts +0 -58
  223. package/tests/lock/lockManager.test.ts +0 -138
  224. package/tests/metrics.test.ts +0 -196
  225. package/tests/normalize.test.ts +0 -101
  226. package/tests/orphanSeverity.test.ts +0 -160
  227. package/tests/pagerank.test.ts +0 -98
  228. package/tests/parser.test.ts +0 -117
  229. package/tests/proxy_safety.test.ts +0 -57
  230. package/tests/redirect_safety.test.ts +0 -73
  231. package/tests/safety.test.ts +0 -114
  232. package/tests/scope.test.ts +0 -66
  233. package/tests/scoring.test.ts +0 -59
  234. package/tests/sitemap.test.ts +0 -88
  235. package/tests/soft404.test.ts +0 -41
  236. package/tests/trap.test.ts +0 -39
  237. package/tests/visualization_data.test.ts +0 -46
  238. package/tsconfig.json +0 -11
@@ -1,124 +0,0 @@
1
- import { getDb } from '../db/index.js';
2
- import { loadGraphFromSnapshot } from '../db/graphLoader.js';
3
- import { MetricsRepository } from '../db/repositories/MetricsRepository.js';
4
- import { SnapshotRepository } from '../db/repositories/SnapshotRepository.js';
5
- import { PageRepository } from '../db/repositories/PageRepository.js';
6
- import { computePageRank } from '../graph/pagerank.js';
7
- import { calculateMetrics } from '../graph/metrics.js';
8
- import { computeHITS } from '../scoring/hits.js';
9
-
10
- export function runPostCrawlMetrics(snapshotId: number, maxDepth: number, limitReached: boolean = false) {
11
- const db = getDb();
12
- const metricsRepo = new MetricsRepository(db);
13
- const snapshotRepo = new SnapshotRepository(db);
14
- const pageRepo = new PageRepository(db);
15
-
16
- const snapshot = snapshotRepo.getSnapshot(snapshotId);
17
- if (!snapshot) {
18
- console.error(`Snapshot ${snapshotId} not found`);
19
- return;
20
- }
21
-
22
- console.log('Loading graph for metrics calculation...');
23
- const graph = loadGraphFromSnapshot(snapshotId);
24
-
25
- console.log('Computing PageRank...');
26
- computePageRank(graph);
27
-
28
- console.log('Computing HITS...');
29
- computeHITS(graph);
30
-
31
- console.log('Updating metrics in DB...');
32
- const nodes = graph.getNodes();
33
-
34
- const tx = db.transaction(() => {
35
- for (const node of nodes) {
36
- const pageId = pageRepo.getIdByUrl(snapshot.site_id, node.url);
37
- if (!pageId) continue;
38
-
39
- const existing = metricsRepo.getMetricsForPage(snapshotId, pageId);
40
-
41
- metricsRepo.insertMetrics({
42
- snapshot_id: snapshotId,
43
- page_id: pageId,
44
- authority_score: node.authorityScore ?? null,
45
- hub_score: node.hubScore ?? null,
46
- pagerank: node.pageRank ?? null,
47
- pagerank_score: node.pageRankScore ?? null,
48
- link_role: node.linkRole ?? null,
49
- crawl_status: existing?.crawl_status ?? null,
50
- word_count: existing?.word_count ?? null,
51
- thin_content_score: existing?.thin_content_score ?? null,
52
- external_link_ratio: existing?.external_link_ratio ?? null,
53
- orphan_score: existing?.orphan_score ?? null,
54
- duplicate_cluster_id: node.duplicateClusterId ?? null,
55
- duplicate_type: node.duplicateType ?? null,
56
- is_cluster_primary: node.isClusterPrimary ? 1 : 0
57
- });
58
-
59
- // Update page-level crawl trap data
60
- if (node.crawlTrapFlag || node.redirectChain?.length || node.bytesReceived) {
61
- pageRepo.upsertPage({
62
- site_id: snapshot.site_id,
63
- normalized_url: node.url,
64
- last_seen_snapshot_id: snapshotId,
65
- redirect_chain: node.redirectChain ? JSON.stringify(node.redirectChain) : null,
66
- bytes_received: node.bytesReceived ?? null,
67
- crawl_trap_flag: node.crawlTrapFlag ? 1 : 0,
68
- crawl_trap_risk: node.crawlTrapRisk ?? null,
69
- trap_type: node.trapType ?? null,
70
- });
71
- }
72
- }
73
-
74
- // Save duplicate clusters
75
- if (graph.duplicateClusters.length > 0) {
76
- const clusterStmt = db.prepare(`
77
- INSERT OR REPLACE INTO duplicate_clusters (id, snapshot_id, type, size, representative, severity)
78
- VALUES (?, ?, ?, ?, ?, ?)
79
- `);
80
- for (const cluster of graph.duplicateClusters) {
81
- clusterStmt.run(cluster.id, snapshotId, cluster.type, cluster.size, cluster.representative, cluster.severity);
82
- }
83
- }
84
-
85
- // Save content clusters
86
- if (graph.contentClusters.length > 0) {
87
- const contentStmt = db.prepare(`
88
- INSERT OR REPLACE INTO content_clusters (id, snapshot_id, count, primary_url, risk, shared_path_prefix)
89
- VALUES (?, ?, ?, ?, ?, ?)
90
- `);
91
- for (const cluster of graph.contentClusters) {
92
- contentStmt.run(cluster.id, snapshotId, cluster.count, cluster.primaryUrl, cluster.risk, cluster.sharedPathPrefix ?? null);
93
- }
94
- }
95
- });
96
- tx();
97
-
98
- console.log('Computing aggregate stats...');
99
- const metrics = calculateMetrics(graph, maxDepth);
100
-
101
- let totalScore = 0;
102
- let totalWeight = 0;
103
- for (const node of nodes) {
104
- const score = node.authorityScore || node.pageRankScore || 0;
105
- const depth = node.depth;
106
- const weight = 1 / (depth + 1);
107
- totalScore += score * weight;
108
- totalWeight += weight;
109
- }
110
- const healthScore = totalWeight > 0 ? (totalScore / totalWeight) * 100 : 0;
111
-
112
- const thinCountRow = db.prepare('SELECT count(*) as count FROM metrics WHERE snapshot_id = ? AND thin_content_score >= 70').get(snapshotId) as { count: number };
113
-
114
- snapshotRepo.updateSnapshotStatus(snapshotId, 'completed', {
115
- node_count: metrics.totalPages,
116
- edge_count: metrics.totalEdges,
117
- health_score: healthScore,
118
- orphan_count: metrics.orphanPages.length,
119
- thin_content_count: thinCountRow.count,
120
- limit_reached: limitReached ? 1 : 0
121
- });
122
-
123
- console.log('Metrics calculation complete.');
124
- }
@@ -1,108 +0,0 @@
1
- /**
2
- * Normalizes a URL string based on specific rules.
3
- */
4
- export interface NormalizeOptions {
5
- stripQuery?: boolean;
6
- }
7
-
8
- const TRACKING_PARAMS = new Set([
9
- 'utm_source',
10
- 'utm_medium',
11
- 'utm_campaign',
12
- 'utm_term',
13
- 'utm_content',
14
- 'fbclid',
15
- 'gclid',
16
- 'msclkid'
17
- ]);
18
-
19
- const SKIP_EXTENSIONS = new Set([
20
- '.pdf', '.jpg', '.png', '.svg', '.webp', '.gif',
21
- '.zip', '.xml', '.json', '.mp4'
22
- ]);
23
-
24
- export function normalizeUrl(input: string, base: string, options: NormalizeOptions = {}): string | null {
25
- try {
26
- // 1. Resolve absolute URL
27
- let u: URL;
28
- if (base) {
29
- u = new URL(input, base);
30
- } else {
31
- u = new URL(input);
32
- }
33
-
34
- // 2. Allow only http/https
35
- if (u.protocol !== 'http:' && u.protocol !== 'https:') {
36
- return null;
37
- }
38
-
39
- // 3. Lowercase hostname
40
- u.hostname = u.hostname.toLowerCase();
41
-
42
- // 4. Remove default ports
43
- if ((u.protocol === 'http:' && u.port === '80') || (u.protocol === 'https:' && u.port === '443')) {
44
- u.port = '';
45
- }
46
-
47
- // 5. Remove hash fragments
48
- u.hash = '';
49
-
50
- // 6. Query params handling
51
- const params = new URLSearchParams(u.search);
52
- const newParams = new URLSearchParams();
53
-
54
- // Check if we should strip all query params
55
- if (options.stripQuery) {
56
- u.search = '';
57
- } else {
58
- // Filter tracking params
59
- let hasParams = false;
60
- for (const [key, value] of params) {
61
- // Remove utm_* and other tracking params
62
- if (key.startsWith('utm_') || TRACKING_PARAMS.has(key)) {
63
- continue;
64
- }
65
- newParams.append(key, value);
66
- hasParams = true;
67
- }
68
-
69
- // Sort for consistency
70
- newParams.sort();
71
-
72
- if (hasParams || newParams.toString()) {
73
- u.search = newParams.toString();
74
- } else {
75
- u.search = '';
76
- }
77
- }
78
-
79
- // 7. Normalize trailing slash
80
- // 8. Collapse duplicate slashes in pathname
81
- let pathname = u.pathname;
82
-
83
- // Collapse duplicate slashes
84
- pathname = pathname.replace(/\/+/g, '/');
85
-
86
- // Remove trailing slash unless root
87
- if (pathname.length > 1 && pathname.endsWith('/')) {
88
- pathname = pathname.slice(0, -1);
89
- }
90
-
91
- u.pathname = pathname;
92
-
93
- // 9. Skip non-HTML assets by extension
94
- const lastDotIndex = u.pathname.lastIndexOf('.');
95
- if (lastDotIndex !== -1) {
96
- const ext = u.pathname.slice(lastDotIndex).toLowerCase();
97
- if (SKIP_EXTENSIONS.has(ext)) {
98
- return null;
99
- }
100
- }
101
-
102
- // 10. Return final string
103
- return u.toString();
104
-
105
- } catch (_e) {
106
- return null;
107
- }
108
- }
@@ -1,190 +0,0 @@
1
- import * as cheerio from 'cheerio';
2
- import crypto from 'node:crypto';
3
- import { normalizeUrl } from './normalize.js';
4
- import { SimHash } from '../graph/simhash.js';
5
-
6
- export interface ParseLink {
7
- url: string;
8
- weight: number;
9
- }
10
-
11
- export interface ParseResult {
12
- links: ParseLink[];
13
- html: string;
14
- canonical: string | null;
15
- noindex: boolean;
16
- nofollow: boolean;
17
- contentHash: string;
18
- simhash?: string;
19
- uniqueTokenRatio?: number;
20
- soft404Score: number;
21
- soft404Signals: string[];
22
- }
23
-
24
- export class Parser {
25
- /**
26
- * Parses HTML content to extract metadata and links.
27
- */
28
- parse(html: string, baseUrl: string, status: number): ParseResult {
29
- const $ = cheerio.load(html);
30
-
31
- // 1. Robots Meta
32
- let noindex = false;
33
- let nofollow = false;
34
- const robotsMeta = $('meta[name="robots"]').attr('content');
35
- if (robotsMeta) {
36
- const directives = robotsMeta.toLowerCase().split(',').map(s => s.trim());
37
- if (directives.includes('noindex') || directives.includes('none')) noindex = true;
38
- if (directives.includes('nofollow') || directives.includes('none')) nofollow = true;
39
- }
40
-
41
- // 2. Canonical
42
- let canonical: string | null = null;
43
- const canonicalLink = $('link[rel="canonical"]').attr('href');
44
- if (canonicalLink) {
45
- try {
46
- // Resolve relative canonicals
47
- const u = new URL(canonicalLink, baseUrl);
48
- // Normalize minimally (remove default ports, lowercase host, etc)
49
- // We don't strip query by default for canonical as it might be relevant
50
- canonical = normalizeUrl(u.toString(), '', { stripQuery: false });
51
- } catch (_e) {
52
- // Invalid canonical URL, ignore
53
- }
54
- }
55
-
56
- // 3. Links
57
- const links = new Map<string, number>();
58
- if (!nofollow) { // Don't extract links if nofollow is set
59
- $('a').each((_, element) => {
60
- const href = $(element).attr('href');
61
- const rel = $(element).attr('rel');
62
- const isNofollow = rel && rel.toLowerCase().includes('nofollow');
63
-
64
- if (href && !isNofollow) {
65
- try {
66
- const absoluteUrl = new URL(href, baseUrl);
67
- if (absoluteUrl.protocol === 'http:' || absoluteUrl.protocol === 'https:') {
68
- absoluteUrl.hash = '';
69
- const urlStr = absoluteUrl.toString();
70
-
71
- // Calculate Weight
72
- let weight = 1.0; // Default: Body
73
-
74
- // Semantic Check
75
- const $el = $(element);
76
- if ($el.closest('nav').length > 0 || $el.closest('header').length > 0) {
77
- weight = 0.7;
78
- } else if ($el.closest('footer').length > 0) {
79
- weight = 0.4;
80
- } else {
81
- // Secondary check: Common attributes
82
- const parentText = ($el.parent().attr('class') || '') + ($el.parent().attr('id') || '');
83
- const grandParentText = ($el.parent().parent().attr('class') || '') + ($el.parent().parent().attr('id') || '');
84
- const combinedContext = (parentText + grandParentText).toLowerCase();
85
-
86
- if (combinedContext.includes('nav') || combinedContext.includes('menu')) {
87
- weight = 0.7;
88
- } else if (combinedContext.includes('footer')) {
89
- weight = 0.4;
90
- }
91
- }
92
-
93
- // Store highest weight if multiple links to same URL
94
- const currentMax = links.get(urlStr) || 0;
95
- if (weight > currentMax) {
96
- links.set(urlStr, weight);
97
- }
98
- }
99
- } catch (_e) {
100
- // Invalid URL
101
- }
102
- }
103
- });
104
- }
105
-
106
- // 4. Content Hash (ignoring script/style/comments)
107
- // Clone body to avoid modifying the loaded doc (though we don't reuse it)
108
- // Actually cheerio load gives us a fresh instance.
109
- $('script').remove();
110
- $('style').remove();
111
- $('noscript').remove();
112
- $('iframe').remove();
113
-
114
- const cleanText = $('body').text().replace(/\s+/g, ' ').trim();
115
- const contentHash = crypto.createHash('sha256').update(cleanText).digest('hex');
116
-
117
- // 4b. Simhash & Token calculation (limit to 50k chars for performance)
118
- const limitedText = cleanText.substring(0, 50000).toLowerCase();
119
- const tokens = limitedText.split(/\W+/).filter(t => t.length > 0);
120
- const uniqueTokens = new Set(tokens);
121
- const uniqueTokenRatio = tokens.length > 0 ? (uniqueTokens.size / tokens.length) : 0;
122
- const simhash = SimHash.generate(tokens).toString();
123
-
124
- // 5. Soft 404 Detection
125
- let soft404Score = 0;
126
- const soft404Signals: string[] = [];
127
-
128
- if (status === 200) {
129
- const title = $('title').text().toLowerCase();
130
- const h1Text = $('h1').first().text().toLowerCase();
131
- const bodyText = cleanText.toLowerCase();
132
-
133
- const errorPatterns = ['404', 'not found', 'error', 'doesn\'t exist', 'unavailable', 'invalid'];
134
-
135
- // Pattern checks
136
- for (const pattern of errorPatterns) {
137
- if (title.includes(pattern)) {
138
- soft404Score += 0.4;
139
- soft404Signals.push(`title_pattern_${pattern.replace(/\s+/g, '_')}`);
140
- break;
141
- }
142
- }
143
-
144
- for (const pattern of errorPatterns) {
145
- if (h1Text.includes(pattern)) {
146
- soft404Score += 0.3;
147
- soft404Signals.push(`h1_pattern_${pattern.replace(/\s+/g, '_')}`);
148
- break;
149
- }
150
- }
151
-
152
- if (bodyText.includes('page not found') || bodyText.includes('404 error')) {
153
- soft404Score += 0.2;
154
- soft404Signals.push('body_error_phrase');
155
- }
156
-
157
- // Content length check (Word count approximation)
158
- const words = cleanText.split(/\s+/).filter(w => w.length > 0);
159
- if (words.length < 50) {
160
- soft404Score += 0.3;
161
- soft404Signals.push('very_low_word_count');
162
- } else if (words.length < 150) {
163
- soft404Score += 0.1;
164
- soft404Signals.push('low_word_count');
165
- }
166
-
167
- // Link count check
168
- if (links.size === 0) {
169
- soft404Score += 0.2;
170
- soft404Signals.push('no_outbound_links');
171
- }
172
-
173
- // Cap at 1.0
174
- soft404Score = Math.min(1.0, soft404Score);
175
- }
176
-
177
- return {
178
- links: Array.from(links.entries()).map(([url, weight]) => ({ url, weight })),
179
- html: html, // pass raw HTML for analysis
180
- canonical,
181
- noindex,
182
- nofollow,
183
- contentHash,
184
- simhash,
185
- uniqueTokenRatio,
186
- soft404Score,
187
- soft404Signals
188
- }
189
- }
190
- }
@@ -1,73 +0,0 @@
1
- import { request } from 'undici';
2
- import * as cheerio from 'cheerio';
3
- import { normalizeUrl } from './normalize.js';
4
-
5
- export class Sitemap {
6
- /**
7
- * Fetches and parses a sitemap (or sitemap index) to extract URLs.
8
- * Recursively handles sitemap indexes with loop detection and depth limits.
9
- */
10
- async fetch(url: string): Promise<string[]> {
11
- const visited = new Set<string>();
12
- const urls = new Set<string>();
13
-
14
- await this.processSitemap(url, visited, urls);
15
-
16
- return Array.from(urls);
17
- }
18
-
19
- private async processSitemap(url: string, visited: Set<string>, urls: Set<string>) {
20
- if (visited.has(url)) return;
21
- visited.add(url);
22
-
23
- // Hard limit on number of sitemaps to fetch to prevent abuse
24
- if (visited.size > 50) return;
25
-
26
- try {
27
- const res = await request(url, {
28
- maxRedirections: 3,
29
- headers: { 'User-Agent': 'crawlith/1.0' },
30
- headersTimeout: 10000,
31
- bodyTimeout: 10000
32
- });
33
-
34
- if (res.statusCode >= 200 && res.statusCode < 300) {
35
- const xml = await res.body.text();
36
- // Basic validation: must verify it looks like XML
37
- if (!xml.trim().startsWith('<')) return;
38
-
39
- const $ = cheerio.load(xml, { xmlMode: true });
40
-
41
- // Check if it's a sitemap index
42
- const sitemaps = $('sitemap > loc');
43
- if (sitemaps.length > 0) {
44
- const childSitemaps: string[] = [];
45
- sitemaps.each((_, el) => {
46
- const loc = $(el).text().trim();
47
- if (loc) childSitemaps.push(loc);
48
- });
49
-
50
- // Process children sequentially to avoid massive concurrency spike
51
- for (const childUrl of childSitemaps) {
52
- await this.processSitemap(childUrl, visited, urls);
53
- }
54
- } else {
55
- // It's a URL Set
56
- $('url > loc').each((_, el) => {
57
- const loc = $(el).text().trim();
58
- if (loc) {
59
- const normalized = normalizeUrl(loc, '');
60
- if (normalized) {
61
- urls.add(normalized);
62
- }
63
- }
64
- });
65
- }
66
- } else {
67
- await res.body.dump();
68
- }
69
- } catch (e) {
70
- console.warn(`Failed to fetch sitemap ${url}:`, e);
71
- }
72
- }
73
- }
@@ -1,96 +0,0 @@
1
-
2
- export type TrapType = 'faceted_navigation' | 'calendar_trap' | 'pagination_loop' | 'session_trap';
3
-
4
- export interface TrapResult {
5
- risk: number;
6
- type: TrapType | null;
7
- }
8
-
9
- export class TrapDetector {
10
- private pathCounters = new Map<string, Set<string>>();
11
- private paginationCounters = new Map<string, number>();
12
- private sessionParams = new Set(['sid', 'session', 'phpsessid', 'sessid', 'token']);
13
-
14
- // Configurable thresholds
15
- private PARAM_EXPLOSION_THRESHOLD = 30;
16
- private PAGINATION_THRESHOLD = 50;
17
-
18
- constructor(options: { paramThreshold?: number, paginationThreshold?: number } = {}) {
19
- if (options.paramThreshold) this.PARAM_EXPLOSION_THRESHOLD = options.paramThreshold;
20
- if (options.paginationThreshold) this.PAGINATION_THRESHOLD = options.paginationThreshold;
21
- }
22
-
23
- /**
24
- * Checks if a URL represents a potential crawl trap.
25
- */
26
- checkTrap(rawUrl: string, _depth: number): TrapResult {
27
- let risk = 0;
28
- let type: TrapType | null = null;
29
-
30
- try {
31
- const u = new URL(rawUrl);
32
- const params = new URLSearchParams(u.search);
33
- const pathname = u.pathname;
34
- const pathKey = `${u.origin}${pathname}`;
35
-
36
- // 1. Session IDs / Tracking Parameters
37
- for (const [key] of params) {
38
- if (this.sessionParams.has(key.toLowerCase()) || key.toLowerCase().includes('session')) {
39
- risk = Math.max(risk, 0.9);
40
- type = 'session_trap';
41
- }
42
- }
43
-
44
- // 2. Calendar Pattern Detection
45
- // Matches /2023/12/01, /2023-12-01, /12-2023 etc
46
- const calendarRegex = /\/\d{4}[-/]\d{2}[-/]\d{2}\/|\/\d{2}[-/]\d{2}[-/]\d{4}\//;
47
- if (calendarRegex.test(pathname)) {
48
- risk = Math.max(risk, 0.7);
49
- type = 'calendar_trap';
50
- }
51
-
52
- // 3. Pagination Loop
53
- const pageParam = params.get('page') || params.get('p') || params.get('pg');
54
- if (pageParam && /^\d+$/.test(pageParam)) {
55
- const pageNum = parseInt(pageParam, 10);
56
- const currentMaxPage = this.paginationCounters.get(pathKey) || 0;
57
-
58
- if (pageNum > currentMaxPage) {
59
- this.paginationCounters.set(pathKey, pageNum);
60
- }
61
-
62
- if (pageNum > this.PAGINATION_THRESHOLD) {
63
- risk = Math.max(risk, 0.85);
64
- type = 'pagination_loop';
65
- }
66
- }
67
-
68
- // 4. Infinite Parameter Explosion (Faceted Navigation)
69
- if (params.size > 0) {
70
- const paramSet = this.pathCounters.get(pathKey) || new Set<string>();
71
- params.sort();
72
- const paramKey = params.toString();
73
- paramSet.add(paramKey);
74
- this.pathCounters.set(pathKey, paramSet);
75
-
76
- if (paramSet.size > this.PARAM_EXPLOSION_THRESHOLD) {
77
- risk = Math.max(risk, 0.95);
78
- if (!type) type = 'faceted_navigation';
79
- }
80
- }
81
-
82
- } catch (_e) {
83
- // Invalid URL
84
- }
85
-
86
- return { risk, type };
87
- }
88
-
89
- /**
90
- * Resets internal state (useful for multi-crawl sessions if needed)
91
- */
92
- reset() {
93
- this.pathCounters.clear();
94
- this.paginationCounters.clear();
95
- }
96
- }
@@ -1,105 +0,0 @@
1
- import { getDb } from './index.js';
2
- import { PageRepository } from './repositories/PageRepository.js';
3
- import { EdgeRepository } from './repositories/EdgeRepository.js';
4
- import { MetricsRepository, DbMetrics } from './repositories/MetricsRepository.js';
5
- import { SnapshotRepository } from './repositories/SnapshotRepository.js';
6
- import { Graph } from '../graph/graph.js';
7
-
8
- export function loadGraphFromSnapshot(snapshotId: number): Graph {
9
- const db = getDb();
10
- const pageRepo = new PageRepository(db);
11
- const edgeRepo = new EdgeRepository(db);
12
- const metricsRepo = new MetricsRepository(db);
13
- const snapshotRepo = new SnapshotRepository(db);
14
-
15
- const pages = pageRepo.getPagesBySnapshot(snapshotId);
16
- const metrics = metricsRepo.getMetrics(snapshotId);
17
- const snapshot = snapshotRepo.getSnapshot(snapshotId);
18
- const metricsMap = new Map<number, DbMetrics>();
19
- for (const m of metrics) {
20
- metricsMap.set(m.page_id, m);
21
- }
22
-
23
- const graph = new Graph();
24
- if (snapshot) {
25
- graph.limitReached = !!snapshot.limit_reached;
26
- }
27
- const idMap = new Map<number, string>();
28
-
29
- for (const p of pages) {
30
- idMap.set(p.id, p.normalized_url);
31
- graph.addNode(p.normalized_url, p.depth, p.http_status || 0);
32
-
33
- const m = metricsMap.get(p.id);
34
- let incrementalStatus: 'new' | 'changed' | 'unchanged' | undefined;
35
- if (p.first_seen_snapshot_id === snapshotId) {
36
- incrementalStatus = 'new';
37
- } else if (m?.crawl_status === 'cached') {
38
- incrementalStatus = 'unchanged';
39
- } else if (m?.crawl_status === 'fetched') {
40
- incrementalStatus = 'changed';
41
- }
42
-
43
- graph.updateNodeData(p.normalized_url, {
44
- canonical: p.canonical_url || undefined,
45
- contentHash: p.content_hash || undefined,
46
- simhash: p.simhash || undefined,
47
- etag: p.etag || undefined,
48
- lastModified: p.last_modified || undefined,
49
- html: p.html || undefined,
50
- soft404Score: p.soft404_score || undefined,
51
- noindex: !!p.noindex,
52
- nofollow: !!p.nofollow,
53
- incrementalStatus,
54
- securityError: p.security_error || undefined,
55
- retries: p.retries || undefined,
56
- bytesReceived: p.bytes_received || undefined,
57
- redirectChain: p.redirect_chain ? JSON.parse(p.redirect_chain) : undefined,
58
- crawlTrapFlag: !!p.crawl_trap_flag,
59
- crawlTrapRisk: p.crawl_trap_risk || undefined,
60
- trapType: p.trap_type || undefined,
61
- // Metrics
62
- pageRank: m?.pagerank ?? undefined,
63
- pageRankScore: m?.pagerank_score ?? m?.pagerank ?? undefined,
64
- authorityScore: m?.authority_score ?? undefined,
65
- hubScore: m?.hub_score ?? undefined,
66
- linkRole: m?.link_role ?? undefined,
67
- // Duplicate info
68
- duplicateClusterId: m?.duplicate_cluster_id ?? undefined,
69
- duplicateType: m?.duplicate_type ?? undefined,
70
- isClusterPrimary: m?.is_cluster_primary ? true : undefined,
71
- });
72
- }
73
-
74
- const edges = edgeRepo.getEdgesBySnapshot(snapshotId);
75
-
76
- for (const e of edges) {
77
- const source = idMap.get(e.source_page_id);
78
- const target = idMap.get(e.target_page_id);
79
- if (source && target) {
80
- graph.addEdge(source, target, e.weight || 1.0);
81
- }
82
- }
83
-
84
- // Load duplicate clusters
85
- const dupClusters = db.prepare('SELECT * FROM duplicate_clusters WHERE snapshot_id = ?').all(snapshotId) as any[];
86
- graph.duplicateClusters = dupClusters.map(c => ({
87
- id: c.id,
88
- type: c.type,
89
- size: c.size,
90
- representative: c.representative,
91
- severity: c.severity
92
- }));
93
-
94
- // Load content clusters
95
- const contentClusters = db.prepare('SELECT * FROM content_clusters WHERE snapshot_id = ?').all(snapshotId) as any[];
96
- graph.contentClusters = contentClusters.map(c => ({
97
- id: c.id,
98
- count: c.count,
99
- primaryUrl: c.primary_url,
100
- risk: c.risk,
101
- sharedPathPrefix: c.shared_path_prefix || undefined
102
- }));
103
-
104
- return graph;
105
- }