@crawlith/core 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analyze.d.ts +29 -8
  4. package/dist/analysis/analyze.js +325 -221
  5. package/dist/analysis/clustering.d.ts +23 -0
  6. package/dist/analysis/clustering.js +206 -0
  7. package/dist/analysis/content.d.ts +1 -1
  8. package/dist/analysis/content.js +11 -5
  9. package/dist/analysis/duplicate.d.ts +34 -0
  10. package/dist/analysis/duplicate.js +305 -0
  11. package/dist/analysis/heading.d.ts +116 -0
  12. package/dist/analysis/heading.js +356 -0
  13. package/dist/analysis/images.d.ts +1 -1
  14. package/dist/analysis/images.js +6 -5
  15. package/dist/analysis/links.d.ts +1 -1
  16. package/dist/analysis/links.js +8 -8
  17. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  18. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  19. package/dist/analysis/scoring.js +4 -1
  20. package/dist/analysis/seo.d.ts +8 -4
  21. package/dist/analysis/seo.js +41 -30
  22. package/dist/analysis/soft404.d.ts +17 -0
  23. package/dist/analysis/soft404.js +62 -0
  24. package/dist/analysis/structuredData.d.ts +1 -1
  25. package/dist/analysis/structuredData.js +5 -4
  26. package/dist/application/index.d.ts +2 -0
  27. package/dist/application/index.js +2 -0
  28. package/dist/application/usecase.d.ts +3 -0
  29. package/dist/application/usecase.js +1 -0
  30. package/dist/application/usecases.d.ts +114 -0
  31. package/dist/application/usecases.js +201 -0
  32. package/dist/audit/index.js +1 -1
  33. package/dist/audit/transport.d.ts +1 -1
  34. package/dist/audit/transport.js +5 -4
  35. package/dist/audit/types.d.ts +1 -0
  36. package/dist/constants.d.ts +17 -0
  37. package/dist/constants.js +23 -0
  38. package/dist/core/scope/scopeManager.js +3 -0
  39. package/dist/crawler/crawl.d.ts +2 -2
  40. package/dist/crawler/crawler.d.ts +17 -5
  41. package/dist/crawler/crawler.js +259 -94
  42. package/dist/crawler/fetcher.d.ts +1 -1
  43. package/dist/crawler/fetcher.js +6 -6
  44. package/dist/crawler/metricsRunner.d.ts +21 -1
  45. package/dist/crawler/metricsRunner.js +181 -60
  46. package/dist/crawler/normalize.d.ts +41 -0
  47. package/dist/crawler/normalize.js +119 -3
  48. package/dist/crawler/parser.d.ts +1 -3
  49. package/dist/crawler/parser.js +2 -49
  50. package/dist/crawler/resolver.d.ts +11 -0
  51. package/dist/crawler/resolver.js +67 -0
  52. package/dist/crawler/sitemap.d.ts +4 -1
  53. package/dist/crawler/sitemap.js +24 -18
  54. package/dist/crawler/trap.d.ts +5 -1
  55. package/dist/crawler/trap.js +23 -2
  56. package/dist/db/CrawlithDB.d.ts +110 -0
  57. package/dist/db/CrawlithDB.js +500 -0
  58. package/dist/db/graphLoader.js +15 -32
  59. package/dist/db/index.d.ts +9 -1
  60. package/dist/db/index.js +39 -31
  61. package/dist/db/migrations.d.ts +2 -0
  62. package/dist/db/{schema.js → migrations.js} +90 -43
  63. package/dist/db/pluginRegistry.d.ts +9 -0
  64. package/dist/db/pluginRegistry.js +19 -0
  65. package/dist/db/repositories/EdgeRepository.d.ts +5 -0
  66. package/dist/db/repositories/EdgeRepository.js +7 -0
  67. package/dist/db/repositories/MetricsRepository.d.ts +13 -8
  68. package/dist/db/repositories/MetricsRepository.js +14 -6
  69. package/dist/db/repositories/PageRepository.d.ts +5 -3
  70. package/dist/db/repositories/PageRepository.js +68 -17
  71. package/dist/db/repositories/SiteRepository.d.ts +6 -0
  72. package/dist/db/repositories/SiteRepository.js +4 -0
  73. package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
  74. package/dist/db/repositories/SnapshotRepository.js +48 -10
  75. package/dist/db/reset.d.ts +9 -0
  76. package/dist/db/reset.js +32 -0
  77. package/dist/db/statements.d.ts +12 -0
  78. package/dist/db/statements.js +40 -0
  79. package/dist/diff/compare.d.ts +0 -5
  80. package/dist/diff/compare.js +0 -12
  81. package/dist/diff/service.d.ts +16 -0
  82. package/dist/diff/service.js +41 -0
  83. package/dist/domain/index.d.ts +4 -0
  84. package/dist/domain/index.js +4 -0
  85. package/dist/events.d.ts +8 -0
  86. package/dist/graph/graph.d.ts +20 -42
  87. package/dist/graph/graph.js +12 -16
  88. package/dist/graph/hits.d.ts +23 -0
  89. package/dist/graph/hits.js +111 -0
  90. package/dist/graph/metrics.d.ts +0 -4
  91. package/dist/graph/metrics.js +19 -15
  92. package/dist/graph/pagerank.d.ts +17 -4
  93. package/dist/graph/pagerank.js +126 -93
  94. package/dist/index.d.ts +27 -9
  95. package/dist/index.js +27 -9
  96. package/dist/lock/lockManager.d.ts +1 -0
  97. package/dist/lock/lockManager.js +15 -0
  98. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  99. package/dist/plugin-system/plugin-cli.js +31 -0
  100. package/dist/plugin-system/plugin-config.d.ts +16 -0
  101. package/dist/plugin-system/plugin-config.js +36 -0
  102. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  103. package/dist/plugin-system/plugin-loader.js +122 -0
  104. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  105. package/dist/plugin-system/plugin-registry.js +167 -0
  106. package/dist/plugin-system/plugin-types.d.ts +205 -0
  107. package/dist/plugin-system/plugin-types.js +1 -0
  108. package/dist/ports/index.d.ts +9 -0
  109. package/dist/ports/index.js +1 -0
  110. package/dist/report/export.d.ts +3 -0
  111. package/dist/report/export.js +81 -0
  112. package/dist/report/insight.d.ts +27 -0
  113. package/dist/report/insight.js +103 -0
  114. package/dist/scoring/health.d.ts +17 -11
  115. package/dist/scoring/health.js +183 -140
  116. package/dist/utils/chalk.d.ts +6 -0
  117. package/dist/utils/chalk.js +41 -0
  118. package/dist/utils/secureConfig.d.ts +23 -0
  119. package/dist/utils/secureConfig.js +128 -0
  120. package/package.json +10 -4
  121. package/CHANGELOG.md +0 -13
  122. package/dist/db/schema.d.ts +0 -2
  123. package/dist/graph/cluster.d.ts +0 -6
  124. package/dist/graph/cluster.js +0 -221
  125. package/dist/graph/duplicate.d.ts +0 -10
  126. package/dist/graph/duplicate.js +0 -302
  127. package/dist/scoring/hits.d.ts +0 -10
  128. package/dist/scoring/hits.js +0 -131
  129. package/scripts/copy-assets.js +0 -37
  130. package/src/analysis/analysis_list.html +0 -35
  131. package/src/analysis/analysis_page.html +0 -123
  132. package/src/analysis/analyze.ts +0 -505
  133. package/src/analysis/content.ts +0 -62
  134. package/src/analysis/images.ts +0 -28
  135. package/src/analysis/links.ts +0 -41
  136. package/src/analysis/scoring.ts +0 -66
  137. package/src/analysis/seo.ts +0 -82
  138. package/src/analysis/structuredData.ts +0 -62
  139. package/src/analysis/templates.ts +0 -9
  140. package/src/audit/dns.ts +0 -49
  141. package/src/audit/headers.ts +0 -98
  142. package/src/audit/index.ts +0 -66
  143. package/src/audit/scoring.ts +0 -232
  144. package/src/audit/transport.ts +0 -258
  145. package/src/audit/types.ts +0 -102
  146. package/src/core/network/proxyAdapter.ts +0 -21
  147. package/src/core/network/rateLimiter.ts +0 -39
  148. package/src/core/network/redirectController.ts +0 -47
  149. package/src/core/network/responseLimiter.ts +0 -34
  150. package/src/core/network/retryPolicy.ts +0 -57
  151. package/src/core/scope/domainFilter.ts +0 -45
  152. package/src/core/scope/scopeManager.ts +0 -52
  153. package/src/core/scope/subdomainPolicy.ts +0 -39
  154. package/src/core/security/ipGuard.ts +0 -171
  155. package/src/crawler/crawl.ts +0 -9
  156. package/src/crawler/crawler.ts +0 -601
  157. package/src/crawler/extract.ts +0 -39
  158. package/src/crawler/fetcher.ts +0 -251
  159. package/src/crawler/metricsRunner.ts +0 -137
  160. package/src/crawler/normalize.ts +0 -108
  161. package/src/crawler/parser.ts +0 -190
  162. package/src/crawler/sitemap.ts +0 -76
  163. package/src/crawler/trap.ts +0 -96
  164. package/src/db/graphLoader.ts +0 -135
  165. package/src/db/index.ts +0 -75
  166. package/src/db/repositories/EdgeRepository.ts +0 -43
  167. package/src/db/repositories/MetricsRepository.ts +0 -63
  168. package/src/db/repositories/PageRepository.ts +0 -228
  169. package/src/db/repositories/SiteRepository.ts +0 -43
  170. package/src/db/repositories/SnapshotRepository.ts +0 -99
  171. package/src/db/schema.ts +0 -177
  172. package/src/diff/compare.ts +0 -84
  173. package/src/events.ts +0 -16
  174. package/src/graph/cluster.ts +0 -246
  175. package/src/graph/duplicate.ts +0 -350
  176. package/src/graph/graph.ts +0 -192
  177. package/src/graph/metrics.ts +0 -125
  178. package/src/graph/pagerank.ts +0 -126
  179. package/src/graph/simhash.ts +0 -76
  180. package/src/index.ts +0 -33
  181. package/src/lock/hashKey.ts +0 -51
  182. package/src/lock/lockManager.ts +0 -132
  183. package/src/lock/pidCheck.ts +0 -13
  184. package/src/report/crawl.html +0 -879
  185. package/src/report/crawlExport.ts +0 -58
  186. package/src/report/crawl_template.ts +0 -9
  187. package/src/report/html.ts +0 -27
  188. package/src/scoring/health.ts +0 -241
  189. package/src/scoring/hits.ts +0 -153
  190. package/src/scoring/orphanSeverity.ts +0 -176
  191. package/src/utils/version.ts +0 -18
  192. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  193. package/tests/analysis.unit.test.ts +0 -142
  194. package/tests/analyze.integration.test.ts +0 -133
  195. package/tests/analyze_markdown.test.ts +0 -98
  196. package/tests/audit/audit.test.ts +0 -101
  197. package/tests/audit/dns.test.ts +0 -31
  198. package/tests/audit/headers.test.ts +0 -45
  199. package/tests/audit/scoring.test.ts +0 -133
  200. package/tests/audit/security.test.ts +0 -12
  201. package/tests/audit/transport.test.ts +0 -111
  202. package/tests/clustering.test.ts +0 -118
  203. package/tests/clustering_risk.test.ts +0 -118
  204. package/tests/crawler.test.ts +0 -364
  205. package/tests/db/index.test.ts +0 -134
  206. package/tests/db/repositories.test.ts +0 -115
  207. package/tests/db.test.ts +0 -159
  208. package/tests/db_repos.test.ts +0 -72
  209. package/tests/diff.test.ts +0 -67
  210. package/tests/duplicate.test.ts +0 -110
  211. package/tests/extract.test.ts +0 -86
  212. package/tests/fetcher.test.ts +0 -110
  213. package/tests/fetcher_safety.test.ts +0 -91
  214. package/tests/fixtures/analyze-crawl.json +0 -26
  215. package/tests/graph/graph.test.ts +0 -100
  216. package/tests/graphLoader.test.ts +0 -124
  217. package/tests/hits.test.ts +0 -134
  218. package/tests/html_report.test.ts +0 -59
  219. package/tests/ipGuard.test.ts +0 -73
  220. package/tests/lock/lockManager.test.ts +0 -198
  221. package/tests/metrics.test.ts +0 -196
  222. package/tests/normalize.test.ts +0 -88
  223. package/tests/orphanSeverity.test.ts +0 -160
  224. package/tests/pagerank.test.ts +0 -98
  225. package/tests/parser.test.ts +0 -117
  226. package/tests/proxy_safety.test.ts +0 -57
  227. package/tests/redirect_safety.test.ts +0 -77
  228. package/tests/renderAnalysisCsv.test.ts +0 -183
  229. package/tests/safety.test.ts +0 -126
  230. package/tests/scope.test.ts +0 -84
  231. package/tests/scoring.test.ts +0 -60
  232. package/tests/sitemap.test.ts +0 -100
  233. package/tests/soft404.test.ts +0 -41
  234. package/tests/ssrf_fix.test.ts +0 -69
  235. package/tests/trap.test.ts +0 -39
  236. package/tests/visualization_data.test.ts +0 -46
  237. package/tsconfig.json +0 -11
@@ -1,171 +0,0 @@
1
- import * as dns from 'dns';
2
- import * as net from 'net';
3
- import { promisify } from 'util';
4
- import { Agent } from 'undici';
5
-
6
- const resolve4 = promisify(dns.resolve4);
7
- const resolve6 = promisify(dns.resolve6);
8
-
9
- export class IPGuard {
10
- /**
11
- * Checks if an IP address is internal/private
12
- */
13
- static isInternal(ip: string): boolean {
14
- if (net.isIPv4(ip)) {
15
- const parts = ip.split('.').map(Number);
16
-
17
- // 127.0.0.0/8
18
- if (parts[0] === 127) return true;
19
-
20
- // 10.0.0.0/8
21
- if (parts[0] === 10) return true;
22
-
23
- // 192.168.0.0/16
24
- if (parts[0] === 192 && parts[1] === 168) return true;
25
-
26
- // 172.16.0.0 – 172.31.255.255
27
- if (parts[0] === 172 && parts[1] >= 16 && parts[1] <= 31) return true;
28
-
29
- // 169.254.0.0/16
30
- if (parts[0] === 169 && parts[1] === 254) return true;
31
-
32
- // 0.0.0.0/8
33
- if (parts[0] === 0) return true;
34
-
35
- return false;
36
- }
37
-
38
- if (net.isIPv6(ip)) {
39
- // Normalize IPv6
40
- const expanded = IPGuard.expandIPv6(ip);
41
-
42
- // ::1
43
- if (expanded === '0000:0000:0000:0000:0000:0000:0000:0001') return true;
44
-
45
- // fc00::/7 (Unique Local Address) -> fc or fd
46
- const firstWord = parseInt(expanded.split(':')[0], 16);
47
- if ((firstWord & 0xfe00) === 0xfc00) return true;
48
-
49
- // fe80::/10 (Link Local)
50
- if ((firstWord & 0xffc0) === 0xfe80) return true;
51
-
52
- // IPv4-mapped IPv6: ::ffff:0:0/96
53
- if (expanded.startsWith('0000:0000:0000:0000:0000:ffff:')) {
54
- const parts = expanded.split(':');
55
- const p7 = parseInt(parts[6], 16);
56
- const p8 = parseInt(parts[7], 16);
57
- const ip4 = `${(p7 >> 8) & 255}.${p7 & 255}.${(p8 >> 8) & 255}.${p8 & 255}`;
58
- return IPGuard.isInternal(ip4);
59
- }
60
-
61
- return false;
62
- }
63
-
64
- return true; // Unknown format, block it for safety
65
- }
66
-
67
- /**
68
- * Resolves a hostname and validates all result IPs
69
- */
70
- static async validateHost(host: string): Promise<boolean> {
71
- if (net.isIP(host)) {
72
- return !IPGuard.isInternal(host);
73
- }
74
-
75
- try {
76
- const res4 = await resolve4(host).catch(() => [] as string[]);
77
- const res6 = await resolve6(host).catch(() => [] as string[]);
78
- const ips = [...res4, ...res6];
79
-
80
- if (ips.length === 0) return true; // Let the fetcher handle DNS failures
81
-
82
- return ips.every(ip => !IPGuard.isInternal(ip));
83
- } catch (_e) {
84
- // If resolution fails drastically, we block for safety or let fetcher try
85
- return false;
86
- }
87
- }
88
-
89
- /**
90
- * Custom lookup function for undici that validates the resolved IP.
91
- * Prevents DNS Rebinding attacks by checking the IP immediately before connection.
92
- */
93
- static secureLookup(
94
- hostname: string,
95
- options: dns.LookupOneOptions | dns.LookupAllOptions,
96
- callback: (err: NodeJS.ErrnoException | null, address: string | dns.LookupAddress[], family: number) => void
97
- ): void {
98
- dns.lookup(hostname, options as any, (err: NodeJS.ErrnoException | null, address: string | dns.LookupAddress[], family: number) => {
99
- if (err) {
100
- return callback(err, address as any, family);
101
- }
102
-
103
- const checkIP = (ip: string) => {
104
- if (IPGuard.isInternal(ip)) {
105
- return new Error(`Blocked internal IP: ${ip}`);
106
- }
107
- return null;
108
- };
109
-
110
- if (typeof address === 'string') {
111
- const error = checkIP(address);
112
- if (error) {
113
- // Return a custom error that undici will propagate
114
- const blockedError = new Error(`Blocked internal IP: ${address}`);
115
- (blockedError as any).code = 'EBLOCKED';
116
- return callback(blockedError, address, family);
117
- }
118
- } else if (Array.isArray(address)) {
119
- // Handle array of addresses (if options.all is true)
120
- for (const addr of address) {
121
- const error = checkIP(addr.address);
122
- if (error) {
123
- const blockedError = new Error(`Blocked internal IP: ${addr.address}`);
124
- (blockedError as any).code = 'EBLOCKED';
125
- return callback(blockedError, address, family);
126
- }
127
- }
128
- }
129
-
130
- callback(null, address, family);
131
- });
132
- }
133
-
134
- /**
135
- * Returns an undici Agent configured with secure DNS lookup.
136
- */
137
- static getSecureDispatcher(): Agent {
138
- return new Agent({
139
- connect: {
140
- lookup: IPGuard.secureLookup as any
141
- }
142
- });
143
- }
144
-
145
- private static expandIPv6(ip: string): string {
146
- if (ip === '::') return '0000:0000:0000:0000:0000:0000:0000:0000';
147
-
148
- let normalizedIp = ip;
149
- if (ip.includes('.')) {
150
- const lastColonIndex = ip.lastIndexOf(':');
151
- const lastPart = ip.substring(lastColonIndex + 1);
152
- if (net.isIPv4(lastPart)) {
153
- const parts = lastPart.split('.').map(Number);
154
- const hex1 = ((parts[0] << 8) | parts[1]).toString(16);
155
- const hex2 = ((parts[2] << 8) | parts[3]).toString(16);
156
- normalizedIp = ip.substring(0, lastColonIndex + 1) + hex1 + ':' + hex2;
157
- }
158
- }
159
-
160
- let full = normalizedIp;
161
- if (normalizedIp.includes('::')) {
162
- const parts = normalizedIp.split('::');
163
- const left = parts[0].split(':').filter(x => x !== '');
164
- const right = parts[1].split(':').filter(x => x !== '');
165
- const missing = 8 - (left.length + right.length);
166
- const middle = Array(missing).fill('0000');
167
- full = [...left, ...middle, ...right].join(':');
168
- }
169
- return full.split(':').map(part => part.padStart(4, '0')).join(':');
170
- }
171
- }
@@ -1,9 +0,0 @@
1
- import { Crawler, CrawlOptions } from './crawler.js';
2
- import { EngineContext } from '../events.js';
3
-
4
- export { CrawlOptions };
5
-
6
- export async function crawl(startUrl: string, options: CrawlOptions, context?: EngineContext): Promise<number> {
7
- const crawler = new Crawler(startUrl, options, context);
8
- return crawler.run();
9
- }