@crawlith/core 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analyze.d.ts +29 -8
  4. package/dist/analysis/analyze.js +325 -221
  5. package/dist/analysis/clustering.d.ts +23 -0
  6. package/dist/analysis/clustering.js +206 -0
  7. package/dist/analysis/content.d.ts +1 -1
  8. package/dist/analysis/content.js +11 -5
  9. package/dist/analysis/duplicate.d.ts +34 -0
  10. package/dist/analysis/duplicate.js +305 -0
  11. package/dist/analysis/heading.d.ts +116 -0
  12. package/dist/analysis/heading.js +356 -0
  13. package/dist/analysis/images.d.ts +1 -1
  14. package/dist/analysis/images.js +6 -5
  15. package/dist/analysis/links.d.ts +1 -1
  16. package/dist/analysis/links.js +8 -8
  17. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  18. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  19. package/dist/analysis/scoring.js +4 -1
  20. package/dist/analysis/seo.d.ts +8 -4
  21. package/dist/analysis/seo.js +41 -30
  22. package/dist/analysis/soft404.d.ts +17 -0
  23. package/dist/analysis/soft404.js +62 -0
  24. package/dist/analysis/structuredData.d.ts +1 -1
  25. package/dist/analysis/structuredData.js +5 -4
  26. package/dist/application/index.d.ts +2 -0
  27. package/dist/application/index.js +2 -0
  28. package/dist/application/usecase.d.ts +3 -0
  29. package/dist/application/usecase.js +1 -0
  30. package/dist/application/usecases.d.ts +114 -0
  31. package/dist/application/usecases.js +201 -0
  32. package/dist/audit/index.js +1 -1
  33. package/dist/audit/transport.d.ts +1 -1
  34. package/dist/audit/transport.js +5 -4
  35. package/dist/audit/types.d.ts +1 -0
  36. package/dist/constants.d.ts +17 -0
  37. package/dist/constants.js +23 -0
  38. package/dist/core/scope/scopeManager.js +3 -0
  39. package/dist/crawler/crawl.d.ts +2 -2
  40. package/dist/crawler/crawler.d.ts +17 -5
  41. package/dist/crawler/crawler.js +259 -94
  42. package/dist/crawler/fetcher.d.ts +1 -1
  43. package/dist/crawler/fetcher.js +6 -6
  44. package/dist/crawler/metricsRunner.d.ts +21 -1
  45. package/dist/crawler/metricsRunner.js +181 -60
  46. package/dist/crawler/normalize.d.ts +41 -0
  47. package/dist/crawler/normalize.js +119 -3
  48. package/dist/crawler/parser.d.ts +1 -3
  49. package/dist/crawler/parser.js +2 -49
  50. package/dist/crawler/resolver.d.ts +11 -0
  51. package/dist/crawler/resolver.js +67 -0
  52. package/dist/crawler/sitemap.d.ts +4 -1
  53. package/dist/crawler/sitemap.js +24 -18
  54. package/dist/crawler/trap.d.ts +5 -1
  55. package/dist/crawler/trap.js +23 -2
  56. package/dist/db/CrawlithDB.d.ts +110 -0
  57. package/dist/db/CrawlithDB.js +500 -0
  58. package/dist/db/graphLoader.js +15 -32
  59. package/dist/db/index.d.ts +9 -1
  60. package/dist/db/index.js +39 -31
  61. package/dist/db/migrations.d.ts +2 -0
  62. package/dist/db/{schema.js → migrations.js} +90 -43
  63. package/dist/db/pluginRegistry.d.ts +9 -0
  64. package/dist/db/pluginRegistry.js +19 -0
  65. package/dist/db/repositories/EdgeRepository.d.ts +5 -0
  66. package/dist/db/repositories/EdgeRepository.js +7 -0
  67. package/dist/db/repositories/MetricsRepository.d.ts +13 -8
  68. package/dist/db/repositories/MetricsRepository.js +14 -6
  69. package/dist/db/repositories/PageRepository.d.ts +5 -3
  70. package/dist/db/repositories/PageRepository.js +68 -17
  71. package/dist/db/repositories/SiteRepository.d.ts +6 -0
  72. package/dist/db/repositories/SiteRepository.js +4 -0
  73. package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
  74. package/dist/db/repositories/SnapshotRepository.js +48 -10
  75. package/dist/db/reset.d.ts +9 -0
  76. package/dist/db/reset.js +32 -0
  77. package/dist/db/statements.d.ts +12 -0
  78. package/dist/db/statements.js +40 -0
  79. package/dist/diff/compare.d.ts +0 -5
  80. package/dist/diff/compare.js +0 -12
  81. package/dist/diff/service.d.ts +16 -0
  82. package/dist/diff/service.js +41 -0
  83. package/dist/domain/index.d.ts +4 -0
  84. package/dist/domain/index.js +4 -0
  85. package/dist/events.d.ts +8 -0
  86. package/dist/graph/graph.d.ts +20 -42
  87. package/dist/graph/graph.js +12 -16
  88. package/dist/graph/hits.d.ts +23 -0
  89. package/dist/graph/hits.js +111 -0
  90. package/dist/graph/metrics.d.ts +0 -4
  91. package/dist/graph/metrics.js +19 -15
  92. package/dist/graph/pagerank.d.ts +17 -4
  93. package/dist/graph/pagerank.js +126 -93
  94. package/dist/index.d.ts +27 -9
  95. package/dist/index.js +27 -9
  96. package/dist/lock/lockManager.d.ts +1 -0
  97. package/dist/lock/lockManager.js +15 -0
  98. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  99. package/dist/plugin-system/plugin-cli.js +31 -0
  100. package/dist/plugin-system/plugin-config.d.ts +16 -0
  101. package/dist/plugin-system/plugin-config.js +36 -0
  102. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  103. package/dist/plugin-system/plugin-loader.js +122 -0
  104. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  105. package/dist/plugin-system/plugin-registry.js +167 -0
  106. package/dist/plugin-system/plugin-types.d.ts +205 -0
  107. package/dist/plugin-system/plugin-types.js +1 -0
  108. package/dist/ports/index.d.ts +9 -0
  109. package/dist/ports/index.js +1 -0
  110. package/dist/report/export.d.ts +3 -0
  111. package/dist/report/export.js +81 -0
  112. package/dist/report/insight.d.ts +27 -0
  113. package/dist/report/insight.js +103 -0
  114. package/dist/scoring/health.d.ts +17 -11
  115. package/dist/scoring/health.js +183 -140
  116. package/dist/utils/chalk.d.ts +6 -0
  117. package/dist/utils/chalk.js +41 -0
  118. package/dist/utils/secureConfig.d.ts +23 -0
  119. package/dist/utils/secureConfig.js +128 -0
  120. package/package.json +10 -4
  121. package/CHANGELOG.md +0 -13
  122. package/dist/db/schema.d.ts +0 -2
  123. package/dist/graph/cluster.d.ts +0 -6
  124. package/dist/graph/cluster.js +0 -221
  125. package/dist/graph/duplicate.d.ts +0 -10
  126. package/dist/graph/duplicate.js +0 -302
  127. package/dist/scoring/hits.d.ts +0 -10
  128. package/dist/scoring/hits.js +0 -131
  129. package/scripts/copy-assets.js +0 -37
  130. package/src/analysis/analysis_list.html +0 -35
  131. package/src/analysis/analysis_page.html +0 -123
  132. package/src/analysis/analyze.ts +0 -505
  133. package/src/analysis/content.ts +0 -62
  134. package/src/analysis/images.ts +0 -28
  135. package/src/analysis/links.ts +0 -41
  136. package/src/analysis/scoring.ts +0 -66
  137. package/src/analysis/seo.ts +0 -82
  138. package/src/analysis/structuredData.ts +0 -62
  139. package/src/analysis/templates.ts +0 -9
  140. package/src/audit/dns.ts +0 -49
  141. package/src/audit/headers.ts +0 -98
  142. package/src/audit/index.ts +0 -66
  143. package/src/audit/scoring.ts +0 -232
  144. package/src/audit/transport.ts +0 -258
  145. package/src/audit/types.ts +0 -102
  146. package/src/core/network/proxyAdapter.ts +0 -21
  147. package/src/core/network/rateLimiter.ts +0 -39
  148. package/src/core/network/redirectController.ts +0 -47
  149. package/src/core/network/responseLimiter.ts +0 -34
  150. package/src/core/network/retryPolicy.ts +0 -57
  151. package/src/core/scope/domainFilter.ts +0 -45
  152. package/src/core/scope/scopeManager.ts +0 -52
  153. package/src/core/scope/subdomainPolicy.ts +0 -39
  154. package/src/core/security/ipGuard.ts +0 -171
  155. package/src/crawler/crawl.ts +0 -9
  156. package/src/crawler/crawler.ts +0 -601
  157. package/src/crawler/extract.ts +0 -39
  158. package/src/crawler/fetcher.ts +0 -251
  159. package/src/crawler/metricsRunner.ts +0 -137
  160. package/src/crawler/normalize.ts +0 -108
  161. package/src/crawler/parser.ts +0 -190
  162. package/src/crawler/sitemap.ts +0 -76
  163. package/src/crawler/trap.ts +0 -96
  164. package/src/db/graphLoader.ts +0 -135
  165. package/src/db/index.ts +0 -75
  166. package/src/db/repositories/EdgeRepository.ts +0 -43
  167. package/src/db/repositories/MetricsRepository.ts +0 -63
  168. package/src/db/repositories/PageRepository.ts +0 -228
  169. package/src/db/repositories/SiteRepository.ts +0 -43
  170. package/src/db/repositories/SnapshotRepository.ts +0 -99
  171. package/src/db/schema.ts +0 -177
  172. package/src/diff/compare.ts +0 -84
  173. package/src/events.ts +0 -16
  174. package/src/graph/cluster.ts +0 -246
  175. package/src/graph/duplicate.ts +0 -350
  176. package/src/graph/graph.ts +0 -192
  177. package/src/graph/metrics.ts +0 -125
  178. package/src/graph/pagerank.ts +0 -126
  179. package/src/graph/simhash.ts +0 -76
  180. package/src/index.ts +0 -33
  181. package/src/lock/hashKey.ts +0 -51
  182. package/src/lock/lockManager.ts +0 -132
  183. package/src/lock/pidCheck.ts +0 -13
  184. package/src/report/crawl.html +0 -879
  185. package/src/report/crawlExport.ts +0 -58
  186. package/src/report/crawl_template.ts +0 -9
  187. package/src/report/html.ts +0 -27
  188. package/src/scoring/health.ts +0 -241
  189. package/src/scoring/hits.ts +0 -153
  190. package/src/scoring/orphanSeverity.ts +0 -176
  191. package/src/utils/version.ts +0 -18
  192. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  193. package/tests/analysis.unit.test.ts +0 -142
  194. package/tests/analyze.integration.test.ts +0 -133
  195. package/tests/analyze_markdown.test.ts +0 -98
  196. package/tests/audit/audit.test.ts +0 -101
  197. package/tests/audit/dns.test.ts +0 -31
  198. package/tests/audit/headers.test.ts +0 -45
  199. package/tests/audit/scoring.test.ts +0 -133
  200. package/tests/audit/security.test.ts +0 -12
  201. package/tests/audit/transport.test.ts +0 -111
  202. package/tests/clustering.test.ts +0 -118
  203. package/tests/clustering_risk.test.ts +0 -118
  204. package/tests/crawler.test.ts +0 -364
  205. package/tests/db/index.test.ts +0 -134
  206. package/tests/db/repositories.test.ts +0 -115
  207. package/tests/db.test.ts +0 -159
  208. package/tests/db_repos.test.ts +0 -72
  209. package/tests/diff.test.ts +0 -67
  210. package/tests/duplicate.test.ts +0 -110
  211. package/tests/extract.test.ts +0 -86
  212. package/tests/fetcher.test.ts +0 -110
  213. package/tests/fetcher_safety.test.ts +0 -91
  214. package/tests/fixtures/analyze-crawl.json +0 -26
  215. package/tests/graph/graph.test.ts +0 -100
  216. package/tests/graphLoader.test.ts +0 -124
  217. package/tests/hits.test.ts +0 -134
  218. package/tests/html_report.test.ts +0 -59
  219. package/tests/ipGuard.test.ts +0 -73
  220. package/tests/lock/lockManager.test.ts +0 -198
  221. package/tests/metrics.test.ts +0 -196
  222. package/tests/normalize.test.ts +0 -88
  223. package/tests/orphanSeverity.test.ts +0 -160
  224. package/tests/pagerank.test.ts +0 -98
  225. package/tests/parser.test.ts +0 -117
  226. package/tests/proxy_safety.test.ts +0 -57
  227. package/tests/redirect_safety.test.ts +0 -77
  228. package/tests/renderAnalysisCsv.test.ts +0 -183
  229. package/tests/safety.test.ts +0 -126
  230. package/tests/scope.test.ts +0 -84
  231. package/tests/scoring.test.ts +0 -60
  232. package/tests/sitemap.test.ts +0 -100
  233. package/tests/soft404.test.ts +0 -41
  234. package/tests/ssrf_fix.test.ts +0 -69
  235. package/tests/trap.test.ts +0 -39
  236. package/tests/visualization_data.test.ts +0 -46
  237. package/tsconfig.json +0 -11
@@ -1,251 +0,0 @@
1
- import { request, Dispatcher } from 'undici';
2
- import * as net from 'net';
3
- import { IPGuard } from '../core/security/ipGuard.js';
4
- import { RateLimiter } from '../core/network/rateLimiter.js';
5
- import { RetryPolicy } from '../core/network/retryPolicy.js';
6
- import { ResponseLimiter } from '../core/network/responseLimiter.js';
7
- import { RedirectController } from '../core/network/redirectController.js';
8
- import { ProxyAdapter } from '../core/network/proxyAdapter.js';
9
- import { ScopeManager } from '../core/scope/scopeManager.js';
10
- import { version } from '../utils/version.js';
11
-
12
- export interface RedirectStep {
13
- url: string;
14
- status: number;
15
- target: string;
16
- }
17
-
18
- export interface FetchResult {
19
- status: number
20
- | 'blocked_internal_ip'
21
- | 'blocked_by_domain_filter'
22
- | 'blocked_subdomain'
23
- | 'oversized'
24
- | 'failed_after_retries'
25
- | 'network_error'
26
- | 'redirect_limit_exceeded'
27
- | 'redirect_loop'
28
- | 'proxy_connection_failed';
29
- headers: Record<string, string | string[] | undefined>;
30
- body: string;
31
- redirectChain: RedirectStep[];
32
- etag: string | null;
33
- lastModified: string | null;
34
- finalUrl: string;
35
- retries?: number;
36
- bytesReceived?: number;
37
- }
38
-
39
- export interface FetchOptions {
40
- etag?: string;
41
- lastModified?: string;
42
- rate?: number;
43
- maxBytes?: number;
44
- crawlDelay?: number;
45
- }
46
-
47
- export class Fetcher {
48
- private userAgent = 'crawlith/1.0';
49
- private rateLimiter: RateLimiter;
50
- private proxyAdapter: ProxyAdapter;
51
- private secureDispatcher: Dispatcher;
52
- private scopeManager?: ScopeManager;
53
- private maxRedirects: number;
54
-
55
- constructor(options: {
56
- rate?: number;
57
- proxyUrl?: string;
58
- scopeManager?: ScopeManager;
59
- maxRedirects?: number;
60
- userAgent?: string;
61
- } = {}) {
62
- this.rateLimiter = new RateLimiter(options.rate || 2);
63
- this.proxyAdapter = new ProxyAdapter(options.proxyUrl);
64
-
65
- if (this.proxyAdapter.dispatcher) {
66
- this.secureDispatcher = this.proxyAdapter.dispatcher;
67
- } else {
68
- this.secureDispatcher = IPGuard.getSecureDispatcher();
69
- }
70
-
71
- this.scopeManager = options.scopeManager;
72
- this.maxRedirects = Math.min(options.maxRedirects ?? 2, 11);
73
- this.userAgent = options.userAgent || `crawlith/${version}`;
74
- }
75
-
76
- async fetch(url: string, options: FetchOptions = {}): Promise<FetchResult> {
77
- const maxBytes = options.maxBytes || 2000000;
78
- const redirectChain: RedirectStep[] = [];
79
- const redirectController = new RedirectController(this.maxRedirects, url);
80
-
81
- let currentUrl = url;
82
- let totalRetries = 0;
83
-
84
- // Use a while(true) and explicit return/continue to handle redirects
85
- while (true) {
86
- const urlObj = new URL(currentUrl);
87
-
88
- // 1. SSRF Guard (IP Literals only)
89
- // We only check explicit IP literals here to fail fast.
90
- // For domains, we rely on the secureDispatcher (which uses IPGuard.secureLookup)
91
- // to resolve and validate the IP at connection time, preventing TOCTOU attacks.
92
- if (net.isIP(urlObj.hostname)) {
93
- if (IPGuard.isInternal(urlObj.hostname)) {
94
- return this.errorResult('blocked_internal_ip', currentUrl, redirectChain, totalRetries);
95
- }
96
- }
97
-
98
- // 2. Scope Validation (Domain & Subdomain)
99
- if (this.scopeManager) {
100
- const eligibility = this.scopeManager.isUrlEligible(currentUrl);
101
- if (eligibility !== 'allowed') {
102
- return this.errorResult(eligibility, currentUrl, redirectChain, totalRetries);
103
- }
104
- }
105
-
106
- // 3. Rate Limiting
107
- await this.rateLimiter.waitForToken(urlObj.hostname, options.crawlDelay);
108
-
109
- try {
110
- // 4. Retry Strategy
111
- const result = await RetryPolicy.execute(
112
- async (attempt) => {
113
- if (attempt > 0) totalRetries++;
114
-
115
- const headers: Record<string, string> = {
116
- 'User-Agent': this.userAgent
117
- };
118
-
119
- // Conditional GET only for the FIRST request in a chain
120
- if (redirectChain.length === 0) {
121
- if (options.etag) headers['If-None-Match'] = options.etag;
122
- if (options.lastModified) headers['If-Modified-Since'] = options.lastModified;
123
- }
124
-
125
- const res = await request(currentUrl, {
126
- method: 'GET',
127
- headers,
128
- maxRedirections: 0,
129
- dispatcher: this.secureDispatcher,
130
- headersTimeout: 10000,
131
- bodyTimeout: 10000
132
- });
133
-
134
- if (RetryPolicy.isRetryableStatus(res.statusCode)) {
135
- await res.body.dump();
136
- throw new Error(`Status ${res.statusCode}`);
137
- }
138
-
139
- return res;
140
- },
141
- (error) => RetryPolicy.isNetworkError(error) || error.message.startsWith('Status ')
142
- );
143
-
144
- const status = result.statusCode;
145
- const resHeaders = result.headers;
146
-
147
- const getHeader = (name: string): string | null => {
148
- const val = resHeaders[name.toLowerCase()];
149
- if (Array.isArray(val)) return val[0];
150
- return (val as string) || null;
151
- };
152
-
153
- const etag = getHeader('etag');
154
- const lastModified = getHeader('last-modified');
155
-
156
- // Handle Redirects
157
- if (status >= 300 && status < 400 && status !== 304) {
158
- const location = getHeader('location');
159
- if (location) {
160
- let targetUrl: string;
161
- try {
162
- targetUrl = new URL(location, currentUrl).toString();
163
- } catch (_e) {
164
- // Bad redirect location, treat as final but maybe error?
165
- const body = await ResponseLimiter.streamToString(result.body, maxBytes);
166
- return { status, headers: resHeaders, body, redirectChain, etag: null, lastModified: null, finalUrl: currentUrl, retries: totalRetries };
167
- }
168
-
169
- const redirectError = redirectController.nextHop(targetUrl);
170
- if (redirectError) {
171
- await result.body.dump();
172
- return this.errorResult(redirectError, currentUrl, redirectChain, totalRetries);
173
- }
174
-
175
- redirectChain.push({ url: currentUrl, status, target: targetUrl });
176
- await result.body.dump();
177
- currentUrl = targetUrl;
178
- continue; // Next iteration for redirect target
179
- }
180
- }
181
-
182
- // 5. Max Response Size (Streaming)
183
- let bytesReceived = 0;
184
- try {
185
- const body = status === 304 ? '' : await ResponseLimiter.streamToString(
186
- result.body,
187
- maxBytes,
188
- (bytes) => { bytesReceived = bytes; }
189
- );
190
-
191
- return {
192
- status,
193
- headers: resHeaders,
194
- body,
195
- redirectChain,
196
- etag,
197
- lastModified,
198
- finalUrl: currentUrl,
199
- retries: totalRetries,
200
- bytesReceived
201
- };
202
- } catch (e: any) {
203
- if (e.message === 'Oversized response') {
204
- return {
205
- status: 'oversized',
206
- headers: resHeaders,
207
- body: '',
208
- redirectChain,
209
- etag: null,
210
- lastModified: null,
211
- finalUrl: currentUrl,
212
- retries: totalRetries,
213
- bytesReceived
214
- };
215
- }
216
- throw e;
217
- }
218
-
219
- } catch (error: any) {
220
- // Map common network errors to specific statuses if needed
221
- const isProxyError = error.message?.toLowerCase().includes('proxy') || error.code === 'ECONNREFUSED';
222
-
223
- if (error.code === 'EBLOCKED' || error.message?.includes('Blocked internal IP')) {
224
- return this.errorResult('blocked_internal_ip', currentUrl, redirectChain, totalRetries);
225
- }
226
-
227
- const finalStatus = isProxyError ? 'proxy_connection_failed' : 'network_error';
228
-
229
- return this.errorResult(
230
- totalRetries >= RetryPolicy.DEFAULT_CONFIG.maxRetries ? 'failed_after_retries' : finalStatus,
231
- currentUrl,
232
- redirectChain,
233
- totalRetries
234
- );
235
- }
236
- }
237
- }
238
-
239
- private errorResult(status: any, finalUrl: string, redirectChain: RedirectStep[], retries: number): FetchResult {
240
- return {
241
- status,
242
- headers: {},
243
- body: '',
244
- redirectChain,
245
- etag: null,
246
- lastModified: null,
247
- finalUrl,
248
- retries
249
- };
250
- }
251
- }
@@ -1,137 +0,0 @@
1
- import { getDb } from '../db/index.js';
2
- import { loadGraphFromSnapshot } from '../db/graphLoader.js';
3
- import { MetricsRepository } from '../db/repositories/MetricsRepository.js';
4
- import { SnapshotRepository } from '../db/repositories/SnapshotRepository.js';
5
- import { PageRepository } from '../db/repositories/PageRepository.js';
6
- import { computePageRank } from '../graph/pagerank.js';
7
- import { calculateMetrics } from '../graph/metrics.js';
8
- import { computeHITS } from '../scoring/hits.js';
9
- import { EngineContext } from '../events.js';
10
- import { calculateHealthScore, collectCrawlIssues } from '../scoring/health.js';
11
-
12
- import { Graph } from '../graph/graph.js';
13
-
14
- export function runPostCrawlMetrics(snapshotId: number, maxDepth: number, context?: EngineContext, limitReached: boolean = false, graphInstance?: Graph) {
15
- const db = getDb();
16
- const metricsRepo = new MetricsRepository(db);
17
- const snapshotRepo = new SnapshotRepository(db);
18
- const pageRepo = new PageRepository(db);
19
-
20
- const graph = graphInstance || loadGraphFromSnapshot(snapshotId);
21
-
22
- // Fallback emitter
23
- const emit = (event: any) => {
24
- if (context) {
25
- context.emit(event);
26
- } else {
27
- if (event.type === 'error') console.error(event.message);
28
- else if (event.type !== 'debug') console.log(event.message || event.phase);
29
- }
30
- };
31
-
32
- const snapshot = snapshotRepo.getSnapshot(snapshotId);
33
- if (!snapshot) {
34
- emit({ type: 'error', message: `Snapshot ${snapshotId} not found` });
35
- return;
36
- }
37
-
38
- if (!graphInstance) {
39
- emit({ type: 'metrics:start', phase: 'Loading graph' });
40
- }
41
-
42
- emit({ type: 'metrics:start', phase: 'Computing PageRank' });
43
- computePageRank(graph);
44
-
45
- emit({ type: 'metrics:start', phase: 'Computing HITS' });
46
- computeHITS(graph);
47
-
48
- emit({ type: 'metrics:start', phase: 'Updating metrics in DB' });
49
- const nodes = graph.getNodes();
50
-
51
- // Pre-fetch all page IDs to avoid N+1 queries
52
- // Use getPagesIdentityBySnapshot to avoid loading full page content (HTML) into memory again
53
- const pages = pageRepo.getPagesIdentityBySnapshot(snapshotId);
54
- const urlToId = new Map<string, number>();
55
- for (const p of pages) {
56
- urlToId.set(p.normalized_url, p.id);
57
- }
58
-
59
- const clusterStmt = db.prepare(`
60
- INSERT OR REPLACE INTO duplicate_clusters (id, snapshot_id, type, size, representative, severity)
61
- VALUES (?, ?, ?, ?, ?, ?)
62
- `);
63
-
64
- const contentStmt = db.prepare(`
65
- INSERT OR REPLACE INTO content_clusters (id, snapshot_id, count, primary_url, risk, shared_path_prefix)
66
- VALUES (?, ?, ?, ?, ?, ?)
67
- `);
68
-
69
- const tx = db.transaction(() => {
70
- for (const node of nodes) {
71
- const pageId = urlToId.get(node.url);
72
- if (!pageId) continue;
73
-
74
-
75
- metricsRepo.insertMetrics({
76
- snapshot_id: snapshotId,
77
- page_id: pageId,
78
- authority_score: node.authorityScore ?? null,
79
- hub_score: node.hubScore ?? null,
80
- pagerank: node.pageRank ?? null,
81
- pagerank_score: node.pageRankScore ?? null,
82
- link_role: node.linkRole ?? null,
83
- crawl_status: node.crawlStatus ?? null,
84
- word_count: node.wordCount ?? null,
85
- thin_content_score: node.thinContentScore ?? null,
86
- external_link_ratio: node.externalLinkRatio ?? null,
87
- orphan_score: node.orphanScore ?? null,
88
- duplicate_cluster_id: node.duplicateClusterId ?? null,
89
- duplicate_type: node.duplicateType ?? null,
90
- is_cluster_primary: node.isClusterPrimary ? 1 : 0
91
- });
92
-
93
- // Update page-level crawl trap data
94
- if (node.crawlTrapFlag || node.redirectChain?.length || node.bytesReceived) {
95
- pageRepo.upsertPage({
96
- site_id: snapshot.site_id,
97
- normalized_url: node.url,
98
- last_seen_snapshot_id: snapshotId,
99
- redirect_chain: node.redirectChain ? JSON.stringify(node.redirectChain) : null,
100
- bytes_received: node.bytesReceived ?? null,
101
- crawl_trap_flag: node.crawlTrapFlag ? 1 : 0,
102
- crawl_trap_risk: node.crawlTrapRisk ?? null,
103
- trap_type: node.trapType ?? null,
104
- });
105
- }
106
- }
107
-
108
- // Save duplicate clusters
109
- for (const cluster of graph.duplicateClusters) {
110
- clusterStmt.run(cluster.id, snapshotId, cluster.type, cluster.size, cluster.representative, cluster.severity);
111
- }
112
-
113
- // Save content clusters
114
- for (const cluster of graph.contentClusters) {
115
- contentStmt.run(cluster.id, snapshotId, cluster.count, cluster.primaryUrl, cluster.risk, cluster.sharedPathPrefix ?? null);
116
- }
117
- });
118
- tx();
119
-
120
- emit({ type: 'metrics:start', phase: 'Computing aggregate stats' });
121
- const metrics = calculateMetrics(graph, maxDepth);
122
-
123
- // Calculate penalty-based health score (matches CLI)
124
- const issues = collectCrawlIssues(graph, metrics);
125
- const health = calculateHealthScore(metrics.totalPages, issues);
126
-
127
- snapshotRepo.updateSnapshotStatus(snapshotId, 'completed', {
128
- node_count: metrics.totalPages,
129
- edge_count: metrics.totalEdges,
130
- health_score: health.score,
131
- orphan_count: issues.orphanPages,
132
- thin_content_count: issues.thinContent,
133
- limit_reached: limitReached ? 1 : 0
134
- });
135
-
136
- emit({ type: 'metrics:complete', durationMs: 0 });
137
- }
@@ -1,108 +0,0 @@
1
- /**
2
- * Normalizes a URL string based on specific rules.
3
- */
4
- export interface NormalizeOptions {
5
- stripQuery?: boolean;
6
- }
7
-
8
- const TRACKING_PARAMS = new Set([
9
- 'utm_source',
10
- 'utm_medium',
11
- 'utm_campaign',
12
- 'utm_term',
13
- 'utm_content',
14
- 'fbclid',
15
- 'gclid',
16
- 'msclkid'
17
- ]);
18
-
19
- const SKIP_EXTENSIONS = new Set([
20
- '.pdf', '.jpg', '.png', '.svg', '.webp', '.gif',
21
- '.zip', '.xml', '.json', '.mp4'
22
- ]);
23
-
24
- export function normalizeUrl(input: string, base: string, options: NormalizeOptions = {}): string | null {
25
- try {
26
- // 1. Resolve absolute URL
27
- let u: URL;
28
- if (base) {
29
- u = new URL(input, base);
30
- } else {
31
- u = new URL(input);
32
- }
33
-
34
- // 2. Allow only http/https
35
- if (u.protocol !== 'http:' && u.protocol !== 'https:') {
36
- return null;
37
- }
38
-
39
- // 3. Lowercase hostname
40
- u.hostname = u.hostname.toLowerCase();
41
-
42
- // 4. Remove default ports
43
- if ((u.protocol === 'http:' && u.port === '80') || (u.protocol === 'https:' && u.port === '443')) {
44
- u.port = '';
45
- }
46
-
47
- // 5. Remove hash fragments
48
- u.hash = '';
49
-
50
- // 6. Query params handling
51
- const params = new URLSearchParams(u.search);
52
- const newParams = new URLSearchParams();
53
-
54
- // Check if we should strip all query params
55
- if (options.stripQuery) {
56
- u.search = '';
57
- } else {
58
- // Filter tracking params
59
- let hasParams = false;
60
- for (const [key, value] of params) {
61
- // Remove utm_* and other tracking params
62
- if (key.startsWith('utm_') || TRACKING_PARAMS.has(key)) {
63
- continue;
64
- }
65
- newParams.append(key, value);
66
- hasParams = true;
67
- }
68
-
69
- // Sort for consistency
70
- newParams.sort();
71
-
72
- if (hasParams || newParams.toString()) {
73
- u.search = newParams.toString();
74
- } else {
75
- u.search = '';
76
- }
77
- }
78
-
79
- // 7. Normalize trailing slash
80
- // 8. Collapse duplicate slashes in pathname
81
- let pathname = u.pathname;
82
-
83
- // Collapse duplicate slashes
84
- pathname = pathname.replace(/\/+/g, '/');
85
-
86
- // Remove trailing slash unless root
87
- if (pathname.length > 1 && pathname.endsWith('/')) {
88
- pathname = pathname.slice(0, -1);
89
- }
90
-
91
- u.pathname = pathname;
92
-
93
- // 9. Skip non-HTML assets by extension
94
- const lastDotIndex = u.pathname.lastIndexOf('.');
95
- if (lastDotIndex !== -1) {
96
- const ext = u.pathname.slice(lastDotIndex).toLowerCase();
97
- if (SKIP_EXTENSIONS.has(ext)) {
98
- return null;
99
- }
100
- }
101
-
102
- // 10. Return final string
103
- return u.toString();
104
-
105
- } catch (_e) {
106
- return null;
107
- }
108
- }
@@ -1,190 +0,0 @@
1
- import * as cheerio from 'cheerio';
2
- import crypto from 'node:crypto';
3
- import { normalizeUrl } from './normalize.js';
4
- import { SimHash } from '../graph/simhash.js';
5
-
6
- export interface ParseLink {
7
- url: string;
8
- weight: number;
9
- }
10
-
11
- export interface ParseResult {
12
- links: ParseLink[];
13
- html: string;
14
- canonical: string | null;
15
- noindex: boolean;
16
- nofollow: boolean;
17
- contentHash: string;
18
- simhash?: string;
19
- uniqueTokenRatio?: number;
20
- soft404Score: number;
21
- soft404Signals: string[];
22
- }
23
-
24
- export class Parser {
25
- /**
26
- * Parses HTML content to extract metadata and links.
27
- */
28
- parse(html: string, baseUrl: string, status: number): ParseResult {
29
- const $ = cheerio.load(html);
30
-
31
- // 1. Robots Meta
32
- let noindex = false;
33
- let nofollow = false;
34
- const robotsMeta = $('meta[name="robots"]').attr('content');
35
- if (robotsMeta) {
36
- const directives = robotsMeta.toLowerCase().split(',').map(s => s.trim());
37
- if (directives.includes('noindex') || directives.includes('none')) noindex = true;
38
- if (directives.includes('nofollow') || directives.includes('none')) nofollow = true;
39
- }
40
-
41
- // 2. Canonical
42
- let canonical: string | null = null;
43
- const canonicalLink = $('link[rel="canonical"]').attr('href');
44
- if (canonicalLink) {
45
- try {
46
- // Resolve relative canonicals
47
- const u = new URL(canonicalLink, baseUrl);
48
- // Normalize minimally (remove default ports, lowercase host, etc)
49
- // We don't strip query by default for canonical as it might be relevant
50
- canonical = normalizeUrl(u.toString(), '', { stripQuery: false });
51
- } catch (_e) {
52
- // Invalid canonical URL, ignore
53
- }
54
- }
55
-
56
- // 3. Links
57
- const links = new Map<string, number>();
58
- if (!nofollow) { // Don't extract links if nofollow is set
59
- $('a').each((_, element) => {
60
- const href = $(element).attr('href');
61
- const rel = $(element).attr('rel');
62
- const isNofollow = rel && rel.toLowerCase().includes('nofollow');
63
-
64
- if (href && !isNofollow) {
65
- try {
66
- const absoluteUrl = new URL(href, baseUrl);
67
- if (absoluteUrl.protocol === 'http:' || absoluteUrl.protocol === 'https:') {
68
- absoluteUrl.hash = '';
69
- const urlStr = absoluteUrl.toString();
70
-
71
- // Calculate Weight
72
- let weight = 1.0; // Default: Body
73
-
74
- // Semantic Check
75
- const $el = $(element);
76
- if ($el.closest('nav').length > 0 || $el.closest('header').length > 0) {
77
- weight = 0.7;
78
- } else if ($el.closest('footer').length > 0) {
79
- weight = 0.4;
80
- } else {
81
- // Secondary check: Common attributes
82
- const parentText = ($el.parent().attr('class') || '') + ($el.parent().attr('id') || '');
83
- const grandParentText = ($el.parent().parent().attr('class') || '') + ($el.parent().parent().attr('id') || '');
84
- const combinedContext = (parentText + grandParentText).toLowerCase();
85
-
86
- if (combinedContext.includes('nav') || combinedContext.includes('menu')) {
87
- weight = 0.7;
88
- } else if (combinedContext.includes('footer')) {
89
- weight = 0.4;
90
- }
91
- }
92
-
93
- // Store highest weight if multiple links to same URL
94
- const currentMax = links.get(urlStr) || 0;
95
- if (weight > currentMax) {
96
- links.set(urlStr, weight);
97
- }
98
- }
99
- } catch (_e) {
100
- // Invalid URL
101
- }
102
- }
103
- });
104
- }
105
-
106
- // 4. Content Hash (ignoring script/style/comments)
107
- // Clone body to avoid modifying the loaded doc (though we don't reuse it)
108
- // Actually cheerio load gives us a fresh instance.
109
- $('script').remove();
110
- $('style').remove();
111
- $('noscript').remove();
112
- $('iframe').remove();
113
-
114
- const cleanText = $('body').text().replace(/\s+/g, ' ').trim();
115
- const contentHash = crypto.createHash('sha256').update(cleanText).digest('hex');
116
-
117
- // 4b. Simhash & Token calculation (limit to 50k chars for performance)
118
- const limitedText = cleanText.substring(0, 50000).toLowerCase();
119
- const tokens = limitedText.split(/\W+/).filter(t => t.length > 0);
120
- const uniqueTokens = new Set(tokens);
121
- const uniqueTokenRatio = tokens.length > 0 ? (uniqueTokens.size / tokens.length) : 0;
122
- const simhash = SimHash.generate(tokens).toString();
123
-
124
- // 5. Soft 404 Detection
125
- let soft404Score = 0;
126
- const soft404Signals: string[] = [];
127
-
128
- if (status === 200) {
129
- const title = $('title').text().toLowerCase();
130
- const h1Text = $('h1').first().text().toLowerCase();
131
- const bodyText = cleanText.toLowerCase();
132
-
133
- const errorPatterns = ['404', 'not found', 'error', 'doesn\'t exist', 'unavailable', 'invalid'];
134
-
135
- // Pattern checks
136
- for (const pattern of errorPatterns) {
137
- if (title.includes(pattern)) {
138
- soft404Score += 0.4;
139
- soft404Signals.push(`title_pattern_${pattern.replace(/\s+/g, '_')}`);
140
- break;
141
- }
142
- }
143
-
144
- for (const pattern of errorPatterns) {
145
- if (h1Text.includes(pattern)) {
146
- soft404Score += 0.3;
147
- soft404Signals.push(`h1_pattern_${pattern.replace(/\s+/g, '_')}`);
148
- break;
149
- }
150
- }
151
-
152
- if (bodyText.includes('page not found') || bodyText.includes('404 error')) {
153
- soft404Score += 0.2;
154
- soft404Signals.push('body_error_phrase');
155
- }
156
-
157
- // Content length check (Word count approximation)
158
- const words = cleanText.split(/\s+/).filter(w => w.length > 0);
159
- if (words.length < 50) {
160
- soft404Score += 0.3;
161
- soft404Signals.push('very_low_word_count');
162
- } else if (words.length < 150) {
163
- soft404Score += 0.1;
164
- soft404Signals.push('low_word_count');
165
- }
166
-
167
- // Link count check
168
- if (links.size === 0) {
169
- soft404Score += 0.2;
170
- soft404Signals.push('no_outbound_links');
171
- }
172
-
173
- // Cap at 1.0
174
- soft404Score = Math.min(1.0, soft404Score);
175
- }
176
-
177
- return {
178
- links: Array.from(links.entries()).map(([url, weight]) => ({ url, weight })),
179
- html: html, // pass raw HTML for analysis
180
- canonical,
181
- noindex,
182
- nofollow,
183
- contentHash,
184
- simhash,
185
- uniqueTokenRatio,
186
- soft404Score,
187
- soft404Signals
188
- }
189
- }
190
- }