@crawlith/core 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analysis_list.html +35 -0
  4. package/dist/analysis/analysis_page.html +123 -0
  5. package/dist/analysis/analyze.d.ts +40 -5
  6. package/dist/analysis/analyze.js +395 -347
  7. package/dist/analysis/clustering.d.ts +23 -0
  8. package/dist/analysis/clustering.js +206 -0
  9. package/dist/analysis/content.d.ts +1 -1
  10. package/dist/analysis/content.js +11 -5
  11. package/dist/analysis/duplicate.d.ts +34 -0
  12. package/dist/analysis/duplicate.js +305 -0
  13. package/dist/analysis/heading.d.ts +116 -0
  14. package/dist/analysis/heading.js +356 -0
  15. package/dist/analysis/images.d.ts +1 -1
  16. package/dist/analysis/images.js +6 -5
  17. package/dist/analysis/links.d.ts +1 -1
  18. package/dist/analysis/links.js +8 -8
  19. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  20. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  21. package/dist/analysis/scoring.js +11 -2
  22. package/dist/analysis/seo.d.ts +8 -4
  23. package/dist/analysis/seo.js +41 -30
  24. package/dist/analysis/soft404.d.ts +17 -0
  25. package/dist/analysis/soft404.js +62 -0
  26. package/dist/analysis/structuredData.d.ts +1 -1
  27. package/dist/analysis/structuredData.js +5 -4
  28. package/dist/analysis/templates.d.ts +2 -0
  29. package/dist/analysis/templates.js +7 -0
  30. package/dist/application/index.d.ts +2 -0
  31. package/dist/application/index.js +2 -0
  32. package/dist/application/usecase.d.ts +3 -0
  33. package/dist/application/usecase.js +1 -0
  34. package/dist/application/usecases.d.ts +114 -0
  35. package/dist/application/usecases.js +201 -0
  36. package/dist/audit/index.js +1 -1
  37. package/dist/audit/transport.d.ts +1 -1
  38. package/dist/audit/transport.js +5 -4
  39. package/dist/audit/types.d.ts +1 -0
  40. package/dist/constants.d.ts +17 -0
  41. package/dist/constants.js +23 -0
  42. package/dist/core/scope/scopeManager.js +3 -0
  43. package/dist/core/security/ipGuard.d.ts +11 -0
  44. package/dist/core/security/ipGuard.js +71 -3
  45. package/dist/crawler/crawl.d.ts +4 -22
  46. package/dist/crawler/crawl.js +4 -335
  47. package/dist/crawler/crawler.d.ts +87 -0
  48. package/dist/crawler/crawler.js +683 -0
  49. package/dist/crawler/extract.d.ts +4 -1
  50. package/dist/crawler/extract.js +7 -2
  51. package/dist/crawler/fetcher.d.ts +2 -1
  52. package/dist/crawler/fetcher.js +26 -11
  53. package/dist/crawler/metricsRunner.d.ts +23 -1
  54. package/dist/crawler/metricsRunner.js +202 -72
  55. package/dist/crawler/normalize.d.ts +41 -0
  56. package/dist/crawler/normalize.js +119 -3
  57. package/dist/crawler/parser.d.ts +1 -3
  58. package/dist/crawler/parser.js +2 -49
  59. package/dist/crawler/resolver.d.ts +11 -0
  60. package/dist/crawler/resolver.js +67 -0
  61. package/dist/crawler/sitemap.d.ts +6 -0
  62. package/dist/crawler/sitemap.js +27 -17
  63. package/dist/crawler/trap.d.ts +5 -1
  64. package/dist/crawler/trap.js +23 -2
  65. package/dist/db/CrawlithDB.d.ts +110 -0
  66. package/dist/db/CrawlithDB.js +500 -0
  67. package/dist/db/graphLoader.js +42 -30
  68. package/dist/db/index.d.ts +11 -0
  69. package/dist/db/index.js +41 -29
  70. package/dist/db/migrations.d.ts +2 -0
  71. package/dist/db/{schema.js → migrations.js} +90 -43
  72. package/dist/db/pluginRegistry.d.ts +9 -0
  73. package/dist/db/pluginRegistry.js +19 -0
  74. package/dist/db/repositories/EdgeRepository.d.ts +13 -0
  75. package/dist/db/repositories/EdgeRepository.js +20 -0
  76. package/dist/db/repositories/MetricsRepository.d.ts +16 -8
  77. package/dist/db/repositories/MetricsRepository.js +28 -7
  78. package/dist/db/repositories/PageRepository.d.ts +15 -2
  79. package/dist/db/repositories/PageRepository.js +169 -25
  80. package/dist/db/repositories/SiteRepository.d.ts +9 -0
  81. package/dist/db/repositories/SiteRepository.js +13 -0
  82. package/dist/db/repositories/SnapshotRepository.d.ts +14 -5
  83. package/dist/db/repositories/SnapshotRepository.js +64 -5
  84. package/dist/db/reset.d.ts +9 -0
  85. package/dist/db/reset.js +32 -0
  86. package/dist/db/statements.d.ts +12 -0
  87. package/dist/db/statements.js +40 -0
  88. package/dist/diff/compare.d.ts +0 -5
  89. package/dist/diff/compare.js +0 -12
  90. package/dist/diff/service.d.ts +16 -0
  91. package/dist/diff/service.js +41 -0
  92. package/dist/domain/index.d.ts +4 -0
  93. package/dist/domain/index.js +4 -0
  94. package/dist/events.d.ts +56 -0
  95. package/dist/events.js +1 -0
  96. package/dist/graph/graph.d.ts +36 -42
  97. package/dist/graph/graph.js +26 -17
  98. package/dist/graph/hits.d.ts +23 -0
  99. package/dist/graph/hits.js +111 -0
  100. package/dist/graph/metrics.d.ts +0 -4
  101. package/dist/graph/metrics.js +25 -9
  102. package/dist/graph/pagerank.d.ts +17 -4
  103. package/dist/graph/pagerank.js +126 -91
  104. package/dist/graph/simhash.d.ts +6 -0
  105. package/dist/graph/simhash.js +14 -0
  106. package/dist/index.d.ts +29 -8
  107. package/dist/index.js +29 -8
  108. package/dist/lock/hashKey.js +1 -1
  109. package/dist/lock/lockManager.d.ts +5 -1
  110. package/dist/lock/lockManager.js +38 -13
  111. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  112. package/dist/plugin-system/plugin-cli.js +31 -0
  113. package/dist/plugin-system/plugin-config.d.ts +16 -0
  114. package/dist/plugin-system/plugin-config.js +36 -0
  115. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  116. package/dist/plugin-system/plugin-loader.js +122 -0
  117. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  118. package/dist/plugin-system/plugin-registry.js +167 -0
  119. package/dist/plugin-system/plugin-types.d.ts +205 -0
  120. package/dist/plugin-system/plugin-types.js +1 -0
  121. package/dist/ports/index.d.ts +9 -0
  122. package/dist/ports/index.js +1 -0
  123. package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
  124. package/dist/report/crawlExport.d.ts +3 -0
  125. package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
  126. package/dist/report/crawl_template.d.ts +1 -0
  127. package/dist/report/crawl_template.js +7 -0
  128. package/dist/report/export.d.ts +3 -0
  129. package/dist/report/export.js +81 -0
  130. package/dist/report/html.js +15 -216
  131. package/dist/report/insight.d.ts +27 -0
  132. package/dist/report/insight.js +103 -0
  133. package/dist/scoring/health.d.ts +56 -0
  134. package/dist/scoring/health.js +213 -0
  135. package/dist/utils/chalk.d.ts +6 -0
  136. package/dist/utils/chalk.js +41 -0
  137. package/dist/utils/secureConfig.d.ts +23 -0
  138. package/dist/utils/secureConfig.js +128 -0
  139. package/package.json +12 -6
  140. package/CHANGELOG.md +0 -7
  141. package/dist/db/schema.d.ts +0 -2
  142. package/dist/graph/cluster.d.ts +0 -6
  143. package/dist/graph/cluster.js +0 -173
  144. package/dist/graph/duplicate.d.ts +0 -10
  145. package/dist/graph/duplicate.js +0 -251
  146. package/dist/report/sitegraphExport.d.ts +0 -3
  147. package/dist/report/sitegraph_template.d.ts +0 -1
  148. package/dist/report/sitegraph_template.js +0 -630
  149. package/dist/scoring/hits.d.ts +0 -9
  150. package/dist/scoring/hits.js +0 -111
  151. package/src/analysis/analyze.ts +0 -548
  152. package/src/analysis/content.ts +0 -62
  153. package/src/analysis/images.ts +0 -28
  154. package/src/analysis/links.ts +0 -41
  155. package/src/analysis/scoring.ts +0 -59
  156. package/src/analysis/seo.ts +0 -82
  157. package/src/analysis/structuredData.ts +0 -62
  158. package/src/audit/dns.ts +0 -49
  159. package/src/audit/headers.ts +0 -98
  160. package/src/audit/index.ts +0 -66
  161. package/src/audit/scoring.ts +0 -232
  162. package/src/audit/transport.ts +0 -258
  163. package/src/audit/types.ts +0 -102
  164. package/src/core/network/proxyAdapter.ts +0 -21
  165. package/src/core/network/rateLimiter.ts +0 -39
  166. package/src/core/network/redirectController.ts +0 -47
  167. package/src/core/network/responseLimiter.ts +0 -34
  168. package/src/core/network/retryPolicy.ts +0 -57
  169. package/src/core/scope/domainFilter.ts +0 -45
  170. package/src/core/scope/scopeManager.ts +0 -52
  171. package/src/core/scope/subdomainPolicy.ts +0 -39
  172. package/src/core/security/ipGuard.ts +0 -92
  173. package/src/crawler/crawl.ts +0 -382
  174. package/src/crawler/extract.ts +0 -34
  175. package/src/crawler/fetcher.ts +0 -233
  176. package/src/crawler/metricsRunner.ts +0 -124
  177. package/src/crawler/normalize.ts +0 -108
  178. package/src/crawler/parser.ts +0 -190
  179. package/src/crawler/sitemap.ts +0 -73
  180. package/src/crawler/trap.ts +0 -96
  181. package/src/db/graphLoader.ts +0 -105
  182. package/src/db/index.ts +0 -70
  183. package/src/db/repositories/EdgeRepository.ts +0 -29
  184. package/src/db/repositories/MetricsRepository.ts +0 -49
  185. package/src/db/repositories/PageRepository.ts +0 -128
  186. package/src/db/repositories/SiteRepository.ts +0 -32
  187. package/src/db/repositories/SnapshotRepository.ts +0 -74
  188. package/src/db/schema.ts +0 -177
  189. package/src/diff/compare.ts +0 -84
  190. package/src/graph/cluster.ts +0 -192
  191. package/src/graph/duplicate.ts +0 -286
  192. package/src/graph/graph.ts +0 -172
  193. package/src/graph/metrics.ts +0 -110
  194. package/src/graph/pagerank.ts +0 -125
  195. package/src/graph/simhash.ts +0 -61
  196. package/src/index.ts +0 -30
  197. package/src/lock/hashKey.ts +0 -51
  198. package/src/lock/lockManager.ts +0 -124
  199. package/src/lock/pidCheck.ts +0 -13
  200. package/src/report/html.ts +0 -227
  201. package/src/report/sitegraphExport.ts +0 -58
  202. package/src/scoring/hits.ts +0 -131
  203. package/src/scoring/orphanSeverity.ts +0 -176
  204. package/src/utils/version.ts +0 -18
  205. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  206. package/tests/analysis.unit.test.ts +0 -98
  207. package/tests/analyze.integration.test.ts +0 -98
  208. package/tests/audit/dns.test.ts +0 -31
  209. package/tests/audit/headers.test.ts +0 -45
  210. package/tests/audit/scoring.test.ts +0 -133
  211. package/tests/audit/security.test.ts +0 -12
  212. package/tests/audit/transport.test.ts +0 -112
  213. package/tests/clustering.test.ts +0 -118
  214. package/tests/crawler.test.ts +0 -358
  215. package/tests/db.test.ts +0 -159
  216. package/tests/diff.test.ts +0 -67
  217. package/tests/duplicate.test.ts +0 -110
  218. package/tests/fetcher.test.ts +0 -106
  219. package/tests/fetcher_safety.test.ts +0 -85
  220. package/tests/fixtures/analyze-crawl.json +0 -26
  221. package/tests/hits.test.ts +0 -134
  222. package/tests/html_report.test.ts +0 -58
  223. package/tests/lock/lockManager.test.ts +0 -138
  224. package/tests/metrics.test.ts +0 -196
  225. package/tests/normalize.test.ts +0 -101
  226. package/tests/orphanSeverity.test.ts +0 -160
  227. package/tests/pagerank.test.ts +0 -98
  228. package/tests/parser.test.ts +0 -117
  229. package/tests/proxy_safety.test.ts +0 -57
  230. package/tests/redirect_safety.test.ts +0 -73
  231. package/tests/safety.test.ts +0 -114
  232. package/tests/scope.test.ts +0 -66
  233. package/tests/scoring.test.ts +0 -59
  234. package/tests/sitemap.test.ts +0 -88
  235. package/tests/soft404.test.ts +0 -41
  236. package/tests/trap.test.ts +0 -39
  237. package/tests/visualization_data.test.ts +0 -46
  238. package/tsconfig.json +0 -11
@@ -2,8 +2,11 @@ import * as cheerio from 'cheerio';
2
2
  /**
3
3
  * Extracts all links from an HTML document.
4
4
  * Returns absolute URLs.
5
+ * @param html The HTML content string
6
+ * @param baseUrl The base URL to resolve relative links against
7
+ * @param onError Optional callback for handling extraction errors
5
8
  */
6
- export function extractLinks(html, baseUrl) {
9
+ export function extractLinks(html, baseUrl, onError) {
7
10
  try {
8
11
  const $ = cheerio.load(html);
9
12
  const links = new Set();
@@ -27,7 +30,9 @@ export function extractLinks(html, baseUrl) {
27
30
  return Array.from(links);
28
31
  }
29
32
  catch (e) {
30
- console.error(`Error extracting links from ${baseUrl}:`, e);
33
+ if (onError) {
34
+ onError(e);
35
+ }
31
36
  return [];
32
37
  }
33
38
  }
@@ -23,9 +23,10 @@ export interface FetchOptions {
23
23
  crawlDelay?: number;
24
24
  }
25
25
  export declare class Fetcher {
26
- private userAgent;
26
+ userAgent: string;
27
27
  private rateLimiter;
28
28
  private proxyAdapter;
29
+ private secureDispatcher;
29
30
  private scopeManager?;
30
31
  private maxRedirects;
31
32
  constructor(options?: {
@@ -1,26 +1,34 @@
1
1
  import { request } from 'undici';
2
+ import * as net from 'net';
2
3
  import { IPGuard } from '../core/security/ipGuard.js';
3
4
  import { RateLimiter } from '../core/network/rateLimiter.js';
4
5
  import { RetryPolicy } from '../core/network/retryPolicy.js';
5
6
  import { ResponseLimiter } from '../core/network/responseLimiter.js';
6
7
  import { RedirectController } from '../core/network/redirectController.js';
7
8
  import { ProxyAdapter } from '../core/network/proxyAdapter.js';
8
- import { version } from '../utils/version.js';
9
+ import { DEFAULTS } from '../constants.js';
9
10
  export class Fetcher {
10
- userAgent = 'crawlith/1.0';
11
+ userAgent = DEFAULTS.USER_AGENT;
11
12
  rateLimiter;
12
13
  proxyAdapter;
14
+ secureDispatcher;
13
15
  scopeManager;
14
16
  maxRedirects;
15
17
  constructor(options = {}) {
16
- this.rateLimiter = new RateLimiter(options.rate || 2);
18
+ this.rateLimiter = new RateLimiter(options.rate || DEFAULTS.RATE_LIMIT);
17
19
  this.proxyAdapter = new ProxyAdapter(options.proxyUrl);
20
+ if (this.proxyAdapter.dispatcher) {
21
+ this.secureDispatcher = this.proxyAdapter.dispatcher;
22
+ }
23
+ else {
24
+ this.secureDispatcher = IPGuard.getSecureDispatcher();
25
+ }
18
26
  this.scopeManager = options.scopeManager;
19
- this.maxRedirects = Math.min(options.maxRedirects ?? 2, 11);
20
- this.userAgent = options.userAgent || `crawlith/${version}`;
27
+ this.maxRedirects = Math.min(options.maxRedirects ?? DEFAULTS.MAX_REDIRECTS, DEFAULTS.MAX_REDIRECTS_LIMIT);
28
+ this.userAgent = options.userAgent || DEFAULTS.USER_AGENT;
21
29
  }
22
30
  async fetch(url, options = {}) {
23
- const maxBytes = options.maxBytes || 2000000;
31
+ const maxBytes = options.maxBytes || DEFAULTS.MAX_BYTES;
24
32
  const redirectChain = [];
25
33
  const redirectController = new RedirectController(this.maxRedirects, url);
26
34
  let currentUrl = url;
@@ -28,10 +36,14 @@ export class Fetcher {
28
36
  // Use a while(true) and explicit return/continue to handle redirects
29
37
  while (true) {
30
38
  const urlObj = new URL(currentUrl);
31
- // 1. SSRF Guard
32
- const isSafe = await IPGuard.validateHost(urlObj.hostname);
33
- if (!isSafe) {
34
- return this.errorResult('blocked_internal_ip', currentUrl, redirectChain, totalRetries);
39
+ // 1. SSRF Guard (IP Literals only)
40
+ // We only check explicit IP literals here to fail fast.
41
+ // For domains, we rely on the secureDispatcher (which uses IPGuard.secureLookup)
42
+ // to resolve and validate the IP at connection time, preventing TOCTOU attacks.
43
+ if (net.isIP(urlObj.hostname)) {
44
+ if (IPGuard.isInternal(urlObj.hostname)) {
45
+ return this.errorResult('blocked_internal_ip', currentUrl, redirectChain, totalRetries);
46
+ }
35
47
  }
36
48
  // 2. Scope Validation (Domain & Subdomain)
37
49
  if (this.scopeManager) {
@@ -61,7 +73,7 @@ export class Fetcher {
61
73
  method: 'GET',
62
74
  headers,
63
75
  maxRedirections: 0,
64
- dispatcher: this.proxyAdapter.dispatcher,
76
+ dispatcher: this.secureDispatcher,
65
77
  headersTimeout: 10000,
66
78
  bodyTimeout: 10000
67
79
  });
@@ -141,6 +153,9 @@ export class Fetcher {
141
153
  catch (error) {
142
154
  // Map common network errors to specific statuses if needed
143
155
  const isProxyError = error.message?.toLowerCase().includes('proxy') || error.code === 'ECONNREFUSED';
156
+ if (error.code === 'EBLOCKED' || error.message?.includes('Blocked internal IP')) {
157
+ return this.errorResult('blocked_internal_ip', currentUrl, redirectChain, totalRetries);
158
+ }
144
159
  const finalStatus = isProxyError ? 'proxy_connection_failed' : 'network_error';
145
160
  return this.errorResult(totalRetries >= RetryPolicy.DEFAULT_CONFIG.maxRetries ? 'failed_after_retries' : finalStatus, currentUrl, redirectChain, totalRetries);
146
161
  }
@@ -1 +1,23 @@
1
- export declare function runPostCrawlMetrics(snapshotId: number, maxDepth: number, limitReached?: boolean): void;
1
+ import { EngineContext } from '../events.js';
2
+ import { Graph } from '../graph/graph.js';
3
+ export interface PostCrawlOptions {
4
+ context?: EngineContext;
5
+ limitReached?: boolean;
6
+ graphInstance?: Graph;
7
+ clustering?: boolean;
8
+ clusterThreshold?: number;
9
+ minClusterSize?: number;
10
+ health?: boolean;
11
+ computePagerank?: boolean;
12
+ computeHits?: boolean;
13
+ heading?: boolean;
14
+ orphans?: boolean;
15
+ orphanSeverity?: 'low' | 'medium' | 'high' | boolean;
16
+ includeSoftOrphans?: boolean;
17
+ minInbound?: number;
18
+ rootOrigin?: string;
19
+ }
20
+ export declare function runPostCrawlMetrics(snapshotId: number, maxDepth: number, options?: PostCrawlOptions): {
21
+ metrics: any;
22
+ healthData?: any;
23
+ } | undefined;
@@ -3,51 +3,189 @@ import { loadGraphFromSnapshot } from '../db/graphLoader.js';
3
3
  import { MetricsRepository } from '../db/repositories/MetricsRepository.js';
4
4
  import { SnapshotRepository } from '../db/repositories/SnapshotRepository.js';
5
5
  import { PageRepository } from '../db/repositories/PageRepository.js';
6
- import { computePageRank } from '../graph/pagerank.js';
7
6
  import { calculateMetrics } from '../graph/metrics.js';
8
- import { computeHITS } from '../scoring/hits.js';
9
- export function runPostCrawlMetrics(snapshotId, maxDepth, limitReached = false) {
7
+ import { PageRankService } from '../graph/pagerank.js';
8
+ import { HITSService } from '../graph/hits.js';
9
+ import { TrapDetector } from './trap.js';
10
+ import { ClusteringService } from '../analysis/clustering.js';
11
+ import { DuplicateService } from '../analysis/duplicate.js';
12
+ import { annotateOrphans } from '../analysis/orphan.js';
13
+ import { Soft404Service } from '../analysis/soft404.js';
14
+ import { HeadingHealthService } from '../analysis/heading.js';
15
+ import { HealthService } from '../scoring/health.js';
16
+ import { analyzeContent } from '../analysis/content.js';
17
+ import { load } from 'cheerio';
18
+ export function runPostCrawlMetrics(snapshotId, maxDepth, options = {}) {
19
+ const context = options.context;
20
+ const limitReached = options.limitReached || false;
21
+ const graphInstance = options.graphInstance;
10
22
  const db = getDb();
11
23
  const metricsRepo = new MetricsRepository(db);
12
24
  const snapshotRepo = new SnapshotRepository(db);
13
25
  const pageRepo = new PageRepository(db);
26
+ const graph = graphInstance || loadGraphFromSnapshot(snapshotId);
27
+ // Fallback emitter
28
+ const emit = (event) => {
29
+ if (context) {
30
+ context.emit(event);
31
+ }
32
+ else {
33
+ if (event.type === 'error')
34
+ console.error(event.message);
35
+ else if (event.type !== 'debug') {
36
+ const out = event.message || event.phase;
37
+ if (out)
38
+ console.log(out);
39
+ }
40
+ }
41
+ };
14
42
  const snapshot = snapshotRepo.getSnapshot(snapshotId);
15
43
  if (!snapshot) {
16
- console.error(`Snapshot ${snapshotId} not found`);
17
- return;
44
+ emit({ type: 'error', message: `Snapshot ${snapshotId} not found` });
45
+ return undefined;
18
46
  }
19
- console.log('Loading graph for metrics calculation...');
20
- const graph = loadGraphFromSnapshot(snapshotId);
21
- console.log('Computing PageRank...');
22
- computePageRank(graph);
23
- console.log('Computing HITS...');
24
- computeHITS(graph);
25
- console.log('Updating metrics in DB...');
26
- const nodes = graph.getNodes();
27
- const tx = db.transaction(() => {
28
- for (const node of nodes) {
29
- const pageId = pageRepo.getIdByUrl(snapshot.site_id, node.url);
30
- if (!pageId)
31
- continue;
32
- const existing = metricsRepo.getMetricsForPage(snapshotId, pageId);
33
- metricsRepo.insertMetrics({
34
- snapshot_id: snapshotId,
35
- page_id: pageId,
36
- authority_score: node.authorityScore ?? null,
37
- hub_score: node.hubScore ?? null,
38
- pagerank: node.pageRank ?? null,
39
- pagerank_score: node.pageRankScore ?? null,
40
- link_role: node.linkRole ?? null,
41
- crawl_status: existing?.crawl_status ?? null,
42
- word_count: existing?.word_count ?? null,
43
- thin_content_score: existing?.thin_content_score ?? null,
44
- external_link_ratio: existing?.external_link_ratio ?? null,
45
- orphan_score: existing?.orphan_score ?? null,
46
- duplicate_cluster_id: node.duplicateClusterId ?? null,
47
- duplicate_type: node.duplicateType ?? null,
48
- is_cluster_primary: node.isClusterPrimary ? 1 : 0
47
+ if (!graphInstance) {
48
+ emit({ type: 'metrics:start', phase: 'Loading graph' });
49
+ }
50
+ emit({ type: 'metrics:start', phase: 'Running core algorithms' });
51
+ // 1. Graph Algorithms
52
+ const prResults = options.computePagerank ? new PageRankService().evaluate(graph) : new Map();
53
+ const hitsResults = options.computeHits ? new HITSService().evaluate(graph, { iterations: 20 }) : new Map();
54
+ // 2. Crawler Safety
55
+ new TrapDetector().analyze(graph);
56
+ // 3. Analysis / Intelligence
57
+ if (options.clustering) {
58
+ const contentClusters = new ClusteringService().detectContentClusters(graph, options.clusterThreshold, options.minClusterSize);
59
+ if (contentClusters.length > 0) {
60
+ const insertCluster = db.prepare(`
61
+ INSERT OR REPLACE INTO content_clusters (id, snapshot_id, count, primary_url, risk, shared_path_prefix)
62
+ VALUES (@id, @snapshot_id, @count, @primary_url, @risk, @shared_path_prefix)
63
+ `);
64
+ const insertContentTx = db.transaction((clusters) => {
65
+ for (const c of clusters) {
66
+ insertCluster.run({
67
+ id: c.id,
68
+ snapshot_id: snapshotId,
69
+ count: c.count,
70
+ primary_url: c.primaryUrl,
71
+ risk: c.risk,
72
+ shared_path_prefix: c.sharedPathPrefix ?? null
73
+ });
74
+ }
49
75
  });
50
- // Update page-level crawl trap data
76
+ insertContentTx(contentClusters);
77
+ }
78
+ }
79
+ const duplicateClusters = new DuplicateService().detectDuplicates(graph, { collapse: false });
80
+ if (duplicateClusters.length > 0) {
81
+ const insertCluster = db.prepare(`
82
+ INSERT OR REPLACE INTO duplicate_clusters (id, snapshot_id, type, size, representative, severity)
83
+ VALUES (@id, @snapshot_id, @type, @size, @representative, @severity)
84
+ `);
85
+ const insertDuplicateTx = db.transaction((clusters) => {
86
+ for (const c of clusters) {
87
+ insertCluster.run({
88
+ id: c.id,
89
+ snapshot_id: snapshotId,
90
+ type: c.type, // valid: 'exact' | 'near' | 'template_heavy'
91
+ size: c.size,
92
+ representative: c.representative,
93
+ severity: c.severity || 'low'
94
+ });
95
+ }
96
+ });
97
+ insertDuplicateTx(duplicateClusters);
98
+ }
99
+ let annotatedNodes = [];
100
+ if (options.orphans) {
101
+ const orphanOptions = {
102
+ enabled: true,
103
+ severityEnabled: !!options.orphanSeverity || options.orphanSeverity === undefined,
104
+ includeSoftOrphans: options.includeSoftOrphans ?? true,
105
+ minInbound: options.minInbound ?? 2
106
+ };
107
+ annotatedNodes = annotateOrphans(graph.getNodes(), graph.getEdges(), orphanOptions);
108
+ }
109
+ const soft404Service = new Soft404Service();
110
+ const headingService = new HeadingHealthService();
111
+ // Pre-calculate heading health for all nodes with HTML
112
+ let headingPayloads = new Map();
113
+ if (options.heading) {
114
+ const result = headingService.evaluateNodes(graph.getNodes());
115
+ headingPayloads = result.payloadsByUrl;
116
+ }
117
+ // Apply signals to nodes
118
+ for (const node of graph.getNodes()) {
119
+ const pr = prResults.get(node.url);
120
+ if (pr)
121
+ node.pagerankScore = pr.score;
122
+ const hits = hitsResults.get(node.url);
123
+ if (hits) {
124
+ node.authScore = hits.authority_score;
125
+ node.hubScore = hits.hub_score;
126
+ node.linkRole = hits.link_role;
127
+ }
128
+ if (options.orphans) {
129
+ const annotated = annotatedNodes.find((n) => n.url === node.url);
130
+ if (annotated) {
131
+ node.orphanScore = annotated.orphanSeverity;
132
+ node.orphanType = annotated.orphanType;
133
+ node.impactLevel = annotated.impactLevel;
134
+ }
135
+ }
136
+ if (options.heading) {
137
+ const heading = headingPayloads.get(node.url);
138
+ if (heading) {
139
+ node.headingScore = heading.score;
140
+ node.headingData = JSON.stringify(heading);
141
+ }
142
+ }
143
+ if (node.html) {
144
+ const soft404 = soft404Service.analyze(node.html, node.outLinks);
145
+ node.soft404Score = soft404.score;
146
+ const $ = load(node.html);
147
+ const content = analyzeContent($);
148
+ node.wordCount = content.wordCount;
149
+ }
150
+ }
151
+ emit({ type: 'metrics:start', phase: 'Updating metrics in DB' });
152
+ // Pre-fetch all page IDs to avoid N+1 queries
153
+ const pagesIdentity = pageRepo.getPagesIdentityBySnapshot(snapshotId);
154
+ const urlToId = new Map();
155
+ for (const p of pagesIdentity) {
156
+ urlToId.set(p.normalized_url, p.id);
157
+ }
158
+ const metricsToSave = graph.getNodes().map(node => {
159
+ const pageId = urlToId.get(node.url);
160
+ if (!pageId)
161
+ return null;
162
+ return {
163
+ snapshot_id: snapshotId,
164
+ page_id: pageId,
165
+ crawl_status: node.crawlStatus ?? null,
166
+ word_count: node.wordCount ?? null,
167
+ thin_content_score: node.thinContentScore ?? null,
168
+ external_link_ratio: node.externalLinkRatio ?? null,
169
+ pagerank_score: node.pagerankScore ?? null,
170
+ hub_score: node.hubScore ?? null,
171
+ auth_score: node.authScore ?? null,
172
+ link_role: node.linkRole ?? null,
173
+ duplicate_cluster_id: node.duplicateClusterId ?? null,
174
+ duplicate_type: node.duplicateType ?? null,
175
+ cluster_id: node.clusterId ?? null,
176
+ soft404_score: node.soft404Score ?? null,
177
+ heading_score: node.headingScore ?? null,
178
+ orphan_score: node.orphanScore ?? null,
179
+ orphan_type: node.orphanType ?? null,
180
+ impact_level: node.impactLevel ?? null,
181
+ heading_data: node.headingData ?? null,
182
+ is_cluster_primary: node.isClusterPrimary ? 1 : 0
183
+ };
184
+ }).filter(m => m !== null);
185
+ metricsRepo.insertMany(metricsToSave);
186
+ // Update page-level metadata in transaction
187
+ const tx = db.transaction(() => {
188
+ for (const node of graph.getNodes()) {
51
189
  if (node.crawlTrapFlag || node.redirectChain?.length || node.bytesReceived) {
52
190
  pageRepo.upsertPage({
53
191
  site_id: snapshot.site_id,
@@ -61,48 +199,40 @@ export function runPostCrawlMetrics(snapshotId, maxDepth, limitReached = false)
61
199
  });
62
200
  }
63
201
  }
64
- // Save duplicate clusters
65
- if (graph.duplicateClusters.length > 0) {
66
- const clusterStmt = db.prepare(`
67
- INSERT OR REPLACE INTO duplicate_clusters (id, snapshot_id, type, size, representative, severity)
68
- VALUES (?, ?, ?, ?, ?, ?)
69
- `);
70
- for (const cluster of graph.duplicateClusters) {
71
- clusterStmt.run(cluster.id, snapshotId, cluster.type, cluster.size, cluster.representative, cluster.severity);
72
- }
73
- }
74
- // Save content clusters
75
- if (graph.contentClusters.length > 0) {
76
- const contentStmt = db.prepare(`
77
- INSERT OR REPLACE INTO content_clusters (id, snapshot_id, count, primary_url, risk, shared_path_prefix)
78
- VALUES (?, ?, ?, ?, ?, ?)
79
- `);
80
- for (const cluster of graph.contentClusters) {
81
- contentStmt.run(cluster.id, snapshotId, cluster.count, cluster.primaryUrl, cluster.risk, cluster.sharedPathPrefix ?? null);
82
- }
83
- }
84
202
  });
85
203
  tx();
86
- console.log('Computing aggregate stats...');
204
+ emit({ type: 'metrics:start', phase: 'Computing aggregate stats' });
87
205
  const metrics = calculateMetrics(graph, maxDepth);
88
- let totalScore = 0;
89
- let totalWeight = 0;
90
- for (const node of nodes) {
91
- const score = node.authorityScore || node.pageRankScore || 0;
92
- const depth = node.depth;
93
- const weight = 1 / (depth + 1);
94
- totalScore += score * weight;
95
- totalWeight += weight;
206
+ // Compute health score if enabled
207
+ let healthScore = null;
208
+ if (options.health) {
209
+ try {
210
+ const rootOrigin = options.rootOrigin ?? '';
211
+ const healthService = new HealthService();
212
+ const issues = healthService.collectCrawlIssues(graph, metrics, rootOrigin);
213
+ const breakdown = healthService.calculateHealthScore(metrics.totalPages, issues);
214
+ healthScore = breakdown.score;
215
+ }
216
+ catch (e) {
217
+ emit({ type: 'error', message: 'Error computing health score', error: e });
218
+ }
96
219
  }
97
- const healthScore = totalWeight > 0 ? (totalScore / totalWeight) * 100 : 0;
98
- const thinCountRow = db.prepare('SELECT count(*) as count FROM metrics WHERE snapshot_id = ? AND thin_content_score >= 70').get(snapshotId);
220
+ const thinContentCount = graph.getNodes().filter(n => n.wordCount !== undefined && n.wordCount < 200 && n.status === 200).length;
221
+ const orphanCount = metrics.orphanPages.length;
99
222
  snapshotRepo.updateSnapshotStatus(snapshotId, 'completed', {
100
223
  node_count: metrics.totalPages,
101
224
  edge_count: metrics.totalEdges,
102
- health_score: healthScore,
103
- orphan_count: metrics.orphanPages.length,
104
- thin_content_count: thinCountRow.count,
105
- limit_reached: limitReached ? 1 : 0
225
+ limit_reached: limitReached ? 1 : 0,
226
+ thin_content_count: thinContentCount,
227
+ orphan_count: orphanCount,
228
+ ...(healthScore !== null ? { health_score: healthScore } : {})
106
229
  });
107
- console.log('Metrics calculation complete.');
230
+ emit({ type: 'metrics:complete', durationMs: 0 });
231
+ return {
232
+ metrics,
233
+ healthData: healthScore !== null ? {
234
+ health: new HealthService().calculateHealthScore(metrics.totalPages, new HealthService().collectCrawlIssues(graph, metrics, options.rootOrigin ?? '')),
235
+ issues: new HealthService().collectCrawlIssues(graph, metrics, options.rootOrigin ?? '')
236
+ } : undefined
237
+ };
108
238
  }
@@ -3,5 +3,46 @@
3
3
  */
4
4
  export interface NormalizeOptions {
5
5
  stripQuery?: boolean;
6
+ toPath?: boolean;
6
7
  }
7
8
  export declare function normalizeUrl(input: string, base: string, options?: NormalizeOptions): string | null;
9
+ /**
10
+ * Utility for converting between absolute URLs and relative paths
11
+ * primarily used for database storage.
12
+ */
13
+ export declare class UrlUtil {
14
+ /**
15
+ * Extract a stable domain key from a URL/domain input.
16
+ * Examples:
17
+ * - "https://www.example.com/a" -> "example.com"
18
+ * - "example.com" -> "example.com"
19
+ */
20
+ static extractDomain(input: string): string;
21
+ /**
22
+ * Resolve a site's absolute origin from persisted site fields.
23
+ */
24
+ static resolveSiteOrigin(site: {
25
+ domain: string;
26
+ preferred_url?: string | null;
27
+ ssl?: number | null;
28
+ }): string;
29
+ /**
30
+ * Converts a full URL to a root-relative path if it matches the origin.
31
+ * If it doesn't match the origin, it's considered external and kept absolute.
32
+ */
33
+ static toPath(urlStr: string, origin: string): string;
34
+ /**
35
+ * Converts a root-relative path back to an absolute URL relative to the origin.
36
+ * If the input is already an absolute URL, it is returned as-is.
37
+ */
38
+ static toAbsolute(pathOrUrl: string, origin: string): string;
39
+ /**
40
+ * Determines if a URL (or path) is internal relative to the origin.
41
+ */
42
+ static isInternal(pathOrUrl: string, origin: string): boolean;
43
+ /**
44
+ * Build normalized lookup candidates for querying pages table.
45
+ * Returns path/absolute/original variants in priority order, deduplicated.
46
+ */
47
+ static toLookupCandidates(input: string, origin: string): string[];
48
+ }
@@ -10,7 +10,7 @@ const TRACKING_PARAMS = new Set([
10
10
  ]);
11
11
  const SKIP_EXTENSIONS = new Set([
12
12
  '.pdf', '.jpg', '.png', '.svg', '.webp', '.gif',
13
- '.zip', '.xml', '.json', '.mp4'
13
+ '.zip', '.xml', '.json', '.mp4', '.avif', '.ics'
14
14
  ]);
15
15
  export function normalizeUrl(input, base, options = {}) {
16
16
  try {
@@ -71,6 +71,7 @@ export function normalizeUrl(input, base, options = {}) {
71
71
  pathname = pathname.slice(0, -1);
72
72
  }
73
73
  u.pathname = pathname;
74
+ const finalUrl = u.toString();
74
75
  // 9. Skip non-HTML assets by extension
75
76
  const lastDotIndex = u.pathname.lastIndexOf('.');
76
77
  if (lastDotIndex !== -1) {
@@ -79,10 +80,125 @@ export function normalizeUrl(input, base, options = {}) {
79
80
  return null;
80
81
  }
81
82
  }
82
- // 10. Return final string
83
- return u.toString();
83
+ // 10. Return path if requested
84
+ if (options.toPath) {
85
+ return u.pathname + u.search;
86
+ }
87
+ // 11. Return final string
88
+ return finalUrl;
84
89
  }
85
90
  catch (_e) {
86
91
  return null;
87
92
  }
88
93
  }
94
+ /**
95
+ * Utility for converting between absolute URLs and relative paths
96
+ * primarily used for database storage.
97
+ */
98
+ export class UrlUtil {
99
+ /**
100
+ * Extract a stable domain key from a URL/domain input.
101
+ * Examples:
102
+ * - "https://www.example.com/a" -> "example.com"
103
+ * - "example.com" -> "example.com"
104
+ */
105
+ static extractDomain(input) {
106
+ const trimmed = input.trim();
107
+ if (!trimmed)
108
+ return '';
109
+ try {
110
+ const direct = new URL(trimmed);
111
+ return direct.hostname.toLowerCase().replace(/^www\./, '');
112
+ }
113
+ catch {
114
+ // fall through
115
+ }
116
+ try {
117
+ const withProtocol = new URL(`https://${trimmed}`);
118
+ return withProtocol.hostname.toLowerCase().replace(/^www\./, '');
119
+ }
120
+ catch {
121
+ return trimmed.toLowerCase().replace(/^www\./, '');
122
+ }
123
+ }
124
+ /**
125
+ * Resolve a site's absolute origin from persisted site fields.
126
+ */
127
+ static resolveSiteOrigin(site) {
128
+ if (site.preferred_url) {
129
+ try {
130
+ return new URL(site.preferred_url).origin;
131
+ }
132
+ catch {
133
+ // fall through to domain+ssl fallback
134
+ }
135
+ }
136
+ const protocol = site.ssl === 0 ? 'http' : 'https';
137
+ return `${protocol}://${site.domain}`;
138
+ }
139
+ /**
140
+ * Converts a full URL to a root-relative path if it matches the origin.
141
+ * If it doesn't match the origin, it's considered external and kept absolute.
142
+ */
143
+ static toPath(urlStr, origin) {
144
+ try {
145
+ const url = new URL(urlStr);
146
+ const originUrl = new URL(origin);
147
+ if (url.origin === originUrl.origin) {
148
+ return url.pathname + url.search;
149
+ }
150
+ return urlStr;
151
+ }
152
+ catch {
153
+ return urlStr;
154
+ }
155
+ }
156
+ /**
157
+ * Converts a root-relative path back to an absolute URL relative to the origin.
158
+ * If the input is already an absolute URL, it is returned as-is.
159
+ */
160
+ static toAbsolute(pathOrUrl, origin) {
161
+ if (pathOrUrl.startsWith('http://') || pathOrUrl.startsWith('https://')) {
162
+ return pathOrUrl;
163
+ }
164
+ try {
165
+ return new URL(pathOrUrl, origin).toString();
166
+ }
167
+ catch {
168
+ return pathOrUrl;
169
+ }
170
+ }
171
+ /**
172
+ * Determines if a URL (or path) is internal relative to the origin.
173
+ */
174
+ static isInternal(pathOrUrl, origin) {
175
+ if (!pathOrUrl.startsWith('http'))
176
+ return true;
177
+ try {
178
+ const url = new URL(pathOrUrl);
179
+ const originUrl = new URL(origin);
180
+ return url.origin === originUrl.origin;
181
+ }
182
+ catch {
183
+ return false;
184
+ }
185
+ }
186
+ /**
187
+ * Build normalized lookup candidates for querying pages table.
188
+ * Returns path/absolute/original variants in priority order, deduplicated.
189
+ */
190
+ static toLookupCandidates(input, origin) {
191
+ const candidates = new Set();
192
+ const raw = input.trim();
193
+ if (!raw)
194
+ return [];
195
+ const absolute = normalizeUrl(raw, origin, { stripQuery: false }) || UrlUtil.toAbsolute(raw, origin);
196
+ const path = normalizeUrl(raw, origin, { stripQuery: false, toPath: true }) || UrlUtil.toPath(raw, origin);
197
+ const absolutePath = normalizeUrl(absolute, '', { stripQuery: false, toPath: true }) || UrlUtil.toPath(absolute, origin);
198
+ candidates.add(path);
199
+ candidates.add(absolute);
200
+ candidates.add(absolutePath);
201
+ candidates.add(raw);
202
+ return Array.from(candidates).filter(Boolean);
203
+ }
204
+ }
@@ -11,12 +11,10 @@ export interface ParseResult {
11
11
  contentHash: string;
12
12
  simhash?: string;
13
13
  uniqueTokenRatio?: number;
14
- soft404Score: number;
15
- soft404Signals: string[];
16
14
  }
17
15
  export declare class Parser {
18
16
  /**
19
17
  * Parses HTML content to extract metadata and links.
20
18
  */
21
- parse(html: string, baseUrl: string, status: number): ParseResult;
19
+ parse(html: string, baseUrl: string, _status: number): ParseResult;
22
20
  }