@crawlith/core 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analyze.d.ts +29 -8
  4. package/dist/analysis/analyze.js +325 -221
  5. package/dist/analysis/clustering.d.ts +23 -0
  6. package/dist/analysis/clustering.js +206 -0
  7. package/dist/analysis/content.d.ts +1 -1
  8. package/dist/analysis/content.js +11 -5
  9. package/dist/analysis/duplicate.d.ts +34 -0
  10. package/dist/analysis/duplicate.js +305 -0
  11. package/dist/analysis/heading.d.ts +116 -0
  12. package/dist/analysis/heading.js +356 -0
  13. package/dist/analysis/images.d.ts +1 -1
  14. package/dist/analysis/images.js +6 -5
  15. package/dist/analysis/links.d.ts +1 -1
  16. package/dist/analysis/links.js +8 -8
  17. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  18. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  19. package/dist/analysis/scoring.js +4 -1
  20. package/dist/analysis/seo.d.ts +8 -4
  21. package/dist/analysis/seo.js +41 -30
  22. package/dist/analysis/soft404.d.ts +17 -0
  23. package/dist/analysis/soft404.js +62 -0
  24. package/dist/analysis/structuredData.d.ts +1 -1
  25. package/dist/analysis/structuredData.js +5 -4
  26. package/dist/application/index.d.ts +2 -0
  27. package/dist/application/index.js +2 -0
  28. package/dist/application/usecase.d.ts +3 -0
  29. package/dist/application/usecase.js +1 -0
  30. package/dist/application/usecases.d.ts +114 -0
  31. package/dist/application/usecases.js +201 -0
  32. package/dist/audit/index.js +1 -1
  33. package/dist/audit/transport.d.ts +1 -1
  34. package/dist/audit/transport.js +5 -4
  35. package/dist/audit/types.d.ts +1 -0
  36. package/dist/constants.d.ts +17 -0
  37. package/dist/constants.js +23 -0
  38. package/dist/core/scope/scopeManager.js +3 -0
  39. package/dist/crawler/crawl.d.ts +2 -2
  40. package/dist/crawler/crawler.d.ts +17 -5
  41. package/dist/crawler/crawler.js +259 -94
  42. package/dist/crawler/fetcher.d.ts +1 -1
  43. package/dist/crawler/fetcher.js +6 -6
  44. package/dist/crawler/metricsRunner.d.ts +21 -1
  45. package/dist/crawler/metricsRunner.js +181 -60
  46. package/dist/crawler/normalize.d.ts +41 -0
  47. package/dist/crawler/normalize.js +119 -3
  48. package/dist/crawler/parser.d.ts +1 -3
  49. package/dist/crawler/parser.js +2 -49
  50. package/dist/crawler/resolver.d.ts +11 -0
  51. package/dist/crawler/resolver.js +67 -0
  52. package/dist/crawler/sitemap.d.ts +4 -1
  53. package/dist/crawler/sitemap.js +24 -18
  54. package/dist/crawler/trap.d.ts +5 -1
  55. package/dist/crawler/trap.js +23 -2
  56. package/dist/db/CrawlithDB.d.ts +110 -0
  57. package/dist/db/CrawlithDB.js +500 -0
  58. package/dist/db/graphLoader.js +15 -32
  59. package/dist/db/index.d.ts +9 -1
  60. package/dist/db/index.js +39 -31
  61. package/dist/db/migrations.d.ts +2 -0
  62. package/dist/db/{schema.js → migrations.js} +90 -43
  63. package/dist/db/pluginRegistry.d.ts +9 -0
  64. package/dist/db/pluginRegistry.js +19 -0
  65. package/dist/db/repositories/EdgeRepository.d.ts +5 -0
  66. package/dist/db/repositories/EdgeRepository.js +7 -0
  67. package/dist/db/repositories/MetricsRepository.d.ts +13 -8
  68. package/dist/db/repositories/MetricsRepository.js +14 -6
  69. package/dist/db/repositories/PageRepository.d.ts +5 -3
  70. package/dist/db/repositories/PageRepository.js +68 -17
  71. package/dist/db/repositories/SiteRepository.d.ts +6 -0
  72. package/dist/db/repositories/SiteRepository.js +4 -0
  73. package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
  74. package/dist/db/repositories/SnapshotRepository.js +48 -10
  75. package/dist/db/reset.d.ts +9 -0
  76. package/dist/db/reset.js +32 -0
  77. package/dist/db/statements.d.ts +12 -0
  78. package/dist/db/statements.js +40 -0
  79. package/dist/diff/compare.d.ts +0 -5
  80. package/dist/diff/compare.js +0 -12
  81. package/dist/diff/service.d.ts +16 -0
  82. package/dist/diff/service.js +41 -0
  83. package/dist/domain/index.d.ts +4 -0
  84. package/dist/domain/index.js +4 -0
  85. package/dist/events.d.ts +8 -0
  86. package/dist/graph/graph.d.ts +20 -42
  87. package/dist/graph/graph.js +12 -16
  88. package/dist/graph/hits.d.ts +23 -0
  89. package/dist/graph/hits.js +111 -0
  90. package/dist/graph/metrics.d.ts +0 -4
  91. package/dist/graph/metrics.js +19 -15
  92. package/dist/graph/pagerank.d.ts +17 -4
  93. package/dist/graph/pagerank.js +126 -93
  94. package/dist/index.d.ts +27 -9
  95. package/dist/index.js +27 -9
  96. package/dist/lock/lockManager.d.ts +1 -0
  97. package/dist/lock/lockManager.js +15 -0
  98. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  99. package/dist/plugin-system/plugin-cli.js +31 -0
  100. package/dist/plugin-system/plugin-config.d.ts +16 -0
  101. package/dist/plugin-system/plugin-config.js +36 -0
  102. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  103. package/dist/plugin-system/plugin-loader.js +122 -0
  104. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  105. package/dist/plugin-system/plugin-registry.js +167 -0
  106. package/dist/plugin-system/plugin-types.d.ts +205 -0
  107. package/dist/plugin-system/plugin-types.js +1 -0
  108. package/dist/ports/index.d.ts +9 -0
  109. package/dist/ports/index.js +1 -0
  110. package/dist/report/export.d.ts +3 -0
  111. package/dist/report/export.js +81 -0
  112. package/dist/report/insight.d.ts +27 -0
  113. package/dist/report/insight.js +103 -0
  114. package/dist/scoring/health.d.ts +17 -11
  115. package/dist/scoring/health.js +183 -140
  116. package/dist/utils/chalk.d.ts +6 -0
  117. package/dist/utils/chalk.js +41 -0
  118. package/dist/utils/secureConfig.d.ts +23 -0
  119. package/dist/utils/secureConfig.js +128 -0
  120. package/package.json +10 -4
  121. package/CHANGELOG.md +0 -13
  122. package/dist/db/schema.d.ts +0 -2
  123. package/dist/graph/cluster.d.ts +0 -6
  124. package/dist/graph/cluster.js +0 -221
  125. package/dist/graph/duplicate.d.ts +0 -10
  126. package/dist/graph/duplicate.js +0 -302
  127. package/dist/scoring/hits.d.ts +0 -10
  128. package/dist/scoring/hits.js +0 -131
  129. package/scripts/copy-assets.js +0 -37
  130. package/src/analysis/analysis_list.html +0 -35
  131. package/src/analysis/analysis_page.html +0 -123
  132. package/src/analysis/analyze.ts +0 -505
  133. package/src/analysis/content.ts +0 -62
  134. package/src/analysis/images.ts +0 -28
  135. package/src/analysis/links.ts +0 -41
  136. package/src/analysis/scoring.ts +0 -66
  137. package/src/analysis/seo.ts +0 -82
  138. package/src/analysis/structuredData.ts +0 -62
  139. package/src/analysis/templates.ts +0 -9
  140. package/src/audit/dns.ts +0 -49
  141. package/src/audit/headers.ts +0 -98
  142. package/src/audit/index.ts +0 -66
  143. package/src/audit/scoring.ts +0 -232
  144. package/src/audit/transport.ts +0 -258
  145. package/src/audit/types.ts +0 -102
  146. package/src/core/network/proxyAdapter.ts +0 -21
  147. package/src/core/network/rateLimiter.ts +0 -39
  148. package/src/core/network/redirectController.ts +0 -47
  149. package/src/core/network/responseLimiter.ts +0 -34
  150. package/src/core/network/retryPolicy.ts +0 -57
  151. package/src/core/scope/domainFilter.ts +0 -45
  152. package/src/core/scope/scopeManager.ts +0 -52
  153. package/src/core/scope/subdomainPolicy.ts +0 -39
  154. package/src/core/security/ipGuard.ts +0 -171
  155. package/src/crawler/crawl.ts +0 -9
  156. package/src/crawler/crawler.ts +0 -601
  157. package/src/crawler/extract.ts +0 -39
  158. package/src/crawler/fetcher.ts +0 -251
  159. package/src/crawler/metricsRunner.ts +0 -137
  160. package/src/crawler/normalize.ts +0 -108
  161. package/src/crawler/parser.ts +0 -190
  162. package/src/crawler/sitemap.ts +0 -76
  163. package/src/crawler/trap.ts +0 -96
  164. package/src/db/graphLoader.ts +0 -135
  165. package/src/db/index.ts +0 -75
  166. package/src/db/repositories/EdgeRepository.ts +0 -43
  167. package/src/db/repositories/MetricsRepository.ts +0 -63
  168. package/src/db/repositories/PageRepository.ts +0 -228
  169. package/src/db/repositories/SiteRepository.ts +0 -43
  170. package/src/db/repositories/SnapshotRepository.ts +0 -99
  171. package/src/db/schema.ts +0 -177
  172. package/src/diff/compare.ts +0 -84
  173. package/src/events.ts +0 -16
  174. package/src/graph/cluster.ts +0 -246
  175. package/src/graph/duplicate.ts +0 -350
  176. package/src/graph/graph.ts +0 -192
  177. package/src/graph/metrics.ts +0 -125
  178. package/src/graph/pagerank.ts +0 -126
  179. package/src/graph/simhash.ts +0 -76
  180. package/src/index.ts +0 -33
  181. package/src/lock/hashKey.ts +0 -51
  182. package/src/lock/lockManager.ts +0 -132
  183. package/src/lock/pidCheck.ts +0 -13
  184. package/src/report/crawl.html +0 -879
  185. package/src/report/crawlExport.ts +0 -58
  186. package/src/report/crawl_template.ts +0 -9
  187. package/src/report/html.ts +0 -27
  188. package/src/scoring/health.ts +0 -241
  189. package/src/scoring/hits.ts +0 -153
  190. package/src/scoring/orphanSeverity.ts +0 -176
  191. package/src/utils/version.ts +0 -18
  192. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  193. package/tests/analysis.unit.test.ts +0 -142
  194. package/tests/analyze.integration.test.ts +0 -133
  195. package/tests/analyze_markdown.test.ts +0 -98
  196. package/tests/audit/audit.test.ts +0 -101
  197. package/tests/audit/dns.test.ts +0 -31
  198. package/tests/audit/headers.test.ts +0 -45
  199. package/tests/audit/scoring.test.ts +0 -133
  200. package/tests/audit/security.test.ts +0 -12
  201. package/tests/audit/transport.test.ts +0 -111
  202. package/tests/clustering.test.ts +0 -118
  203. package/tests/clustering_risk.test.ts +0 -118
  204. package/tests/crawler.test.ts +0 -364
  205. package/tests/db/index.test.ts +0 -134
  206. package/tests/db/repositories.test.ts +0 -115
  207. package/tests/db.test.ts +0 -159
  208. package/tests/db_repos.test.ts +0 -72
  209. package/tests/diff.test.ts +0 -67
  210. package/tests/duplicate.test.ts +0 -110
  211. package/tests/extract.test.ts +0 -86
  212. package/tests/fetcher.test.ts +0 -110
  213. package/tests/fetcher_safety.test.ts +0 -91
  214. package/tests/fixtures/analyze-crawl.json +0 -26
  215. package/tests/graph/graph.test.ts +0 -100
  216. package/tests/graphLoader.test.ts +0 -124
  217. package/tests/hits.test.ts +0 -134
  218. package/tests/html_report.test.ts +0 -59
  219. package/tests/ipGuard.test.ts +0 -73
  220. package/tests/lock/lockManager.test.ts +0 -198
  221. package/tests/metrics.test.ts +0 -196
  222. package/tests/normalize.test.ts +0 -88
  223. package/tests/orphanSeverity.test.ts +0 -160
  224. package/tests/pagerank.test.ts +0 -98
  225. package/tests/parser.test.ts +0 -117
  226. package/tests/proxy_safety.test.ts +0 -57
  227. package/tests/redirect_safety.test.ts +0 -77
  228. package/tests/renderAnalysisCsv.test.ts +0 -183
  229. package/tests/safety.test.ts +0 -126
  230. package/tests/scope.test.ts +0 -84
  231. package/tests/scoring.test.ts +0 -60
  232. package/tests/sitemap.test.ts +0 -100
  233. package/tests/soft404.test.ts +0 -41
  234. package/tests/ssrf_fix.test.ts +0 -69
  235. package/tests/trap.test.ts +0 -39
  236. package/tests/visualization_data.test.ts +0 -46
  237. package/tsconfig.json +0 -11
@@ -1,3 +1,23 @@
1
1
  import { EngineContext } from '../events.js';
2
2
  import { Graph } from '../graph/graph.js';
3
- export declare function runPostCrawlMetrics(snapshotId: number, maxDepth: number, context?: EngineContext, limitReached?: boolean, graphInstance?: Graph): void;
3
+ export interface PostCrawlOptions {
4
+ context?: EngineContext;
5
+ limitReached?: boolean;
6
+ graphInstance?: Graph;
7
+ clustering?: boolean;
8
+ clusterThreshold?: number;
9
+ minClusterSize?: number;
10
+ health?: boolean;
11
+ computePagerank?: boolean;
12
+ computeHits?: boolean;
13
+ heading?: boolean;
14
+ orphans?: boolean;
15
+ orphanSeverity?: 'low' | 'medium' | 'high' | boolean;
16
+ includeSoftOrphans?: boolean;
17
+ minInbound?: number;
18
+ rootOrigin?: string;
19
+ }
20
+ export declare function runPostCrawlMetrics(snapshotId: number, maxDepth: number, options?: PostCrawlOptions): {
21
+ metrics: any;
22
+ healthData?: any;
23
+ } | undefined;
@@ -3,11 +3,22 @@ import { loadGraphFromSnapshot } from '../db/graphLoader.js';
3
3
  import { MetricsRepository } from '../db/repositories/MetricsRepository.js';
4
4
  import { SnapshotRepository } from '../db/repositories/SnapshotRepository.js';
5
5
  import { PageRepository } from '../db/repositories/PageRepository.js';
6
- import { computePageRank } from '../graph/pagerank.js';
7
6
  import { calculateMetrics } from '../graph/metrics.js';
8
- import { computeHITS } from '../scoring/hits.js';
9
- import { calculateHealthScore, collectCrawlIssues } from '../scoring/health.js';
10
- export function runPostCrawlMetrics(snapshotId, maxDepth, context, limitReached = false, graphInstance) {
7
+ import { PageRankService } from '../graph/pagerank.js';
8
+ import { HITSService } from '../graph/hits.js';
9
+ import { TrapDetector } from './trap.js';
10
+ import { ClusteringService } from '../analysis/clustering.js';
11
+ import { DuplicateService } from '../analysis/duplicate.js';
12
+ import { annotateOrphans } from '../analysis/orphan.js';
13
+ import { Soft404Service } from '../analysis/soft404.js';
14
+ import { HeadingHealthService } from '../analysis/heading.js';
15
+ import { HealthService } from '../scoring/health.js';
16
+ import { analyzeContent } from '../analysis/content.js';
17
+ import { load } from 'cheerio';
18
+ export function runPostCrawlMetrics(snapshotId, maxDepth, options = {}) {
19
+ const context = options.context;
20
+ const limitReached = options.limitReached || false;
21
+ const graphInstance = options.graphInstance;
11
22
  const db = getDb();
12
23
  const metricsRepo = new MetricsRepository(db);
13
24
  const snapshotRepo = new SnapshotRepository(db);
@@ -21,62 +32,160 @@ export function runPostCrawlMetrics(snapshotId, maxDepth, context, limitReached
21
32
  else {
22
33
  if (event.type === 'error')
23
34
  console.error(event.message);
24
- else if (event.type !== 'debug')
25
- console.log(event.message || event.phase);
35
+ else if (event.type !== 'debug') {
36
+ const out = event.message || event.phase;
37
+ if (out)
38
+ console.log(out);
39
+ }
26
40
  }
27
41
  };
28
42
  const snapshot = snapshotRepo.getSnapshot(snapshotId);
29
43
  if (!snapshot) {
30
44
  emit({ type: 'error', message: `Snapshot ${snapshotId} not found` });
31
- return;
45
+ return undefined;
32
46
  }
33
47
  if (!graphInstance) {
34
48
  emit({ type: 'metrics:start', phase: 'Loading graph' });
35
49
  }
36
- emit({ type: 'metrics:start', phase: 'Computing PageRank' });
37
- computePageRank(graph);
38
- emit({ type: 'metrics:start', phase: 'Computing HITS' });
39
- computeHITS(graph);
50
+ emit({ type: 'metrics:start', phase: 'Running core algorithms' });
51
+ // 1. Graph Algorithms
52
+ const prResults = options.computePagerank ? new PageRankService().evaluate(graph) : new Map();
53
+ const hitsResults = options.computeHits ? new HITSService().evaluate(graph, { iterations: 20 }) : new Map();
54
+ // 2. Crawler Safety
55
+ new TrapDetector().analyze(graph);
56
+ // 3. Analysis / Intelligence
57
+ if (options.clustering) {
58
+ const contentClusters = new ClusteringService().detectContentClusters(graph, options.clusterThreshold, options.minClusterSize);
59
+ if (contentClusters.length > 0) {
60
+ const insertCluster = db.prepare(`
61
+ INSERT OR REPLACE INTO content_clusters (id, snapshot_id, count, primary_url, risk, shared_path_prefix)
62
+ VALUES (@id, @snapshot_id, @count, @primary_url, @risk, @shared_path_prefix)
63
+ `);
64
+ const insertContentTx = db.transaction((clusters) => {
65
+ for (const c of clusters) {
66
+ insertCluster.run({
67
+ id: c.id,
68
+ snapshot_id: snapshotId,
69
+ count: c.count,
70
+ primary_url: c.primaryUrl,
71
+ risk: c.risk,
72
+ shared_path_prefix: c.sharedPathPrefix ?? null
73
+ });
74
+ }
75
+ });
76
+ insertContentTx(contentClusters);
77
+ }
78
+ }
79
+ const duplicateClusters = new DuplicateService().detectDuplicates(graph, { collapse: false });
80
+ if (duplicateClusters.length > 0) {
81
+ const insertCluster = db.prepare(`
82
+ INSERT OR REPLACE INTO duplicate_clusters (id, snapshot_id, type, size, representative, severity)
83
+ VALUES (@id, @snapshot_id, @type, @size, @representative, @severity)
84
+ `);
85
+ const insertDuplicateTx = db.transaction((clusters) => {
86
+ for (const c of clusters) {
87
+ insertCluster.run({
88
+ id: c.id,
89
+ snapshot_id: snapshotId,
90
+ type: c.type, // valid: 'exact' | 'near' | 'template_heavy'
91
+ size: c.size,
92
+ representative: c.representative,
93
+ severity: c.severity || 'low'
94
+ });
95
+ }
96
+ });
97
+ insertDuplicateTx(duplicateClusters);
98
+ }
99
+ let annotatedNodes = [];
100
+ if (options.orphans) {
101
+ const orphanOptions = {
102
+ enabled: true,
103
+ severityEnabled: !!options.orphanSeverity || options.orphanSeverity === undefined,
104
+ includeSoftOrphans: options.includeSoftOrphans ?? true,
105
+ minInbound: options.minInbound ?? 2
106
+ };
107
+ annotatedNodes = annotateOrphans(graph.getNodes(), graph.getEdges(), orphanOptions);
108
+ }
109
+ const soft404Service = new Soft404Service();
110
+ const headingService = new HeadingHealthService();
111
+ // Pre-calculate heading health for all nodes with HTML
112
+ let headingPayloads = new Map();
113
+ if (options.heading) {
114
+ const result = headingService.evaluateNodes(graph.getNodes());
115
+ headingPayloads = result.payloadsByUrl;
116
+ }
117
+ // Apply signals to nodes
118
+ for (const node of graph.getNodes()) {
119
+ const pr = prResults.get(node.url);
120
+ if (pr)
121
+ node.pagerankScore = pr.score;
122
+ const hits = hitsResults.get(node.url);
123
+ if (hits) {
124
+ node.authScore = hits.authority_score;
125
+ node.hubScore = hits.hub_score;
126
+ node.linkRole = hits.link_role;
127
+ }
128
+ if (options.orphans) {
129
+ const annotated = annotatedNodes.find((n) => n.url === node.url);
130
+ if (annotated) {
131
+ node.orphanScore = annotated.orphanSeverity;
132
+ node.orphanType = annotated.orphanType;
133
+ node.impactLevel = annotated.impactLevel;
134
+ }
135
+ }
136
+ if (options.heading) {
137
+ const heading = headingPayloads.get(node.url);
138
+ if (heading) {
139
+ node.headingScore = heading.score;
140
+ node.headingData = JSON.stringify(heading);
141
+ }
142
+ }
143
+ if (node.html) {
144
+ const soft404 = soft404Service.analyze(node.html, node.outLinks);
145
+ node.soft404Score = soft404.score;
146
+ const $ = load(node.html);
147
+ const content = analyzeContent($);
148
+ node.wordCount = content.wordCount;
149
+ }
150
+ }
40
151
  emit({ type: 'metrics:start', phase: 'Updating metrics in DB' });
41
- const nodes = graph.getNodes();
42
152
  // Pre-fetch all page IDs to avoid N+1 queries
43
- // Use getPagesIdentityBySnapshot to avoid loading full page content (HTML) into memory again
44
- const pages = pageRepo.getPagesIdentityBySnapshot(snapshotId);
153
+ const pagesIdentity = pageRepo.getPagesIdentityBySnapshot(snapshotId);
45
154
  const urlToId = new Map();
46
- for (const p of pages) {
155
+ for (const p of pagesIdentity) {
47
156
  urlToId.set(p.normalized_url, p.id);
48
157
  }
49
- const clusterStmt = db.prepare(`
50
- INSERT OR REPLACE INTO duplicate_clusters (id, snapshot_id, type, size, representative, severity)
51
- VALUES (?, ?, ?, ?, ?, ?)
52
- `);
53
- const contentStmt = db.prepare(`
54
- INSERT OR REPLACE INTO content_clusters (id, snapshot_id, count, primary_url, risk, shared_path_prefix)
55
- VALUES (?, ?, ?, ?, ?, ?)
56
- `);
158
+ const metricsToSave = graph.getNodes().map(node => {
159
+ const pageId = urlToId.get(node.url);
160
+ if (!pageId)
161
+ return null;
162
+ return {
163
+ snapshot_id: snapshotId,
164
+ page_id: pageId,
165
+ crawl_status: node.crawlStatus ?? null,
166
+ word_count: node.wordCount ?? null,
167
+ thin_content_score: node.thinContentScore ?? null,
168
+ external_link_ratio: node.externalLinkRatio ?? null,
169
+ pagerank_score: node.pagerankScore ?? null,
170
+ hub_score: node.hubScore ?? null,
171
+ auth_score: node.authScore ?? null,
172
+ link_role: node.linkRole ?? null,
173
+ duplicate_cluster_id: node.duplicateClusterId ?? null,
174
+ duplicate_type: node.duplicateType ?? null,
175
+ cluster_id: node.clusterId ?? null,
176
+ soft404_score: node.soft404Score ?? null,
177
+ heading_score: node.headingScore ?? null,
178
+ orphan_score: node.orphanScore ?? null,
179
+ orphan_type: node.orphanType ?? null,
180
+ impact_level: node.impactLevel ?? null,
181
+ heading_data: node.headingData ?? null,
182
+ is_cluster_primary: node.isClusterPrimary ? 1 : 0
183
+ };
184
+ }).filter(m => m !== null);
185
+ metricsRepo.insertMany(metricsToSave);
186
+ // Update page-level metadata in transaction
57
187
  const tx = db.transaction(() => {
58
- for (const node of nodes) {
59
- const pageId = urlToId.get(node.url);
60
- if (!pageId)
61
- continue;
62
- metricsRepo.insertMetrics({
63
- snapshot_id: snapshotId,
64
- page_id: pageId,
65
- authority_score: node.authorityScore ?? null,
66
- hub_score: node.hubScore ?? null,
67
- pagerank: node.pageRank ?? null,
68
- pagerank_score: node.pageRankScore ?? null,
69
- link_role: node.linkRole ?? null,
70
- crawl_status: node.crawlStatus ?? null,
71
- word_count: node.wordCount ?? null,
72
- thin_content_score: node.thinContentScore ?? null,
73
- external_link_ratio: node.externalLinkRatio ?? null,
74
- orphan_score: node.orphanScore ?? null,
75
- duplicate_cluster_id: node.duplicateClusterId ?? null,
76
- duplicate_type: node.duplicateType ?? null,
77
- is_cluster_primary: node.isClusterPrimary ? 1 : 0
78
- });
79
- // Update page-level crawl trap data
188
+ for (const node of graph.getNodes()) {
80
189
  if (node.crawlTrapFlag || node.redirectChain?.length || node.bytesReceived) {
81
190
  pageRepo.upsertPage({
82
191
  site_id: snapshot.site_id,
@@ -90,28 +199,40 @@ export function runPostCrawlMetrics(snapshotId, maxDepth, context, limitReached
90
199
  });
91
200
  }
92
201
  }
93
- // Save duplicate clusters
94
- for (const cluster of graph.duplicateClusters) {
95
- clusterStmt.run(cluster.id, snapshotId, cluster.type, cluster.size, cluster.representative, cluster.severity);
96
- }
97
- // Save content clusters
98
- for (const cluster of graph.contentClusters) {
99
- contentStmt.run(cluster.id, snapshotId, cluster.count, cluster.primaryUrl, cluster.risk, cluster.sharedPathPrefix ?? null);
100
- }
101
202
  });
102
203
  tx();
103
204
  emit({ type: 'metrics:start', phase: 'Computing aggregate stats' });
104
205
  const metrics = calculateMetrics(graph, maxDepth);
105
- // Calculate penalty-based health score (matches CLI)
106
- const issues = collectCrawlIssues(graph, metrics);
107
- const health = calculateHealthScore(metrics.totalPages, issues);
206
+ // Compute health score if enabled
207
+ let healthScore = null;
208
+ if (options.health) {
209
+ try {
210
+ const rootOrigin = options.rootOrigin ?? '';
211
+ const healthService = new HealthService();
212
+ const issues = healthService.collectCrawlIssues(graph, metrics, rootOrigin);
213
+ const breakdown = healthService.calculateHealthScore(metrics.totalPages, issues);
214
+ healthScore = breakdown.score;
215
+ }
216
+ catch (e) {
217
+ emit({ type: 'error', message: 'Error computing health score', error: e });
218
+ }
219
+ }
220
+ const thinContentCount = graph.getNodes().filter(n => n.wordCount !== undefined && n.wordCount < 200 && n.status === 200).length;
221
+ const orphanCount = metrics.orphanPages.length;
108
222
  snapshotRepo.updateSnapshotStatus(snapshotId, 'completed', {
109
223
  node_count: metrics.totalPages,
110
224
  edge_count: metrics.totalEdges,
111
- health_score: health.score,
112
- orphan_count: issues.orphanPages,
113
- thin_content_count: issues.thinContent,
114
- limit_reached: limitReached ? 1 : 0
225
+ limit_reached: limitReached ? 1 : 0,
226
+ thin_content_count: thinContentCount,
227
+ orphan_count: orphanCount,
228
+ ...(healthScore !== null ? { health_score: healthScore } : {})
115
229
  });
116
230
  emit({ type: 'metrics:complete', durationMs: 0 });
231
+ return {
232
+ metrics,
233
+ healthData: healthScore !== null ? {
234
+ health: new HealthService().calculateHealthScore(metrics.totalPages, new HealthService().collectCrawlIssues(graph, metrics, options.rootOrigin ?? '')),
235
+ issues: new HealthService().collectCrawlIssues(graph, metrics, options.rootOrigin ?? '')
236
+ } : undefined
237
+ };
117
238
  }
@@ -3,5 +3,46 @@
3
3
  */
4
4
  export interface NormalizeOptions {
5
5
  stripQuery?: boolean;
6
+ toPath?: boolean;
6
7
  }
7
8
  export declare function normalizeUrl(input: string, base: string, options?: NormalizeOptions): string | null;
9
+ /**
10
+ * Utility for converting between absolute URLs and relative paths
11
+ * primarily used for database storage.
12
+ */
13
+ export declare class UrlUtil {
14
+ /**
15
+ * Extract a stable domain key from a URL/domain input.
16
+ * Examples:
17
+ * - "https://www.example.com/a" -> "example.com"
18
+ * - "example.com" -> "example.com"
19
+ */
20
+ static extractDomain(input: string): string;
21
+ /**
22
+ * Resolve a site's absolute origin from persisted site fields.
23
+ */
24
+ static resolveSiteOrigin(site: {
25
+ domain: string;
26
+ preferred_url?: string | null;
27
+ ssl?: number | null;
28
+ }): string;
29
+ /**
30
+ * Converts a full URL to a root-relative path if it matches the origin.
31
+ * If it doesn't match the origin, it's considered external and kept absolute.
32
+ */
33
+ static toPath(urlStr: string, origin: string): string;
34
+ /**
35
+ * Converts a root-relative path back to an absolute URL relative to the origin.
36
+ * If the input is already an absolute URL, it is returned as-is.
37
+ */
38
+ static toAbsolute(pathOrUrl: string, origin: string): string;
39
+ /**
40
+ * Determines if a URL (or path) is internal relative to the origin.
41
+ */
42
+ static isInternal(pathOrUrl: string, origin: string): boolean;
43
+ /**
44
+ * Build normalized lookup candidates for querying pages table.
45
+ * Returns path/absolute/original variants in priority order, deduplicated.
46
+ */
47
+ static toLookupCandidates(input: string, origin: string): string[];
48
+ }
@@ -10,7 +10,7 @@ const TRACKING_PARAMS = new Set([
10
10
  ]);
11
11
  const SKIP_EXTENSIONS = new Set([
12
12
  '.pdf', '.jpg', '.png', '.svg', '.webp', '.gif',
13
- '.zip', '.xml', '.json', '.mp4'
13
+ '.zip', '.xml', '.json', '.mp4', '.avif', '.ics'
14
14
  ]);
15
15
  export function normalizeUrl(input, base, options = {}) {
16
16
  try {
@@ -71,6 +71,7 @@ export function normalizeUrl(input, base, options = {}) {
71
71
  pathname = pathname.slice(0, -1);
72
72
  }
73
73
  u.pathname = pathname;
74
+ const finalUrl = u.toString();
74
75
  // 9. Skip non-HTML assets by extension
75
76
  const lastDotIndex = u.pathname.lastIndexOf('.');
76
77
  if (lastDotIndex !== -1) {
@@ -79,10 +80,125 @@ export function normalizeUrl(input, base, options = {}) {
79
80
  return null;
80
81
  }
81
82
  }
82
- // 10. Return final string
83
- return u.toString();
83
+ // 10. Return path if requested
84
+ if (options.toPath) {
85
+ return u.pathname + u.search;
86
+ }
87
+ // 11. Return final string
88
+ return finalUrl;
84
89
  }
85
90
  catch (_e) {
86
91
  return null;
87
92
  }
88
93
  }
94
+ /**
95
+ * Utility for converting between absolute URLs and relative paths
96
+ * primarily used for database storage.
97
+ */
98
+ export class UrlUtil {
99
+ /**
100
+ * Extract a stable domain key from a URL/domain input.
101
+ * Examples:
102
+ * - "https://www.example.com/a" -> "example.com"
103
+ * - "example.com" -> "example.com"
104
+ */
105
+ static extractDomain(input) {
106
+ const trimmed = input.trim();
107
+ if (!trimmed)
108
+ return '';
109
+ try {
110
+ const direct = new URL(trimmed);
111
+ return direct.hostname.toLowerCase().replace(/^www\./, '');
112
+ }
113
+ catch {
114
+ // fall through
115
+ }
116
+ try {
117
+ const withProtocol = new URL(`https://${trimmed}`);
118
+ return withProtocol.hostname.toLowerCase().replace(/^www\./, '');
119
+ }
120
+ catch {
121
+ return trimmed.toLowerCase().replace(/^www\./, '');
122
+ }
123
+ }
124
+ /**
125
+ * Resolve a site's absolute origin from persisted site fields.
126
+ */
127
+ static resolveSiteOrigin(site) {
128
+ if (site.preferred_url) {
129
+ try {
130
+ return new URL(site.preferred_url).origin;
131
+ }
132
+ catch {
133
+ // fall through to domain+ssl fallback
134
+ }
135
+ }
136
+ const protocol = site.ssl === 0 ? 'http' : 'https';
137
+ return `${protocol}://${site.domain}`;
138
+ }
139
+ /**
140
+ * Converts a full URL to a root-relative path if it matches the origin.
141
+ * If it doesn't match the origin, it's considered external and kept absolute.
142
+ */
143
+ static toPath(urlStr, origin) {
144
+ try {
145
+ const url = new URL(urlStr);
146
+ const originUrl = new URL(origin);
147
+ if (url.origin === originUrl.origin) {
148
+ return url.pathname + url.search;
149
+ }
150
+ return urlStr;
151
+ }
152
+ catch {
153
+ return urlStr;
154
+ }
155
+ }
156
+ /**
157
+ * Converts a root-relative path back to an absolute URL relative to the origin.
158
+ * If the input is already an absolute URL, it is returned as-is.
159
+ */
160
+ static toAbsolute(pathOrUrl, origin) {
161
+ if (pathOrUrl.startsWith('http://') || pathOrUrl.startsWith('https://')) {
162
+ return pathOrUrl;
163
+ }
164
+ try {
165
+ return new URL(pathOrUrl, origin).toString();
166
+ }
167
+ catch {
168
+ return pathOrUrl;
169
+ }
170
+ }
171
+ /**
172
+ * Determines if a URL (or path) is internal relative to the origin.
173
+ */
174
+ static isInternal(pathOrUrl, origin) {
175
+ if (!pathOrUrl.startsWith('http'))
176
+ return true;
177
+ try {
178
+ const url = new URL(pathOrUrl);
179
+ const originUrl = new URL(origin);
180
+ return url.origin === originUrl.origin;
181
+ }
182
+ catch {
183
+ return false;
184
+ }
185
+ }
186
+ /**
187
+ * Build normalized lookup candidates for querying pages table.
188
+ * Returns path/absolute/original variants in priority order, deduplicated.
189
+ */
190
+ static toLookupCandidates(input, origin) {
191
+ const candidates = new Set();
192
+ const raw = input.trim();
193
+ if (!raw)
194
+ return [];
195
+ const absolute = normalizeUrl(raw, origin, { stripQuery: false }) || UrlUtil.toAbsolute(raw, origin);
196
+ const path = normalizeUrl(raw, origin, { stripQuery: false, toPath: true }) || UrlUtil.toPath(raw, origin);
197
+ const absolutePath = normalizeUrl(absolute, '', { stripQuery: false, toPath: true }) || UrlUtil.toPath(absolute, origin);
198
+ candidates.add(path);
199
+ candidates.add(absolute);
200
+ candidates.add(absolutePath);
201
+ candidates.add(raw);
202
+ return Array.from(candidates).filter(Boolean);
203
+ }
204
+ }
@@ -11,12 +11,10 @@ export interface ParseResult {
11
11
  contentHash: string;
12
12
  simhash?: string;
13
13
  uniqueTokenRatio?: number;
14
- soft404Score: number;
15
- soft404Signals: string[];
16
14
  }
17
15
  export declare class Parser {
18
16
  /**
19
17
  * Parses HTML content to extract metadata and links.
20
18
  */
21
- parse(html: string, baseUrl: string, status: number): ParseResult;
19
+ parse(html: string, baseUrl: string, _status: number): ParseResult;
22
20
  }
@@ -6,7 +6,7 @@ export class Parser {
6
6
  /**
7
7
  * Parses HTML content to extract metadata and links.
8
8
  */
9
- parse(html, baseUrl, status) {
9
+ parse(html, baseUrl, _status) {
10
10
  const $ = cheerio.load(html);
11
11
  // 1. Robots Meta
12
12
  let noindex = false;
@@ -97,51 +97,6 @@ export class Parser {
97
97
  const uniqueTokens = new Set(tokens);
98
98
  const uniqueTokenRatio = tokens.length > 0 ? (uniqueTokens.size / tokens.length) : 0;
99
99
  const simhash = SimHash.generate(tokens).toString();
100
- // 5. Soft 404 Detection
101
- let soft404Score = 0;
102
- const soft404Signals = [];
103
- if (status === 200) {
104
- const title = $('title').text().toLowerCase();
105
- const h1Text = $('h1').first().text().toLowerCase();
106
- const bodyText = cleanText.toLowerCase();
107
- const errorPatterns = ['404', 'not found', 'error', 'doesn\'t exist', 'unavailable', 'invalid'];
108
- // Pattern checks
109
- for (const pattern of errorPatterns) {
110
- if (title.includes(pattern)) {
111
- soft404Score += 0.4;
112
- soft404Signals.push(`title_pattern_${pattern.replace(/\s+/g, '_')}`);
113
- break;
114
- }
115
- }
116
- for (const pattern of errorPatterns) {
117
- if (h1Text.includes(pattern)) {
118
- soft404Score += 0.3;
119
- soft404Signals.push(`h1_pattern_${pattern.replace(/\s+/g, '_')}`);
120
- break;
121
- }
122
- }
123
- if (bodyText.includes('page not found') || bodyText.includes('404 error')) {
124
- soft404Score += 0.2;
125
- soft404Signals.push('body_error_phrase');
126
- }
127
- // Content length check (Word count approximation)
128
- const words = cleanText.split(/\s+/).filter(w => w.length > 0);
129
- if (words.length < 50) {
130
- soft404Score += 0.3;
131
- soft404Signals.push('very_low_word_count');
132
- }
133
- else if (words.length < 150) {
134
- soft404Score += 0.1;
135
- soft404Signals.push('low_word_count');
136
- }
137
- // Link count check
138
- if (links.size === 0) {
139
- soft404Score += 0.2;
140
- soft404Signals.push('no_outbound_links');
141
- }
142
- // Cap at 1.0
143
- soft404Score = Math.min(1.0, soft404Score);
144
- }
145
100
  return {
146
101
  links: Array.from(links.entries()).map(([url, weight]) => ({ url, weight })),
147
102
  html: html, // pass raw HTML for analysis
@@ -150,9 +105,7 @@ export class Parser {
150
105
  nofollow,
151
106
  contentHash,
152
107
  simhash,
153
- uniqueTokenRatio,
154
- soft404Score,
155
- soft404Signals
108
+ uniqueTokenRatio
156
109
  };
157
110
  }
158
111
  }
@@ -0,0 +1,11 @@
1
+ import { Fetcher } from './fetcher.js';
2
+ import { Site } from '../db/repositories/SiteRepository.js';
3
+ export interface ResolvedUrl {
4
+ url: string;
5
+ site: Site;
6
+ }
7
+ export declare class UrlResolver {
8
+ private siteRepo;
9
+ constructor();
10
+ resolve(inputUrl: string, fetcher: Fetcher): Promise<ResolvedUrl>;
11
+ }
@@ -0,0 +1,67 @@
1
+ import { SiteRepository } from '../db/repositories/SiteRepository.js';
2
+ import { getDb } from '../db/index.js';
3
+ export class UrlResolver {
4
+ siteRepo;
5
+ constructor() {
6
+ this.siteRepo = new SiteRepository(getDb());
7
+ }
8
+ async resolve(inputUrl, fetcher) {
9
+ const hasProtocol = inputUrl.startsWith('http://') || inputUrl.startsWith('https://');
10
+ const workingUrl = hasProtocol ? inputUrl : `https://${inputUrl}`;
11
+ let hostname;
12
+ try {
13
+ hostname = new URL(workingUrl).hostname;
14
+ }
15
+ catch {
16
+ throw new Error(`Invalid URL or domain: ${inputUrl}`);
17
+ }
18
+ const domain = hostname.replace(/^www\./, '');
19
+ let site = this.siteRepo.firstOrCreateSite(domain);
20
+ // If protocol was omitted, we use our discovery logic or stored preference
21
+ if (!hasProtocol) {
22
+ if (site.ssl !== null && site.preferred_url) {
23
+ return {
24
+ url: site.preferred_url,
25
+ site
26
+ };
27
+ }
28
+ // No protocol provided and no stored preference: Probe HTTPS first
29
+ try {
30
+ const res = await fetcher.fetch(`https://${hostname}/`);
31
+ if (typeof res.status === 'number' && res.status >= 200 && res.status < 400) {
32
+ const isSsl = res.finalUrl.startsWith('https:') ? 1 : 0;
33
+ this.siteRepo.updateSitePreference(site.id, { preferred_url: res.finalUrl, ssl: isSsl });
34
+ // Refresh site object
35
+ site = this.siteRepo.getSiteById(site.id);
36
+ return { url: res.finalUrl, site };
37
+ }
38
+ }
39
+ catch {
40
+ // Fallback to HTTP
41
+ }
42
+ // Try HTTP
43
+ try {
44
+ const res = await fetcher.fetch(`http://${hostname}/`);
45
+ if (typeof res.status === 'number' && res.status >= 200 && res.status < 400) {
46
+ const isSsl = res.finalUrl.startsWith('https:') ? 1 : 0;
47
+ this.siteRepo.updateSitePreference(site.id, { preferred_url: res.finalUrl, ssl: isSsl });
48
+ site = this.siteRepo.getSiteById(site.id);
49
+ return { url: res.finalUrl, site };
50
+ }
51
+ }
52
+ catch {
53
+ // If both fail, we still default to the provided input as https
54
+ return { url: workingUrl, site };
55
+ }
56
+ }
57
+ // Protocol was provided, we just return it but ensure site is in sync if it's the first time
58
+ if (site.ssl === null) {
59
+ this.siteRepo.updateSitePreference(site.id, {
60
+ preferred_url: inputUrl,
61
+ ssl: inputUrl.startsWith('https:') ? 1 : 0
62
+ });
63
+ site = this.siteRepo.getSiteById(site.id);
64
+ }
65
+ return { url: inputUrl, site };
66
+ }
67
+ }