@crawlith/core 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +70 -0
  3. package/dist/analysis/analyze.d.ts +29 -8
  4. package/dist/analysis/analyze.js +325 -221
  5. package/dist/analysis/clustering.d.ts +23 -0
  6. package/dist/analysis/clustering.js +206 -0
  7. package/dist/analysis/content.d.ts +1 -1
  8. package/dist/analysis/content.js +11 -5
  9. package/dist/analysis/duplicate.d.ts +34 -0
  10. package/dist/analysis/duplicate.js +305 -0
  11. package/dist/analysis/heading.d.ts +116 -0
  12. package/dist/analysis/heading.js +356 -0
  13. package/dist/analysis/images.d.ts +1 -1
  14. package/dist/analysis/images.js +6 -5
  15. package/dist/analysis/links.d.ts +1 -1
  16. package/dist/analysis/links.js +8 -8
  17. package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
  18. package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
  19. package/dist/analysis/scoring.js +4 -1
  20. package/dist/analysis/seo.d.ts +8 -4
  21. package/dist/analysis/seo.js +41 -30
  22. package/dist/analysis/soft404.d.ts +17 -0
  23. package/dist/analysis/soft404.js +62 -0
  24. package/dist/analysis/structuredData.d.ts +1 -1
  25. package/dist/analysis/structuredData.js +5 -4
  26. package/dist/application/index.d.ts +2 -0
  27. package/dist/application/index.js +2 -0
  28. package/dist/application/usecase.d.ts +3 -0
  29. package/dist/application/usecase.js +1 -0
  30. package/dist/application/usecases.d.ts +114 -0
  31. package/dist/application/usecases.js +201 -0
  32. package/dist/audit/index.js +1 -1
  33. package/dist/audit/transport.d.ts +1 -1
  34. package/dist/audit/transport.js +5 -4
  35. package/dist/audit/types.d.ts +1 -0
  36. package/dist/constants.d.ts +17 -0
  37. package/dist/constants.js +23 -0
  38. package/dist/core/scope/scopeManager.js +3 -0
  39. package/dist/crawler/crawl.d.ts +2 -2
  40. package/dist/crawler/crawler.d.ts +17 -5
  41. package/dist/crawler/crawler.js +259 -94
  42. package/dist/crawler/fetcher.d.ts +1 -1
  43. package/dist/crawler/fetcher.js +6 -6
  44. package/dist/crawler/metricsRunner.d.ts +21 -1
  45. package/dist/crawler/metricsRunner.js +181 -60
  46. package/dist/crawler/normalize.d.ts +41 -0
  47. package/dist/crawler/normalize.js +119 -3
  48. package/dist/crawler/parser.d.ts +1 -3
  49. package/dist/crawler/parser.js +2 -49
  50. package/dist/crawler/resolver.d.ts +11 -0
  51. package/dist/crawler/resolver.js +67 -0
  52. package/dist/crawler/sitemap.d.ts +4 -1
  53. package/dist/crawler/sitemap.js +24 -18
  54. package/dist/crawler/trap.d.ts +5 -1
  55. package/dist/crawler/trap.js +23 -2
  56. package/dist/db/CrawlithDB.d.ts +110 -0
  57. package/dist/db/CrawlithDB.js +500 -0
  58. package/dist/db/graphLoader.js +15 -32
  59. package/dist/db/index.d.ts +9 -1
  60. package/dist/db/index.js +39 -31
  61. package/dist/db/migrations.d.ts +2 -0
  62. package/dist/db/{schema.js → migrations.js} +90 -43
  63. package/dist/db/pluginRegistry.d.ts +9 -0
  64. package/dist/db/pluginRegistry.js +19 -0
  65. package/dist/db/repositories/EdgeRepository.d.ts +5 -0
  66. package/dist/db/repositories/EdgeRepository.js +7 -0
  67. package/dist/db/repositories/MetricsRepository.d.ts +13 -8
  68. package/dist/db/repositories/MetricsRepository.js +14 -6
  69. package/dist/db/repositories/PageRepository.d.ts +5 -3
  70. package/dist/db/repositories/PageRepository.js +68 -17
  71. package/dist/db/repositories/SiteRepository.d.ts +6 -0
  72. package/dist/db/repositories/SiteRepository.js +4 -0
  73. package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
  74. package/dist/db/repositories/SnapshotRepository.js +48 -10
  75. package/dist/db/reset.d.ts +9 -0
  76. package/dist/db/reset.js +32 -0
  77. package/dist/db/statements.d.ts +12 -0
  78. package/dist/db/statements.js +40 -0
  79. package/dist/diff/compare.d.ts +0 -5
  80. package/dist/diff/compare.js +0 -12
  81. package/dist/diff/service.d.ts +16 -0
  82. package/dist/diff/service.js +41 -0
  83. package/dist/domain/index.d.ts +4 -0
  84. package/dist/domain/index.js +4 -0
  85. package/dist/events.d.ts +8 -0
  86. package/dist/graph/graph.d.ts +20 -42
  87. package/dist/graph/graph.js +12 -16
  88. package/dist/graph/hits.d.ts +23 -0
  89. package/dist/graph/hits.js +111 -0
  90. package/dist/graph/metrics.d.ts +0 -4
  91. package/dist/graph/metrics.js +19 -15
  92. package/dist/graph/pagerank.d.ts +17 -4
  93. package/dist/graph/pagerank.js +126 -93
  94. package/dist/index.d.ts +27 -9
  95. package/dist/index.js +27 -9
  96. package/dist/lock/lockManager.d.ts +1 -0
  97. package/dist/lock/lockManager.js +15 -0
  98. package/dist/plugin-system/plugin-cli.d.ts +10 -0
  99. package/dist/plugin-system/plugin-cli.js +31 -0
  100. package/dist/plugin-system/plugin-config.d.ts +16 -0
  101. package/dist/plugin-system/plugin-config.js +36 -0
  102. package/dist/plugin-system/plugin-loader.d.ts +17 -0
  103. package/dist/plugin-system/plugin-loader.js +122 -0
  104. package/dist/plugin-system/plugin-registry.d.ts +25 -0
  105. package/dist/plugin-system/plugin-registry.js +167 -0
  106. package/dist/plugin-system/plugin-types.d.ts +205 -0
  107. package/dist/plugin-system/plugin-types.js +1 -0
  108. package/dist/ports/index.d.ts +9 -0
  109. package/dist/ports/index.js +1 -0
  110. package/dist/report/export.d.ts +3 -0
  111. package/dist/report/export.js +81 -0
  112. package/dist/report/insight.d.ts +27 -0
  113. package/dist/report/insight.js +103 -0
  114. package/dist/scoring/health.d.ts +17 -11
  115. package/dist/scoring/health.js +183 -140
  116. package/dist/utils/chalk.d.ts +6 -0
  117. package/dist/utils/chalk.js +41 -0
  118. package/dist/utils/secureConfig.d.ts +23 -0
  119. package/dist/utils/secureConfig.js +128 -0
  120. package/package.json +10 -4
  121. package/CHANGELOG.md +0 -13
  122. package/dist/db/schema.d.ts +0 -2
  123. package/dist/graph/cluster.d.ts +0 -6
  124. package/dist/graph/cluster.js +0 -221
  125. package/dist/graph/duplicate.d.ts +0 -10
  126. package/dist/graph/duplicate.js +0 -302
  127. package/dist/scoring/hits.d.ts +0 -10
  128. package/dist/scoring/hits.js +0 -131
  129. package/scripts/copy-assets.js +0 -37
  130. package/src/analysis/analysis_list.html +0 -35
  131. package/src/analysis/analysis_page.html +0 -123
  132. package/src/analysis/analyze.ts +0 -505
  133. package/src/analysis/content.ts +0 -62
  134. package/src/analysis/images.ts +0 -28
  135. package/src/analysis/links.ts +0 -41
  136. package/src/analysis/scoring.ts +0 -66
  137. package/src/analysis/seo.ts +0 -82
  138. package/src/analysis/structuredData.ts +0 -62
  139. package/src/analysis/templates.ts +0 -9
  140. package/src/audit/dns.ts +0 -49
  141. package/src/audit/headers.ts +0 -98
  142. package/src/audit/index.ts +0 -66
  143. package/src/audit/scoring.ts +0 -232
  144. package/src/audit/transport.ts +0 -258
  145. package/src/audit/types.ts +0 -102
  146. package/src/core/network/proxyAdapter.ts +0 -21
  147. package/src/core/network/rateLimiter.ts +0 -39
  148. package/src/core/network/redirectController.ts +0 -47
  149. package/src/core/network/responseLimiter.ts +0 -34
  150. package/src/core/network/retryPolicy.ts +0 -57
  151. package/src/core/scope/domainFilter.ts +0 -45
  152. package/src/core/scope/scopeManager.ts +0 -52
  153. package/src/core/scope/subdomainPolicy.ts +0 -39
  154. package/src/core/security/ipGuard.ts +0 -171
  155. package/src/crawler/crawl.ts +0 -9
  156. package/src/crawler/crawler.ts +0 -601
  157. package/src/crawler/extract.ts +0 -39
  158. package/src/crawler/fetcher.ts +0 -251
  159. package/src/crawler/metricsRunner.ts +0 -137
  160. package/src/crawler/normalize.ts +0 -108
  161. package/src/crawler/parser.ts +0 -190
  162. package/src/crawler/sitemap.ts +0 -76
  163. package/src/crawler/trap.ts +0 -96
  164. package/src/db/graphLoader.ts +0 -135
  165. package/src/db/index.ts +0 -75
  166. package/src/db/repositories/EdgeRepository.ts +0 -43
  167. package/src/db/repositories/MetricsRepository.ts +0 -63
  168. package/src/db/repositories/PageRepository.ts +0 -228
  169. package/src/db/repositories/SiteRepository.ts +0 -43
  170. package/src/db/repositories/SnapshotRepository.ts +0 -99
  171. package/src/db/schema.ts +0 -177
  172. package/src/diff/compare.ts +0 -84
  173. package/src/events.ts +0 -16
  174. package/src/graph/cluster.ts +0 -246
  175. package/src/graph/duplicate.ts +0 -350
  176. package/src/graph/graph.ts +0 -192
  177. package/src/graph/metrics.ts +0 -125
  178. package/src/graph/pagerank.ts +0 -126
  179. package/src/graph/simhash.ts +0 -76
  180. package/src/index.ts +0 -33
  181. package/src/lock/hashKey.ts +0 -51
  182. package/src/lock/lockManager.ts +0 -132
  183. package/src/lock/pidCheck.ts +0 -13
  184. package/src/report/crawl.html +0 -879
  185. package/src/report/crawlExport.ts +0 -58
  186. package/src/report/crawl_template.ts +0 -9
  187. package/src/report/html.ts +0 -27
  188. package/src/scoring/health.ts +0 -241
  189. package/src/scoring/hits.ts +0 -153
  190. package/src/scoring/orphanSeverity.ts +0 -176
  191. package/src/utils/version.ts +0 -18
  192. package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
  193. package/tests/analysis.unit.test.ts +0 -142
  194. package/tests/analyze.integration.test.ts +0 -133
  195. package/tests/analyze_markdown.test.ts +0 -98
  196. package/tests/audit/audit.test.ts +0 -101
  197. package/tests/audit/dns.test.ts +0 -31
  198. package/tests/audit/headers.test.ts +0 -45
  199. package/tests/audit/scoring.test.ts +0 -133
  200. package/tests/audit/security.test.ts +0 -12
  201. package/tests/audit/transport.test.ts +0 -111
  202. package/tests/clustering.test.ts +0 -118
  203. package/tests/clustering_risk.test.ts +0 -118
  204. package/tests/crawler.test.ts +0 -364
  205. package/tests/db/index.test.ts +0 -134
  206. package/tests/db/repositories.test.ts +0 -115
  207. package/tests/db.test.ts +0 -159
  208. package/tests/db_repos.test.ts +0 -72
  209. package/tests/diff.test.ts +0 -67
  210. package/tests/duplicate.test.ts +0 -110
  211. package/tests/extract.test.ts +0 -86
  212. package/tests/fetcher.test.ts +0 -110
  213. package/tests/fetcher_safety.test.ts +0 -91
  214. package/tests/fixtures/analyze-crawl.json +0 -26
  215. package/tests/graph/graph.test.ts +0 -100
  216. package/tests/graphLoader.test.ts +0 -124
  217. package/tests/hits.test.ts +0 -134
  218. package/tests/html_report.test.ts +0 -59
  219. package/tests/ipGuard.test.ts +0 -73
  220. package/tests/lock/lockManager.test.ts +0 -198
  221. package/tests/metrics.test.ts +0 -196
  222. package/tests/normalize.test.ts +0 -88
  223. package/tests/orphanSeverity.test.ts +0 -160
  224. package/tests/pagerank.test.ts +0 -98
  225. package/tests/parser.test.ts +0 -117
  226. package/tests/proxy_safety.test.ts +0 -57
  227. package/tests/redirect_safety.test.ts +0 -77
  228. package/tests/renderAnalysisCsv.test.ts +0 -183
  229. package/tests/safety.test.ts +0 -126
  230. package/tests/scope.test.ts +0 -84
  231. package/tests/scoring.test.ts +0 -60
  232. package/tests/sitemap.test.ts +0 -100
  233. package/tests/soft404.test.ts +0 -41
  234. package/tests/ssrf_fix.test.ts +0 -69
  235. package/tests/trap.test.ts +0 -39
  236. package/tests/visualization_data.test.ts +0 -46
  237. package/tsconfig.json +0 -11
@@ -1,192 +0,0 @@
1
- export interface GraphNode {
2
- url: string;
3
- depth: number;
4
- inLinks: number;
5
- outLinks: number;
6
- status: number;
7
- canonical?: string;
8
- noindex?: boolean;
9
- nofollow?: boolean;
10
- brokenLinks?: string[];
11
- redirectChain?: string[];
12
- incrementalStatus?: 'new' | 'changed' | 'unchanged' | 'deleted';
13
- etag?: string;
14
- lastModified?: string;
15
- contentHash?: string;
16
- html?: string;
17
- pageRank?: number;
18
- pageRankScore?: number;
19
- authorityScore?: number;
20
- hubScore?: number;
21
- duplicateClusterId?: string;
22
- duplicateType?: 'exact' | 'near' | 'template_heavy' | 'none';
23
- isClusterPrimary?: boolean;
24
- isCollapsed?: boolean;
25
- collapseInto?: string;
26
- simhash?: string;
27
- uniqueTokenRatio?: number;
28
- soft404Score?: number;
29
- soft404Signals?: string[];
30
- crawlTrapFlag?: boolean;
31
- crawlTrapRisk?: number;
32
- trapType?: string;
33
- securityError?: string;
34
- retries?: number;
35
- clusterId?: number;
36
- bytesReceived?: number;
37
- linkRole?: 'hub' | 'authority' | 'power' | 'balanced' | 'peripheral';
38
- crawlStatus?: string;
39
- wordCount?: number;
40
- thinContentScore?: number;
41
- externalLinkRatio?: number;
42
- orphanScore?: number;
43
- }
44
-
45
- export interface GraphEdge {
46
- source: string;
47
- target: string;
48
- weight: number;
49
- }
50
-
51
- export interface ClusterInfo {
52
- id: number;
53
- count: number;
54
- primaryUrl: string;
55
- risk: 'low' | 'medium' | 'high';
56
- sharedPathPrefix?: string;
57
- }
58
-
59
- export interface CrawlStats {
60
- pagesFetched: number;
61
- pagesCached: number;
62
- pagesSkipped: number;
63
- totalFound: number;
64
- }
65
-
66
- export class Graph {
67
- nodes: Map<string, GraphNode> = new Map();
68
- // Using JSON string of [source, target] to ensure uniqueness. Mapping to weight.
69
- edges: Map<string, number> = new Map();
70
- limitReached: boolean = false;
71
- sessionStats: CrawlStats = {
72
- pagesFetched: 0,
73
- pagesCached: 0,
74
- pagesSkipped: 0,
75
- totalFound: 0
76
- };
77
- trapClusters: { pattern: string; type: string; count: number }[] = [];
78
- duplicateClusters: { id: string; type: 'exact' | 'near' | 'template_heavy'; size: number; representative: string; severity: 'low' | 'medium' | 'high' }[] = [];
79
- contentClusters: ClusterInfo[] = [];
80
-
81
- /**
82
- * Generates a unique key for an edge.
83
- */
84
- static getEdgeKey(source: string, target: string): string {
85
- return JSON.stringify([source, target]);
86
- }
87
-
88
- /**
89
- * Parses an edge key back into source and target.
90
- */
91
- static parseEdgeKey(key: string): { source: string; target: string } {
92
- const [source, target] = JSON.parse(key);
93
- return { source, target };
94
- }
95
-
96
- /**
97
- * Adds a node to the graph if it doesn't exist.
98
- * If it exists, updates the status if the new status is non-zero (meaning we crawled it).
99
- * Depth is only set on creation (BFS guarantees shortest path first).
100
- */
101
- addNode(url: string, depth: number, status: number = 0) {
102
- const existing = this.nodes.get(url);
103
- if (!existing) {
104
- this.nodes.set(url, {
105
- url,
106
- depth,
107
- status,
108
- inLinks: 0,
109
- outLinks: 0
110
- });
111
- } else {
112
- // Update status if we have a real one now (e.g. was 0/pending, now crawled)
113
- if (status !== 0) {
114
- existing.status = status;
115
- }
116
- }
117
- }
118
-
119
- updateNodeData(url: string, data: Partial<GraphNode>) {
120
- const existing = this.nodes.get(url);
121
- if (existing) {
122
- Object.assign(existing, data);
123
- }
124
- }
125
-
126
- /**
127
- * Adds a directed edge between two nodes.
128
- * Both nodes must exist in the graph.
129
- * Updates inLinks and outLinks counts.
130
- */
131
- addEdge(source: string, target: string, weight: number = 1.0) {
132
- const sourceNode = this.nodes.get(source);
133
- const targetNode = this.nodes.get(target);
134
-
135
- if (sourceNode && targetNode) {
136
- const edgeKey = Graph.getEdgeKey(source, target);
137
- if (!this.edges.has(edgeKey)) {
138
- this.edges.set(edgeKey, weight);
139
- sourceNode.outLinks++;
140
- targetNode.inLinks++;
141
- } else {
142
- // If edge exists, keep highest weight (or could sum, but usually we just want the 'best' relationship)
143
- const currentWeight = this.edges.get(edgeKey) || 0;
144
- if (weight > currentWeight) {
145
- this.edges.set(edgeKey, weight);
146
- }
147
- }
148
- }
149
- }
150
-
151
- getNodes(): GraphNode[] {
152
- return Array.from(this.nodes.values());
153
- }
154
-
155
- getEdges(): GraphEdge[] {
156
- return Array.from(this.edges.entries()).map(([edge, weight]) => {
157
- const { source, target } = Graph.parseEdgeKey(edge);
158
- return { source, target, weight };
159
- });
160
- }
161
-
162
- toJSON() {
163
- return {
164
- nodes: this.getNodes(),
165
- edges: this.getEdges(),
166
- duplicateClusters: this.duplicateClusters,
167
- contentClusters: this.contentClusters
168
- };
169
- }
170
-
171
- static fromJSON(json: any): Graph {
172
- const graph = new Graph();
173
- if (json.nodes) {
174
- for (const node of json.nodes) {
175
- graph.nodes.set(node.url, { ...node });
176
- }
177
- }
178
- if (json.edges) {
179
- for (const edge of json.edges) {
180
- const key = Graph.getEdgeKey(edge.source, edge.target);
181
- graph.edges.set(key, edge.weight || 1.0);
182
- }
183
- }
184
- if (json.duplicateClusters) {
185
- graph.duplicateClusters = json.duplicateClusters;
186
- }
187
- if (json.contentClusters) {
188
- graph.contentClusters = json.contentClusters;
189
- }
190
- return graph;
191
- }
192
- }
@@ -1,125 +0,0 @@
1
- import { Graph, GraphNode } from './graph.js';
2
-
3
- export interface Metrics {
4
- totalPages: number;
5
- totalEdges: number;
6
- orphanPages: string[];
7
- nearOrphans: string[];
8
- deepPages: string[];
9
- topAuthorityPages: { url: string; authority: number }[];
10
- averageOutDegree: number;
11
- maxDepthFound: number;
12
- crawlEfficiencyScore: number;
13
- averageDepth: number;
14
- structuralEntropy: number;
15
- topPageRankPages: { url: string; score: number }[];
16
- limitReached: boolean;
17
- sessionStats?: {
18
- pagesFetched: number;
19
- pagesCached: number;
20
- pagesSkipped: number;
21
- totalFound: number;
22
- };
23
- }
24
-
25
- export function calculateMetrics(graph: Graph, _maxDepth: number): Metrics {
26
- const nodes = graph.getNodes();
27
- const edges = graph.getEdges();
28
-
29
- const totalPages = nodes.length;
30
- const totalEdges = edges.length;
31
-
32
- // Identify broken nodes
33
- const brokenNodes = new Set(nodes.filter(n => n.status >= 400 || n.status === 0).map(n => n.url));
34
-
35
- // Populate brokenLinks per node
36
- for (const node of nodes) {
37
- const nodeEdges = edges.filter(e => e.source === node.url);
38
- const broken = nodeEdges
39
- .map(e => e.target)
40
- .filter(targetUrl => brokenNodes.has(targetUrl));
41
-
42
- if (broken.length > 0) {
43
- node.brokenLinks = broken;
44
- }
45
- }
46
-
47
- // Authority Score (per node)
48
- const maxInLinks = nodes.reduce((max, n) => Math.max(max, n.inLinks), 0);
49
- const getAuthority = (node: GraphNode) => {
50
- if (maxInLinks === 0) return 0;
51
- return Math.log(1 + node.inLinks) / Math.log(1 + maxInLinks);
52
- };
53
-
54
- // orphanPages: inLinks === 0 && depth > 0
55
- const orphanPages = nodes
56
- .filter(n => n.inLinks === 0 && n.depth > 0)
57
- .map(n => n.url);
58
-
59
- // nearOrphans: inLinks === 1 && depth >= 3
60
- const nearOrphans = nodes
61
- .filter(n => n.inLinks === 1 && n.depth >= 3)
62
- .map(n => n.url);
63
-
64
- // deepPages: depth >= 4
65
- const deepPages = nodes
66
- .filter(n => n.depth >= 4) // Per requirement
67
- .map(n => n.url);
68
-
69
- // crawlEfficiencyScore: 1 - (deepPagesCount / totalPages)
70
- const deepPagesCount = deepPages.length;
71
- const crawlEfficiencyScore = totalPages > 0 ? 1 - (deepPagesCount / totalPages) : 1;
72
-
73
- // averageDepth: sum(depth) / totalPages
74
- const sumDepth = nodes.reduce((acc, n) => acc + n.depth, 0);
75
- const averageDepth = totalPages > 0 ? sumDepth / totalPages : 0;
76
-
77
- // structuralEntropy: Shannon entropy over outDegree distribution
78
- const outDegreeCounts = new Map<number, number>();
79
- nodes.forEach(n => {
80
- outDegreeCounts.set(n.outLinks, (outDegreeCounts.get(n.outLinks) || 0) + 1);
81
- });
82
-
83
- let structuralEntropy = 0;
84
- if (totalPages > 0) {
85
- for (const count of outDegreeCounts.values()) {
86
- const p = count / totalPages;
87
- if (p > 0) {
88
- structuralEntropy -= p * Math.log2(p);
89
- }
90
- }
91
- }
92
-
93
- // topAuthorityPages: Top 10 by authority
94
- const topAuthorityPages = [...nodes]
95
- .map(n => ({ url: n.url, authority: n.authorityScore ?? getAuthority(n) }))
96
- .sort((a, b) => b.authority - a.authority)
97
- .slice(0, 10);
98
-
99
- // topPageRankPages: Top 10 by raw PageRank
100
- const topPageRankPages = [...nodes]
101
- .filter(n => n.pageRank !== undefined)
102
- .map(n => ({ url: n.url, score: n.pageRank! }))
103
- .sort((a, b) => b.score - a.score)
104
- .slice(0, 10);
105
-
106
- const averageOutDegree = totalPages > 0 ? totalEdges / totalPages : 0;
107
- const maxDepthFound = nodes.reduce((max, n) => Math.max(max, n.depth), 0);
108
-
109
- return {
110
- totalPages,
111
- totalEdges,
112
- orphanPages,
113
- nearOrphans,
114
- deepPages,
115
- topAuthorityPages,
116
- averageOutDegree,
117
- maxDepthFound,
118
- crawlEfficiencyScore,
119
- averageDepth,
120
- structuralEntropy,
121
- topPageRankPages,
122
- limitReached: graph.limitReached,
123
- sessionStats: graph.sessionStats
124
- };
125
- }
@@ -1,126 +0,0 @@
1
- import { Graph, GraphNode } from './graph.js';
2
-
3
- interface PageRankOptions {
4
- dampingFactor?: number;
5
- maxIterations?: number;
6
- convergenceThreshold?: number;
7
- soft404WeightThreshold?: number;
8
- }
9
-
10
- /**
11
- * Production-Grade Weighted PageRank Engine
12
- */
13
- export function computePageRank(graph: Graph, options: PageRankOptions = {}) {
14
- const d = options.dampingFactor ?? 0.85;
15
- const maxIterations = options.maxIterations ?? 40;
16
- const epsilon = options.convergenceThreshold ?? 1e-5;
17
- const soft404Threshold = options.soft404WeightThreshold ?? 0.8;
18
-
19
- const allNodes = graph.getNodes();
20
- const allEdges = graph.getEdges();
21
-
22
- // 1. Filter Eligible Nodes
23
- const eligibleNodes = allNodes.filter(node => {
24
- if (node.noindex) return false;
25
- if (node.isCollapsed) return false;
26
- if (node.soft404Score && node.soft404Score > soft404Threshold) return false;
27
- if (node.canonical && node.canonical !== node.url) return false;
28
- if (node.status >= 400) return false; // Don't pass rank to broken pages
29
- if (node.status === 0) return false; // Don't pass rank to uncrawled/external pages
30
- return true;
31
- });
32
-
33
- const nodeCount = eligibleNodes.length;
34
- if (nodeCount === 0) return;
35
-
36
- const nodeUrls = eligibleNodes.map(n => n.url);
37
- const nodeMap = new Map<string, GraphNode>();
38
- eligibleNodes.forEach(n => nodeMap.set(n.url, n));
39
-
40
- // Initialize PageRank
41
- let pr = new Map<string, number>();
42
- nodeUrls.forEach(url => pr.set(url, 1 / nodeCount));
43
-
44
- // Pre-calculate weighted outbound sums and inverted adjacency
45
- const outWeights = new Map<string, number>();
46
- const incoming = new Map<string, { source: string; weight: number }[]>();
47
- const sinks: string[] = [];
48
-
49
- // Initialize outWeights for all eligible nodes
50
- nodeUrls.forEach(url => outWeights.set(url, 0));
51
-
52
- for (const edge of allEdges) {
53
- if (nodeMap.has(edge.source) && nodeMap.has(edge.target)) {
54
- const weight = edge.weight || 1.0;
55
-
56
- const sources = incoming.get(edge.target) ?? [];
57
- sources.push({ source: edge.source, weight });
58
- incoming.set(edge.target, sources);
59
-
60
- outWeights.set(edge.source, (outWeights.get(edge.source) || 0) + weight);
61
- }
62
- }
63
-
64
- // Identify sinks
65
- nodeUrls.forEach(url => {
66
- if ((outWeights.get(url) || 0) === 0) {
67
- sinks.push(url);
68
- }
69
- });
70
-
71
- // Iterative Calculation
72
- for (let i = 0; i < maxIterations; i++) {
73
- const nextPr = new Map<string, number>();
74
-
75
- // Calculate total rank from sinks to redistribute
76
- let sinkRankTotal = 0;
77
- for (const url of sinks) {
78
- sinkRankTotal += pr.get(url) || 0;
79
- }
80
-
81
- const baseRank = (1 - d) / nodeCount + (d * sinkRankTotal / nodeCount);
82
-
83
- for (const url of nodeUrls) {
84
- let rankFromLinks = 0;
85
- const sources = incoming.get(url) || [];
86
-
87
- for (const edge of sources) {
88
- const sourceRank = pr.get(edge.source) || 0;
89
- const sourceOutWeight = outWeights.get(edge.source) || 1.0;
90
- rankFromLinks += sourceRank * (edge.weight / sourceOutWeight);
91
- }
92
-
93
- const newRank = baseRank + d * rankFromLinks;
94
- nextPr.set(url, newRank);
95
- }
96
-
97
- // Convergence check
98
- let maxDelta = 0;
99
- for (const url of nodeUrls) {
100
- const delta = Math.abs(nextPr.get(url)! - pr.get(url)!);
101
- if (delta > maxDelta) maxDelta = delta;
102
- }
103
-
104
- pr = nextPr;
105
-
106
- if (maxDelta < epsilon) break;
107
- }
108
-
109
- // 2. Normalization (0-100)
110
- const ranks = Array.from(pr.values());
111
- const minPR = Math.min(...ranks);
112
- const maxPR = Math.max(...ranks);
113
- const range = maxPR - minPR;
114
-
115
- for (const node of eligibleNodes) {
116
- const rawRank = pr.get(node.url)!;
117
- node.pageRank = rawRank;
118
-
119
- if (range > 1e-12) {
120
- node.pageRankScore = 100 * (rawRank - minPR) / range;
121
- } else {
122
- // If there's no range, all eligible pages are equally important.
123
- node.pageRankScore = 100;
124
- }
125
- }
126
- }
@@ -1,76 +0,0 @@
1
- export class SimHash {
2
- private static FNV_PRIME = 1099511628211n;
3
- private static FNV_OFFSET_BASIS = 14695981039346656037n;
4
- private static MAX_UINT64 = 0xffffffffffffffffn;
5
- public static readonly BANDS = 4;
6
- public static readonly BAND_WIDTH = 16;
7
-
8
- /**
9
- * Generates a 64-bit FNV-1a hash for a given string token.
10
- */
11
- static fnv1a64(token: string): bigint {
12
- let hash = this.FNV_OFFSET_BASIS;
13
- const len = token.length;
14
- for (let i = 0; i < len; i++) {
15
- hash ^= BigInt(token.charCodeAt(i));
16
- // BigInt safe multiplication modulo 2^64
17
- hash = (hash * this.FNV_PRIME) & this.MAX_UINT64;
18
- }
19
- return hash;
20
- }
21
-
22
- /**
23
- * Generates a 64-bit SimHash from an array of tokens.
24
- */
25
- static generate(tokens: string[]): bigint {
26
- const v = new Int32Array(64);
27
-
28
- for (const token of tokens) {
29
- const hash = this.fnv1a64(token);
30
- for (let i = 0n; i < 64n; i++) {
31
- const bit = (hash >> i) & 1n;
32
- if (bit === 1n) {
33
- v[Number(i)]++;
34
- } else {
35
- v[Number(i)]--;
36
- }
37
- }
38
- }
39
-
40
- let simhash = 0n;
41
- for (let i = 0n; i < 64n; i++) {
42
- if (v[Number(i)] > 0) {
43
- simhash |= (1n << i);
44
- }
45
- }
46
-
47
- return simhash;
48
- }
49
-
50
- /**
51
- * Splits a 64-bit SimHash into 4 bands of 16 bits.
52
- */
53
- static getBands(simhash: bigint): number[] {
54
- const bands: number[] = [];
55
- for (let i = 0; i < SimHash.BANDS; i++) {
56
- // Extract 16-bit chunks
57
- const chunk = Number((simhash >> BigInt(i * SimHash.BAND_WIDTH)) & 0xFFFFn);
58
- bands.push(chunk);
59
- }
60
- return bands;
61
- }
62
-
63
- /**
64
- * Computes the Hamming distance between two 64-bit hashes.
65
- */
66
- static hammingDistance(a: bigint, b: bigint): number {
67
- let xor = a ^ b;
68
- let distance = 0;
69
- while (xor > 0n) {
70
- // Kernighan's bit counting
71
- xor &= xor - 1n;
72
- distance++;
73
- }
74
- return distance;
75
- }
76
- }
package/src/index.ts DELETED
@@ -1,33 +0,0 @@
1
- export * from './crawler/crawl.js';
2
- export * from './crawler/normalize.js';
3
- export * from './crawler/metricsRunner.js';
4
- export * from './graph/metrics.js';
5
- export * from './report/html.js';
6
- export * from './report/crawl_template.js';
7
- export * from './report/crawlExport.js';
8
- export * from './graph/graph.js';
9
- export * from './diff/compare.js';
10
- export * from './scoring/orphanSeverity.js';
11
- export * from './graph/pagerank.js';
12
- export * from './graph/duplicate.js';
13
- export * from './graph/cluster.js';
14
- export * from './scoring/health.js';
15
- export * from './scoring/hits.js';
16
- export * from './analysis/analyze.js';
17
- export * from './analysis/content.js';
18
- export * from './analysis/seo.js';
19
- export * from './analysis/images.js';
20
- export * from './analysis/links.js';
21
- export * from './audit/index.js';
22
- export * from './audit/types.js';
23
- export * from './db/index.js';
24
- export * from './db/graphLoader.js';
25
- export * from './db/repositories/SiteRepository.js';
26
- export * from './db/repositories/SnapshotRepository.js';
27
- export * from './db/repositories/PageRepository.js';
28
- export * from './db/repositories/EdgeRepository.js';
29
- export * from './db/repositories/MetricsRepository.js';
30
- export * from './lock/lockManager.js';
31
- export * from './lock/hashKey.js';
32
- export * from './utils/version.js';
33
- export * from './events.js';
@@ -1,51 +0,0 @@
1
- import crypto from 'node:crypto';
2
- import { normalizeUrl } from '../crawler/normalize.js';
3
-
4
- // Flags that change the nature of the crawl and should be part of the lock key
5
- const RELEVANT_FLAGS = [
6
- 'limit',
7
- 'depth',
8
- 'output',
9
- 'sitemap',
10
- 'incremental',
11
- 'detectSoft404',
12
- 'detectTraps',
13
- 'includeSubdomains',
14
- 'allow',
15
- 'deny',
16
- 'proxy',
17
- 'ua',
18
- 'maxRedirects',
19
- 'rate',
20
- 'maxBytes',
21
- 'concurrency'
22
- ];
23
-
24
- export function generateLockKey(commandName: string, targetUrl: string, options: any): string {
25
- // Respect the query stripping option consistent with crawl logic
26
- const stripQuery = !options.query;
27
-
28
- const normalizedTarget = normalizeUrl(targetUrl, '', { stripQuery }) || targetUrl;
29
-
30
- // Extract relevant options in a deterministic order
31
- const lockOptions: Record<string, any> = {};
32
- for (const key of RELEVANT_FLAGS) {
33
- if (options[key] !== undefined) {
34
- lockOptions[key] = options[key];
35
- }
36
- }
37
-
38
- // Create composite key object
39
- const compositeKey = {
40
- command: commandName,
41
- target: normalizedTarget,
42
- options: lockOptions
43
- };
44
-
45
- // Stringify and hash
46
- // Since we inserted keys in a deterministic order (RELEVANT_FLAGS order),
47
- // JSON.stringify will produce a stable string in V8/Node.js.
48
- const stableString = JSON.stringify(compositeKey);
49
-
50
- return crypto.createHash('sha256').update(stableString).digest('hex');
51
- }