@crawlith/core 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (201) hide show
  1. package/CHANGELOG.md +7 -0
  2. package/dist/analysis/analyze.d.ts +70 -0
  3. package/dist/analysis/analyze.js +436 -0
  4. package/dist/analysis/content.d.ts +12 -0
  5. package/dist/analysis/content.js +33 -0
  6. package/dist/analysis/images.d.ts +6 -0
  7. package/dist/analysis/images.js +18 -0
  8. package/dist/analysis/links.d.ts +7 -0
  9. package/dist/analysis/links.js +30 -0
  10. package/dist/analysis/scoring.d.ts +9 -0
  11. package/dist/analysis/scoring.js +42 -0
  12. package/dist/analysis/seo.d.ts +15 -0
  13. package/dist/analysis/seo.js +64 -0
  14. package/dist/analysis/structuredData.d.ts +6 -0
  15. package/dist/analysis/structuredData.js +51 -0
  16. package/dist/audit/dns.d.ts +2 -0
  17. package/dist/audit/dns.js +42 -0
  18. package/dist/audit/headers.d.ts +2 -0
  19. package/dist/audit/headers.js +95 -0
  20. package/dist/audit/index.d.ts +2 -0
  21. package/dist/audit/index.js +50 -0
  22. package/dist/audit/scoring.d.ts +14 -0
  23. package/dist/audit/scoring.js +214 -0
  24. package/dist/audit/transport.d.ts +6 -0
  25. package/dist/audit/transport.js +207 -0
  26. package/dist/audit/types.d.ts +88 -0
  27. package/dist/audit/types.js +1 -0
  28. package/dist/core/network/proxyAdapter.d.ts +6 -0
  29. package/dist/core/network/proxyAdapter.js +19 -0
  30. package/dist/core/network/rateLimiter.d.ts +6 -0
  31. package/dist/core/network/rateLimiter.js +31 -0
  32. package/dist/core/network/redirectController.d.ts +13 -0
  33. package/dist/core/network/redirectController.js +41 -0
  34. package/dist/core/network/responseLimiter.d.ts +4 -0
  35. package/dist/core/network/responseLimiter.js +26 -0
  36. package/dist/core/network/retryPolicy.d.ts +10 -0
  37. package/dist/core/network/retryPolicy.js +41 -0
  38. package/dist/core/scope/domainFilter.d.ts +11 -0
  39. package/dist/core/scope/domainFilter.js +40 -0
  40. package/dist/core/scope/scopeManager.d.ts +14 -0
  41. package/dist/core/scope/scopeManager.js +39 -0
  42. package/dist/core/scope/subdomainPolicy.d.ts +6 -0
  43. package/dist/core/scope/subdomainPolicy.js +35 -0
  44. package/dist/core/security/ipGuard.d.ts +11 -0
  45. package/dist/core/security/ipGuard.js +84 -0
  46. package/dist/crawler/crawl.d.ts +22 -0
  47. package/dist/crawler/crawl.js +336 -0
  48. package/dist/crawler/extract.d.ts +5 -0
  49. package/dist/crawler/extract.js +33 -0
  50. package/dist/crawler/fetcher.d.ts +40 -0
  51. package/dist/crawler/fetcher.js +161 -0
  52. package/dist/crawler/metricsRunner.d.ts +1 -0
  53. package/dist/crawler/metricsRunner.js +108 -0
  54. package/dist/crawler/normalize.d.ts +7 -0
  55. package/dist/crawler/normalize.js +88 -0
  56. package/dist/crawler/parser.d.ts +22 -0
  57. package/dist/crawler/parser.js +158 -0
  58. package/dist/crawler/sitemap.d.ts +8 -0
  59. package/dist/crawler/sitemap.js +70 -0
  60. package/dist/crawler/trap.d.ts +24 -0
  61. package/dist/crawler/trap.js +78 -0
  62. package/dist/db/graphLoader.d.ts +2 -0
  63. package/dist/db/graphLoader.js +96 -0
  64. package/dist/db/index.d.ts +4 -0
  65. package/dist/db/index.js +61 -0
  66. package/dist/db/repositories/EdgeRepository.d.ts +16 -0
  67. package/dist/db/repositories/EdgeRepository.js +17 -0
  68. package/dist/db/repositories/MetricsRepository.d.ts +26 -0
  69. package/dist/db/repositories/MetricsRepository.js +27 -0
  70. package/dist/db/repositories/PageRepository.d.ts +47 -0
  71. package/dist/db/repositories/PageRepository.js +93 -0
  72. package/dist/db/repositories/SiteRepository.d.ts +15 -0
  73. package/dist/db/repositories/SiteRepository.js +22 -0
  74. package/dist/db/repositories/SnapshotRepository.d.ts +22 -0
  75. package/dist/db/repositories/SnapshotRepository.js +55 -0
  76. package/dist/db/schema.d.ts +2 -0
  77. package/dist/db/schema.js +169 -0
  78. package/dist/diff/compare.d.ts +26 -0
  79. package/dist/diff/compare.js +64 -0
  80. package/dist/graph/cluster.d.ts +6 -0
  81. package/dist/graph/cluster.js +173 -0
  82. package/dist/graph/duplicate.d.ts +10 -0
  83. package/dist/graph/duplicate.js +251 -0
  84. package/dist/graph/graph.d.ts +103 -0
  85. package/dist/graph/graph.js +106 -0
  86. package/dist/graph/metrics.d.ts +29 -0
  87. package/dist/graph/metrics.js +74 -0
  88. package/dist/graph/pagerank.d.ts +12 -0
  89. package/dist/graph/pagerank.js +102 -0
  90. package/dist/graph/simhash.d.ts +17 -0
  91. package/dist/graph/simhash.js +56 -0
  92. package/dist/index.d.ts +30 -0
  93. package/dist/index.js +30 -0
  94. package/dist/lock/hashKey.d.ts +1 -0
  95. package/dist/lock/hashKey.js +44 -0
  96. package/dist/lock/lockManager.d.ts +7 -0
  97. package/dist/lock/lockManager.js +112 -0
  98. package/dist/lock/pidCheck.d.ts +1 -0
  99. package/dist/lock/pidCheck.js +14 -0
  100. package/dist/report/html.d.ts +2 -0
  101. package/dist/report/html.js +223 -0
  102. package/dist/report/sitegraphExport.d.ts +3 -0
  103. package/dist/report/sitegraphExport.js +52 -0
  104. package/dist/report/sitegraph_template.d.ts +1 -0
  105. package/dist/report/sitegraph_template.js +630 -0
  106. package/dist/scoring/hits.d.ts +9 -0
  107. package/dist/scoring/hits.js +111 -0
  108. package/dist/scoring/orphanSeverity.d.ts +39 -0
  109. package/dist/scoring/orphanSeverity.js +125 -0
  110. package/dist/utils/version.d.ts +2 -0
  111. package/dist/utils/version.js +15 -0
  112. package/package.json +33 -0
  113. package/src/analysis/analyze.ts +548 -0
  114. package/src/analysis/content.ts +62 -0
  115. package/src/analysis/images.ts +28 -0
  116. package/src/analysis/links.ts +41 -0
  117. package/src/analysis/scoring.ts +59 -0
  118. package/src/analysis/seo.ts +82 -0
  119. package/src/analysis/structuredData.ts +62 -0
  120. package/src/audit/dns.ts +49 -0
  121. package/src/audit/headers.ts +98 -0
  122. package/src/audit/index.ts +66 -0
  123. package/src/audit/scoring.ts +232 -0
  124. package/src/audit/transport.ts +258 -0
  125. package/src/audit/types.ts +102 -0
  126. package/src/core/network/proxyAdapter.ts +21 -0
  127. package/src/core/network/rateLimiter.ts +39 -0
  128. package/src/core/network/redirectController.ts +47 -0
  129. package/src/core/network/responseLimiter.ts +34 -0
  130. package/src/core/network/retryPolicy.ts +57 -0
  131. package/src/core/scope/domainFilter.ts +45 -0
  132. package/src/core/scope/scopeManager.ts +52 -0
  133. package/src/core/scope/subdomainPolicy.ts +39 -0
  134. package/src/core/security/ipGuard.ts +92 -0
  135. package/src/crawler/crawl.ts +382 -0
  136. package/src/crawler/extract.ts +34 -0
  137. package/src/crawler/fetcher.ts +233 -0
  138. package/src/crawler/metricsRunner.ts +124 -0
  139. package/src/crawler/normalize.ts +108 -0
  140. package/src/crawler/parser.ts +190 -0
  141. package/src/crawler/sitemap.ts +73 -0
  142. package/src/crawler/trap.ts +96 -0
  143. package/src/db/graphLoader.ts +105 -0
  144. package/src/db/index.ts +70 -0
  145. package/src/db/repositories/EdgeRepository.ts +29 -0
  146. package/src/db/repositories/MetricsRepository.ts +49 -0
  147. package/src/db/repositories/PageRepository.ts +128 -0
  148. package/src/db/repositories/SiteRepository.ts +32 -0
  149. package/src/db/repositories/SnapshotRepository.ts +74 -0
  150. package/src/db/schema.ts +177 -0
  151. package/src/diff/compare.ts +84 -0
  152. package/src/graph/cluster.ts +192 -0
  153. package/src/graph/duplicate.ts +286 -0
  154. package/src/graph/graph.ts +172 -0
  155. package/src/graph/metrics.ts +110 -0
  156. package/src/graph/pagerank.ts +125 -0
  157. package/src/graph/simhash.ts +61 -0
  158. package/src/index.ts +30 -0
  159. package/src/lock/hashKey.ts +51 -0
  160. package/src/lock/lockManager.ts +124 -0
  161. package/src/lock/pidCheck.ts +13 -0
  162. package/src/report/html.ts +227 -0
  163. package/src/report/sitegraphExport.ts +58 -0
  164. package/src/report/sitegraph_template.ts +630 -0
  165. package/src/scoring/hits.ts +131 -0
  166. package/src/scoring/orphanSeverity.ts +176 -0
  167. package/src/utils/version.ts +18 -0
  168. package/tests/__snapshots__/orphanSeverity.test.ts.snap +49 -0
  169. package/tests/analysis.unit.test.ts +98 -0
  170. package/tests/analyze.integration.test.ts +98 -0
  171. package/tests/audit/dns.test.ts +31 -0
  172. package/tests/audit/headers.test.ts +45 -0
  173. package/tests/audit/scoring.test.ts +133 -0
  174. package/tests/audit/security.test.ts +12 -0
  175. package/tests/audit/transport.test.ts +112 -0
  176. package/tests/clustering.test.ts +118 -0
  177. package/tests/crawler.test.ts +358 -0
  178. package/tests/db.test.ts +159 -0
  179. package/tests/diff.test.ts +67 -0
  180. package/tests/duplicate.test.ts +110 -0
  181. package/tests/fetcher.test.ts +106 -0
  182. package/tests/fetcher_safety.test.ts +85 -0
  183. package/tests/fixtures/analyze-crawl.json +26 -0
  184. package/tests/hits.test.ts +134 -0
  185. package/tests/html_report.test.ts +58 -0
  186. package/tests/lock/lockManager.test.ts +138 -0
  187. package/tests/metrics.test.ts +196 -0
  188. package/tests/normalize.test.ts +101 -0
  189. package/tests/orphanSeverity.test.ts +160 -0
  190. package/tests/pagerank.test.ts +98 -0
  191. package/tests/parser.test.ts +117 -0
  192. package/tests/proxy_safety.test.ts +57 -0
  193. package/tests/redirect_safety.test.ts +73 -0
  194. package/tests/safety.test.ts +114 -0
  195. package/tests/scope.test.ts +66 -0
  196. package/tests/scoring.test.ts +59 -0
  197. package/tests/sitemap.test.ts +88 -0
  198. package/tests/soft404.test.ts +41 -0
  199. package/tests/trap.test.ts +39 -0
  200. package/tests/visualization_data.test.ts +46 -0
  201. package/tsconfig.json +11 -0
@@ -0,0 +1,173 @@
1
+ import { SimHash } from './simhash.js';
2
+ /**
3
+ * Detects content clusters using 64-bit SimHash and Hamming Distance.
4
+ * Uses band optimization to reduce O(n^2) comparisons.
5
+ */
6
+ export function detectContentClusters(graph, threshold = 10, minSize = 3) {
7
+ const nodes = graph.getNodes().filter(n => n.simhash && n.status === 200);
8
+ if (nodes.length === 0)
9
+ return [];
10
+ const adjacency = new Map();
11
+ // Banding Optimization (4 bands of 16 bits)
12
+ // Note: For threshold > 3, this is a heuristic and may miss some pairs,
13
+ // but it dramatically reduces the search space as requested.
14
+ const bands = 4;
15
+ const bandWidth = 16;
16
+ const buckets = Array.from({ length: bands }, () => new Map());
17
+ for (const node of nodes) {
18
+ const hash = BigInt(node.simhash);
19
+ for (let b = 0; b < bands; b++) {
20
+ const bandValue = Number((hash >> BigInt(b * bandWidth)) & 0xffffn);
21
+ if (!buckets[b].has(bandValue)) {
22
+ buckets[b].set(bandValue, new Set());
23
+ }
24
+ buckets[b].get(bandValue).add(node.url);
25
+ }
26
+ }
27
+ const checkedPairs = new Set();
28
+ for (let b = 0; b < bands; b++) {
29
+ for (const bucket of buckets[b].values()) {
30
+ if (bucket.size < 2)
31
+ continue;
32
+ const bucketNodes = Array.from(bucket);
33
+ for (let i = 0; i < bucketNodes.length; i++) {
34
+ for (let j = i + 1; j < bucketNodes.length; j++) {
35
+ const u1 = bucketNodes[i];
36
+ const u2 = bucketNodes[j];
37
+ if (u1 === u2)
38
+ continue;
39
+ const pairKey = u1 < u2 ? `${u1}|${u2}` : `${u2}|${u1}`;
40
+ if (checkedPairs.has(pairKey))
41
+ continue;
42
+ checkedPairs.add(pairKey);
43
+ const n1 = graph.nodes.get(u1);
44
+ const n2 = graph.nodes.get(u2);
45
+ const dist = SimHash.hammingDistance(BigInt(n1.simhash), BigInt(n2.simhash));
46
+ if (dist <= threshold) {
47
+ if (!adjacency.has(u1))
48
+ adjacency.set(u1, new Set());
49
+ if (!adjacency.has(u2))
50
+ adjacency.set(u2, new Set());
51
+ adjacency.get(u1).add(u2);
52
+ adjacency.get(u2).add(u1);
53
+ }
54
+ }
55
+ }
56
+ }
57
+ }
58
+ // Find connected components (Clusters)
59
+ const visited = new Set();
60
+ const clusters = [];
61
+ for (const node of nodes) {
62
+ if (visited.has(node.url))
63
+ continue;
64
+ const component = [];
65
+ const queue = [node.url];
66
+ visited.add(node.url);
67
+ while (queue.length > 0) {
68
+ const current = queue.shift();
69
+ component.push(current);
70
+ const neighbors = adjacency.get(current);
71
+ if (neighbors) {
72
+ for (const neighbor of neighbors) {
73
+ if (!visited.has(neighbor)) {
74
+ visited.add(neighbor);
75
+ queue.push(neighbor);
76
+ }
77
+ }
78
+ }
79
+ }
80
+ if (component.length >= minSize) {
81
+ clusters.push(component);
82
+ }
83
+ }
84
+ // Sort clusters by size (descending) then by primary URL (ascending) for deterministic IDs
85
+ clusters.sort((a, b) => {
86
+ if (b.length !== a.length)
87
+ return b.length - a.length;
88
+ const aPrimary = selectPrimaryUrl(a, graph);
89
+ const bPrimary = selectPrimaryUrl(b, graph);
90
+ return aPrimary.localeCompare(bPrimary);
91
+ });
92
+ const clusterInfos = [];
93
+ clusters.forEach((memberUrls, index) => {
94
+ const clusterId = index + 1;
95
+ const clusterNodes = memberUrls.map(url => graph.nodes.get(url));
96
+ for (const node of clusterNodes) {
97
+ node.clusterId = clusterId;
98
+ }
99
+ const primaryUrl = selectPrimaryUrl(memberUrls, graph);
100
+ const risk = calculateClusterRisk(clusterNodes);
101
+ const sharedPathPrefix = findSharedPathPrefix(memberUrls);
102
+ clusterInfos.push({
103
+ id: clusterId,
104
+ count: memberUrls.length,
105
+ primaryUrl,
106
+ risk,
107
+ sharedPathPrefix
108
+ });
109
+ });
110
+ graph.contentClusters = clusterInfos;
111
+ return clusterInfos;
112
+ }
113
+ /**
114
+ * Selects the primary URL for a cluster based on:
115
+ * 1. Highest PageRank
116
+ * 2. Shortest URL
117
+ * 3. Lexicographic fallback
118
+ */
119
+ function selectPrimaryUrl(urls, graph) {
120
+ return urls.reduce((best, current) => {
121
+ const nBest = graph.nodes.get(best);
122
+ const nCurrent = graph.nodes.get(current);
123
+ if ((nCurrent.pageRank || 0) > (nBest.pageRank || 0))
124
+ return current;
125
+ if ((nCurrent.pageRank || 0) < (nBest.pageRank || 0))
126
+ return best;
127
+ if (current.length < best.length)
128
+ return current;
129
+ if (current.length > best.length)
130
+ return best;
131
+ return current.localeCompare(best) < 0 ? current : best;
132
+ });
133
+ }
134
+ /**
135
+ * Calculates cannibalization risk based on title and H1 similarity within the cluster.
136
+ */
137
+ function calculateClusterRisk(nodes) {
138
+ // Logic: Check if there's significant overlap in Titles or H1s among cluster members.
139
+ // This is a heuristic as requested.
140
+ // Simplified heuristic: risk is based on cluster density and size
141
+ // Large clusters of highly similar content are high risk.
142
+ // Fallback to a safe categorization
143
+ if (nodes.length > 5)
144
+ return 'high';
145
+ if (nodes.length > 2)
146
+ return 'medium';
147
+ return 'low';
148
+ }
149
+ /**
150
+ * Finds the common path prefix among a set of URLs.
151
+ */
152
+ function findSharedPathPrefix(urls) {
153
+ if (urls.length < 2)
154
+ return undefined;
155
+ try {
156
+ const paths = urls.map(u => new URL(u).pathname.split('/').filter(Boolean));
157
+ const first = paths[0];
158
+ const common = [];
159
+ for (let i = 0; i < first.length; i++) {
160
+ const segment = first[i];
161
+ if (paths.every(p => p[i] === segment)) {
162
+ common.push(segment);
163
+ }
164
+ else {
165
+ break;
166
+ }
167
+ }
168
+ return common.length > 0 ? '/' + common.join('/') : undefined;
169
+ }
170
+ catch {
171
+ return undefined;
172
+ }
173
+ }
@@ -0,0 +1,10 @@
1
+ import { Graph } from './graph.js';
2
+ export interface DuplicateOptions {
3
+ collapse?: boolean;
4
+ simhashThreshold?: number;
5
+ }
6
+ /**
7
+ * Detects exact and near duplicates, identifies canonical conflicts,
8
+ * and performs non-destructive collapse of edges.
9
+ */
10
+ export declare function detectDuplicates(graph: Graph, options?: DuplicateOptions): void;
@@ -0,0 +1,251 @@
1
+ import { SimHash } from './simhash.js';
2
+ /**
3
+ * Detects exact and near duplicates, identifies canonical conflicts,
4
+ * and performs non-destructive collapse of edges.
5
+ */
6
+ export function detectDuplicates(graph, options = {}) {
7
+ const collapse = options.collapse !== false; // Default to true
8
+ const threshold = options.simhashThreshold ?? 3;
9
+ const exactClusters = [];
10
+ const nearClusters = [];
11
+ const nodes = graph.getNodes();
12
+ // Phase 1 & 2: Exact Duplicate Detection
13
+ const exactMap = new Map();
14
+ for (const node of nodes) {
15
+ if (!node.contentHash || node.status !== 200)
16
+ continue;
17
+ // Safety check: if there's no soft404 signal (soft404 is handled elsewhere, but just filter 200 OKs)
18
+ let arr = exactMap.get(node.contentHash);
19
+ if (!arr) {
20
+ arr = [];
21
+ exactMap.set(node.contentHash, arr);
22
+ }
23
+ arr.push(node);
24
+ }
25
+ // Nodes that are NOT part of an exact duplicate group are candidates for near duplicate checks
26
+ const nearCandidates = [];
27
+ let clusterCounter = 1;
28
+ for (const [_hash, group] of exactMap.entries()) {
29
+ if (group.length > 1) {
30
+ const id = `cluster_exact_${clusterCounter++}`;
31
+ exactClusters.push({ id, type: 'exact', nodes: group });
32
+ // Mark nodes
33
+ for (const n of group) {
34
+ n.duplicateClusterId = id;
35
+ n.duplicateType = 'exact';
36
+ }
37
+ }
38
+ else {
39
+ nearCandidates.push(group[0]);
40
+ }
41
+ }
42
+ // Phase 3: Near Duplicate Detection (SimHash with Bands)
43
+ // 64-bit simhash -> split into 4 bands of 16 bits.
44
+ const bandsMaps = [
45
+ new Map(),
46
+ new Map(),
47
+ new Map(),
48
+ new Map()
49
+ ];
50
+ for (const node of nearCandidates) {
51
+ if (!node.simhash)
52
+ continue;
53
+ const simhash = BigInt(node.simhash);
54
+ // Extract 16 bit bands
55
+ const b0 = Number(simhash & 0xffffn);
56
+ const b1 = Number((simhash >> 16n) & 0xffffn);
57
+ const b2 = Number((simhash >> 32n) & 0xffffn);
58
+ const b3 = Number((simhash >> 48n) & 0xffffn);
59
+ const bands = [b0, b1, b2, b3];
60
+ for (let i = 0; i < 4; i++) {
61
+ let arr = bandsMaps[i].get(bands[i]);
62
+ if (!arr) {
63
+ arr = [];
64
+ bandsMaps[i].set(bands[i], arr);
65
+ }
66
+ arr.push(node);
67
+ }
68
+ }
69
+ // Find candidate pairs
70
+ const nearGroupMap = new Map(); // node.url -> cluster set
71
+ const checkedPairs = new Set();
72
+ for (let i = 0; i < 4; i++) {
73
+ for (const [_bandVal, bucketNodes] of bandsMaps[i].entries()) {
74
+ if (bucketNodes.length < 2)
75
+ continue; // nothing to compare
76
+ // Compare all nodes in this bucket
77
+ for (let j = 0; j < bucketNodes.length; j++) {
78
+ for (let k = j + 1; k < bucketNodes.length; k++) {
79
+ const n1 = bucketNodes[j];
80
+ const n2 = bucketNodes[k];
81
+ // Ensure n1 < n2 lexicographically to avoid duplicate pairs
82
+ const [a, b] = n1.url < n2.url ? [n1, n2] : [n2, n1];
83
+ const pairKey = `${a.url}|${b.url}`;
84
+ if (checkedPairs.has(pairKey))
85
+ continue;
86
+ checkedPairs.add(pairKey);
87
+ const dist = SimHash.hammingDistance(BigInt(a.simhash), BigInt(b.simhash));
88
+ if (dist <= threshold) {
89
+ // They are near duplicates.
90
+ // Find or create their cluster set using union-find or reference propagation
91
+ const setA = nearGroupMap.get(a.url);
92
+ const setB = nearGroupMap.get(b.url);
93
+ if (!setA && !setB) {
94
+ const newSet = new Set([a, b]);
95
+ nearGroupMap.set(a.url, newSet);
96
+ nearGroupMap.set(b.url, newSet);
97
+ }
98
+ else if (setA && !setB) {
99
+ setA.add(b);
100
+ nearGroupMap.set(b.url, setA);
101
+ }
102
+ else if (setB && !setA) {
103
+ setB.add(a);
104
+ nearGroupMap.set(a.url, setB);
105
+ }
106
+ else if (setA && setB && setA !== setB) {
107
+ // Merge sets
108
+ for (const node of setB) {
109
+ setA.add(node);
110
+ nearGroupMap.set(node.url, setA);
111
+ }
112
+ }
113
+ }
114
+ }
115
+ }
116
+ }
117
+ }
118
+ // Compile near duplicate clusters (deduplicated by Set reference)
119
+ const uniqueNearSets = new Set();
120
+ for (const group of nearGroupMap.values()) {
121
+ uniqueNearSets.add(group);
122
+ }
123
+ for (const groupSet of uniqueNearSets) {
124
+ if (groupSet.size > 1) {
125
+ const id = `cluster_near_${clusterCounter++}`;
126
+ const groupArr = Array.from(groupSet);
127
+ nearClusters.push({ id, type: 'near', nodes: groupArr });
128
+ for (const n of groupArr) {
129
+ n.duplicateClusterId = id;
130
+ n.duplicateType = 'near';
131
+ }
132
+ }
133
+ }
134
+ const allClusters = [...exactClusters, ...nearClusters];
135
+ // Phase 4: Template-Heavy Detection
136
+ // Mark classes as 'template_heavy' if ratio < 0.3
137
+ for (const cluster of allClusters) {
138
+ const avgRatio = cluster.nodes.reduce((sum, n) => sum + (n.uniqueTokenRatio || 0), 0) / cluster.nodes.length;
139
+ if (avgRatio < 0.3) {
140
+ cluster.type = 'template_heavy';
141
+ cluster.nodes.forEach(n => n.duplicateType = 'template_heavy');
142
+ }
143
+ }
144
+ // Phase 5: Canonical Conflict & Representative Selection
145
+ for (const cluster of allClusters) {
146
+ const canonicals = new Set();
147
+ let hasMissing = false;
148
+ for (const n of cluster.nodes) {
149
+ if (!n.canonical)
150
+ hasMissing = true;
151
+ // We compare full absolute canonical URLs (assuming they are normalized during crawl)
152
+ else
153
+ canonicals.add(n.canonical);
154
+ }
155
+ if (hasMissing || canonicals.size > 1) {
156
+ cluster.severity = 'high';
157
+ }
158
+ else if (cluster.type === 'near') {
159
+ cluster.severity = 'medium';
160
+ }
161
+ else {
162
+ cluster.severity = 'low';
163
+ }
164
+ // Phase 6: Select Representative
165
+ // 1. Valid Canonical target in cluster
166
+ // 2. Highest internal in-degree
167
+ // 3. Shortest URL
168
+ // 4. First discovered (relying on array order, which is from BFS map roughly)
169
+ let representativeNode = cluster.nodes[0];
170
+ // Evaluate best rep
171
+ const urlsInCluster = new Set(cluster.nodes.map(n => n.url));
172
+ const validCanonicals = cluster.nodes.filter(n => n.canonical && urlsInCluster.has(n.canonical) && n.url === n.canonical);
173
+ if (validCanonicals.length > 0) {
174
+ representativeNode = validCanonicals[0]; // If multiple, just pick first matching self
175
+ }
176
+ else {
177
+ representativeNode = cluster.nodes.reduce((best, current) => {
178
+ if (current.inLinks > best.inLinks)
179
+ return current;
180
+ if (current.inLinks < best.inLinks)
181
+ return best;
182
+ if (current.url.length < best.url.length)
183
+ return current;
184
+ return best;
185
+ });
186
+ }
187
+ cluster.representative = representativeNode.url;
188
+ cluster.nodes.forEach(n => {
189
+ n.isClusterPrimary = n.url === representativeNode.url;
190
+ n.isCollapsed = false; // default for JSON
191
+ n.collapseInto = undefined;
192
+ });
193
+ // Push to Graph's final cluster list
194
+ graph.duplicateClusters.push({
195
+ id: cluster.id,
196
+ type: cluster.type,
197
+ size: cluster.nodes.length,
198
+ representative: representativeNode.url,
199
+ severity: cluster.severity
200
+ });
201
+ // Controlled Collapse
202
+ if (collapse) {
203
+ for (const n of cluster.nodes) {
204
+ if (n.url !== representativeNode.url) {
205
+ n.isCollapsed = true;
206
+ n.collapseInto = representativeNode.url;
207
+ }
208
+ }
209
+ }
210
+ }
211
+ // Final Edge Transfer if Collapsing
212
+ if (collapse) {
213
+ const edges = graph.getEdges();
214
+ const updatedEdges = new Map();
215
+ for (const edge of edges) {
216
+ const sourceNode = graph.nodes.get(edge.source);
217
+ const targetNode = graph.nodes.get(edge.target);
218
+ if (!sourceNode || !targetNode)
219
+ continue;
220
+ // We do NOT modify source structure for out-bound edges of collapsed nodes?
221
+ // Spec: "Ignore edges from collapsed nodes. Transfer inbound edges to representative."
222
+ // Actually, if a node links TO a collapsed node, we repoint the edge to the representative.
223
+ // If a collapsed node links to X, we ignore it (PageRank will filter it out).
224
+ const actualSource = edge.source;
225
+ // repoint target
226
+ const actualTarget = targetNode.isCollapsed && targetNode.collapseInto ? targetNode.collapseInto : edge.target;
227
+ // Skip self-referential edges caused by repointing
228
+ if (actualSource === actualTarget)
229
+ continue;
230
+ const edgeKey = `${actualSource}|${actualTarget}`;
231
+ const existingWeight = updatedEdges.get(edgeKey) || 0;
232
+ updatedEdges.set(edgeKey, Math.max(existingWeight, edge.weight)); // deduplicate
233
+ }
234
+ // Update graph edges in-place
235
+ graph.edges = updatedEdges;
236
+ // Re-calculate inLinks and outLinks based on collapsed edges
237
+ for (const node of graph.getNodes()) {
238
+ node.inLinks = 0;
239
+ node.outLinks = 0;
240
+ }
241
+ for (const [edgeKey, _weight] of updatedEdges.entries()) {
242
+ const [src, tgt] = edgeKey.split('|');
243
+ const sn = graph.nodes.get(src);
244
+ const tn = graph.nodes.get(tgt);
245
+ if (sn)
246
+ sn.outLinks++;
247
+ if (tn)
248
+ tn.inLinks++;
249
+ }
250
+ }
251
+ }
@@ -0,0 +1,103 @@
1
+ export interface GraphNode {
2
+ url: string;
3
+ depth: number;
4
+ inLinks: number;
5
+ outLinks: number;
6
+ status: number;
7
+ canonical?: string;
8
+ noindex?: boolean;
9
+ nofollow?: boolean;
10
+ brokenLinks?: string[];
11
+ redirectChain?: string[];
12
+ incrementalStatus?: 'new' | 'changed' | 'unchanged' | 'deleted';
13
+ etag?: string;
14
+ lastModified?: string;
15
+ contentHash?: string;
16
+ html?: string;
17
+ pageRank?: number;
18
+ pageRankScore?: number;
19
+ authorityScore?: number;
20
+ hubScore?: number;
21
+ duplicateClusterId?: string;
22
+ duplicateType?: 'exact' | 'near' | 'template_heavy' | 'none';
23
+ isClusterPrimary?: boolean;
24
+ isCollapsed?: boolean;
25
+ collapseInto?: string;
26
+ simhash?: string;
27
+ uniqueTokenRatio?: number;
28
+ soft404Score?: number;
29
+ soft404Signals?: string[];
30
+ crawlTrapFlag?: boolean;
31
+ crawlTrapRisk?: number;
32
+ trapType?: string;
33
+ securityError?: string;
34
+ retries?: number;
35
+ clusterId?: number;
36
+ bytesReceived?: number;
37
+ linkRole?: 'hub' | 'authority' | 'power' | 'balanced' | 'peripheral';
38
+ }
39
+ export interface GraphEdge {
40
+ source: string;
41
+ target: string;
42
+ weight: number;
43
+ }
44
+ export interface ClusterInfo {
45
+ id: number;
46
+ count: number;
47
+ primaryUrl: string;
48
+ risk: 'low' | 'medium' | 'high';
49
+ sharedPathPrefix?: string;
50
+ }
51
+ export interface CrawlStats {
52
+ pagesFetched: number;
53
+ pagesCached: number;
54
+ pagesSkipped: number;
55
+ totalFound: number;
56
+ }
57
+ export declare class Graph {
58
+ nodes: Map<string, GraphNode>;
59
+ edges: Map<string, number>;
60
+ limitReached: boolean;
61
+ sessionStats: CrawlStats;
62
+ trapClusters: {
63
+ pattern: string;
64
+ type: string;
65
+ count: number;
66
+ }[];
67
+ duplicateClusters: {
68
+ id: string;
69
+ type: 'exact' | 'near' | 'template_heavy';
70
+ size: number;
71
+ representative: string;
72
+ severity: 'low' | 'medium' | 'high';
73
+ }[];
74
+ contentClusters: ClusterInfo[];
75
+ /**
76
+ * Adds a node to the graph if it doesn't exist.
77
+ * If it exists, updates the status if the new status is non-zero (meaning we crawled it).
78
+ * Depth is only set on creation (BFS guarantees shortest path first).
79
+ */
80
+ addNode(url: string, depth: number, status?: number): void;
81
+ updateNodeData(url: string, data: Partial<GraphNode>): void;
82
+ /**
83
+ * Adds a directed edge between two nodes.
84
+ * Both nodes must exist in the graph.
85
+ * Updates inLinks and outLinks counts.
86
+ */
87
+ addEdge(source: string, target: string, weight?: number): void;
88
+ getNodes(): GraphNode[];
89
+ getEdges(): GraphEdge[];
90
+ toJSON(): {
91
+ nodes: GraphNode[];
92
+ edges: GraphEdge[];
93
+ duplicateClusters: {
94
+ id: string;
95
+ type: "exact" | "near" | "template_heavy";
96
+ size: number;
97
+ representative: string;
98
+ severity: "low" | "medium" | "high";
99
+ }[];
100
+ contentClusters: ClusterInfo[];
101
+ };
102
+ static fromJSON(json: any): Graph;
103
+ }
@@ -0,0 +1,106 @@
1
+ export class Graph {
2
+ nodes = new Map();
3
+ // Using string "source|target" to ensure uniqueness efficiently. Mapping to weight.
4
+ edges = new Map();
5
+ limitReached = false;
6
+ sessionStats = {
7
+ pagesFetched: 0,
8
+ pagesCached: 0,
9
+ pagesSkipped: 0,
10
+ totalFound: 0
11
+ };
12
+ trapClusters = [];
13
+ duplicateClusters = [];
14
+ contentClusters = [];
15
+ /**
16
+ * Adds a node to the graph if it doesn't exist.
17
+ * If it exists, updates the status if the new status is non-zero (meaning we crawled it).
18
+ * Depth is only set on creation (BFS guarantees shortest path first).
19
+ */
20
+ addNode(url, depth, status = 0) {
21
+ const existing = this.nodes.get(url);
22
+ if (!existing) {
23
+ this.nodes.set(url, {
24
+ url,
25
+ depth,
26
+ status,
27
+ inLinks: 0,
28
+ outLinks: 0
29
+ });
30
+ }
31
+ else {
32
+ // Update status if we have a real one now (e.g. was 0/pending, now crawled)
33
+ if (status !== 0) {
34
+ existing.status = status;
35
+ }
36
+ }
37
+ }
38
+ updateNodeData(url, data) {
39
+ const existing = this.nodes.get(url);
40
+ if (existing) {
41
+ Object.assign(existing, data);
42
+ }
43
+ }
44
+ /**
45
+ * Adds a directed edge between two nodes.
46
+ * Both nodes must exist in the graph.
47
+ * Updates inLinks and outLinks counts.
48
+ */
49
+ addEdge(source, target, weight = 1.0) {
50
+ const sourceNode = this.nodes.get(source);
51
+ const targetNode = this.nodes.get(target);
52
+ if (sourceNode && targetNode) {
53
+ const edgeKey = `${source}|${target}`;
54
+ if (!this.edges.has(edgeKey)) {
55
+ this.edges.set(edgeKey, weight);
56
+ sourceNode.outLinks++;
57
+ targetNode.inLinks++;
58
+ }
59
+ else {
60
+ // If edge exists, keep highest weight (or could sum, but usually we just want the 'best' relationship)
61
+ const currentWeight = this.edges.get(edgeKey) || 0;
62
+ if (weight > currentWeight) {
63
+ this.edges.set(edgeKey, weight);
64
+ }
65
+ }
66
+ }
67
+ }
68
+ getNodes() {
69
+ return Array.from(this.nodes.values());
70
+ }
71
+ getEdges() {
72
+ return Array.from(this.edges.entries()).map(([edge, weight]) => {
73
+ const [source, target] = edge.split('|');
74
+ return { source, target, weight };
75
+ });
76
+ }
77
+ toJSON() {
78
+ return {
79
+ nodes: this.getNodes(),
80
+ edges: this.getEdges(),
81
+ duplicateClusters: this.duplicateClusters,
82
+ contentClusters: this.contentClusters
83
+ };
84
+ }
85
+ static fromJSON(json) {
86
+ const graph = new Graph();
87
+ if (json.nodes) {
88
+ for (const node of json.nodes) {
89
+ graph.nodes.set(node.url, { ...node });
90
+ }
91
+ }
92
+ if (json.edges) {
93
+ for (const edge of json.edges) {
94
+ const key = `${edge.source}|${edge.target}`;
95
+ graph.edges.set(key, edge.weight || 1.0);
96
+ }
97
+ }
98
+ if (json.duplicateClusters) {
99
+ graph.duplicateClusters = json.duplicateClusters;
100
+ }
101
+ if (json.contentClusters) {
102
+ graph.contentClusters = json.contentClusters;
103
+ }
104
+ return graph;
105
+ }
106
+ }
@@ -0,0 +1,29 @@
1
+ import { Graph } from './graph.js';
2
+ export interface Metrics {
3
+ totalPages: number;
4
+ totalEdges: number;
5
+ orphanPages: string[];
6
+ nearOrphans: string[];
7
+ deepPages: string[];
8
+ topAuthorityPages: {
9
+ url: string;
10
+ authority: number;
11
+ }[];
12
+ averageOutDegree: number;
13
+ maxDepthFound: number;
14
+ crawlEfficiencyScore: number;
15
+ averageDepth: number;
16
+ structuralEntropy: number;
17
+ topPageRankPages: {
18
+ url: string;
19
+ score: number;
20
+ }[];
21
+ limitReached: boolean;
22
+ sessionStats?: {
23
+ pagesFetched: number;
24
+ pagesCached: number;
25
+ pagesSkipped: number;
26
+ totalFound: number;
27
+ };
28
+ }
29
+ export declare function calculateMetrics(graph: Graph, _maxDepth: number): Metrics;