solidity-argus 0.1.8 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (178) hide show
  1. package/AGENTS.md +3 -3
  2. package/README.md +229 -13
  3. package/package.json +37 -8
  4. package/skills/INVENTORY.md +88 -57
  5. package/skills/README.md +72 -6
  6. package/skills/case-studies/beanstalk-governance/SKILL.md +52 -0
  7. package/skills/case-studies/bzx-flash-loan/SKILL.md +53 -0
  8. package/skills/case-studies/cream-finance/SKILL.md +52 -0
  9. package/skills/case-studies/curve-reentrancy/SKILL.md +52 -0
  10. package/skills/case-studies/dao-hack/SKILL.md +51 -0
  11. package/skills/case-studies/euler-finance/SKILL.md +52 -0
  12. package/skills/case-studies/harvest-finance/SKILL.md +52 -0
  13. package/skills/case-studies/level-finance/SKILL.md +51 -0
  14. package/skills/case-studies/mango-markets/SKILL.md +53 -0
  15. package/skills/case-studies/nomad-bridge/SKILL.md +51 -0
  16. package/skills/case-studies/parity-multisig/SKILL.md +55 -0
  17. package/skills/case-studies/poly-network/SKILL.md +51 -0
  18. package/skills/case-studies/rari-fuse/SKILL.md +51 -0
  19. package/skills/case-studies/ronin-bridge/SKILL.md +52 -0
  20. package/skills/case-studies/wormhole-bridge/SKILL.md +51 -0
  21. package/skills/checklists/cyfrin-defi-core/SKILL.md +3 -0
  22. package/skills/manifests/cyfrin.json +16 -0
  23. package/skills/manifests/defifofum.json +25 -0
  24. package/skills/manifests/kadenzipfel.json +48 -0
  25. package/skills/manifests/scvd.json +9 -0
  26. package/skills/manifests/smartbugs.json +9 -0
  27. package/skills/manifests/solodit.json +9 -0
  28. package/skills/manifests/sunweb3sec.json +9 -0
  29. package/skills/manifests/trailofbits.json +9 -0
  30. package/skills/methodology/audit-workflow/SKILL.md +3 -0
  31. package/skills/protocol-patterns/amm-dex/SKILL.md +3 -0
  32. package/skills/references/exploit-reference/SKILL.md +3 -0
  33. package/skills/vulnerability-patterns/access-control/SKILL.md +27 -0
  34. package/skills/vulnerability-patterns/arbitrary-storage-location/SKILL.md +13 -1
  35. package/skills/vulnerability-patterns/assert-violation/SKILL.md +8 -1
  36. package/skills/vulnerability-patterns/asserting-contract-from-code-size/SKILL.md +12 -1
  37. package/skills/vulnerability-patterns/authorization-txorigin/SKILL.md +8 -1
  38. package/skills/vulnerability-patterns/cross-chain-bridge-vulnerabilities/SKILL.md +217 -0
  39. package/skills/vulnerability-patterns/default-visibility/SKILL.md +13 -1
  40. package/skills/vulnerability-patterns/delegatecall-untrusted-callee/SKILL.md +8 -1
  41. package/skills/vulnerability-patterns/dos-gas-limit/SKILL.md +8 -1
  42. package/skills/vulnerability-patterns/dos-revert/SKILL.md +14 -1
  43. package/skills/vulnerability-patterns/erc4626-exchange-rate-manipulation/SKILL.md +64 -0
  44. package/skills/vulnerability-patterns/fee-on-transfer-tokens/SKILL.md +93 -0
  45. package/skills/vulnerability-patterns/flash-loan-attacks/SKILL.md +13 -0
  46. package/skills/vulnerability-patterns/floating-pragma/SKILL.md +8 -1
  47. package/skills/vulnerability-patterns/front-running-attacks/SKILL.md +209 -0
  48. package/skills/vulnerability-patterns/gas-optimization-patterns/SKILL.md +203 -0
  49. package/skills/vulnerability-patterns/governance-attacks/SKILL.md +208 -0
  50. package/skills/vulnerability-patterns/hash-collision/SKILL.md +8 -1
  51. package/skills/vulnerability-patterns/inadherence-to-standards/SKILL.md +12 -1
  52. package/skills/vulnerability-patterns/incorrect-constructor/SKILL.md +8 -1
  53. package/skills/vulnerability-patterns/incorrect-inheritance-order/SKILL.md +8 -1
  54. package/skills/vulnerability-patterns/insufficient-gas-griefing/SKILL.md +12 -1
  55. package/skills/vulnerability-patterns/lack-of-precision/SKILL.md +7 -1
  56. package/skills/vulnerability-patterns/logic-errors/SKILL.md +10 -0
  57. package/skills/vulnerability-patterns/missing-parameter-bounds/SKILL.md +44 -0
  58. package/skills/vulnerability-patterns/missing-protection-signature-replay/SKILL.md +17 -1
  59. package/skills/vulnerability-patterns/msgvalue-loop/SKILL.md +12 -1
  60. package/skills/vulnerability-patterns/off-by-one/SKILL.md +7 -1
  61. package/skills/vulnerability-patterns/oracle-manipulation/SKILL.md +22 -0
  62. package/skills/vulnerability-patterns/outdated-compiler-version/SKILL.md +8 -1
  63. package/skills/vulnerability-patterns/overflow-underflow/SKILL.md +11 -1
  64. package/skills/vulnerability-patterns/proxy-vulnerabilities/SKILL.md +209 -0
  65. package/skills/vulnerability-patterns/reentrancy/SKILL.md +22 -0
  66. package/skills/vulnerability-patterns/shadowing-state-variables/SKILL.md +8 -1
  67. package/skills/vulnerability-patterns/share-accounting-desynchronization/SKILL.md +44 -0
  68. package/skills/vulnerability-patterns/signature-malleability/SKILL.md +11 -1
  69. package/skills/vulnerability-patterns/stateful-parameter-update-drift/SKILL.md +44 -0
  70. package/skills/vulnerability-patterns/unbounded-return-data/SKILL.md +12 -1
  71. package/skills/vulnerability-patterns/unchecked-return-values/SKILL.md +13 -1
  72. package/skills/vulnerability-patterns/unencrypted-private-data-on-chain/SKILL.md +8 -1
  73. package/skills/vulnerability-patterns/unexpected-ecrecover-null-address/SKILL.md +8 -1
  74. package/skills/vulnerability-patterns/uninitialized-storage-pointer/SKILL.md +8 -1
  75. package/skills/vulnerability-patterns/unsafe-erc20-transfers/SKILL.md +132 -0
  76. package/skills/vulnerability-patterns/unsafe-low-level-call/SKILL.md +12 -1
  77. package/skills/vulnerability-patterns/unsecure-signatures/SKILL.md +12 -1
  78. package/skills/vulnerability-patterns/unsupported-opcodes/SKILL.md +11 -1
  79. package/skills/vulnerability-patterns/unused-variables/SKILL.md +8 -1
  80. package/skills/vulnerability-patterns/use-of-deprecated-functions/SKILL.md +8 -1
  81. package/skills/vulnerability-patterns/weak-sources-randomness/SKILL.md +8 -1
  82. package/skills/vulnerability-patterns/weird-tokens/SKILL.md +10 -0
  83. package/skills/vulnerability-patterns/zero-address-misconfiguration/SKILL.md +48 -0
  84. package/src/agents/argus-prompt.ts +27 -10
  85. package/src/agents/pythia-prompt.ts +7 -8
  86. package/src/agents/scribe-prompt.ts +10 -5
  87. package/src/agents/sentinel-prompt.ts +36 -7
  88. package/src/cli/cli-output.ts +16 -0
  89. package/src/cli/cli-program.ts +29 -22
  90. package/src/cli/commands/check-skills.ts +135 -0
  91. package/src/cli/commands/doctor.ts +303 -23
  92. package/src/cli/commands/init.ts +8 -6
  93. package/src/cli/commands/install.ts +10 -8
  94. package/src/cli/commands/lint-skills.ts +118 -0
  95. package/src/cli/index.ts +5 -5
  96. package/src/cli/tui-prompts.ts +4 -2
  97. package/src/cli/types.ts +3 -3
  98. package/src/config/index.ts +1 -1
  99. package/src/config/loader.ts +4 -6
  100. package/src/config/schema.ts +6 -5
  101. package/src/config/types.ts +2 -2
  102. package/src/constants/defaults.ts +2 -0
  103. package/src/create-hooks.ts +225 -29
  104. package/src/create-managers.ts +10 -8
  105. package/src/create-tools.ts +14 -8
  106. package/src/features/background-agent/background-manager.ts +93 -87
  107. package/src/features/background-agent/index.ts +1 -1
  108. package/src/features/context-monitor/context-monitor.ts +3 -3
  109. package/src/features/context-monitor/index.ts +2 -2
  110. package/src/features/error-recovery/session-recovery.ts +2 -4
  111. package/src/features/error-recovery/tool-error-recovery.ts +79 -19
  112. package/src/features/index.ts +5 -5
  113. package/src/features/persistent-state/audit-state-manager.ts +158 -52
  114. package/src/features/persistent-state/global-run-index.ts +38 -0
  115. package/src/features/persistent-state/index.ts +1 -1
  116. package/src/features/persistent-state/run-journal.ts +86 -0
  117. package/src/hooks/agent-tracker.ts +53 -0
  118. package/src/hooks/compaction-hook.ts +46 -37
  119. package/src/hooks/config-handler.ts +31 -11
  120. package/src/hooks/context-budget.ts +42 -0
  121. package/src/hooks/event-hook.ts +48 -23
  122. package/src/hooks/hook-system.ts +4 -4
  123. package/src/hooks/index.ts +5 -5
  124. package/src/hooks/knowledge-sync-hook.ts +19 -21
  125. package/src/hooks/recon-context-builder.ts +66 -0
  126. package/src/hooks/safe-create-hook.ts +9 -11
  127. package/src/hooks/system-prompt-hook.ts +128 -0
  128. package/src/hooks/tool-tracking-hook.ts +162 -29
  129. package/src/hooks/types.ts +2 -1
  130. package/src/index.ts +23 -13
  131. package/src/knowledge/retry.ts +53 -0
  132. package/src/knowledge/scvd-client.ts +103 -83
  133. package/src/knowledge/scvd-errors.ts +89 -0
  134. package/src/knowledge/scvd-index.ts +110 -62
  135. package/src/knowledge/scvd-sync.ts +223 -47
  136. package/src/knowledge/source-manifest.ts +102 -0
  137. package/src/managers/index.ts +1 -1
  138. package/src/managers/types.ts +19 -14
  139. package/src/plugin-interface.ts +19 -8
  140. package/src/shared/binary-utils.ts +44 -34
  141. package/src/shared/deep-merge.ts +55 -36
  142. package/src/shared/file-utils.ts +21 -19
  143. package/src/shared/index.ts +11 -5
  144. package/src/shared/jsonc-parser.ts +123 -28
  145. package/src/shared/logger.ts +91 -17
  146. package/src/shared/project-utils.ts +30 -0
  147. package/src/skills/analysis/cluster.ts +414 -0
  148. package/src/skills/analysis/gates.ts +227 -0
  149. package/src/skills/analysis/index.ts +33 -0
  150. package/src/skills/analysis/normalize.ts +217 -0
  151. package/src/skills/analysis/similarity.ts +224 -0
  152. package/src/skills/argus-skill-resolver.ts +237 -0
  153. package/src/skills/skill-schema.ts +99 -0
  154. package/src/solodit-lifecycle.ts +202 -0
  155. package/src/state/audit-state.ts +10 -8
  156. package/src/state/finding-store.ts +68 -55
  157. package/src/state/types.ts +96 -44
  158. package/src/tools/argus-skill-load-tool.ts +78 -0
  159. package/src/tools/contract-analyzer-tool.ts +60 -77
  160. package/src/tools/forge-coverage-tool.ts +226 -0
  161. package/src/tools/forge-fuzz-tool.ts +127 -127
  162. package/src/tools/forge-test-tool.ts +153 -157
  163. package/src/tools/gas-analysis-tool.ts +264 -0
  164. package/src/tools/pattern-checker-tool.ts +206 -167
  165. package/src/tools/pattern-loader.ts +77 -0
  166. package/src/tools/pattern-schema.ts +51 -0
  167. package/src/tools/proxy-detection-tool.ts +224 -0
  168. package/src/tools/report-generator-tool.ts +333 -142
  169. package/src/tools/slither-tool.ts +300 -210
  170. package/src/tools/solodit-search-tool.ts +255 -80
  171. package/src/tools/sync-knowledge-tool.ts +7 -11
  172. package/src/utils/audit-artifact-detector.ts +118 -0
  173. package/src/utils/dependency-scanner.ts +93 -0
  174. package/src/utils/project-detector.ts +175 -86
  175. package/src/utils/solidity-parser.ts +112 -67
  176. package/src/utils/solodit-health.ts +29 -0
  177. package/src/hooks/event-hook-v2.ts +0 -99
  178. package/src/state/plugin-state.ts +0 -14
@@ -0,0 +1,414 @@
1
+ import { tokenJaccard } from "./similarity"
2
+
3
+ /** Input finding from the PDF extraction pipeline */
4
+ export interface ClusterFinding {
5
+ title: string
6
+ severity: string
7
+ description: string
8
+ category: string
9
+ source_pdf: string
10
+ source_name?: string
11
+ }
12
+
13
+ /** A single cluster of related findings */
14
+ export interface FindingCluster {
15
+ id: number
16
+ category: string
17
+ members: ClusterFinding[]
18
+ medoid: ClusterFinding
19
+ medoidIndex: number
20
+ topTokens: string[]
21
+ avgInternalSimilarity: number
22
+ size: number
23
+ }
24
+
25
+ /** Configuration for clustering */
26
+ export interface ClusterConfig {
27
+ linkThreshold: number
28
+ cohesionMinSimilarity: number
29
+ minClusterSize: number
30
+ }
31
+
32
+ /** Full clustering result */
33
+ export interface ClusterResult {
34
+ clusters: FindingCluster[]
35
+ singletons: ClusterFinding[]
36
+ stats: {
37
+ totalFindings: number
38
+ totalClusters: number
39
+ totalSingletons: number
40
+ categoryCounts: Record<string, number>
41
+ largestCluster: number
42
+ avgClusterSize: number
43
+ }
44
+ }
45
+
46
+ export const DEFAULT_CLUSTER_CONFIG: ClusterConfig = {
47
+ linkThreshold: 0.6,
48
+ cohesionMinSimilarity: 0.65,
49
+ minClusterSize: 2,
50
+ }
51
+
52
+ const STOPWORDS = new Set([
53
+ "the",
54
+ "a",
55
+ "an",
56
+ "is",
57
+ "are",
58
+ "was",
59
+ "were",
60
+ "be",
61
+ "been",
62
+ "being",
63
+ "have",
64
+ "has",
65
+ "had",
66
+ "do",
67
+ "does",
68
+ "did",
69
+ "will",
70
+ "would",
71
+ "shall",
72
+ "should",
73
+ "may",
74
+ "might",
75
+ "can",
76
+ "could",
77
+ "of",
78
+ "in",
79
+ "to",
80
+ "for",
81
+ "with",
82
+ "on",
83
+ "at",
84
+ "by",
85
+ "from",
86
+ "as",
87
+ "into",
88
+ "through",
89
+ "during",
90
+ "before",
91
+ "after",
92
+ "above",
93
+ "below",
94
+ "between",
95
+ "out",
96
+ "off",
97
+ "over",
98
+ "under",
99
+ "again",
100
+ "further",
101
+ "then",
102
+ "once",
103
+ "here",
104
+ "there",
105
+ "where",
106
+ "when",
107
+ "how",
108
+ "all",
109
+ "each",
110
+ "every",
111
+ "both",
112
+ "few",
113
+ "more",
114
+ "most",
115
+ "other",
116
+ "some",
117
+ "such",
118
+ "no",
119
+ "nor",
120
+ "not",
121
+ "only",
122
+ "own",
123
+ "same",
124
+ "than",
125
+ "too",
126
+ "very",
127
+ "and",
128
+ "but",
129
+ "or",
130
+ "if",
131
+ "this",
132
+ "that",
133
+ "these",
134
+ "those",
135
+ "it",
136
+ "its",
137
+ "contract",
138
+ "function",
139
+ "solidity",
140
+ "smart",
141
+ "vulnerability",
142
+ "attack",
143
+ "attacker",
144
+ "token",
145
+ "address",
146
+ "value",
147
+ "state",
148
+ "require",
149
+ "modifier",
150
+ "external",
151
+ "internal",
152
+ "public",
153
+ "private",
154
+ "mapping",
155
+ "uint256",
156
+ "bool",
157
+ "returns",
158
+ "event",
159
+ "emit",
160
+ ])
161
+
162
+ class UnionFind {
163
+ parent: number[]
164
+ rank: number[]
165
+ constructor(n: number) {
166
+ this.parent = Array.from({ length: n }, (_, index) => index)
167
+ this.rank = Array.from({ length: n }, () => 0)
168
+ }
169
+ find(x: number): number {
170
+ const parent = this.parent[x] ?? x
171
+ if (parent === x) return x
172
+ const root = this.find(parent)
173
+ this.parent[x] = root
174
+ return root
175
+ }
176
+ union(x: number, y: number): boolean {
177
+ const rootX = this.find(x)
178
+ const rootY = this.find(y)
179
+ if (rootX === rootY) return false
180
+ const rankX = this.rank[rootX] ?? 0
181
+ const rankY = this.rank[rootY] ?? 0
182
+ if (rankX < rankY) {
183
+ this.parent[rootX] = rootY
184
+ return true
185
+ }
186
+
187
+ if (rankX > rankY) {
188
+ this.parent[rootY] = rootX
189
+ return true
190
+ }
191
+
192
+ this.parent[rootY] = rootX
193
+ this.rank[rootX] = rankX + 1
194
+ return true
195
+ }
196
+ components(): Map<number, number[]> {
197
+ const groups = new Map<number, number[]>()
198
+ for (let index = 0; index < this.parent.length; index += 1) {
199
+ const root = this.find(index)
200
+ const group = groups.get(root) ?? []
201
+ group.push(index)
202
+ groups.set(root, group)
203
+ }
204
+ return groups
205
+ }
206
+ }
207
+
208
+ function tokenize(text: string): string[] {
209
+ if (!text) return []
210
+ const deduped = new Set<string>()
211
+ for (const token of text.toLowerCase().split(/[^a-z0-9]+/g)) {
212
+ if (token.length < 3) continue
213
+ if (STOPWORDS.has(token)) continue
214
+ deduped.add(token)
215
+ }
216
+ return Array.from(deduped)
217
+ }
218
+
219
+ function computeTokenSets(findings: ClusterFinding[]): Set<string>[] {
220
+ return findings.map((finding) => new Set(tokenize(`${finding.title} ${finding.description}`)))
221
+ }
222
+
223
+ function buildSimilarityMatrix(tokenSets: Set<string>[]): number[][] {
224
+ const tokenArrays = tokenSets.map((set) => [...set])
225
+ const matrix: number[][] = Array.from({ length: tokenSets.length }, () =>
226
+ Array.from({ length: tokenSets.length }, () => 0),
227
+ )
228
+ for (let i = 0; i < tokenSets.length; i += 1) {
229
+ const rowI = matrix[i]
230
+ if (rowI) rowI[i] = 1
231
+ for (let j = i + 1; j < tokenSets.length; j += 1) {
232
+ const similarity = tokenJaccard(tokenArrays[i] ?? [], tokenArrays[j] ?? [])
233
+ if (rowI) rowI[j] = similarity
234
+ const rowJ = matrix[j]
235
+ if (rowJ) rowJ[i] = similarity
236
+ }
237
+ }
238
+ return matrix
239
+ }
240
+
241
+ function medoidForMembers(memberIndices: number[], similarityMatrix: number[][]): number {
242
+ if (memberIndices.length === 0) return -1
243
+ if (memberIndices.length === 1) return memberIndices[0] ?? -1
244
+ let bestIndex = memberIndices[0] ?? -1
245
+ let bestAvgSimilarity = -1
246
+ for (const candidateIndex of memberIndices) {
247
+ let total = 0
248
+ for (const otherIndex of memberIndices) {
249
+ if (candidateIndex === otherIndex) continue
250
+ total += similarityMatrix[candidateIndex]?.[otherIndex] ?? 0
251
+ }
252
+
253
+ const avg = total / (memberIndices.length - 1)
254
+ if (avg > bestAvgSimilarity) {
255
+ bestAvgSimilarity = avg
256
+ bestIndex = candidateIndex
257
+ }
258
+ }
259
+ return bestIndex
260
+ }
261
+
262
+ function averageInternalSimilarity(memberIndices: number[], similarityMatrix: number[][]): number {
263
+ if (memberIndices.length < 2) return 0
264
+ let total = 0
265
+ let pairs = 0
266
+ for (let i = 0; i < memberIndices.length; i += 1) {
267
+ const left = memberIndices[i] ?? -1
268
+
269
+ for (let j = i + 1; j < memberIndices.length; j += 1) {
270
+ const right = memberIndices[j] ?? -1
271
+ total += similarityMatrix[left]?.[right] ?? 0
272
+ pairs += 1
273
+ }
274
+ }
275
+ return total / pairs
276
+ }
277
+
278
+ function topTokensForMembers(memberIndices: number[], tokenSets: Set<string>[]): string[] {
279
+ const counts = new Map<string, number>()
280
+ for (const memberIndex of memberIndices) {
281
+ const tokenSet = tokenSets[memberIndex]
282
+ if (!tokenSet) continue
283
+ for (const token of tokenSet) {
284
+ counts.set(token, (counts.get(token) ?? 0) + 1)
285
+ }
286
+ }
287
+ return Array.from(counts.entries())
288
+ .sort((left, right) => {
289
+ const countDelta = right[1] - left[1]
290
+ if (countDelta !== 0) return countDelta
291
+ return left[0].localeCompare(right[0])
292
+ })
293
+ .slice(0, 10)
294
+ .map(([token]) => token)
295
+ }
296
+
297
+ function pushSingletons(
298
+ target: ClusterFinding[],
299
+ bucket: ClusterFinding[],
300
+ indices: number[],
301
+ ): void {
302
+ for (const index of indices) {
303
+ const finding = bucket[index]
304
+ if (finding) target.push(finding)
305
+ }
306
+ }
307
+
308
+ /**
309
+ * Groups related findings per category using token Jaccard links and
310
+ * union-find connected components, then peels low-cohesion outliers.
311
+ */
312
+ export function clusterFindings(
313
+ findings: ClusterFinding[],
314
+ config: ClusterConfig = DEFAULT_CLUSTER_CONFIG,
315
+ ): ClusterResult {
316
+ const clusters: FindingCluster[] = []
317
+ const singletons: ClusterFinding[] = []
318
+ const categoryCounts: Record<string, number> = {}
319
+ const buckets = new Map<string, ClusterFinding[]>()
320
+ for (const finding of findings) {
321
+ categoryCounts[finding.category] = (categoryCounts[finding.category] ?? 0) + 1
322
+ const bucket = buckets.get(finding.category)
323
+ if (bucket) {
324
+ bucket.push(finding)
325
+ } else {
326
+ buckets.set(finding.category, [finding])
327
+ }
328
+ }
329
+ const orderedCategories = Array.from(buckets.keys()).sort((left, right) =>
330
+ left.localeCompare(right),
331
+ )
332
+ let nextClusterId = 1
333
+ for (const category of orderedCategories) {
334
+ const bucket = buckets.get(category) ?? []
335
+ if (bucket.length < config.minClusterSize) {
336
+ singletons.push(...bucket)
337
+ continue
338
+ }
339
+ const tokenSets = computeTokenSets(bucket)
340
+ const similarityMatrix = buildSimilarityMatrix(tokenSets)
341
+ const uf = new UnionFind(bucket.length)
342
+ for (let i = 0; i < bucket.length; i += 1) {
343
+ for (let j = i + 1; j < bucket.length; j += 1) {
344
+ if ((similarityMatrix[i]?.[j] ?? 0) < config.linkThreshold) continue
345
+ uf.union(i, j)
346
+ }
347
+ }
348
+ for (const memberIndices of uf.components().values()) {
349
+ if (memberIndices.length < config.minClusterSize) {
350
+ pushSingletons(singletons, bucket, memberIndices)
351
+ continue
352
+ }
353
+ const initialMedoid = medoidForMembers(memberIndices, similarityMatrix)
354
+ const keptIndices: number[] = []
355
+ const peeledIndices: number[] = []
356
+ for (const index of memberIndices) {
357
+ if (index === initialMedoid) {
358
+ keptIndices.push(index)
359
+ continue
360
+ }
361
+ const similarityToMedoid = similarityMatrix[initialMedoid]?.[index] ?? 0
362
+ if (similarityToMedoid >= config.cohesionMinSimilarity) {
363
+ keptIndices.push(index)
364
+ } else {
365
+ peeledIndices.push(index)
366
+ }
367
+ }
368
+ pushSingletons(singletons, bucket, peeledIndices)
369
+ if (keptIndices.length < config.minClusterSize) {
370
+ pushSingletons(singletons, bucket, keptIndices)
371
+ continue
372
+ }
373
+ const finalMedoid = medoidForMembers(keptIndices, similarityMatrix)
374
+ const members = keptIndices
375
+ .map((index) => bucket[index])
376
+ .filter((finding): finding is ClusterFinding => Boolean(finding))
377
+ if (members.length < config.minClusterSize) {
378
+ singletons.push(...members)
379
+ continue
380
+ }
381
+ const medoidIndex = Math.max(0, keptIndices.indexOf(finalMedoid))
382
+ const medoid = members.at(medoidIndex)
383
+ if (!medoid) throw new Error("Medoid index out of bounds — this should not happen")
384
+ clusters.push({
385
+ id: nextClusterId,
386
+ category,
387
+ members,
388
+ medoid,
389
+ medoidIndex,
390
+ topTokens: topTokensForMembers(keptIndices, tokenSets),
391
+ avgInternalSimilarity: averageInternalSimilarity(keptIndices, similarityMatrix),
392
+ size: members.length,
393
+ })
394
+ nextClusterId += 1
395
+ }
396
+ }
397
+ const largestCluster = clusters.reduce((max, cluster) => Math.max(max, cluster.size), 0)
398
+ const avgClusterSize =
399
+ clusters.length === 0
400
+ ? 0
401
+ : clusters.reduce((total, cluster) => total + cluster.size, 0) / clusters.length
402
+ return {
403
+ clusters,
404
+ singletons,
405
+ stats: {
406
+ totalFindings: findings.length,
407
+ totalClusters: clusters.length,
408
+ totalSingletons: singletons.length,
409
+ categoryCounts,
410
+ largestCluster,
411
+ avgClusterSize,
412
+ },
413
+ }
414
+ }
@@ -0,0 +1,227 @@
1
+ import type { SkillDoc } from "./normalize"
2
+ import type { SimilarityPair, SimilarityScore } from "./similarity"
3
+
4
+ export type GateLevel = "block" | "warn" | "info" | "pass"
5
+
6
+ export interface GateVerdict {
7
+ level: GateLevel
8
+ reason: string
9
+ }
10
+
11
+ export interface GateConfig {
12
+ blockThreshold: number
13
+ warnThreshold: number
14
+ infoThreshold: number
15
+ blockExactRegexConflict: boolean
16
+ }
17
+
18
+ export interface SkillReport {
19
+ totalSkills: number
20
+ findings: Array<{
21
+ skillA: string
22
+ skillB: string
23
+ score: SimilarityScore
24
+ verdict: GateVerdict
25
+ }>
26
+ summary: { block: number; warn: number; info: number }
27
+ }
28
+
29
+ export const DEFAULT_GATE_CONFIG: GateConfig = {
30
+ blockThreshold: 0.9,
31
+ warnThreshold: 0.78,
32
+ infoThreshold: 0.65,
33
+ blockExactRegexConflict: true,
34
+ }
35
+
36
+ const LEVEL_ORDER: Record<GateLevel, number> = {
37
+ block: 0,
38
+ warn: 1,
39
+ info: 2,
40
+ pass: 3,
41
+ }
42
+
43
+ function formatScore(score: number): string {
44
+ return score.toFixed(2)
45
+ }
46
+
47
+ function normalizeRegex(rule: string): string {
48
+ return rule.replace(/\s+/g, " ").trim()
49
+ }
50
+
51
+ function pairKey(skillA: string, skillB: string): string {
52
+ return skillA < skillB ? `${skillA}|||${skillB}` : `${skillB}|||${skillA}`
53
+ }
54
+
55
+ function topSignals(score: SimilarityScore): string {
56
+ const signals = [
57
+ { label: "body TF-IDF", value: score.bodyTfidf },
58
+ { label: "body shingles", value: score.bodyShingle },
59
+ { label: "name/description", value: score.nameDesc },
60
+ { label: "detection rules", value: score.detectionRules },
61
+ ]
62
+
63
+ signals.sort((left, right) => right.value - left.value)
64
+ return signals
65
+ .slice(0, 2)
66
+ .map((signal) => `${signal.label} ${formatScore(signal.value)}`)
67
+ .join(", ")
68
+ }
69
+
70
+ function scoreForConflict(score: SimilarityScore | undefined): SimilarityScore {
71
+ if (score) return score
72
+
73
+ return {
74
+ composite: 1,
75
+ bodyTfidf: 0,
76
+ bodyShingle: 0,
77
+ nameDesc: 0,
78
+ detectionRules: 1,
79
+ }
80
+ }
81
+
82
+ export function evaluatePair(
83
+ pair: SimilarityPair,
84
+ config: GateConfig = DEFAULT_GATE_CONFIG,
85
+ ): GateVerdict {
86
+ const composite = pair.score.composite
87
+ const signalSummary = topSignals(pair.score)
88
+ const reasonSuffix = `composite ${formatScore(composite)}; top signals: ${signalSummary}`
89
+
90
+ if (composite >= config.blockThreshold) {
91
+ return { level: "block", reason: `Duplicate risk: ${reasonSuffix}` }
92
+ }
93
+
94
+ if (composite >= config.warnThreshold) {
95
+ return { level: "warn", reason: `Near-duplicate risk: ${reasonSuffix}` }
96
+ }
97
+
98
+ if (composite >= config.infoThreshold) {
99
+ return { level: "info", reason: `Related skills: ${reasonSuffix}` }
100
+ }
101
+
102
+ return { level: "pass", reason: `Below thresholds: ${reasonSuffix}` }
103
+ }
104
+
105
+ export function checkExactRegexConflicts(
106
+ docs: SkillDoc[],
107
+ ): Array<{ skillA: string; skillB: string; sharedRegex: string }> {
108
+ const conflicts: Array<{ skillA: string; skillB: string; sharedRegex: string }> = []
109
+
110
+ for (let i = 0; i < docs.length; i += 1) {
111
+ const docA = docs[i]
112
+ if (!docA) continue
113
+
114
+ const rulesA = new Set(
115
+ docA.detectionRules.map(normalizeRegex).filter((rule) => rule.length > 0),
116
+ )
117
+
118
+ for (let j = i + 1; j < docs.length; j += 1) {
119
+ const docB = docs[j]
120
+ if (!docB) continue
121
+ if (docA.name === docB.name) continue
122
+
123
+ const rulesB = new Set(
124
+ docB.detectionRules.map(normalizeRegex).filter((rule) => rule.length > 0),
125
+ )
126
+ for (const sharedRegex of rulesA) {
127
+ if (!rulesB.has(sharedRegex)) continue
128
+ conflicts.push({
129
+ skillA: docA.name,
130
+ skillB: docB.name,
131
+ sharedRegex,
132
+ })
133
+ }
134
+ }
135
+ }
136
+
137
+ return conflicts
138
+ }
139
+
140
+ export function generateReport(
141
+ docs: SkillDoc[],
142
+ pairs: SimilarityPair[],
143
+ config: GateConfig = DEFAULT_GATE_CONFIG,
144
+ ): SkillReport {
145
+ const findings: SkillReport["findings"] = []
146
+ const pairScores = new Map<string, SimilarityScore>()
147
+
148
+ for (const pair of pairs) {
149
+ pairScores.set(pairKey(pair.skillA, pair.skillB), pair.score)
150
+
151
+ const verdict = evaluatePair(pair, config)
152
+ if (verdict.level === "pass") continue
153
+
154
+ findings.push({
155
+ skillA: pair.skillA,
156
+ skillB: pair.skillB,
157
+ score: pair.score,
158
+ verdict,
159
+ })
160
+ }
161
+
162
+ if (config.blockExactRegexConflict) {
163
+ const indexByPairKey = new Map<string, number>()
164
+ findings.forEach((finding, index) => {
165
+ indexByPairKey.set(pairKey(finding.skillA, finding.skillB), index)
166
+ })
167
+
168
+ const conflicts = checkExactRegexConflicts(docs)
169
+ for (const conflict of conflicts) {
170
+ const key = pairKey(conflict.skillA, conflict.skillB)
171
+ const existingIndex = indexByPairKey.get(key)
172
+ const conflictReason = `Exact detection rule conflict: ${conflict.sharedRegex}`
173
+
174
+ if (existingIndex !== undefined) {
175
+ const existing = findings[existingIndex]
176
+ if (!existing) continue
177
+ existing.verdict = { level: "block", reason: conflictReason }
178
+ continue
179
+ }
180
+
181
+ findings.push({
182
+ skillA: conflict.skillA,
183
+ skillB: conflict.skillB,
184
+ score: scoreForConflict(pairScores.get(key)),
185
+ verdict: { level: "block", reason: conflictReason },
186
+ })
187
+ indexByPairKey.set(key, findings.length - 1)
188
+ }
189
+ }
190
+
191
+ findings.sort((left, right) => {
192
+ const levelDelta = LEVEL_ORDER[left.verdict.level] - LEVEL_ORDER[right.verdict.level]
193
+ if (levelDelta !== 0) return levelDelta
194
+ return right.score.composite - left.score.composite
195
+ })
196
+
197
+ const summary = { block: 0, warn: 0, info: 0 }
198
+ for (const finding of findings) {
199
+ if (finding.verdict.level === "block") summary.block += 1
200
+ if (finding.verdict.level === "warn") summary.warn += 1
201
+ if (finding.verdict.level === "info") summary.info += 1
202
+ }
203
+
204
+ return {
205
+ totalSkills: docs.length,
206
+ findings,
207
+ summary,
208
+ }
209
+ }
210
+
211
+ export function formatReportText(report: SkillReport): string {
212
+ const lines = [
213
+ `Skills: ${report.totalSkills} | Blocks: ${report.summary.block} | Warnings: ${report.summary.warn} | Info: ${report.summary.info}`,
214
+ ]
215
+
216
+ for (const finding of report.findings) {
217
+ lines.push(
218
+ `[${finding.verdict.level.toUpperCase()}] ${finding.skillA} ↔ ${finding.skillB} (${formatScore(finding.score.composite)}) — ${finding.verdict.reason}`,
219
+ )
220
+ }
221
+
222
+ return lines.join("\n")
223
+ }
224
+
225
+ export function formatReportJson(report: SkillReport): string {
226
+ return JSON.stringify(report, null, 2)
227
+ }
@@ -0,0 +1,33 @@
1
+ export {
2
+ type ClusterConfig,
3
+ type ClusterFinding,
4
+ type ClusterResult,
5
+ clusterFindings,
6
+ DEFAULT_CLUSTER_CONFIG,
7
+ type FindingCluster,
8
+ } from "./cluster"
9
+ export {
10
+ checkExactRegexConflicts,
11
+ DEFAULT_GATE_CONFIG,
12
+ evaluatePair,
13
+ formatReportJson,
14
+ formatReportText,
15
+ type GateConfig,
16
+ type GateLevel,
17
+ type GateVerdict,
18
+ generateReport,
19
+ type SkillReport,
20
+ } from "./gates"
21
+ export { normalizeSkill, type SkillDoc } from "./normalize"
22
+ export {
23
+ buildTfidfCorpus,
24
+ computeAllPairs,
25
+ computeSimilarity,
26
+ detectionRuleOverlap,
27
+ type SimilarityPair,
28
+ type SimilarityScore,
29
+ shingleJaccard,
30
+ type TfidfCorpus,
31
+ tfidfCosine,
32
+ tokenJaccard,
33
+ } from "./similarity"