solidity-argus 0.1.8 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +3 -3
- package/README.md +229 -13
- package/package.json +37 -8
- package/skills/INVENTORY.md +88 -57
- package/skills/README.md +72 -6
- package/skills/case-studies/beanstalk-governance/SKILL.md +52 -0
- package/skills/case-studies/bzx-flash-loan/SKILL.md +53 -0
- package/skills/case-studies/cream-finance/SKILL.md +52 -0
- package/skills/case-studies/curve-reentrancy/SKILL.md +52 -0
- package/skills/case-studies/dao-hack/SKILL.md +51 -0
- package/skills/case-studies/euler-finance/SKILL.md +52 -0
- package/skills/case-studies/harvest-finance/SKILL.md +52 -0
- package/skills/case-studies/level-finance/SKILL.md +51 -0
- package/skills/case-studies/mango-markets/SKILL.md +53 -0
- package/skills/case-studies/nomad-bridge/SKILL.md +51 -0
- package/skills/case-studies/parity-multisig/SKILL.md +55 -0
- package/skills/case-studies/poly-network/SKILL.md +51 -0
- package/skills/case-studies/rari-fuse/SKILL.md +51 -0
- package/skills/case-studies/ronin-bridge/SKILL.md +52 -0
- package/skills/case-studies/wormhole-bridge/SKILL.md +51 -0
- package/skills/checklists/cyfrin-defi-core/SKILL.md +3 -0
- package/skills/manifests/cyfrin.json +16 -0
- package/skills/manifests/defifofum.json +25 -0
- package/skills/manifests/kadenzipfel.json +48 -0
- package/skills/manifests/scvd.json +9 -0
- package/skills/manifests/smartbugs.json +9 -0
- package/skills/manifests/solodit.json +9 -0
- package/skills/manifests/sunweb3sec.json +9 -0
- package/skills/manifests/trailofbits.json +9 -0
- package/skills/methodology/audit-workflow/SKILL.md +3 -0
- package/skills/protocol-patterns/amm-dex/SKILL.md +3 -0
- package/skills/references/exploit-reference/SKILL.md +3 -0
- package/skills/vulnerability-patterns/access-control/SKILL.md +27 -0
- package/skills/vulnerability-patterns/arbitrary-storage-location/SKILL.md +13 -1
- package/skills/vulnerability-patterns/assert-violation/SKILL.md +8 -1
- package/skills/vulnerability-patterns/asserting-contract-from-code-size/SKILL.md +12 -1
- package/skills/vulnerability-patterns/authorization-txorigin/SKILL.md +8 -1
- package/skills/vulnerability-patterns/cross-chain-bridge-vulnerabilities/SKILL.md +217 -0
- package/skills/vulnerability-patterns/default-visibility/SKILL.md +13 -1
- package/skills/vulnerability-patterns/delegatecall-untrusted-callee/SKILL.md +8 -1
- package/skills/vulnerability-patterns/dos-gas-limit/SKILL.md +8 -1
- package/skills/vulnerability-patterns/dos-revert/SKILL.md +14 -1
- package/skills/vulnerability-patterns/erc4626-exchange-rate-manipulation/SKILL.md +64 -0
- package/skills/vulnerability-patterns/fee-on-transfer-tokens/SKILL.md +93 -0
- package/skills/vulnerability-patterns/flash-loan-attacks/SKILL.md +13 -0
- package/skills/vulnerability-patterns/floating-pragma/SKILL.md +8 -1
- package/skills/vulnerability-patterns/front-running-attacks/SKILL.md +209 -0
- package/skills/vulnerability-patterns/gas-optimization-patterns/SKILL.md +203 -0
- package/skills/vulnerability-patterns/governance-attacks/SKILL.md +208 -0
- package/skills/vulnerability-patterns/hash-collision/SKILL.md +8 -1
- package/skills/vulnerability-patterns/inadherence-to-standards/SKILL.md +12 -1
- package/skills/vulnerability-patterns/incorrect-constructor/SKILL.md +8 -1
- package/skills/vulnerability-patterns/incorrect-inheritance-order/SKILL.md +8 -1
- package/skills/vulnerability-patterns/insufficient-gas-griefing/SKILL.md +12 -1
- package/skills/vulnerability-patterns/lack-of-precision/SKILL.md +7 -1
- package/skills/vulnerability-patterns/logic-errors/SKILL.md +10 -0
- package/skills/vulnerability-patterns/missing-parameter-bounds/SKILL.md +44 -0
- package/skills/vulnerability-patterns/missing-protection-signature-replay/SKILL.md +17 -1
- package/skills/vulnerability-patterns/msgvalue-loop/SKILL.md +12 -1
- package/skills/vulnerability-patterns/off-by-one/SKILL.md +7 -1
- package/skills/vulnerability-patterns/oracle-manipulation/SKILL.md +22 -0
- package/skills/vulnerability-patterns/outdated-compiler-version/SKILL.md +8 -1
- package/skills/vulnerability-patterns/overflow-underflow/SKILL.md +11 -1
- package/skills/vulnerability-patterns/proxy-vulnerabilities/SKILL.md +209 -0
- package/skills/vulnerability-patterns/reentrancy/SKILL.md +22 -0
- package/skills/vulnerability-patterns/shadowing-state-variables/SKILL.md +8 -1
- package/skills/vulnerability-patterns/share-accounting-desynchronization/SKILL.md +44 -0
- package/skills/vulnerability-patterns/signature-malleability/SKILL.md +11 -1
- package/skills/vulnerability-patterns/stateful-parameter-update-drift/SKILL.md +44 -0
- package/skills/vulnerability-patterns/unbounded-return-data/SKILL.md +12 -1
- package/skills/vulnerability-patterns/unchecked-return-values/SKILL.md +13 -1
- package/skills/vulnerability-patterns/unencrypted-private-data-on-chain/SKILL.md +8 -1
- package/skills/vulnerability-patterns/unexpected-ecrecover-null-address/SKILL.md +8 -1
- package/skills/vulnerability-patterns/uninitialized-storage-pointer/SKILL.md +8 -1
- package/skills/vulnerability-patterns/unsafe-erc20-transfers/SKILL.md +132 -0
- package/skills/vulnerability-patterns/unsafe-low-level-call/SKILL.md +12 -1
- package/skills/vulnerability-patterns/unsecure-signatures/SKILL.md +12 -1
- package/skills/vulnerability-patterns/unsupported-opcodes/SKILL.md +11 -1
- package/skills/vulnerability-patterns/unused-variables/SKILL.md +8 -1
- package/skills/vulnerability-patterns/use-of-deprecated-functions/SKILL.md +8 -1
- package/skills/vulnerability-patterns/weak-sources-randomness/SKILL.md +8 -1
- package/skills/vulnerability-patterns/weird-tokens/SKILL.md +10 -0
- package/skills/vulnerability-patterns/zero-address-misconfiguration/SKILL.md +48 -0
- package/src/agents/argus-prompt.ts +27 -10
- package/src/agents/pythia-prompt.ts +7 -8
- package/src/agents/scribe-prompt.ts +10 -5
- package/src/agents/sentinel-prompt.ts +36 -7
- package/src/cli/cli-output.ts +16 -0
- package/src/cli/cli-program.ts +29 -22
- package/src/cli/commands/check-skills.ts +135 -0
- package/src/cli/commands/doctor.ts +303 -23
- package/src/cli/commands/init.ts +8 -6
- package/src/cli/commands/install.ts +10 -8
- package/src/cli/commands/lint-skills.ts +118 -0
- package/src/cli/index.ts +5 -5
- package/src/cli/tui-prompts.ts +4 -2
- package/src/cli/types.ts +3 -3
- package/src/config/index.ts +1 -1
- package/src/config/loader.ts +4 -6
- package/src/config/schema.ts +6 -5
- package/src/config/types.ts +2 -2
- package/src/constants/defaults.ts +2 -0
- package/src/create-hooks.ts +225 -29
- package/src/create-managers.ts +10 -8
- package/src/create-tools.ts +14 -8
- package/src/features/background-agent/background-manager.ts +93 -87
- package/src/features/background-agent/index.ts +1 -1
- package/src/features/context-monitor/context-monitor.ts +3 -3
- package/src/features/context-monitor/index.ts +2 -2
- package/src/features/error-recovery/session-recovery.ts +2 -4
- package/src/features/error-recovery/tool-error-recovery.ts +79 -19
- package/src/features/index.ts +5 -5
- package/src/features/persistent-state/audit-state-manager.ts +158 -52
- package/src/features/persistent-state/global-run-index.ts +38 -0
- package/src/features/persistent-state/index.ts +1 -1
- package/src/features/persistent-state/run-journal.ts +86 -0
- package/src/hooks/agent-tracker.ts +53 -0
- package/src/hooks/compaction-hook.ts +46 -37
- package/src/hooks/config-handler.ts +31 -11
- package/src/hooks/context-budget.ts +42 -0
- package/src/hooks/event-hook.ts +48 -23
- package/src/hooks/hook-system.ts +4 -4
- package/src/hooks/index.ts +5 -5
- package/src/hooks/knowledge-sync-hook.ts +19 -21
- package/src/hooks/recon-context-builder.ts +66 -0
- package/src/hooks/safe-create-hook.ts +9 -11
- package/src/hooks/system-prompt-hook.ts +128 -0
- package/src/hooks/tool-tracking-hook.ts +162 -29
- package/src/hooks/types.ts +2 -1
- package/src/index.ts +23 -13
- package/src/knowledge/retry.ts +53 -0
- package/src/knowledge/scvd-client.ts +103 -83
- package/src/knowledge/scvd-errors.ts +89 -0
- package/src/knowledge/scvd-index.ts +110 -62
- package/src/knowledge/scvd-sync.ts +223 -47
- package/src/knowledge/source-manifest.ts +102 -0
- package/src/managers/index.ts +1 -1
- package/src/managers/types.ts +19 -14
- package/src/plugin-interface.ts +19 -8
- package/src/shared/binary-utils.ts +44 -34
- package/src/shared/deep-merge.ts +55 -36
- package/src/shared/file-utils.ts +21 -19
- package/src/shared/index.ts +11 -5
- package/src/shared/jsonc-parser.ts +123 -28
- package/src/shared/logger.ts +91 -17
- package/src/shared/project-utils.ts +30 -0
- package/src/skills/analysis/cluster.ts +414 -0
- package/src/skills/analysis/gates.ts +227 -0
- package/src/skills/analysis/index.ts +33 -0
- package/src/skills/analysis/normalize.ts +217 -0
- package/src/skills/analysis/similarity.ts +224 -0
- package/src/skills/argus-skill-resolver.ts +237 -0
- package/src/skills/skill-schema.ts +99 -0
- package/src/solodit-lifecycle.ts +202 -0
- package/src/state/audit-state.ts +10 -8
- package/src/state/finding-store.ts +68 -55
- package/src/state/types.ts +96 -44
- package/src/tools/argus-skill-load-tool.ts +78 -0
- package/src/tools/contract-analyzer-tool.ts +60 -77
- package/src/tools/forge-coverage-tool.ts +226 -0
- package/src/tools/forge-fuzz-tool.ts +127 -127
- package/src/tools/forge-test-tool.ts +153 -157
- package/src/tools/gas-analysis-tool.ts +264 -0
- package/src/tools/pattern-checker-tool.ts +206 -167
- package/src/tools/pattern-loader.ts +77 -0
- package/src/tools/pattern-schema.ts +51 -0
- package/src/tools/proxy-detection-tool.ts +224 -0
- package/src/tools/report-generator-tool.ts +333 -142
- package/src/tools/slither-tool.ts +300 -210
- package/src/tools/solodit-search-tool.ts +255 -80
- package/src/tools/sync-knowledge-tool.ts +7 -11
- package/src/utils/audit-artifact-detector.ts +118 -0
- package/src/utils/dependency-scanner.ts +93 -0
- package/src/utils/project-detector.ts +175 -86
- package/src/utils/solidity-parser.ts +112 -67
- package/src/utils/solodit-health.ts +29 -0
- package/src/hooks/event-hook-v2.ts +0 -99
- package/src/state/plugin-state.ts +0 -14
|
@@ -0,0 +1,414 @@
|
|
|
1
|
+
import { tokenJaccard } from "./similarity"
|
|
2
|
+
|
|
3
|
+
/** Input finding from the PDF extraction pipeline */
|
|
4
|
+
export interface ClusterFinding {
|
|
5
|
+
title: string
|
|
6
|
+
severity: string
|
|
7
|
+
description: string
|
|
8
|
+
category: string
|
|
9
|
+
source_pdf: string
|
|
10
|
+
source_name?: string
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
/** A single cluster of related findings */
|
|
14
|
+
export interface FindingCluster {
|
|
15
|
+
id: number
|
|
16
|
+
category: string
|
|
17
|
+
members: ClusterFinding[]
|
|
18
|
+
medoid: ClusterFinding
|
|
19
|
+
medoidIndex: number
|
|
20
|
+
topTokens: string[]
|
|
21
|
+
avgInternalSimilarity: number
|
|
22
|
+
size: number
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
/** Configuration for clustering */
|
|
26
|
+
export interface ClusterConfig {
|
|
27
|
+
linkThreshold: number
|
|
28
|
+
cohesionMinSimilarity: number
|
|
29
|
+
minClusterSize: number
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/** Full clustering result */
|
|
33
|
+
export interface ClusterResult {
|
|
34
|
+
clusters: FindingCluster[]
|
|
35
|
+
singletons: ClusterFinding[]
|
|
36
|
+
stats: {
|
|
37
|
+
totalFindings: number
|
|
38
|
+
totalClusters: number
|
|
39
|
+
totalSingletons: number
|
|
40
|
+
categoryCounts: Record<string, number>
|
|
41
|
+
largestCluster: number
|
|
42
|
+
avgClusterSize: number
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
export const DEFAULT_CLUSTER_CONFIG: ClusterConfig = {
|
|
47
|
+
linkThreshold: 0.6,
|
|
48
|
+
cohesionMinSimilarity: 0.65,
|
|
49
|
+
minClusterSize: 2,
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
const STOPWORDS = new Set([
|
|
53
|
+
"the",
|
|
54
|
+
"a",
|
|
55
|
+
"an",
|
|
56
|
+
"is",
|
|
57
|
+
"are",
|
|
58
|
+
"was",
|
|
59
|
+
"were",
|
|
60
|
+
"be",
|
|
61
|
+
"been",
|
|
62
|
+
"being",
|
|
63
|
+
"have",
|
|
64
|
+
"has",
|
|
65
|
+
"had",
|
|
66
|
+
"do",
|
|
67
|
+
"does",
|
|
68
|
+
"did",
|
|
69
|
+
"will",
|
|
70
|
+
"would",
|
|
71
|
+
"shall",
|
|
72
|
+
"should",
|
|
73
|
+
"may",
|
|
74
|
+
"might",
|
|
75
|
+
"can",
|
|
76
|
+
"could",
|
|
77
|
+
"of",
|
|
78
|
+
"in",
|
|
79
|
+
"to",
|
|
80
|
+
"for",
|
|
81
|
+
"with",
|
|
82
|
+
"on",
|
|
83
|
+
"at",
|
|
84
|
+
"by",
|
|
85
|
+
"from",
|
|
86
|
+
"as",
|
|
87
|
+
"into",
|
|
88
|
+
"through",
|
|
89
|
+
"during",
|
|
90
|
+
"before",
|
|
91
|
+
"after",
|
|
92
|
+
"above",
|
|
93
|
+
"below",
|
|
94
|
+
"between",
|
|
95
|
+
"out",
|
|
96
|
+
"off",
|
|
97
|
+
"over",
|
|
98
|
+
"under",
|
|
99
|
+
"again",
|
|
100
|
+
"further",
|
|
101
|
+
"then",
|
|
102
|
+
"once",
|
|
103
|
+
"here",
|
|
104
|
+
"there",
|
|
105
|
+
"where",
|
|
106
|
+
"when",
|
|
107
|
+
"how",
|
|
108
|
+
"all",
|
|
109
|
+
"each",
|
|
110
|
+
"every",
|
|
111
|
+
"both",
|
|
112
|
+
"few",
|
|
113
|
+
"more",
|
|
114
|
+
"most",
|
|
115
|
+
"other",
|
|
116
|
+
"some",
|
|
117
|
+
"such",
|
|
118
|
+
"no",
|
|
119
|
+
"nor",
|
|
120
|
+
"not",
|
|
121
|
+
"only",
|
|
122
|
+
"own",
|
|
123
|
+
"same",
|
|
124
|
+
"than",
|
|
125
|
+
"too",
|
|
126
|
+
"very",
|
|
127
|
+
"and",
|
|
128
|
+
"but",
|
|
129
|
+
"or",
|
|
130
|
+
"if",
|
|
131
|
+
"this",
|
|
132
|
+
"that",
|
|
133
|
+
"these",
|
|
134
|
+
"those",
|
|
135
|
+
"it",
|
|
136
|
+
"its",
|
|
137
|
+
"contract",
|
|
138
|
+
"function",
|
|
139
|
+
"solidity",
|
|
140
|
+
"smart",
|
|
141
|
+
"vulnerability",
|
|
142
|
+
"attack",
|
|
143
|
+
"attacker",
|
|
144
|
+
"token",
|
|
145
|
+
"address",
|
|
146
|
+
"value",
|
|
147
|
+
"state",
|
|
148
|
+
"require",
|
|
149
|
+
"modifier",
|
|
150
|
+
"external",
|
|
151
|
+
"internal",
|
|
152
|
+
"public",
|
|
153
|
+
"private",
|
|
154
|
+
"mapping",
|
|
155
|
+
"uint256",
|
|
156
|
+
"bool",
|
|
157
|
+
"returns",
|
|
158
|
+
"event",
|
|
159
|
+
"emit",
|
|
160
|
+
])
|
|
161
|
+
|
|
162
|
+
class UnionFind {
|
|
163
|
+
parent: number[]
|
|
164
|
+
rank: number[]
|
|
165
|
+
constructor(n: number) {
|
|
166
|
+
this.parent = Array.from({ length: n }, (_, index) => index)
|
|
167
|
+
this.rank = Array.from({ length: n }, () => 0)
|
|
168
|
+
}
|
|
169
|
+
find(x: number): number {
|
|
170
|
+
const parent = this.parent[x] ?? x
|
|
171
|
+
if (parent === x) return x
|
|
172
|
+
const root = this.find(parent)
|
|
173
|
+
this.parent[x] = root
|
|
174
|
+
return root
|
|
175
|
+
}
|
|
176
|
+
union(x: number, y: number): boolean {
|
|
177
|
+
const rootX = this.find(x)
|
|
178
|
+
const rootY = this.find(y)
|
|
179
|
+
if (rootX === rootY) return false
|
|
180
|
+
const rankX = this.rank[rootX] ?? 0
|
|
181
|
+
const rankY = this.rank[rootY] ?? 0
|
|
182
|
+
if (rankX < rankY) {
|
|
183
|
+
this.parent[rootX] = rootY
|
|
184
|
+
return true
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
if (rankX > rankY) {
|
|
188
|
+
this.parent[rootY] = rootX
|
|
189
|
+
return true
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
this.parent[rootY] = rootX
|
|
193
|
+
this.rank[rootX] = rankX + 1
|
|
194
|
+
return true
|
|
195
|
+
}
|
|
196
|
+
components(): Map<number, number[]> {
|
|
197
|
+
const groups = new Map<number, number[]>()
|
|
198
|
+
for (let index = 0; index < this.parent.length; index += 1) {
|
|
199
|
+
const root = this.find(index)
|
|
200
|
+
const group = groups.get(root) ?? []
|
|
201
|
+
group.push(index)
|
|
202
|
+
groups.set(root, group)
|
|
203
|
+
}
|
|
204
|
+
return groups
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
function tokenize(text: string): string[] {
|
|
209
|
+
if (!text) return []
|
|
210
|
+
const deduped = new Set<string>()
|
|
211
|
+
for (const token of text.toLowerCase().split(/[^a-z0-9]+/g)) {
|
|
212
|
+
if (token.length < 3) continue
|
|
213
|
+
if (STOPWORDS.has(token)) continue
|
|
214
|
+
deduped.add(token)
|
|
215
|
+
}
|
|
216
|
+
return Array.from(deduped)
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
function computeTokenSets(findings: ClusterFinding[]): Set<string>[] {
|
|
220
|
+
return findings.map((finding) => new Set(tokenize(`${finding.title} ${finding.description}`)))
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
function buildSimilarityMatrix(tokenSets: Set<string>[]): number[][] {
|
|
224
|
+
const tokenArrays = tokenSets.map((set) => [...set])
|
|
225
|
+
const matrix: number[][] = Array.from({ length: tokenSets.length }, () =>
|
|
226
|
+
Array.from({ length: tokenSets.length }, () => 0),
|
|
227
|
+
)
|
|
228
|
+
for (let i = 0; i < tokenSets.length; i += 1) {
|
|
229
|
+
const rowI = matrix[i]
|
|
230
|
+
if (rowI) rowI[i] = 1
|
|
231
|
+
for (let j = i + 1; j < tokenSets.length; j += 1) {
|
|
232
|
+
const similarity = tokenJaccard(tokenArrays[i] ?? [], tokenArrays[j] ?? [])
|
|
233
|
+
if (rowI) rowI[j] = similarity
|
|
234
|
+
const rowJ = matrix[j]
|
|
235
|
+
if (rowJ) rowJ[i] = similarity
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
return matrix
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
function medoidForMembers(memberIndices: number[], similarityMatrix: number[][]): number {
|
|
242
|
+
if (memberIndices.length === 0) return -1
|
|
243
|
+
if (memberIndices.length === 1) return memberIndices[0] ?? -1
|
|
244
|
+
let bestIndex = memberIndices[0] ?? -1
|
|
245
|
+
let bestAvgSimilarity = -1
|
|
246
|
+
for (const candidateIndex of memberIndices) {
|
|
247
|
+
let total = 0
|
|
248
|
+
for (const otherIndex of memberIndices) {
|
|
249
|
+
if (candidateIndex === otherIndex) continue
|
|
250
|
+
total += similarityMatrix[candidateIndex]?.[otherIndex] ?? 0
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
const avg = total / (memberIndices.length - 1)
|
|
254
|
+
if (avg > bestAvgSimilarity) {
|
|
255
|
+
bestAvgSimilarity = avg
|
|
256
|
+
bestIndex = candidateIndex
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
return bestIndex
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
function averageInternalSimilarity(memberIndices: number[], similarityMatrix: number[][]): number {
|
|
263
|
+
if (memberIndices.length < 2) return 0
|
|
264
|
+
let total = 0
|
|
265
|
+
let pairs = 0
|
|
266
|
+
for (let i = 0; i < memberIndices.length; i += 1) {
|
|
267
|
+
const left = memberIndices[i] ?? -1
|
|
268
|
+
|
|
269
|
+
for (let j = i + 1; j < memberIndices.length; j += 1) {
|
|
270
|
+
const right = memberIndices[j] ?? -1
|
|
271
|
+
total += similarityMatrix[left]?.[right] ?? 0
|
|
272
|
+
pairs += 1
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
return total / pairs
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
function topTokensForMembers(memberIndices: number[], tokenSets: Set<string>[]): string[] {
|
|
279
|
+
const counts = new Map<string, number>()
|
|
280
|
+
for (const memberIndex of memberIndices) {
|
|
281
|
+
const tokenSet = tokenSets[memberIndex]
|
|
282
|
+
if (!tokenSet) continue
|
|
283
|
+
for (const token of tokenSet) {
|
|
284
|
+
counts.set(token, (counts.get(token) ?? 0) + 1)
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
return Array.from(counts.entries())
|
|
288
|
+
.sort((left, right) => {
|
|
289
|
+
const countDelta = right[1] - left[1]
|
|
290
|
+
if (countDelta !== 0) return countDelta
|
|
291
|
+
return left[0].localeCompare(right[0])
|
|
292
|
+
})
|
|
293
|
+
.slice(0, 10)
|
|
294
|
+
.map(([token]) => token)
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
function pushSingletons(
|
|
298
|
+
target: ClusterFinding[],
|
|
299
|
+
bucket: ClusterFinding[],
|
|
300
|
+
indices: number[],
|
|
301
|
+
): void {
|
|
302
|
+
for (const index of indices) {
|
|
303
|
+
const finding = bucket[index]
|
|
304
|
+
if (finding) target.push(finding)
|
|
305
|
+
}
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
/**
|
|
309
|
+
* Groups related findings per category using token Jaccard links and
|
|
310
|
+
* union-find connected components, then peels low-cohesion outliers.
|
|
311
|
+
*/
|
|
312
|
+
export function clusterFindings(
|
|
313
|
+
findings: ClusterFinding[],
|
|
314
|
+
config: ClusterConfig = DEFAULT_CLUSTER_CONFIG,
|
|
315
|
+
): ClusterResult {
|
|
316
|
+
const clusters: FindingCluster[] = []
|
|
317
|
+
const singletons: ClusterFinding[] = []
|
|
318
|
+
const categoryCounts: Record<string, number> = {}
|
|
319
|
+
const buckets = new Map<string, ClusterFinding[]>()
|
|
320
|
+
for (const finding of findings) {
|
|
321
|
+
categoryCounts[finding.category] = (categoryCounts[finding.category] ?? 0) + 1
|
|
322
|
+
const bucket = buckets.get(finding.category)
|
|
323
|
+
if (bucket) {
|
|
324
|
+
bucket.push(finding)
|
|
325
|
+
} else {
|
|
326
|
+
buckets.set(finding.category, [finding])
|
|
327
|
+
}
|
|
328
|
+
}
|
|
329
|
+
const orderedCategories = Array.from(buckets.keys()).sort((left, right) =>
|
|
330
|
+
left.localeCompare(right),
|
|
331
|
+
)
|
|
332
|
+
let nextClusterId = 1
|
|
333
|
+
for (const category of orderedCategories) {
|
|
334
|
+
const bucket = buckets.get(category) ?? []
|
|
335
|
+
if (bucket.length < config.minClusterSize) {
|
|
336
|
+
singletons.push(...bucket)
|
|
337
|
+
continue
|
|
338
|
+
}
|
|
339
|
+
const tokenSets = computeTokenSets(bucket)
|
|
340
|
+
const similarityMatrix = buildSimilarityMatrix(tokenSets)
|
|
341
|
+
const uf = new UnionFind(bucket.length)
|
|
342
|
+
for (let i = 0; i < bucket.length; i += 1) {
|
|
343
|
+
for (let j = i + 1; j < bucket.length; j += 1) {
|
|
344
|
+
if ((similarityMatrix[i]?.[j] ?? 0) < config.linkThreshold) continue
|
|
345
|
+
uf.union(i, j)
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
for (const memberIndices of uf.components().values()) {
|
|
349
|
+
if (memberIndices.length < config.minClusterSize) {
|
|
350
|
+
pushSingletons(singletons, bucket, memberIndices)
|
|
351
|
+
continue
|
|
352
|
+
}
|
|
353
|
+
const initialMedoid = medoidForMembers(memberIndices, similarityMatrix)
|
|
354
|
+
const keptIndices: number[] = []
|
|
355
|
+
const peeledIndices: number[] = []
|
|
356
|
+
for (const index of memberIndices) {
|
|
357
|
+
if (index === initialMedoid) {
|
|
358
|
+
keptIndices.push(index)
|
|
359
|
+
continue
|
|
360
|
+
}
|
|
361
|
+
const similarityToMedoid = similarityMatrix[initialMedoid]?.[index] ?? 0
|
|
362
|
+
if (similarityToMedoid >= config.cohesionMinSimilarity) {
|
|
363
|
+
keptIndices.push(index)
|
|
364
|
+
} else {
|
|
365
|
+
peeledIndices.push(index)
|
|
366
|
+
}
|
|
367
|
+
}
|
|
368
|
+
pushSingletons(singletons, bucket, peeledIndices)
|
|
369
|
+
if (keptIndices.length < config.minClusterSize) {
|
|
370
|
+
pushSingletons(singletons, bucket, keptIndices)
|
|
371
|
+
continue
|
|
372
|
+
}
|
|
373
|
+
const finalMedoid = medoidForMembers(keptIndices, similarityMatrix)
|
|
374
|
+
const members = keptIndices
|
|
375
|
+
.map((index) => bucket[index])
|
|
376
|
+
.filter((finding): finding is ClusterFinding => Boolean(finding))
|
|
377
|
+
if (members.length < config.minClusterSize) {
|
|
378
|
+
singletons.push(...members)
|
|
379
|
+
continue
|
|
380
|
+
}
|
|
381
|
+
const medoidIndex = Math.max(0, keptIndices.indexOf(finalMedoid))
|
|
382
|
+
const medoid = members.at(medoidIndex)
|
|
383
|
+
if (!medoid) throw new Error("Medoid index out of bounds — this should not happen")
|
|
384
|
+
clusters.push({
|
|
385
|
+
id: nextClusterId,
|
|
386
|
+
category,
|
|
387
|
+
members,
|
|
388
|
+
medoid,
|
|
389
|
+
medoidIndex,
|
|
390
|
+
topTokens: topTokensForMembers(keptIndices, tokenSets),
|
|
391
|
+
avgInternalSimilarity: averageInternalSimilarity(keptIndices, similarityMatrix),
|
|
392
|
+
size: members.length,
|
|
393
|
+
})
|
|
394
|
+
nextClusterId += 1
|
|
395
|
+
}
|
|
396
|
+
}
|
|
397
|
+
const largestCluster = clusters.reduce((max, cluster) => Math.max(max, cluster.size), 0)
|
|
398
|
+
const avgClusterSize =
|
|
399
|
+
clusters.length === 0
|
|
400
|
+
? 0
|
|
401
|
+
: clusters.reduce((total, cluster) => total + cluster.size, 0) / clusters.length
|
|
402
|
+
return {
|
|
403
|
+
clusters,
|
|
404
|
+
singletons,
|
|
405
|
+
stats: {
|
|
406
|
+
totalFindings: findings.length,
|
|
407
|
+
totalClusters: clusters.length,
|
|
408
|
+
totalSingletons: singletons.length,
|
|
409
|
+
categoryCounts,
|
|
410
|
+
largestCluster,
|
|
411
|
+
avgClusterSize,
|
|
412
|
+
},
|
|
413
|
+
}
|
|
414
|
+
}
|
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
import type { SkillDoc } from "./normalize"
|
|
2
|
+
import type { SimilarityPair, SimilarityScore } from "./similarity"
|
|
3
|
+
|
|
4
|
+
export type GateLevel = "block" | "warn" | "info" | "pass"
|
|
5
|
+
|
|
6
|
+
export interface GateVerdict {
|
|
7
|
+
level: GateLevel
|
|
8
|
+
reason: string
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
export interface GateConfig {
|
|
12
|
+
blockThreshold: number
|
|
13
|
+
warnThreshold: number
|
|
14
|
+
infoThreshold: number
|
|
15
|
+
blockExactRegexConflict: boolean
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
export interface SkillReport {
|
|
19
|
+
totalSkills: number
|
|
20
|
+
findings: Array<{
|
|
21
|
+
skillA: string
|
|
22
|
+
skillB: string
|
|
23
|
+
score: SimilarityScore
|
|
24
|
+
verdict: GateVerdict
|
|
25
|
+
}>
|
|
26
|
+
summary: { block: number; warn: number; info: number }
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export const DEFAULT_GATE_CONFIG: GateConfig = {
|
|
30
|
+
blockThreshold: 0.9,
|
|
31
|
+
warnThreshold: 0.78,
|
|
32
|
+
infoThreshold: 0.65,
|
|
33
|
+
blockExactRegexConflict: true,
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
const LEVEL_ORDER: Record<GateLevel, number> = {
|
|
37
|
+
block: 0,
|
|
38
|
+
warn: 1,
|
|
39
|
+
info: 2,
|
|
40
|
+
pass: 3,
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
function formatScore(score: number): string {
|
|
44
|
+
return score.toFixed(2)
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
function normalizeRegex(rule: string): string {
|
|
48
|
+
return rule.replace(/\s+/g, " ").trim()
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
function pairKey(skillA: string, skillB: string): string {
|
|
52
|
+
return skillA < skillB ? `${skillA}|||${skillB}` : `${skillB}|||${skillA}`
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
function topSignals(score: SimilarityScore): string {
|
|
56
|
+
const signals = [
|
|
57
|
+
{ label: "body TF-IDF", value: score.bodyTfidf },
|
|
58
|
+
{ label: "body shingles", value: score.bodyShingle },
|
|
59
|
+
{ label: "name/description", value: score.nameDesc },
|
|
60
|
+
{ label: "detection rules", value: score.detectionRules },
|
|
61
|
+
]
|
|
62
|
+
|
|
63
|
+
signals.sort((left, right) => right.value - left.value)
|
|
64
|
+
return signals
|
|
65
|
+
.slice(0, 2)
|
|
66
|
+
.map((signal) => `${signal.label} ${formatScore(signal.value)}`)
|
|
67
|
+
.join(", ")
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
function scoreForConflict(score: SimilarityScore | undefined): SimilarityScore {
|
|
71
|
+
if (score) return score
|
|
72
|
+
|
|
73
|
+
return {
|
|
74
|
+
composite: 1,
|
|
75
|
+
bodyTfidf: 0,
|
|
76
|
+
bodyShingle: 0,
|
|
77
|
+
nameDesc: 0,
|
|
78
|
+
detectionRules: 1,
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
export function evaluatePair(
|
|
83
|
+
pair: SimilarityPair,
|
|
84
|
+
config: GateConfig = DEFAULT_GATE_CONFIG,
|
|
85
|
+
): GateVerdict {
|
|
86
|
+
const composite = pair.score.composite
|
|
87
|
+
const signalSummary = topSignals(pair.score)
|
|
88
|
+
const reasonSuffix = `composite ${formatScore(composite)}; top signals: ${signalSummary}`
|
|
89
|
+
|
|
90
|
+
if (composite >= config.blockThreshold) {
|
|
91
|
+
return { level: "block", reason: `Duplicate risk: ${reasonSuffix}` }
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
if (composite >= config.warnThreshold) {
|
|
95
|
+
return { level: "warn", reason: `Near-duplicate risk: ${reasonSuffix}` }
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
if (composite >= config.infoThreshold) {
|
|
99
|
+
return { level: "info", reason: `Related skills: ${reasonSuffix}` }
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
return { level: "pass", reason: `Below thresholds: ${reasonSuffix}` }
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
export function checkExactRegexConflicts(
|
|
106
|
+
docs: SkillDoc[],
|
|
107
|
+
): Array<{ skillA: string; skillB: string; sharedRegex: string }> {
|
|
108
|
+
const conflicts: Array<{ skillA: string; skillB: string; sharedRegex: string }> = []
|
|
109
|
+
|
|
110
|
+
for (let i = 0; i < docs.length; i += 1) {
|
|
111
|
+
const docA = docs[i]
|
|
112
|
+
if (!docA) continue
|
|
113
|
+
|
|
114
|
+
const rulesA = new Set(
|
|
115
|
+
docA.detectionRules.map(normalizeRegex).filter((rule) => rule.length > 0),
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
for (let j = i + 1; j < docs.length; j += 1) {
|
|
119
|
+
const docB = docs[j]
|
|
120
|
+
if (!docB) continue
|
|
121
|
+
if (docA.name === docB.name) continue
|
|
122
|
+
|
|
123
|
+
const rulesB = new Set(
|
|
124
|
+
docB.detectionRules.map(normalizeRegex).filter((rule) => rule.length > 0),
|
|
125
|
+
)
|
|
126
|
+
for (const sharedRegex of rulesA) {
|
|
127
|
+
if (!rulesB.has(sharedRegex)) continue
|
|
128
|
+
conflicts.push({
|
|
129
|
+
skillA: docA.name,
|
|
130
|
+
skillB: docB.name,
|
|
131
|
+
sharedRegex,
|
|
132
|
+
})
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
return conflicts
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
export function generateReport(
|
|
141
|
+
docs: SkillDoc[],
|
|
142
|
+
pairs: SimilarityPair[],
|
|
143
|
+
config: GateConfig = DEFAULT_GATE_CONFIG,
|
|
144
|
+
): SkillReport {
|
|
145
|
+
const findings: SkillReport["findings"] = []
|
|
146
|
+
const pairScores = new Map<string, SimilarityScore>()
|
|
147
|
+
|
|
148
|
+
for (const pair of pairs) {
|
|
149
|
+
pairScores.set(pairKey(pair.skillA, pair.skillB), pair.score)
|
|
150
|
+
|
|
151
|
+
const verdict = evaluatePair(pair, config)
|
|
152
|
+
if (verdict.level === "pass") continue
|
|
153
|
+
|
|
154
|
+
findings.push({
|
|
155
|
+
skillA: pair.skillA,
|
|
156
|
+
skillB: pair.skillB,
|
|
157
|
+
score: pair.score,
|
|
158
|
+
verdict,
|
|
159
|
+
})
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
if (config.blockExactRegexConflict) {
|
|
163
|
+
const indexByPairKey = new Map<string, number>()
|
|
164
|
+
findings.forEach((finding, index) => {
|
|
165
|
+
indexByPairKey.set(pairKey(finding.skillA, finding.skillB), index)
|
|
166
|
+
})
|
|
167
|
+
|
|
168
|
+
const conflicts = checkExactRegexConflicts(docs)
|
|
169
|
+
for (const conflict of conflicts) {
|
|
170
|
+
const key = pairKey(conflict.skillA, conflict.skillB)
|
|
171
|
+
const existingIndex = indexByPairKey.get(key)
|
|
172
|
+
const conflictReason = `Exact detection rule conflict: ${conflict.sharedRegex}`
|
|
173
|
+
|
|
174
|
+
if (existingIndex !== undefined) {
|
|
175
|
+
const existing = findings[existingIndex]
|
|
176
|
+
if (!existing) continue
|
|
177
|
+
existing.verdict = { level: "block", reason: conflictReason }
|
|
178
|
+
continue
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
findings.push({
|
|
182
|
+
skillA: conflict.skillA,
|
|
183
|
+
skillB: conflict.skillB,
|
|
184
|
+
score: scoreForConflict(pairScores.get(key)),
|
|
185
|
+
verdict: { level: "block", reason: conflictReason },
|
|
186
|
+
})
|
|
187
|
+
indexByPairKey.set(key, findings.length - 1)
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
findings.sort((left, right) => {
|
|
192
|
+
const levelDelta = LEVEL_ORDER[left.verdict.level] - LEVEL_ORDER[right.verdict.level]
|
|
193
|
+
if (levelDelta !== 0) return levelDelta
|
|
194
|
+
return right.score.composite - left.score.composite
|
|
195
|
+
})
|
|
196
|
+
|
|
197
|
+
const summary = { block: 0, warn: 0, info: 0 }
|
|
198
|
+
for (const finding of findings) {
|
|
199
|
+
if (finding.verdict.level === "block") summary.block += 1
|
|
200
|
+
if (finding.verdict.level === "warn") summary.warn += 1
|
|
201
|
+
if (finding.verdict.level === "info") summary.info += 1
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
return {
|
|
205
|
+
totalSkills: docs.length,
|
|
206
|
+
findings,
|
|
207
|
+
summary,
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
export function formatReportText(report: SkillReport): string {
|
|
212
|
+
const lines = [
|
|
213
|
+
`Skills: ${report.totalSkills} | Blocks: ${report.summary.block} | Warnings: ${report.summary.warn} | Info: ${report.summary.info}`,
|
|
214
|
+
]
|
|
215
|
+
|
|
216
|
+
for (const finding of report.findings) {
|
|
217
|
+
lines.push(
|
|
218
|
+
`[${finding.verdict.level.toUpperCase()}] ${finding.skillA} ↔ ${finding.skillB} (${formatScore(finding.score.composite)}) — ${finding.verdict.reason}`,
|
|
219
|
+
)
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
return lines.join("\n")
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
export function formatReportJson(report: SkillReport): string {
|
|
226
|
+
return JSON.stringify(report, null, 2)
|
|
227
|
+
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
export {
|
|
2
|
+
type ClusterConfig,
|
|
3
|
+
type ClusterFinding,
|
|
4
|
+
type ClusterResult,
|
|
5
|
+
clusterFindings,
|
|
6
|
+
DEFAULT_CLUSTER_CONFIG,
|
|
7
|
+
type FindingCluster,
|
|
8
|
+
} from "./cluster"
|
|
9
|
+
export {
|
|
10
|
+
checkExactRegexConflicts,
|
|
11
|
+
DEFAULT_GATE_CONFIG,
|
|
12
|
+
evaluatePair,
|
|
13
|
+
formatReportJson,
|
|
14
|
+
formatReportText,
|
|
15
|
+
type GateConfig,
|
|
16
|
+
type GateLevel,
|
|
17
|
+
type GateVerdict,
|
|
18
|
+
generateReport,
|
|
19
|
+
type SkillReport,
|
|
20
|
+
} from "./gates"
|
|
21
|
+
export { normalizeSkill, type SkillDoc } from "./normalize"
|
|
22
|
+
export {
|
|
23
|
+
buildTfidfCorpus,
|
|
24
|
+
computeAllPairs,
|
|
25
|
+
computeSimilarity,
|
|
26
|
+
detectionRuleOverlap,
|
|
27
|
+
type SimilarityPair,
|
|
28
|
+
type SimilarityScore,
|
|
29
|
+
shingleJaccard,
|
|
30
|
+
type TfidfCorpus,
|
|
31
|
+
tfidfCosine,
|
|
32
|
+
tokenJaccard,
|
|
33
|
+
} from "./similarity"
|