@opensip-cli/clone-detection 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,247 @@
1
+ /**
2
+ * Near-duplicate body detection — LSH-banded MinHash, the single implementation graph's
3
+ * `near-duplicate-function-body` rule consumes (ADR-0064). Pure over `CloneCandidate[]`;
4
+ * emits tool-agnostic clusters (no `Signal`). Candidate pairs exclude identical
5
+ * `bodyHash` edges and cross-language false positives (via the pre-resolved
6
+ * `candidate.language`).
7
+ */
8
+ import { isEligibleKind } from './find-duplicate-bodies.js';
9
+ import { NEAR_DUP_LSH_BANDS, NEAR_DUP_SIGNATURE_K, estimateJaccard, lshBandHashes, } from './near-duplicate-signature.js';
10
+ const DEFAULT_MIN_SIMILARITY = 0.85;
11
+ const DEFAULT_MIN_BODY_SIZE = 200;
12
+ const MAX_CLUSTER_SIZE = 50;
13
+ /**
14
+ * Detect near-duplicate body clusters (Jaccard ≥ `minSimilarity`, same language, distinct
15
+ * exact hash). Returns one cluster per connected component with ≥2 near members.
16
+ */
17
+ export function findNearDuplicates(candidates, opts = {}) {
18
+ const minSimilarity = opts.minSimilarity ?? DEFAULT_MIN_SIMILARITY;
19
+ const minBodySize = opts.minBodySize ?? DEFAULT_MIN_BODY_SIZE;
20
+ const bands = opts.lshBands ?? NEAR_DUP_LSH_BANDS;
21
+ const rows = NEAR_DUP_SIGNATURE_K / bands;
22
+ // bands MUST divide k evenly — otherwise `rows` is fractional and band slicing is
23
+ // misaligned. `rows * bands === k` alone does not catch this (128/7*7 round-trips to
24
+ // 128 in IEEE-754), so test integrality.
25
+ if (!Number.isInteger(rows) || rows < 1)
26
+ return [];
27
+ const eligible = collectEligible(candidates, minBodySize);
28
+ if (eligible.length < 2)
29
+ return [];
30
+ const edges = buildNearEdges(eligible, minSimilarity, bands, rows);
31
+ const components = clusterComponents(eligible.length, edges);
32
+ return emitComponentClusters(eligible, components, edges);
33
+ }
34
+ function collectEligible(candidates, minBodySize) {
35
+ const out = [];
36
+ for (const occ of candidates) {
37
+ if (!isEligibleKind(occ))
38
+ continue;
39
+ if (occ.bodySignature?.length !== NEAR_DUP_SIGNATURE_K)
40
+ continue;
41
+ if (occ.bodySize !== undefined && occ.bodySize < minBodySize)
42
+ continue;
43
+ out.push(occ);
44
+ }
45
+ return out;
46
+ }
47
+ function buildNearEdges(eligible, minSimilarity, bands, rows) {
48
+ const buckets = indexLshBuckets(eligible, bands, rows);
49
+ const edges = [];
50
+ const seenPairs = new Set();
51
+ for (const indices of buckets.values()) {
52
+ collectBucketEdges(indices, eligible, minSimilarity, seenPairs, edges);
53
+ }
54
+ return edges;
55
+ }
56
+ function indexLshBuckets(eligible, bands, rows) {
57
+ const buckets = new Map();
58
+ for (const [i, occ] of eligible.entries()) {
59
+ if (!occ.bodySignature)
60
+ continue;
61
+ const bandHashes = lshBandHashes(occ.bodySignature, bands, rows);
62
+ for (const [band, bandHash] of bandHashes.entries()) {
63
+ const key = `${String(band)}:${bandHash ?? ''}`;
64
+ const list = buckets.get(key) ?? [];
65
+ list.push(i);
66
+ buckets.set(key, list);
67
+ }
68
+ }
69
+ return buckets;
70
+ }
71
+ function collectBucketEdges(indices, eligible, minSimilarity, seenPairs, edges) {
72
+ if (indices.length < 2)
73
+ return;
74
+ for (let i = 0; i < indices.length; i++) {
75
+ for (let j = i + 1; j < indices.length; j++) {
76
+ const ai = indices[i];
77
+ const bi = indices[j];
78
+ if (ai === undefined || bi === undefined)
79
+ continue;
80
+ const key = pairKey(ai, bi);
81
+ if (seenPairs.has(key))
82
+ continue;
83
+ seenPairs.add(key);
84
+ const edge = tryNearEdge(ai, bi, eligible, minSimilarity);
85
+ if (edge)
86
+ edges.push(edge);
87
+ }
88
+ }
89
+ }
90
+ function tryNearEdge(ai, bi, eligible, minSimilarity) {
91
+ const a = eligible[ai];
92
+ const b = eligible[bi];
93
+ if (!a?.bodySignature || !b?.bodySignature)
94
+ return undefined;
95
+ if (a.bodyHash === b.bodyHash)
96
+ return undefined;
97
+ // Same-language gate. `language` is the caller-resolved equivalent of graph's
98
+ // `languageOfFile(filePath)`; undefined on either side skips the pair (identical to
99
+ // `languageOfFile(...) === undefined`).
100
+ const langA = a.language;
101
+ const langB = b.language;
102
+ if (langA === undefined || langB === undefined || langA !== langB)
103
+ return undefined;
104
+ const similarity = estimateJaccard(a.bodySignature, b.bodySignature);
105
+ if (similarity < minSimilarity)
106
+ return undefined;
107
+ return { a: ai, b: bi, similarity };
108
+ }
109
+ function clusterComponents(size, edges) {
110
+ const uf = new UnionFind(size);
111
+ for (const e of edges)
112
+ uf.union(e.a, e.b);
113
+ const byRoot = new Map();
114
+ for (let i = 0; i < size; i++) {
115
+ const root = uf.find(i);
116
+ const list = byRoot.get(root) ?? [];
117
+ list.push(i);
118
+ byRoot.set(root, list);
119
+ }
120
+ return [...byRoot.values()].filter((c) => c.length >= 2);
121
+ }
122
+ function emitComponentClusters(eligible, components, edges) {
123
+ const edgeByPair = buildEdgeSimilarityIndex(edges);
124
+ const clusters = [];
125
+ for (const component of components) {
126
+ const cluster = buildComponentCluster(eligible, component, edges, edgeByPair);
127
+ if (cluster)
128
+ clusters.push(cluster);
129
+ }
130
+ return clusters;
131
+ }
132
+ function buildEdgeSimilarityIndex(edges) {
133
+ const edgeByPair = new Map();
134
+ for (const e of edges) {
135
+ const key = pairKey(e.a, e.b);
136
+ const prev = edgeByPair.get(key);
137
+ if (prev === undefined || e.similarity > prev)
138
+ edgeByPair.set(key, e.similarity);
139
+ }
140
+ return edgeByPair;
141
+ }
142
+ function buildComponentCluster(eligible, component, edges, edgeByPair) {
143
+ const nearIndices = nearIndicesInComponent(component, edges);
144
+ if (nearIndices.size < 2)
145
+ return undefined;
146
+ if (component.length > MAX_CLUSTER_SIZE)
147
+ return undefined;
148
+ const members = component.map((i) => eligible[i]).filter((o) => !!o);
149
+ const anchor = lowestByLocation(members);
150
+ const nearMembers = [...nearIndices]
151
+ .map((i) => eligible[i]?.qualifiedName)
152
+ .filter((n) => n !== undefined)
153
+ .sort();
154
+ const exactMembers = exactMembersInComponent(members);
155
+ const maxSim = maxSimilarityAmong(nearIndices, edgeByPair);
156
+ return {
157
+ anchor,
158
+ nearMembers,
159
+ exactMembers,
160
+ estimatedSimilarity: maxSim,
161
+ clusterSize: component.length,
162
+ };
163
+ }
164
+ function nearIndicesInComponent(component, edges) {
165
+ const nearIndices = new Set();
166
+ const componentSet = new Set(component);
167
+ for (const e of edges) {
168
+ if (!componentSet.has(e.a) && !componentSet.has(e.b))
169
+ continue;
170
+ nearIndices.add(e.a);
171
+ nearIndices.add(e.b);
172
+ }
173
+ return nearIndices;
174
+ }
175
+ function exactMembersInComponent(members) {
176
+ const hashCounts = new Map();
177
+ for (const m of members)
178
+ hashCounts.set(m.bodyHash, (hashCounts.get(m.bodyHash) ?? 0) + 1);
179
+ return members
180
+ .filter((m) => (hashCounts.get(m.bodyHash) ?? 0) > 1)
181
+ .map((m) => m.qualifiedName)
182
+ .sort();
183
+ }
184
+ function maxSimilarityAmong(nearIndices, edgeByPair) {
185
+ let maxSim = 0;
186
+ for (const i of nearIndices) {
187
+ for (const j of nearIndices) {
188
+ if (i >= j)
189
+ continue;
190
+ const sim = edgeByPair.get(pairKey(i, j));
191
+ if (sim !== undefined && sim > maxSim)
192
+ maxSim = sim;
193
+ }
194
+ }
195
+ return maxSim;
196
+ }
197
+ function lowestByLocation(occs) {
198
+ return occs.reduce((lo, c) => {
199
+ if (c.filePath < lo.filePath)
200
+ return c;
201
+ if (c.filePath > lo.filePath)
202
+ return lo;
203
+ if (c.line < lo.line)
204
+ return c;
205
+ if (c.line > lo.line)
206
+ return lo;
207
+ return c.column < lo.column ? c : lo;
208
+ });
209
+ }
210
+ function pairKey(a, b) {
211
+ return a < b ? `${String(a)}:${String(b)}` : `${String(b)}:${String(a)}`;
212
+ }
213
+ class UnionFind {
214
+ parent;
215
+ rank;
216
+ constructor(size) {
217
+ this.parent = Array.from({ length: size }, (_, i) => i);
218
+ this.rank = Array.from({ length: size }, () => 0);
219
+ }
220
+ find(x) {
221
+ const p = this.parent[x];
222
+ if (p === undefined || p === x)
223
+ return x;
224
+ const root = this.find(p);
225
+ this.parent[x] = root;
226
+ return root;
227
+ }
228
+ union(a, b) {
229
+ const ra = this.find(a);
230
+ const rb = this.find(b);
231
+ if (ra === rb)
232
+ return;
233
+ const rankA = this.rank[ra] ?? 0;
234
+ const rankB = this.rank[rb] ?? 0;
235
+ if (rankA < rankB) {
236
+ this.parent[ra] = rb;
237
+ }
238
+ else if (rankA > rankB) {
239
+ this.parent[rb] = ra;
240
+ }
241
+ else {
242
+ this.parent[rb] = ra;
243
+ this.rank[ra] = rankA + 1;
244
+ }
245
+ }
246
+ }
247
+ //# sourceMappingURL=find-near-duplicates.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"find-near-duplicates.js","sourceRoot":"","sources":["../src/find-near-duplicates.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,OAAO,EAAE,cAAc,EAAE,MAAM,4BAA4B,CAAC;AAC5D,OAAO,EACL,kBAAkB,EAClB,oBAAoB,EACpB,eAAe,EACf,aAAa,GACd,MAAM,+BAA+B,CAAC;AAIvC,MAAM,sBAAsB,GAAG,IAAI,CAAC;AACpC,MAAM,qBAAqB,GAAG,GAAG,CAAC;AAClC,MAAM,gBAAgB,GAAG,EAAE,CAAC;AAE5B;;;GAGG;AACH,MAAM,UAAU,kBAAkB,CAChC,UAAqC,EACrC,OAAoB,EAAE;IAEtB,MAAM,aAAa,GAAG,IAAI,CAAC,aAAa,IAAI,sBAAsB,CAAC;IACnE,MAAM,WAAW,GAAG,IAAI,CAAC,WAAW,IAAI,qBAAqB,CAAC;IAC9D,MAAM,KAAK,GAAG,IAAI,CAAC,QAAQ,IAAI,kBAAkB,CAAC;IAClD,MAAM,IAAI,GAAG,oBAAoB,GAAG,KAAK,CAAC;IAC1C,kFAAkF;IAClF,qFAAqF;IACrF,yCAAyC;IACzC,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,IAAI,CAAC,IAAI,IAAI,GAAG,CAAC;QAAE,OAAO,EAAE,CAAC;IAEnD,MAAM,QAAQ,GAAG,eAAe,CAAC,UAAU,EAAE,WAAW,CAAC,CAAC;IAC1D,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC;QAAE,OAAO,EAAE,CAAC;IAEnC,MAAM,KAAK,GAAG,cAAc,CAAC,QAAQ,EAAE,aAAa,EAAE,KAAK,EAAE,IAAI,CAAC,CAAC;IACnE,MAAM,UAAU,GAAG,iBAAiB,CAAC,QAAQ,CAAC,MAAM,EAAE,KAAK,CAAC,CAAC;IAC7D,OAAO,qBAAqB,CAAC,QAAQ,EAAE,UAAU,EAAE,KAAK,CAAC,CAAC;AAC5D,CAAC;AAED,SAAS,eAAe,CACtB,UAAqC,EACrC,WAAmB;IAEnB,MAAM,GAAG,GAAqB,EAAE,CAAC;IACjC,KAAK,MAAM,GAAG,IAAI,UAAU,EAAE,CAAC;QAC7B,IAAI,CAAC,cAAc,CAAC,GAAG,CAAC;YAAE,SAAS;QACnC,IAAI,GAAG,CAAC,aAAa,EAAE,MAAM,KAAK,oBAAoB;YAAE,SAAS;QACjE,IAAI,GAAG,CAAC,QAAQ,KAAK,SAAS,IAAI,GAAG,CAAC,QAAQ,GAAG,WAAW;YAAE,SAAS;QACvE,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IAChB,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAQD,SAAS,cAAc,CACrB,QAAmC,EACnC,aAAqB,EACrB,KAAa,EACb,IAAY;IAEZ,MAAM,OAAO,GAAG,eAAe,CAAC,QAAQ,EAAE,KAAK,EAAE,IAAI,CAAC,CAAC;IACvD,MAAM,KAAK,GAAe,EAAE,CAAC;IAC7B,MAAM,SAAS,GAAG,IAAI,GAAG,EAAU,CAAC;IACpC,KAAK,MAAM,OAAO,IAAI,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC;QACvC,kBAAkB,CAAC,OAAO,EAAE,QAAQ,EAAE,aAAa,EAAE,SAAS,EAAE,KAAK,CAAC,CAAC;IACzE,CAAC;IACD,OAAO,KAAK,CAAC;AACf,CAAC;AAED,SAAS,eAAe,CACtB,QAAmC,EACnC,KAAa,EACb,IAAY;IAEZ,MAAM,OAAO,GAAG,IAAI,GAAG,EAAoB,CAAC;IAC5C,KAAK,MAAM,CAAC,CAAC,EAAE,GAAG,CAAC,IAAI,QAAQ,CAAC,OAAO,EAAE,EAAE,CAAC;QAC1C,IAAI,CAAC,GAAG,CAAC,aAAa;YAAE,SAAS;QACjC,MAAM,UAAU,GAAG,aAAa,CAAC,GAAG,CAAC,aAAa,EAAE,KAAK,EAAE,IAAI,CAAC,CAAC;QACjE,KAAK,MAAM,CAAC,IAAI,EAAE,QAAQ,CAAC,IAAI,UAAU,CAAC,OAAO,EAAE,EAAE,CAAC;YACpD,MAAM,GAAG,GAAG,GAAG,MAAM,CAAC,IAAI,CAAC,IAAI,QAAQ,IAAI,EAAE,EAAE,CAAC;YAChD,MAAM,IAAI,GAAG,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC;YACpC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YACb,OAAO,CAAC,GAAG,CAAC,GAAG,EAAE,IAAI,CAAC,CAAC;QACzB,CAAC;IACH,CAAC;IACD,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,SAAS,kBAAkB,CACzB,OAA0B,EAC1B,QAAmC,EACnC,aAAqB,EACrB,SAAsB,EACtB,KAAiB;IAEjB,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC;QAAE,OAAO;IAC/B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACxC,KAAK,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC5C,MAAM,EAAE,GAAG,OAAO,CAAC,CAAC,CAAC,CAAC;YACtB,MAAM,EAAE,GAAG,OAAO,CAAC,CAAC,CAAC,CAAC;YACtB,IAAI,EAAE,KAAK,SAAS,IAAI,EAAE,KAAK,SAAS;gBAAE,SAAS;YACnD,MAAM,GAAG,GAAG,OAAO,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC;YAC5B,IAAI,SAAS,CAAC,GAAG,CAAC,GAAG,CAAC;gBAAE,SAAS;YACjC,SAAS,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;YACnB,MAAM,IAAI,GAAG,WAAW,CAAC,EAAE,EAAE,EAAE,EAAE,QAAQ,EAAE,aAAa,CAAC,CAAC;YAC1D,IAAI,IAAI;gBAAE,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC7B,CAAC;IACH,CAAC;AACH,CAAC;AAED,SAAS,WAAW,CAClB,EAAU,EACV,EAAU,EACV,QAAmC,EACnC,aAAqB;IAErB,MAAM,CAAC,GAAG,QAAQ,CAAC,EAAE,CAAC,CAAC;IACvB,MAAM,CAAC,GAAG,QAAQ,CAAC,EAAE,CAAC,CAAC;IACvB,IAAI,CAAC,CAAC,EAAE,aAAa,IAAI,CAAC,CAAC,EAAE,aAAa;QAAE,OAAO,SAAS,CAAC;IAC7D,IAAI,CAAC,CAAC,QAAQ,KAAK,CAAC,CAAC,QAAQ;QAAE,OAAO,SAAS,CAAC;IAEhD,8EAA8E;IAC9E,oFAAoF;IACpF,wCAAwC;IACxC,MAAM,KAAK,GAAG,CAAC,CAAC,QAAQ,CAAC;IACzB,MAAM,KAAK,GAAG,CAAC,CAAC,QAAQ,CAAC;IACzB,IAAI,KAAK,KAAK,SAAS,IAAI,KAAK,KAAK,SAAS,IAAI,KAAK,KAAK,KAAK;QAAE,OAAO,SAAS,CAAC;IAEpF,MAAM,UAAU,GAAG,eAAe,CAAC,CAAC,CAAC,aAAa,EAAE,CAAC,CAAC,aAAa,CAAC,CAAC;IACrE,IAAI,UAAU,GAAG,aAAa;QAAE,OAAO,SAAS,CAAC;IACjD,OAAO,EAAE,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,EAAE,EAAE,UAAU,EAAE,CAAC;AACtC,CAAC;AAED,SAAS,iBAAiB,CAAC,IAAY,EAAE,KAA0B;IACjE,MAAM,EAAE,GAAG,IAAI,SAAS,CAAC,IAAI,CAAC,CAAC;IAC/B,KAAK,MAAM,CAAC,IAAI,KAAK;QAAE,EAAE,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;IAE1C,MAAM,MAAM,GAAG,IAAI,GAAG,EAAoB,CAAC;IAC3C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,EAAE,CAAC,EAAE,EAAE,CAAC;QAC9B,MAAM,IAAI,GAAG,EAAE,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACxB,MAAM,IAAI,GAAG,MAAM,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;QACpC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACb,MAAM,CAAC,GAAG,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC;IACzB,CAAC;IACD,OAAO,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,IAAI,CAAC,CAAC,CAAC;AAC3D,CAAC;AAED,SAAS,qBAAqB,CAC5B,QAAmC,EACnC,UAA+B,EAC/B,KAA0B;IAE1B,MAAM,UAAU,GAAG,wBAAwB,CAAC,KAAK,CAAC,CAAC;IACnD,MAAM,QAAQ,GAA2B,EAAE,CAAC;IAC5C,KAAK,MAAM,SAAS,IAAI,UAAU,EAAE,CAAC;QACnC,MAAM,OAAO,GAAG,qBAAqB,CAAC,QAAQ,EAAE,SAAS,EAAE,KAAK,EAAE,UAAU,CAAC,CAAC;QAC9E,IAAI,OAAO;YAAE,QAAQ,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;IACtC,CAAC;IACD,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED,SAAS,wBAAwB,CAAC,KAA0B;IAC1D,MAAM,UAAU,GAAG,IAAI,GAAG,EAAkB,CAAC;IAC7C,KAAK,MAAM,CAAC,IAAI,KAAK,EAAE,CAAC;QACtB,MAAM,GAAG,GAAG,OAAO,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;QAC9B,MAAM,IAAI,GAAG,UAAU,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;QACjC,IAAI,IAAI,KAAK,SAAS,IAAI,CAAC,CAAC,UAAU,GAAG,IAAI;YAAE,UAAU,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC,UAAU,CAAC,CAAC;IACnF,CAAC;IACD,OAAO,UAAU,CAAC;AACpB,CAAC;AAED,SAAS,qBAAqB,CAC5B,QAAmC,EACnC,SAA4B,EAC5B,KAA0B,EAC1B,UAAuC;IAEvC,MAAM,WAAW,GAAG,sBAAsB,CAAC,SAAS,EAAE,KAAK,CAAC,CAAC;IAC7D,IAAI,WAAW,CAAC,IAAI,GAAG,CAAC;QAAE,OAAO,SAAS,CAAC;IAC3C,IAAI,SAAS,CAAC,MAAM,GAAG,gBAAgB;QAAE,OAAO,SAAS,CAAC;IAE1D,MAAM,OAAO,GAAG,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAuB,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAC1F,MAAM,MAAM,GAAG,gBAAgB,CAAC,OAAO,CAAC,CAAC;IACzC,MAAM,WAAW,GAAG,CAAC,GAAG,WAAW,CAAC;SACjC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE,aAAa,CAAC;SACtC,MAAM,CAAC,CAAC,CAAC,EAAe,EAAE,CAAC,CAAC,KAAK,SAAS,CAAC;SAC3C,IAAI,EAAE,CAAC;IACV,MAAM,YAAY,GAAG,uBAAuB,CAAC,OAAO,CAAC,CAAC;IACtD,MAAM,MAAM,GAAG,kBAAkB,CAAC,WAAW,EAAE,UAAU,CAAC,CAAC;IAE3D,OAAO;QACL,MAAM;QACN,WAAW;QACX,YAAY;QACZ,mBAAmB,EAAE,MAAM;QAC3B,WAAW,EAAE,SAAS,CAAC,MAAM;KAC9B,CAAC;AACJ,CAAC;AAED,SAAS,sBAAsB,CAC7B,SAA4B,EAC5B,KAA0B;IAE1B,MAAM,WAAW,GAAG,IAAI,GAAG,EAAU,CAAC;IACtC,MAAM,YAAY,GAAG,IAAI,GAAG,CAAC,SAAS,CAAC,CAAC;IACxC,KAAK,MAAM,CAAC,IAAI,KAAK,EAAE,CAAC;QACtB,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;YAAE,SAAS;QAC/D,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QACrB,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IACvB,CAAC;IACD,OAAO,WAAW,CAAC;AACrB,CAAC;AAED,SAAS,uBAAuB,CAAC,OAAkC;IACjE,MAAM,UAAU,GAAG,IAAI,GAAG,EAAkB,CAAC;IAC7C,KAAK,MAAM,CAAC,IAAI,OAAO;QAAE,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,QAAQ,EAAE,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAC3F,OAAO,OAAO;SACX,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC;SACpD,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,aAAa,CAAC;SAC3B,IAAI,EAAE,CAAC;AACZ,CAAC;AAED,SAAS,kBAAkB,CACzB,WAAgC,EAChC,UAAuC;IAEvC,IAAI,MAAM,GAAG,CAAC,CAAC;IACf,KAAK,MAAM,CAAC,IAAI,WAAW,EAAE,CAAC;QAC5B,KAAK,MAAM,CAAC,IAAI,WAAW,EAAE,CAAC;YAC5B,IAAI,CAAC,IAAI,CAAC;gBAAE,SAAS;YACrB,MAAM,GAAG,GAAG,UAAU,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;YAC1C,IAAI,GAAG,KAAK,SAAS,IAAI,GAAG,GAAG,MAAM;gBAAE,MAAM,GAAG,GAAG,CAAC;QACtD,CAAC;IACH,CAAC;IACD,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,SAAS,gBAAgB,CAAC,IAA+B;IACvD,OAAO,IAAI,CAAC,MAAM,CAAC,CAAC,EAAE,EAAE,CAAC,EAAE,EAAE;QAC3B,IAAI,CAAC,CAAC,QAAQ,GAAG,EAAE,CAAC,QAAQ;YAAE,OAAO,CAAC,CAAC;QACvC,IAAI,CAAC,CAAC,QAAQ,GAAG,EAAE,CAAC,QAAQ;YAAE,OAAO,EAAE,CAAC;QACxC,IAAI,CAAC,CAAC,IAAI,GAAG,EAAE,CAAC,IAAI;YAAE,OAAO,CAAC,CAAC;QAC/B,IAAI,CAAC,CAAC,IAAI,GAAG,EAAE,CAAC,IAAI;YAAE,OAAO,EAAE,CAAC;QAChC,OAAO,CAAC,CAAC,MAAM,GAAG,EAAE,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;IACvC,CAAC,CAAC,CAAC;AACL,CAAC;AAED,SAAS,OAAO,CAAC,CAAS,EAAE,CAAS;IACnC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC,CAAC,CAAC,IAAI,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC,CAAC,CAAC,IAAI,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC;AAC3E,CAAC;AAED,MAAM,SAAS;IACI,MAAM,CAAW;IACjB,IAAI,CAAW;IAEhC,YAAY,IAAY;QACtB,IAAI,CAAC,MAAM,GAAG,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,IAAI,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC;QACxD,IAAI,CAAC,IAAI,GAAG,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,IAAI,EAAE,EAAE,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC;IACpD,CAAC;IAED,IAAI,CAAC,CAAS;QACZ,MAAM,CAAC,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC;QACzB,IAAI,CAAC,KAAK,SAAS,IAAI,CAAC,KAAK,CAAC;YAAE,OAAO,CAAC,CAAC;QACzC,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC1B,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,GAAG,IAAI,CAAC;QACtB,OAAO,IAAI,CAAC;IACd,CAAC;IAED,KAAK,CAAC,CAAS,EAAE,CAAS;QACxB,MAAM,EAAE,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACxB,MAAM,EAAE,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACxB,IAAI,EAAE,KAAK,EAAE;YAAE,OAAO;QACtB,MAAM,KAAK,GAAG,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,CAAC;QACjC,MAAM,KAAK,GAAG,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,CAAC;QACjC,IAAI,KAAK,GAAG,KAAK,EAAE,CAAC;YAClB,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,GAAG,EAAE,CAAC;QACvB,CAAC;aAAM,IAAI,KAAK,GAAG,KAAK,EAAE,CAAC;YACzB,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,GAAG,EAAE,CAAC;QACvB,CAAC;aAAM,CAAC;YACN,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,GAAG,EAAE,CAAC;YACrB,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,GAAG,KAAK,GAAG,CAAC,CAAC;QAC5B,CAAC;IACH,CAAC;CACF"}
@@ -0,0 +1,16 @@
1
+ /**
2
+ * @opensip-cli/clone-detection — shared function-body clone-detection substrate.
3
+ *
4
+ * A pure, `node:crypto`-only leaf package (layer 2) that single-sources the body-hash
5
+ * + MinHash primitives, the tool-neutral `CloneCandidate` shape, and the duplicate /
6
+ * near-duplicate detection algorithms + curation policy. Both the graph tool and the
7
+ * yagni tool depend on it (neither on the other), so there is exactly one
8
+ * implementation and they cannot diverge (ADR-0064).
9
+ */
10
+ export { normalizeWhitespace, hashBody, type BodyDigest } from './body-digest.js';
11
+ export { NEAR_DUP_SIGNATURE_K, NEAR_DUP_LSH_BANDS, NEAR_DUP_LSH_ROWS, NEAR_DUP_SIGNATURE_VERSION, shingle, bodySignature, estimateJaccard, lshBandHashes, digestCanonicalBody, type BodyDigestWithSignature, } from './near-duplicate-signature.js';
12
+ export type { FunctionKind, CloneCandidate, DupOpts, NearDupOpts, DuplicateGroup, CrossPackageAggregate, DuplicateFindings, NearDuplicateCluster, } from './types.js';
13
+ export { findDuplicateBodies, isEligibleKind } from './find-duplicate-bodies.js';
14
+ export { findNearDuplicates } from './find-near-duplicates.js';
15
+ export { isTestFilePath } from './test-file.js';
16
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAIH,OAAO,EAAE,mBAAmB,EAAE,QAAQ,EAAE,KAAK,UAAU,EAAE,MAAM,kBAAkB,CAAC;AAGlF,OAAO,EACL,oBAAoB,EACpB,kBAAkB,EAClB,iBAAiB,EACjB,0BAA0B,EAC1B,OAAO,EACP,aAAa,EACb,eAAe,EACf,aAAa,EACb,mBAAmB,EACnB,KAAK,uBAAuB,GAC7B,MAAM,+BAA+B,CAAC;AAIvC,YAAY,EACV,YAAY,EACZ,cAAc,EACd,OAAO,EACP,WAAW,EACX,cAAc,EACd,qBAAqB,EACrB,iBAAiB,EACjB,oBAAoB,GACrB,MAAM,YAAY,CAAC;AACpB,OAAO,EAAE,mBAAmB,EAAE,cAAc,EAAE,MAAM,4BAA4B,CAAC;AACjF,OAAO,EAAE,kBAAkB,EAAE,MAAM,2BAA2B,CAAC;AAI/D,OAAO,EAAE,cAAc,EAAE,MAAM,gBAAgB,CAAC"}
package/dist/index.js ADDED
@@ -0,0 +1,20 @@
1
+ /**
2
+ * @opensip-cli/clone-detection — shared function-body clone-detection substrate.
3
+ *
4
+ * A pure, `node:crypto`-only leaf package (layer 2) that single-sources the body-hash
5
+ * + MinHash primitives, the tool-neutral `CloneCandidate` shape, and the duplicate /
6
+ * near-duplicate detection algorithms + curation policy. Both the graph tool and the
7
+ * yagni tool depend on it (neither on the other), so there is exactly one
8
+ * implementation and they cannot diverge (ADR-0064).
9
+ */
10
+ // Body-hash primitives (relocated verbatim from graph — bodyHash is the catalog/cache
11
+ // /equivalence-guardrail identity; the values must never change).
12
+ export { normalizeWhitespace, hashBody } from './body-digest.js';
13
+ // MinHash / LSH near-duplicate primitives + algorithm constants.
14
+ export { NEAR_DUP_SIGNATURE_K, NEAR_DUP_LSH_BANDS, NEAR_DUP_LSH_ROWS, NEAR_DUP_SIGNATURE_VERSION, shingle, bodySignature, estimateJaccard, lshBandHashes, digestCanonicalBody, } from './near-duplicate-signature.js';
15
+ export { findDuplicateBodies, isEligibleKind } from './find-duplicate-bodies.js';
16
+ export { findNearDuplicates } from './find-near-duplicates.js';
17
+ // Canonical TS/JS test-file predicate (single-sourced so every CloneCandidate producer
18
+ // stamps `inTestFile` identically — D1).
19
+ export { isTestFilePath } from './test-file.js';
20
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH,sFAAsF;AACtF,kEAAkE;AAClE,OAAO,EAAE,mBAAmB,EAAE,QAAQ,EAAmB,MAAM,kBAAkB,CAAC;AAElF,iEAAiE;AACjE,OAAO,EACL,oBAAoB,EACpB,kBAAkB,EAClB,iBAAiB,EACjB,0BAA0B,EAC1B,OAAO,EACP,aAAa,EACb,eAAe,EACf,aAAa,EACb,mBAAmB,GAEpB,MAAM,+BAA+B,CAAC;AAcvC,OAAO,EAAE,mBAAmB,EAAE,cAAc,EAAE,MAAM,4BAA4B,CAAC;AACjF,OAAO,EAAE,kBAAkB,EAAE,MAAM,2BAA2B,CAAC;AAE/D,uFAAuF;AACvF,yCAAyC;AACzC,OAAO,EAAE,cAAc,EAAE,MAAM,gBAAgB,CAAC"}
@@ -0,0 +1,55 @@
1
+ /**
2
+ * Near-duplicate body signatures — MinHash + LSH primitives.
3
+ *
4
+ * Single source of truth for signature constants and the digest tail used by
5
+ * both tree-sitter adapters and the TypeScript inventory path. Signatures are
6
+ * derived from the same canonical body string that feeds `bodyHash`.
7
+ */
8
+ import { type BodyDigest } from './body-digest.js';
9
+ /** Number of MinHash values per body signature. */
10
+ export declare const NEAR_DUP_SIGNATURE_K = 128;
11
+ /** Default LSH band count — co-tuned with {@link NEAR_DUP_LSH_ROWS}. */
12
+ export declare const NEAR_DUP_LSH_BANDS = 8;
13
+ /** Rows per band; `bands × rows === k`. Knee ≈ 0.878 at threshold 0.85. */
14
+ export declare const NEAR_DUP_LSH_ROWS = 16;
15
+ /**
16
+ * Signature ALGORITHM version. Bump on ANY change to shingling, hashing, or the
17
+ * permutation scheme that alters signature VALUES. Feeds the `sig=` cache-key
18
+ * segment so catalogs built with an older algorithm invalidate — mixing old- and
19
+ * new-algorithm signatures across an incremental build (some occurrences cached,
20
+ * some re-walked) would corrupt every cross-occurrence Jaccard estimate.
21
+ *
22
+ * v1 = k independent SHA-256 hashes per (shingle, seed). v2 = one SHA-256 base
23
+ * hash per shingle + k cheap 32-bit mixers (~k× fewer hashes, identical MinHash
24
+ * semantics).
25
+ */
26
+ export declare const NEAR_DUP_SIGNATURE_VERSION = 2;
27
+ /** Digest including hash, size, and optional near-duplicate MinHash signature. */
28
+ export type BodyDigestWithSignature = BodyDigest;
29
+ /**
30
+ * Character k-grams from canonical body text. Bodies shorter than `gramSize`
31
+ * yield a single shingle of the whole string when non-empty.
32
+ */
33
+ export declare function shingle(canonical: string, gramSize?: number): ReadonlySet<string>;
34
+ /**
35
+ * Deterministic MinHash signature over char shingles. Stable across runs and
36
+ * machines (SHA-256 base hash + fixed permutation seeds).
37
+ *
38
+ * Each shingle is hashed ONCE with SHA-256 (the only expensive step); the k
39
+ * MinHash values are then derived by mixing that base hash with k fixed 32-bit
40
+ * seeds. This is ~k× fewer SHA-256 computations than hashing every
41
+ * (shingle, seed) pair while preserving MinHash semantics (each of the k mixers
42
+ * is an independent hash of the shingle universe, so the per-position min
43
+ * estimates Jaccard exactly as before). Algorithm {@link NEAR_DUP_SIGNATURE_VERSION}.
44
+ */
45
+ export declare function bodySignature(canonical: string, k?: number): readonly number[];
46
+ /** MinHash Jaccard estimate: equal-position fraction. */
47
+ export declare function estimateJaccard(a: readonly number[], b: readonly number[]): number;
48
+ /** LSH band hashes for candidate-pair generation. */
49
+ export declare function lshBandHashes(signature: readonly number[], bands: number, rows: number): readonly string[];
50
+ /**
51
+ * Hash + size + signature for an already-normalized canonical body string.
52
+ * Skips `signature` when the canonical text yields no shingles.
53
+ */
54
+ export declare function digestCanonicalBody(canonical: string): BodyDigestWithSignature;
55
+ //# sourceMappingURL=near-duplicate-signature.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"near-duplicate-signature.d.ts","sourceRoot":"","sources":["../src/near-duplicate-signature.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAIH,OAAO,EAAY,KAAK,UAAU,EAAE,MAAM,kBAAkB,CAAC;AAE7D,mDAAmD;AACnD,eAAO,MAAM,oBAAoB,MAAM,CAAC;AAExC,wEAAwE;AACxE,eAAO,MAAM,kBAAkB,IAAI,CAAC;AAEpC,2EAA2E;AAC3E,eAAO,MAAM,iBAAiB,KAAK,CAAC;AAEpC;;;;;;;;;;GAUG;AACH,eAAO,MAAM,0BAA0B,IAAI,CAAC;AAE5C,kFAAkF;AAClF,MAAM,MAAM,uBAAuB,GAAG,UAAU,CAAC;AAEjD;;;GAGG;AACH,wBAAgB,OAAO,CAAC,SAAS,EAAE,MAAM,EAAE,QAAQ,SAAI,GAAG,WAAW,CAAC,MAAM,CAAC,CAW5E;AAED;;;;;;;;;;GAUG;AACH,wBAAgB,aAAa,CAAC,SAAS,EAAE,MAAM,EAAE,CAAC,SAAuB,GAAG,SAAS,MAAM,EAAE,CAmB5F;AAED,yDAAyD;AACzD,wBAAgB,eAAe,CAAC,CAAC,EAAE,SAAS,MAAM,EAAE,EAAE,CAAC,EAAE,SAAS,MAAM,EAAE,GAAG,MAAM,CAOlF;AAED,qDAAqD;AACrD,wBAAgB,aAAa,CAC3B,SAAS,EAAE,SAAS,MAAM,EAAE,EAC5B,KAAK,EAAE,MAAM,EACb,IAAI,EAAE,MAAM,GACX,SAAS,MAAM,EAAE,CAOnB;AAED;;;GAGG;AACH,wBAAgB,mBAAmB,CAAC,SAAS,EAAE,MAAM,GAAG,uBAAuB,CAO9E"}
@@ -0,0 +1,135 @@
1
+ /**
2
+ * Near-duplicate body signatures — MinHash + LSH primitives.
3
+ *
4
+ * Single source of truth for signature constants and the digest tail used by
5
+ * both tree-sitter adapters and the TypeScript inventory path. Signatures are
6
+ * derived from the same canonical body string that feeds `bodyHash`.
7
+ */
8
+ import { createHash } from 'node:crypto';
9
+ import { hashBody } from './body-digest.js';
10
+ /** Number of MinHash values per body signature. */
11
+ export const NEAR_DUP_SIGNATURE_K = 128;
12
+ /** Default LSH band count — co-tuned with {@link NEAR_DUP_LSH_ROWS}. */
13
+ export const NEAR_DUP_LSH_BANDS = 8;
14
+ /** Rows per band; `bands × rows === k`. Knee ≈ 0.878 at threshold 0.85. */
15
+ export const NEAR_DUP_LSH_ROWS = 16;
16
+ /**
17
+ * Signature ALGORITHM version. Bump on ANY change to shingling, hashing, or the
18
+ * permutation scheme that alters signature VALUES. Feeds the `sig=` cache-key
19
+ * segment so catalogs built with an older algorithm invalidate — mixing old- and
20
+ * new-algorithm signatures across an incremental build (some occurrences cached,
21
+ * some re-walked) would corrupt every cross-occurrence Jaccard estimate.
22
+ *
23
+ * v1 = k independent SHA-256 hashes per (shingle, seed). v2 = one SHA-256 base
24
+ * hash per shingle + k cheap 32-bit mixers (~k× fewer hashes, identical MinHash
25
+ * semantics).
26
+ */
27
+ export const NEAR_DUP_SIGNATURE_VERSION = 2;
28
+ /**
29
+ * Character k-grams from canonical body text. Bodies shorter than `gramSize`
30
+ * yield a single shingle of the whole string when non-empty.
31
+ */
32
+ export function shingle(canonical, gramSize = 5) {
33
+ const set = new Set();
34
+ if (canonical.length === 0)
35
+ return set;
36
+ if (canonical.length < gramSize) {
37
+ set.add(canonical);
38
+ return set;
39
+ }
40
+ for (let i = 0; i <= canonical.length - gramSize; i++) {
41
+ set.add(canonical.slice(i, i + gramSize));
42
+ }
43
+ return set;
44
+ }
45
+ /**
46
+ * Deterministic MinHash signature over char shingles. Stable across runs and
47
+ * machines (SHA-256 base hash + fixed permutation seeds).
48
+ *
49
+ * Each shingle is hashed ONCE with SHA-256 (the only expensive step); the k
50
+ * MinHash values are then derived by mixing that base hash with k fixed 32-bit
51
+ * seeds. This is ~k× fewer SHA-256 computations than hashing every
52
+ * (shingle, seed) pair while preserving MinHash semantics (each of the k mixers
53
+ * is an independent hash of the shingle universe, so the per-position min
54
+ * estimates Jaccard exactly as before). Algorithm {@link NEAR_DUP_SIGNATURE_VERSION}.
55
+ */
56
+ export function bodySignature(canonical, k = NEAR_DUP_SIGNATURE_K) {
57
+ const shingles = shingle(canonical);
58
+ if (shingles.size === 0)
59
+ return [];
60
+ const seeds = k === NEAR_DUP_SIGNATURE_K ? PERM_SEEDS : derivePermSeeds(k);
61
+ // Hash each shingle exactly once — the per-(shingle, seed) k-fold SHA-256 of
62
+ // the v1 algorithm is the cost this removes.
63
+ const bases = [];
64
+ for (const gram of shingles)
65
+ bases.push(baseHash(gram));
66
+ const sig = [];
67
+ for (let i = 0; i < k; i++) {
68
+ const seed = seeds[i] ?? 0;
69
+ let min = 0xff_ff_ff_ff;
70
+ for (const base of bases) {
71
+ const v = mix32(base, seed);
72
+ if (v < min)
73
+ min = v;
74
+ }
75
+ sig.push(min);
76
+ }
77
+ return sig;
78
+ }
79
+ /** MinHash Jaccard estimate: equal-position fraction. */
80
+ export function estimateJaccard(a, b) {
81
+ if (a.length === 0 || b.length === 0 || a.length !== b.length)
82
+ return 0;
83
+ let matches = 0;
84
+ for (const [i, element] of a.entries()) {
85
+ if (element === b[i])
86
+ matches++;
87
+ }
88
+ return matches / a.length;
89
+ }
90
+ /** LSH band hashes for candidate-pair generation. */
91
+ export function lshBandHashes(signature, bands, rows) {
92
+ const out = [];
93
+ for (let band = 0; band < bands; band++) {
94
+ const start = band * rows;
95
+ out.push(signature.slice(start, start + rows).join(','));
96
+ }
97
+ return out;
98
+ }
99
+ /**
100
+ * Hash + size + signature for an already-normalized canonical body string.
101
+ * Skips `signature` when the canonical text yields no shingles.
102
+ */
103
+ export function digestCanonicalBody(canonical) {
104
+ const digest = hashBody(canonical);
105
+ const signature = bodySignature(canonical);
106
+ return {
107
+ ...digest,
108
+ signature: signature.length > 0 ? signature : undefined,
109
+ };
110
+ }
111
+ /** SHA-256 → 32-bit base hash of one shingle (computed once per shingle). */
112
+ function baseHash(gram) {
113
+ return createHash('sha256').update(gram, 'utf8').digest().readUInt32LE(0);
114
+ }
115
+ /**
116
+ * Cheap 32-bit avalanche of a base hash under one permutation seed (Mueller's
117
+ * integer finalizer). Division-free, no precision loss — `Math.imul` keeps the
118
+ * multiply in 32-bit. Distinct seeds yield independent hashes of the shingle.
119
+ */
120
+ function mix32(h, seed) {
121
+ const MIX_CONST = 0x4_5d_9f_3b;
122
+ let x = (h ^ seed) >>> 0;
123
+ x = Math.imul((x >>> 16) ^ x, MIX_CONST) >>> 0;
124
+ x = Math.imul((x >>> 16) ^ x, MIX_CONST) >>> 0;
125
+ return ((x >>> 16) ^ x) >>> 0;
126
+ }
127
+ /** k fixed permutation seeds, derived deterministically (SHA-256 of the index). */
128
+ function derivePermSeeds(k) {
129
+ return Array.from({ length: k }, (_, i) => createHash('sha256')
130
+ .update(`opensip-minhash-perm:${String(i)}`, 'utf8')
131
+ .digest()
132
+ .readUInt32LE(0));
133
+ }
134
+ const PERM_SEEDS = derivePermSeeds(NEAR_DUP_SIGNATURE_K);
135
+ //# sourceMappingURL=near-duplicate-signature.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"near-duplicate-signature.js","sourceRoot":"","sources":["../src/near-duplicate-signature.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AAEzC,OAAO,EAAE,QAAQ,EAAmB,MAAM,kBAAkB,CAAC;AAE7D,mDAAmD;AACnD,MAAM,CAAC,MAAM,oBAAoB,GAAG,GAAG,CAAC;AAExC,wEAAwE;AACxE,MAAM,CAAC,MAAM,kBAAkB,GAAG,CAAC,CAAC;AAEpC,2EAA2E;AAC3E,MAAM,CAAC,MAAM,iBAAiB,GAAG,EAAE,CAAC;AAEpC;;;;;;;;;;GAUG;AACH,MAAM,CAAC,MAAM,0BAA0B,GAAG,CAAC,CAAC;AAK5C;;;GAGG;AACH,MAAM,UAAU,OAAO,CAAC,SAAiB,EAAE,QAAQ,GAAG,CAAC;IACrD,MAAM,GAAG,GAAG,IAAI,GAAG,EAAU,CAAC;IAC9B,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,GAAG,CAAC;IACvC,IAAI,SAAS,CAAC,MAAM,GAAG,QAAQ,EAAE,CAAC;QAChC,GAAG,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC;QACnB,OAAO,GAAG,CAAC;IACb,CAAC;IACD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,SAAS,CAAC,MAAM,GAAG,QAAQ,EAAE,CAAC,EAAE,EAAE,CAAC;QACtD,GAAG,CAAC,GAAG,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,QAAQ,CAAC,CAAC,CAAC;IAC5C,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED;;;;;;;;;;GAUG;AACH,MAAM,UAAU,aAAa,CAAC,SAAiB,EAAE,CAAC,GAAG,oBAAoB;IACvE,MAAM,QAAQ,GAAG,OAAO,CAAC,SAAS,CAAC,CAAC;IACpC,IAAI,QAAQ,CAAC,IAAI,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IACnC,MAAM,KAAK,GAAG,CAAC,KAAK,oBAAoB,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC;IAC3E,6EAA6E;IAC7E,6CAA6C;IAC7C,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,KAAK,MAAM,IAAI,IAAI,QAAQ;QAAE,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC;IACxD,MAAM,GAAG,GAAa,EAAE,CAAC;IACzB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QAC3B,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;QAC3B,IAAI,GAAG,GAAG,aAAa,CAAC;QACxB,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,MAAM,CAAC,GAAG,KAAK,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC;YAC5B,IAAI,CAAC,GAAG,GAAG;gBAAE,GAAG,GAAG,CAAC,CAAC;QACvB,CAAC;QACD,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IAChB,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED,yDAAyD;AACzD,MAAM,UAAU,eAAe,CAAC,CAAoB,EAAE,CAAoB;IACxE,IAAI,CAAC,CAAC,MAAM,KAAK,CAAC,IAAI,CAAC,CAAC,MAAM,KAAK,CAAC,IAAI,CAAC,CAAC,MAAM,KAAK,CAAC,CAAC,MAAM;QAAE,OAAO,CAAC,CAAC;IACxE,IAAI,OAAO,GAAG,CAAC,CAAC;IAChB,KAAK,MAAM,CAAC,CAAC,EAAE,OAAO,CAAC,IAAI,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC;QACvC,IAAI,OAAO,KAAK,CAAC,CAAC,CAAC,CAAC;YAAE,OAAO,EAAE,CAAC;IAClC,CAAC;IACD,OAAO,OAAO,GAAG,CAAC,CAAC,MAAM,CAAC;AAC5B,CAAC;AAED,qDAAqD;AACrD,MAAM,UAAU,aAAa,CAC3B,SAA4B,EAC5B,KAAa,EACb,IAAY;IAEZ,MAAM,GAAG,GAAa,EAAE,CAAC;IACzB,KAAK,IAAI,IAAI,GAAG,CAAC,EAAE,IAAI,GAAG,KAAK,EAAE,IAAI,EAAE,EAAE,CAAC;QACxC,MAAM,KAAK,GAAG,IAAI,GAAG,IAAI,CAAC;QAC1B,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,KAAK,EAAE,KAAK,GAAG,IAAI,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;IAC3D,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,mBAAmB,CAAC,SAAiB;IACnD,MAAM,MAAM,GAAG,QAAQ,CAAC,SAAS,CAAC,CAAC;IACnC,MAAM,SAAS,GAAG,aAAa,CAAC,SAAS,CAAC,CAAC;IAC3C,OAAO;QACL,GAAG,MAAM;QACT,SAAS,EAAE,SAAS,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,SAAS;KACxD,CAAC;AACJ,CAAC;AAED,6EAA6E;AAC7E,SAAS,QAAQ,CAAC,IAAY;IAC5B,OAAO,UAAU,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC,MAAM,EAAE,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;AAC5E,CAAC;AAED;;;;GAIG;AACH,SAAS,KAAK,CAAC,CAAS,EAAE,IAAY;IACpC,MAAM,SAAS,GAAG,YAAY,CAAC;IAC/B,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC;IACzB,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,KAAK,EAAE,CAAC,GAAG,CAAC,EAAE,SAAS,CAAC,KAAK,CAAC,CAAC;IAC/C,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,KAAK,EAAE,CAAC,GAAG,CAAC,EAAE,SAAS,CAAC,KAAK,CAAC,CAAC;IAC/C,OAAO,CAAC,CAAC,CAAC,KAAK,EAAE,CAAC,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC;AAChC,CAAC;AAED,mFAAmF;AACnF,SAAS,eAAe,CAAC,CAAS;IAChC,OAAO,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CACxC,UAAU,CAAC,QAAQ,CAAC;SACjB,MAAM,CAAC,wBAAwB,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,MAAM,CAAC;SACnD,MAAM,EAAE;SACR,YAAY,CAAC,CAAC,CAAC,CACnB,CAAC;AACJ,CAAC;AAED,MAAM,UAAU,GAAsB,eAAe,CAAC,oBAAoB,CAAC,CAAC"}
@@ -0,0 +1,7 @@
1
+ /**
2
+ * Returns true if `filePathProjectRel` is a TypeScript / JS test file (or test fixture)
3
+ * by path convention. Path is project-relative with `/` separators (the same shape
4
+ * `CloneCandidate.filePath` carries).
5
+ */
6
+ export declare function isTestFilePath(filePathProjectRel: string): boolean;
7
+ //# sourceMappingURL=test-file.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"test-file.d.ts","sourceRoot":"","sources":["../src/test-file.ts"],"names":[],"mappings":"AAgBA;;;;GAIG;AACH,wBAAgB,cAAc,CAAC,kBAAkB,EAAE,MAAM,GAAG,OAAO,CAMlE"}
@@ -0,0 +1,26 @@
1
+ /**
2
+ * Canonical TS/JS test-file predicate — single-sourced so every producer of a
3
+ * `CloneCandidate.inTestFile` flag (graph's TS walk, yagni's TS inventory) classifies
4
+ * test files IDENTICALLY. A divergence here is exactly the filter-divergence class
5
+ * ADR-0064 prevents (it would split the cross-tool parity test on `inTestFile`).
6
+ *
7
+ * Anchored patterns instead of one alternation; avoids catastrophic backtracking on
8
+ * pathological inputs. `__fixtures__/` counts as test scaffolding by convention.
9
+ *
10
+ * Relocated verbatim from `graph-typescript/src/test-file.ts` (the regexes are unchanged
11
+ * — classification is byte-stable).
12
+ */
13
+ const TEST_TESTS_DIR_RE = /(?:^|\/)__tests__\//;
14
+ const TEST_FIXTURES_DIR_RE = /(?:^|\/)__fixtures__\//;
15
+ const TEST_FILE_SUFFIX_RE = /\.test\.(?:ts|tsx|js|jsx)$|_test\.(?:ts|tsx|js|jsx)$/;
16
+ /**
17
+ * Returns true if `filePathProjectRel` is a TypeScript / JS test file (or test fixture)
18
+ * by path convention. Path is project-relative with `/` separators (the same shape
19
+ * `CloneCandidate.filePath` carries).
20
+ */
21
+ export function isTestFilePath(filePathProjectRel) {
22
+ return (TEST_TESTS_DIR_RE.test(filePathProjectRel) ||
23
+ TEST_FIXTURES_DIR_RE.test(filePathProjectRel) ||
24
+ TEST_FILE_SUFFIX_RE.test(filePathProjectRel));
25
+ }
26
+ //# sourceMappingURL=test-file.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"test-file.js","sourceRoot":"","sources":["../src/test-file.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AACH,MAAM,iBAAiB,GAAG,qBAAqB,CAAC;AAChD,MAAM,oBAAoB,GAAG,wBAAwB,CAAC;AACtD,MAAM,mBAAmB,GAAG,sDAAsD,CAAC;AAEnF;;;;GAIG;AACH,MAAM,UAAU,cAAc,CAAC,kBAA0B;IACvD,OAAO,CACL,iBAAiB,CAAC,IAAI,CAAC,kBAAkB,CAAC;QAC1C,oBAAoB,CAAC,IAAI,CAAC,kBAAkB,CAAC;QAC7C,mBAAmB,CAAC,IAAI,CAAC,kBAAkB,CAAC,CAC7C,CAAC;AACJ,CAAC"}
@@ -0,0 +1,81 @@
1
+ /**
2
+ * Tool-neutral input + finding types for the clone-detection substrate.
3
+ *
4
+ * `CloneCandidate` is a deliberate STRUCTURAL SUBSET of graph's runtime
5
+ * `FunctionOccurrence` (ISP — the algorithms accept only the fields the math needs;
6
+ * graph passes its occurrences directly by structural assignability, yagni builds the
7
+ * same shape from `lang-typescript`). This is the DIP/ISP boundary that keeps the
8
+ * substrate free of either tool's types — NOT a future-proofing hook. The cross-tool
9
+ * parity test (Phase 2) is the seam that proves both producers agree on these fields;
10
+ * the dep-cruiser leaf rule is the compile-time invariant.
11
+ */
12
+ /** Function-shape classification — mirrors graph's `FunctionKind` for structural assignability. */
13
+ export type FunctionKind = 'function-declaration' | 'function-expression' | 'arrow' | 'method' | 'constructor' | 'getter' | 'setter' | 'module-init';
14
+ /** The minimal per-function input the detection algorithms consume. */
15
+ export interface CloneCandidate {
16
+ /** sha256(normalized body) — the grouping key. */
17
+ readonly bodyHash: string;
18
+ /** MinHash signature (k=128) — near-duplicate only; absent ⇒ skipped by `findNearDuplicates`. */
19
+ readonly bodySignature?: readonly number[];
20
+ /** Normalized body length in chars; absent ⇒ "passes the size floor". */
21
+ readonly bodySize?: number;
22
+ /** Canonical body span in lines; absent ⇒ fall back to `endLine − line + 1`. */
23
+ readonly bodyLines?: number;
24
+ readonly kind: FunctionKind;
25
+ readonly inTestFile: boolean;
26
+ readonly filePath: string;
27
+ readonly line: number;
28
+ readonly column: number;
29
+ readonly endLine: number;
30
+ readonly simpleName: string;
31
+ readonly qualifiedName: string;
32
+ /** Resolved package the caller assigns (graph: `pkgOf`; yagni: nearest `package.json`). */
33
+ readonly package?: string;
34
+ /**
35
+ * Resolved language the caller assigns for the near-duplicate same-language gate
36
+ * (graph: `languageOfFile(filePath)`). `languageOfFile` stays in graph (single
37
+ * caller — rule of three not met), so the substrate compares this pre-resolved
38
+ * field rather than importing a language map. Absent ⇒ the pair is skipped, exactly
39
+ * as `languageOfFile(...) === undefined` did.
40
+ */
41
+ readonly language?: string;
42
+ }
43
+ /** Thresholds for `findDuplicateBodies` (exact). Absent fields take the policy defaults. */
44
+ export interface DupOpts {
45
+ readonly minLines?: number;
46
+ readonly minBodySize?: number;
47
+ readonly minCrossPackagePackages?: number;
48
+ readonly minCrossPackageBodySize?: number;
49
+ }
50
+ /** Thresholds for `findNearDuplicates` (MinHash/LSH). Absent fields take the policy defaults. */
51
+ export interface NearDupOpts {
52
+ readonly minSimilarity?: number;
53
+ readonly minBodySize?: number;
54
+ readonly lshBands?: number;
55
+ }
56
+ /** A per-instance exact-duplicate group. `members[0]` is the primary (lowest qualifiedName). */
57
+ export interface DuplicateGroup {
58
+ readonly bodyHash: string;
59
+ readonly members: readonly CloneCandidate[];
60
+ }
61
+ /** A body duplicated across ≥ `minCrossPackagePackages` distinct packages. */
62
+ export interface CrossPackageAggregate {
63
+ readonly bodyHash: string;
64
+ readonly anchor: CloneCandidate;
65
+ readonly packages: readonly string[];
66
+ readonly occurrenceCount: number;
67
+ }
68
+ /** Result of {@link findDuplicateBodies}: cross-package aggregates + the surviving per-instance groups. */
69
+ export interface DuplicateFindings {
70
+ readonly aggregates: readonly CrossPackageAggregate[];
71
+ readonly groups: readonly DuplicateGroup[];
72
+ }
73
+ /** A near-duplicate cluster (connected component over LSH/Jaccard edges). */
74
+ export interface NearDuplicateCluster {
75
+ readonly anchor: CloneCandidate;
76
+ readonly nearMembers: readonly string[];
77
+ readonly exactMembers: readonly string[];
78
+ readonly estimatedSimilarity: number;
79
+ readonly clusterSize: number;
80
+ }
81
+ //# sourceMappingURL=types.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,mGAAmG;AACnG,MAAM,MAAM,YAAY,GACpB,sBAAsB,GACtB,qBAAqB,GACrB,OAAO,GACP,QAAQ,GACR,aAAa,GACb,QAAQ,GACR,QAAQ,GACR,aAAa,CAAC;AAElB,uEAAuE;AACvE,MAAM,WAAW,cAAc;IAC7B,kDAAkD;IAClD,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;IAC1B,iGAAiG;IACjG,QAAQ,CAAC,aAAa,CAAC,EAAE,SAAS,MAAM,EAAE,CAAC;IAC3C,yEAAyE;IACzE,QAAQ,CAAC,QAAQ,CAAC,EAAE,MAAM,CAAC;IAC3B,gFAAgF;IAChF,QAAQ,CAAC,SAAS,CAAC,EAAE,MAAM,CAAC;IAC5B,QAAQ,CAAC,IAAI,EAAE,YAAY,CAAC;IAC5B,QAAQ,CAAC,UAAU,EAAE,OAAO,CAAC;IAC7B,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;IAC1B,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;IAC5B,QAAQ,CAAC,aAAa,EAAE,MAAM,CAAC;IAC/B,2FAA2F;IAC3F,QAAQ,CAAC,OAAO,CAAC,EAAE,MAAM,CAAC;IAC1B;;;;;;OAMG;IACH,QAAQ,CAAC,QAAQ,CAAC,EAAE,MAAM,CAAC;CAC5B;AAED,4FAA4F;AAC5F,MAAM,WAAW,OAAO;IACtB,QAAQ,CAAC,QAAQ,CAAC,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,WAAW,CAAC,EAAE,MAAM,CAAC;IAC9B,QAAQ,CAAC,uBAAuB,CAAC,EAAE,MAAM,CAAC;IAC1C,QAAQ,CAAC,uBAAuB,CAAC,EAAE,MAAM,CAAC;CAC3C;AAED,iGAAiG;AACjG,MAAM,WAAW,WAAW;IAC1B,QAAQ,CAAC,aAAa,CAAC,EAAE,MAAM,CAAC;IAChC,QAAQ,CAAC,WAAW,CAAC,EAAE,MAAM,CAAC;IAC9B,QAAQ,CAAC,QAAQ,CAAC,EAAE,MAAM,CAAC;CAC5B;AAED,gGAAgG;AAChG,MAAM,WAAW,cAAc;IAC7B,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;IAC1B,QAAQ,CAAC,OAAO,EAAE,SAAS,cAAc,EAAE,CAAC;CAC7C;AAED,8EAA8E;AAC9E,MAAM,WAAW,qBAAqB;IACpC,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;IAC1B,QAAQ,CAAC,MAAM,EAAE,cAAc,CAAC;IAChC,QAAQ,CAAC,QAAQ,EAAE,SAAS,MAAM,EAAE,CAAC;IACrC,QAAQ,CAAC,eAAe,EAAE,MAAM,CAAC;CAClC;AAED,2GAA2G;AAC3G,MAAM,WAAW,iBAAiB;IAChC,QAAQ,CAAC,UAAU,EAAE,SAAS,qBAAqB,EAAE,CAAC;IACtD,QAAQ,CAAC,MAAM,EAAE,SAAS,cAAc,EAAE,CAAC;CAC5C;AAED,6EAA6E;AAC7E,MAAM,WAAW,oBAAoB;IACnC,QAAQ,CAAC,MAAM,EAAE,cAAc,CAAC;IAChC,QAAQ,CAAC,WAAW,EAAE,SAAS,MAAM,EAAE,CAAC;IACxC,QAAQ,CAAC,YAAY,EAAE,SAAS,MAAM,EAAE,CAAC;IACzC,QAAQ,CAAC,mBAAmB,EAAE,MAAM,CAAC;IACrC,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC;CAC9B"}