@goshenkata/dryscan-core 1.2.4 → 1.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +1 -1
- package/dist/index.js +267 -167
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/src/DryScan.ts +5 -4
- package/src/config/dryconfig.ts +1 -1
- package/src/db/DryScanDatabase.ts +1 -1
- package/src/extractors/java.ts +47 -12
- package/src/services/DuplicateService.ts +136 -166
- package/src/services/DuplicationCache.ts +107 -1
- package/src/services/UpdateService.ts +5 -2
|
@@ -1,4 +1,8 @@
|
|
|
1
|
-
import
|
|
1
|
+
import debug from "debug";
|
|
2
|
+
import { cosineSimilarity } from "@langchain/core/utils/math";
|
|
3
|
+
import { DuplicateGroup, IndexUnit } from "../types";
|
|
4
|
+
|
|
5
|
+
const log = debug("DryScan:DuplicationCache");
|
|
2
6
|
|
|
3
7
|
/**
|
|
4
8
|
* In-memory cache for duplicate comparison scores.
|
|
@@ -11,6 +15,13 @@ export class DuplicationCache {
|
|
|
11
15
|
private readonly fileIndex = new Map<string, Set<string>>();
|
|
12
16
|
private initialized = false;
|
|
13
17
|
|
|
18
|
+
/** Per-run similarity matrix from a single batched library call (reset each run). */
|
|
19
|
+
private embSimMatrix: number[][] = [];
|
|
20
|
+
/** Maps unit ID to its row/column index in embSimMatrix. */
|
|
21
|
+
private embSimIndex = new Map<string, number>();
|
|
22
|
+
/** Per-run memoization of parent unit similarity scores (reset each run). */
|
|
23
|
+
private parentSimCache = new Map<string, number>();
|
|
24
|
+
|
|
14
25
|
static getInstance(): DuplicationCache {
|
|
15
26
|
if (!DuplicationCache.instance) {
|
|
16
27
|
DuplicationCache.instance = new DuplicationCache();
|
|
@@ -85,6 +96,101 @@ export class DuplicationCache {
|
|
|
85
96
|
this.comparisons.clear();
|
|
86
97
|
this.fileIndex.clear();
|
|
87
98
|
this.initialized = false;
|
|
99
|
+
this.embSimMatrix = [];
|
|
100
|
+
this.embSimIndex.clear();
|
|
101
|
+
this.clearRunCaches();
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* Resets per-run memoization (parent similarities).
|
|
106
|
+
* The embedding matrix is intentionally preserved so incremental runs can
|
|
107
|
+
* reuse clean×clean values across calls.
|
|
108
|
+
*/
|
|
109
|
+
clearRunCaches(): void {
|
|
110
|
+
this.parentSimCache.clear();
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
/**
|
|
114
|
+
* Builds or incrementally updates the embedding similarity matrix.
|
|
115
|
+
*
|
|
116
|
+
* Full rebuild (default): replaces the entire matrix — O(n²).
|
|
117
|
+
* Incremental (dirtyPaths provided + prior matrix exists): copies clean×clean
|
|
118
|
+
* cells from the old matrix and recomputes only dirty rows via one batched
|
|
119
|
+
* cosineSimilarity call — O(d·n) where d = number of dirty units.
|
|
120
|
+
*/
|
|
121
|
+
buildEmbSimCache(units: IndexUnit[], dirtyPaths?: string[]): void {
|
|
122
|
+
const embedded = units.filter(u => Array.isArray(u.embedding) && u.embedding.length > 0);
|
|
123
|
+
if (embedded.length < 2) {
|
|
124
|
+
this.embSimMatrix = [];
|
|
125
|
+
this.embSimIndex.clear();
|
|
126
|
+
return;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
const embeddings = embedded.map(u => u.embedding as number[]);
|
|
130
|
+
const newIndex = new Map(embedded.map((u, i) => [u.id, i] as [string, number]));
|
|
131
|
+
const dirtySet = dirtyPaths ? new Set(dirtyPaths) : null;
|
|
132
|
+
const hasPriorMatrix = this.embSimMatrix.length > 0;
|
|
133
|
+
|
|
134
|
+
if (!dirtySet || !hasPriorMatrix) {
|
|
135
|
+
// Full rebuild
|
|
136
|
+
this.embSimIndex = newIndex;
|
|
137
|
+
this.embSimMatrix = cosineSimilarity(embeddings, embeddings);
|
|
138
|
+
log("Built full embedding similarity matrix: %d units", embedded.length);
|
|
139
|
+
return;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// Incremental: identify dirty unit IDs
|
|
143
|
+
const dirtyIds = new Set(embedded.filter(u => dirtySet.has(u.filePath)).map(u => u.id));
|
|
144
|
+
|
|
145
|
+
if (dirtyIds.size === 0) {
|
|
146
|
+
log("Matrix reused: no dirty units detected");
|
|
147
|
+
return;
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
const n = embedded.length;
|
|
151
|
+
|
|
152
|
+
// Start with zeroes; copy clean×clean values from prior matrix
|
|
153
|
+
const newMatrix: number[][] = Array.from({ length: n }, () => new Array(n).fill(0));
|
|
154
|
+
for (let i = 0; i < n; i++) {
|
|
155
|
+
for (let j = 0; j < n; j++) {
|
|
156
|
+
if (dirtyIds.has(embedded[i].id) || dirtyIds.has(embedded[j].id)) continue;
|
|
157
|
+
const oi = this.embSimIndex.get(embedded[i].id);
|
|
158
|
+
const oj = this.embSimIndex.get(embedded[j].id);
|
|
159
|
+
if (oi !== undefined && oj !== undefined) newMatrix[i][j] = this.embSimMatrix[oi][oj];
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
// Recompute dirty rows in one batched call
|
|
164
|
+
const dirtyIndices = embedded.reduce<number[]>((acc, u, i) => (dirtyIds.has(u.id) ? [...acc, i] : acc), []);
|
|
165
|
+
const dirtyRows = cosineSimilarity(dirtyIndices.map(i => embeddings[i]), embeddings);
|
|
166
|
+
dirtyIndices.forEach((rowIdx, di) => {
|
|
167
|
+
for (let j = 0; j < n; j++) {
|
|
168
|
+
newMatrix[rowIdx][j] = dirtyRows[di][j];
|
|
169
|
+
newMatrix[j][rowIdx] = dirtyRows[di][j];
|
|
170
|
+
}
|
|
171
|
+
});
|
|
172
|
+
|
|
173
|
+
this.embSimIndex = newIndex;
|
|
174
|
+
this.embSimMatrix = newMatrix;
|
|
175
|
+
log("Incremental matrix update: %d dirty unit(s) out of %d total", dirtyIds.size, n);
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
/** Returns the pre-computed cosine similarity for a pair of unit IDs, if available. */
|
|
179
|
+
getEmbSim(id1: string, id2: string): number | undefined {
|
|
180
|
+
const i = this.embSimIndex.get(id1);
|
|
181
|
+
const j = this.embSimIndex.get(id2);
|
|
182
|
+
if (i === undefined || j === undefined) return undefined;
|
|
183
|
+
return this.embSimMatrix[i][j];
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
/** Returns the memoized parent similarity for the given stable key, if available. */
|
|
187
|
+
getParentSim(key: string): number | undefined {
|
|
188
|
+
return this.parentSimCache.get(key);
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
/** Stores a memoized parent similarity for the given stable key. */
|
|
192
|
+
setParentSim(key: string, sim: number): void {
|
|
193
|
+
this.parentSimCache.set(key, sim);
|
|
88
194
|
}
|
|
89
195
|
|
|
90
196
|
private addKeyForFile(filePath: string, key: string): void {
|
|
@@ -12,14 +12,17 @@ export class UpdateService {
|
|
|
12
12
|
private readonly exclusionService: ExclusionService
|
|
13
13
|
) {}
|
|
14
14
|
|
|
15
|
-
|
|
15
|
+
/** Returns the list of file paths that were modified or deleted (dirty). */
|
|
16
|
+
async updateIndex(): Promise<string[]> {
|
|
16
17
|
const extractor = this.deps.extractor;
|
|
17
18
|
const cache = DuplicationCache.getInstance();
|
|
18
19
|
|
|
19
20
|
try {
|
|
20
21
|
const changeSet = await performIncrementalUpdate(this.deps.repoPath, extractor, this.deps.db);
|
|
21
22
|
await this.exclusionService.cleanupExcludedFiles();
|
|
22
|
-
|
|
23
|
+
const dirtyPaths = [...changeSet.changed, ...changeSet.deleted, ...changeSet.added];
|
|
24
|
+
await cache.invalidate(dirtyPaths);
|
|
25
|
+
return dirtyPaths;
|
|
23
26
|
} catch (err) {
|
|
24
27
|
log("Error during index update:", err);
|
|
25
28
|
throw err;
|