@goshenkata/dryscan-core 1.2.4 → 1.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,8 @@
1
- import { DuplicateGroup } from "../types";
1
+ import debug from "debug";
2
+ import { cosineSimilarity } from "@langchain/core/utils/math";
3
+ import { DuplicateGroup, IndexUnit } from "../types";
4
+
5
+ const log = debug("DryScan:DuplicationCache");
2
6
 
3
7
  /**
4
8
  * In-memory cache for duplicate comparison scores.
@@ -11,6 +15,13 @@ export class DuplicationCache {
11
15
  private readonly fileIndex = new Map<string, Set<string>>();
12
16
  private initialized = false;
13
17
 
18
+ /** Per-run similarity matrix from a single batched library call (reset each run). */
19
+ private embSimMatrix: number[][] = [];
20
+ /** Maps unit ID to its row/column index in embSimMatrix. */
21
+ private embSimIndex = new Map<string, number>();
22
+ /** Per-run memoization of parent unit similarity scores (reset each run). */
23
+ private parentSimCache = new Map<string, number>();
24
+
14
25
  static getInstance(): DuplicationCache {
15
26
  if (!DuplicationCache.instance) {
16
27
  DuplicationCache.instance = new DuplicationCache();
@@ -85,6 +96,101 @@ export class DuplicationCache {
85
96
  this.comparisons.clear();
86
97
  this.fileIndex.clear();
87
98
  this.initialized = false;
99
+ this.embSimMatrix = [];
100
+ this.embSimIndex.clear();
101
+ this.clearRunCaches();
102
+ }
103
+
104
+ /**
105
+ * Resets per-run memoization (parent similarities).
106
+ * The embedding matrix is intentionally preserved so incremental runs can
107
+ * reuse clean×clean values across calls.
108
+ */
109
+ clearRunCaches(): void {
110
+ this.parentSimCache.clear();
111
+ }
112
+
113
+ /**
114
+ * Builds or incrementally updates the embedding similarity matrix.
115
+ *
116
+ * Full rebuild (default): replaces the entire matrix — O(n²).
117
+ * Incremental (dirtyPaths provided + prior matrix exists): copies clean×clean
118
+ * cells from the old matrix and recomputes only dirty rows via one batched
119
+ * cosineSimilarity call — O(d·n) where d = number of dirty units.
120
+ */
121
+ buildEmbSimCache(units: IndexUnit[], dirtyPaths?: string[]): void {
122
+ const embedded = units.filter(u => Array.isArray(u.embedding) && u.embedding.length > 0);
123
+ if (embedded.length < 2) {
124
+ this.embSimMatrix = [];
125
+ this.embSimIndex.clear();
126
+ return;
127
+ }
128
+
129
+ const embeddings = embedded.map(u => u.embedding as number[]);
130
+ const newIndex = new Map(embedded.map((u, i) => [u.id, i] as [string, number]));
131
+ const dirtySet = dirtyPaths ? new Set(dirtyPaths) : null;
132
+ const hasPriorMatrix = this.embSimMatrix.length > 0;
133
+
134
+ if (!dirtySet || !hasPriorMatrix) {
135
+ // Full rebuild
136
+ this.embSimIndex = newIndex;
137
+ this.embSimMatrix = cosineSimilarity(embeddings, embeddings);
138
+ log("Built full embedding similarity matrix: %d units", embedded.length);
139
+ return;
140
+ }
141
+
142
+ // Incremental: identify dirty unit IDs
143
+ const dirtyIds = new Set(embedded.filter(u => dirtySet.has(u.filePath)).map(u => u.id));
144
+
145
+ if (dirtyIds.size === 0) {
146
+ log("Matrix reused: no dirty units detected");
147
+ return;
148
+ }
149
+
150
+ const n = embedded.length;
151
+
152
+ // Start with zeroes; copy clean×clean values from prior matrix
153
+ const newMatrix: number[][] = Array.from({ length: n }, () => new Array(n).fill(0));
154
+ for (let i = 0; i < n; i++) {
155
+ for (let j = 0; j < n; j++) {
156
+ if (dirtyIds.has(embedded[i].id) || dirtyIds.has(embedded[j].id)) continue;
157
+ const oi = this.embSimIndex.get(embedded[i].id);
158
+ const oj = this.embSimIndex.get(embedded[j].id);
159
+ if (oi !== undefined && oj !== undefined) newMatrix[i][j] = this.embSimMatrix[oi][oj];
160
+ }
161
+ }
162
+
163
+ // Recompute dirty rows in one batched call
164
+ const dirtyIndices = embedded.reduce<number[]>((acc, u, i) => (dirtyIds.has(u.id) ? [...acc, i] : acc), []);
165
+ const dirtyRows = cosineSimilarity(dirtyIndices.map(i => embeddings[i]), embeddings);
166
+ dirtyIndices.forEach((rowIdx, di) => {
167
+ for (let j = 0; j < n; j++) {
168
+ newMatrix[rowIdx][j] = dirtyRows[di][j];
169
+ newMatrix[j][rowIdx] = dirtyRows[di][j];
170
+ }
171
+ });
172
+
173
+ this.embSimIndex = newIndex;
174
+ this.embSimMatrix = newMatrix;
175
+ log("Incremental matrix update: %d dirty unit(s) out of %d total", dirtyIds.size, n);
176
+ }
177
+
178
+ /** Returns the pre-computed cosine similarity for a pair of unit IDs, if available. */
179
+ getEmbSim(id1: string, id2: string): number | undefined {
180
+ const i = this.embSimIndex.get(id1);
181
+ const j = this.embSimIndex.get(id2);
182
+ if (i === undefined || j === undefined) return undefined;
183
+ return this.embSimMatrix[i][j];
184
+ }
185
+
186
+ /** Returns the memoized parent similarity for the given stable key, if available. */
187
+ getParentSim(key: string): number | undefined {
188
+ return this.parentSimCache.get(key);
189
+ }
190
+
191
+ /** Stores a memoized parent similarity for the given stable key. */
192
+ setParentSim(key: string, sim: number): void {
193
+ this.parentSimCache.set(key, sim);
88
194
  }
89
195
 
90
196
  private addKeyForFile(filePath: string, key: string): void {
@@ -12,14 +12,17 @@ export class UpdateService {
12
12
  private readonly exclusionService: ExclusionService
13
13
  ) {}
14
14
 
15
- async updateIndex(): Promise<void> {
15
+ /** Returns the list of file paths that were modified or deleted (dirty). */
16
+ async updateIndex(): Promise<string[]> {
16
17
  const extractor = this.deps.extractor;
17
18
  const cache = DuplicationCache.getInstance();
18
19
 
19
20
  try {
20
21
  const changeSet = await performIncrementalUpdate(this.deps.repoPath, extractor, this.deps.db);
21
22
  await this.exclusionService.cleanupExcludedFiles();
22
- await cache.invalidate([...changeSet.changed, ...changeSet.deleted]);
23
+ const dirtyPaths = [...changeSet.changed, ...changeSet.deleted, ...changeSet.added];
24
+ await cache.invalidate(dirtyPaths);
25
+ return dirtyPaths;
23
26
  } catch (err) {
24
27
  log("Error during index update:", err);
25
28
  throw err;