@goshenkata/dryscan-core 1.2.5 → 1.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -221,7 +221,7 @@ declare class DryScan {
221
221
  * 6. Recompute embeddings for affected units
222
222
  * 7. Update file tracking metadata
223
223
  */
224
- updateIndex(): Promise<void>;
224
+ updateIndex(): Promise<string[]>;
225
225
  /**
226
226
  * Runs duplicate detection and returns a normalized report payload ready for persistence or display.
227
227
  */
package/dist/index.js CHANGED
@@ -61,7 +61,7 @@ var DEFAULT_CONFIG = {
61
61
  excludedPairs: [],
62
62
  minLines: 3,
63
63
  minBlockLines: 5,
64
- threshold: 0.88,
64
+ threshold: 0.8,
65
65
  embeddingSource: "http://localhost:11434",
66
66
  contextLength: 2048
67
67
  };
@@ -244,7 +244,8 @@ var JavaExtractor = class {
244
244
  const fnUnit = this.buildFunctionUnit(node, source, fileRelPath, currentClass);
245
245
  const fnLength = fnUnit.endLine - fnUnit.startLine;
246
246
  const bodyNode = this.getFunctionBody(node);
247
- const skipFunction = this.shouldSkip("function" /* FUNCTION */, fnUnit.name, fnLength);
247
+ const fnArity = this.getNodeArity(node);
248
+ const skipFunction = this.shouldSkip("function" /* FUNCTION */, fnUnit.name, fnLength, fnArity);
248
249
  if (skipFunction) {
249
250
  return;
250
251
  }
@@ -311,22 +312,34 @@ var JavaExtractor = class {
311
312
  const normalized = this.normalizeCode(unit.code);
312
313
  return crypto.createHash(BLOCK_HASH_ALGO).update(normalized).digest("hex");
313
314
  }
314
- shouldSkip(unitType, name, lineCount) {
315
+ shouldSkip(unitType, name, lineCount, arity) {
315
316
  if (!this.config) {
316
317
  throw new Error("Config not loaded before skip evaluation");
317
318
  }
318
319
  const config = this.config;
319
320
  const minLines = unitType === "block" /* BLOCK */ ? Math.max(indexConfig.blockMinLines, config.minBlockLines ?? 0) : config.minLines;
320
321
  const belowMin = minLines > 0 && lineCount < minLines;
321
- const trivial = unitType === "function" /* FUNCTION */ && this.isTrivialFunction(name);
322
+ const trivial = unitType === "function" /* FUNCTION */ && this.isTrivialFunction(name, arity ?? 0);
322
323
  return belowMin || trivial;
323
324
  }
324
- isTrivialFunction(fullName) {
325
+ /**
326
+ * A function is trivial if it follows a simple accessor pattern:
327
+ * - getters/isers: name matches get[A-Z] or is[A-Z] with exactly 0 parameters
328
+ * - setters: name matches set[A-Z] with at most 1 parameter
329
+ * Methods like getUserById(Long id) have arity > 0 and are NOT trivial.
330
+ */
331
+ isTrivialFunction(fullName, arity) {
325
332
  const simpleName = fullName.split(".").pop() || fullName;
326
- const isGetter = /^(get|is)[A-Z]/.test(simpleName);
327
- const isSetter = /^set[A-Z]/.test(simpleName);
333
+ const isGetter = /^(get|is)[A-Z]/.test(simpleName) && arity === 0;
334
+ const isSetter = /^set[A-Z]/.test(simpleName) && arity <= 1;
328
335
  return isGetter || isSetter;
329
336
  }
337
+ /** Counts the formal parameters of a method or constructor node. */
338
+ getNodeArity(node) {
339
+ const params = node.childForFieldName?.("parameters");
340
+ if (!params) return 0;
341
+ return params.namedChildren.filter((c) => c.type === "formal_parameter" || c.type === "spread_parameter").length;
342
+ }
330
343
  isDtoClass(node, source, className) {
331
344
  const classBody = node.children.find((child) => child.type === "class_body");
332
345
  if (!classBody) return false;
@@ -344,7 +357,8 @@ var JavaExtractor = class {
344
357
  if (child.type === "method_declaration" || child.type === "constructor_declaration") {
345
358
  const simpleName = this.getSimpleFunctionName(child, source);
346
359
  const fullName = `${className}.${simpleName}`;
347
- if (!this.isTrivialFunction(fullName)) {
360
+ const arity = this.getNodeArity(child);
361
+ if (!this.isTrivialFunction(fullName, arity)) {
348
362
  return false;
349
363
  }
350
364
  continue;
@@ -1021,7 +1035,7 @@ var RepositoryInitializer = class {
1021
1035
  };
1022
1036
 
1023
1037
  // src/services/UpdateService.ts
1024
- import debug4 from "debug";
1038
+ import debug5 from "debug";
1025
1039
 
1026
1040
  // src/DryScanUpdater.ts
1027
1041
  import path4 from "path";
@@ -1141,11 +1155,20 @@ async function performIncrementalUpdate(repoPath, extractor, db) {
1141
1155
  }
1142
1156
 
1143
1157
  // src/services/DuplicationCache.ts
1158
+ import debug4 from "debug";
1159
+ import { cosineSimilarity } from "@langchain/core/utils/math";
1160
+ var log4 = debug4("DryScan:DuplicationCache");
1144
1161
  var DuplicationCache = class _DuplicationCache {
1145
1162
  static instance = null;
1146
1163
  comparisons = /* @__PURE__ */ new Map();
1147
1164
  fileIndex = /* @__PURE__ */ new Map();
1148
1165
  initialized = false;
1166
+ /** Per-run similarity matrix from a single batched library call (reset each run). */
1167
+ embSimMatrix = [];
1168
+ /** Maps unit ID to its row/column index in embSimMatrix. */
1169
+ embSimIndex = /* @__PURE__ */ new Map();
1170
+ /** Per-run memoization of parent unit similarity scores (reset each run). */
1171
+ parentSimCache = /* @__PURE__ */ new Map();
1149
1172
  static getInstance() {
1150
1173
  if (!_DuplicationCache.instance) {
1151
1174
  _DuplicationCache.instance = new _DuplicationCache();
@@ -1208,6 +1231,84 @@ var DuplicationCache = class _DuplicationCache {
1208
1231
  this.comparisons.clear();
1209
1232
  this.fileIndex.clear();
1210
1233
  this.initialized = false;
1234
+ this.embSimMatrix = [];
1235
+ this.embSimIndex.clear();
1236
+ this.clearRunCaches();
1237
+ }
1238
+ /**
1239
+ * Resets per-run memoization (parent similarities).
1240
+ * The embedding matrix is intentionally preserved so incremental runs can
1241
+ * reuse clean×clean values across calls.
1242
+ */
1243
+ clearRunCaches() {
1244
+ this.parentSimCache.clear();
1245
+ }
1246
+ /**
1247
+ * Builds or incrementally updates the embedding similarity matrix.
1248
+ *
1249
+ * Full rebuild (default): replaces the entire matrix — O(n²).
1250
+ * Incremental (dirtyPaths provided + prior matrix exists): copies clean×clean
1251
+ * cells from the old matrix and recomputes only dirty rows via one batched
1252
+ * cosineSimilarity call — O(d·n) where d = number of dirty units.
1253
+ */
1254
+ buildEmbSimCache(units, dirtyPaths) {
1255
+ const embedded = units.filter((u) => Array.isArray(u.embedding) && u.embedding.length > 0);
1256
+ if (embedded.length < 2) {
1257
+ this.embSimMatrix = [];
1258
+ this.embSimIndex.clear();
1259
+ return;
1260
+ }
1261
+ const embeddings = embedded.map((u) => u.embedding);
1262
+ const newIndex = new Map(embedded.map((u, i) => [u.id, i]));
1263
+ const dirtySet = dirtyPaths ? new Set(dirtyPaths) : null;
1264
+ const hasPriorMatrix = this.embSimMatrix.length > 0;
1265
+ if (!dirtySet || !hasPriorMatrix) {
1266
+ this.embSimIndex = newIndex;
1267
+ this.embSimMatrix = cosineSimilarity(embeddings, embeddings);
1268
+ log4("Built full embedding similarity matrix: %d units", embedded.length);
1269
+ return;
1270
+ }
1271
+ const dirtyIds = new Set(embedded.filter((u) => dirtySet.has(u.filePath)).map((u) => u.id));
1272
+ if (dirtyIds.size === 0) {
1273
+ log4("Matrix reused: no dirty units detected");
1274
+ return;
1275
+ }
1276
+ const n = embedded.length;
1277
+ const newMatrix = Array.from({ length: n }, () => new Array(n).fill(0));
1278
+ for (let i = 0; i < n; i++) {
1279
+ for (let j = 0; j < n; j++) {
1280
+ if (dirtyIds.has(embedded[i].id) || dirtyIds.has(embedded[j].id)) continue;
1281
+ const oi = this.embSimIndex.get(embedded[i].id);
1282
+ const oj = this.embSimIndex.get(embedded[j].id);
1283
+ if (oi !== void 0 && oj !== void 0) newMatrix[i][j] = this.embSimMatrix[oi][oj];
1284
+ }
1285
+ }
1286
+ const dirtyIndices = embedded.reduce((acc, u, i) => dirtyIds.has(u.id) ? [...acc, i] : acc, []);
1287
+ const dirtyRows = cosineSimilarity(dirtyIndices.map((i) => embeddings[i]), embeddings);
1288
+ dirtyIndices.forEach((rowIdx, di) => {
1289
+ for (let j = 0; j < n; j++) {
1290
+ newMatrix[rowIdx][j] = dirtyRows[di][j];
1291
+ newMatrix[j][rowIdx] = dirtyRows[di][j];
1292
+ }
1293
+ });
1294
+ this.embSimIndex = newIndex;
1295
+ this.embSimMatrix = newMatrix;
1296
+ log4("Incremental matrix update: %d dirty unit(s) out of %d total", dirtyIds.size, n);
1297
+ }
1298
+ /** Returns the pre-computed cosine similarity for a pair of unit IDs, if available. */
1299
+ getEmbSim(id1, id2) {
1300
+ const i = this.embSimIndex.get(id1);
1301
+ const j = this.embSimIndex.get(id2);
1302
+ if (i === void 0 || j === void 0) return void 0;
1303
+ return this.embSimMatrix[i][j];
1304
+ }
1305
+ /** Returns the memoized parent similarity for the given stable key, if available. */
1306
+ getParentSim(key) {
1307
+ return this.parentSimCache.get(key);
1308
+ }
1309
+ /** Stores a memoized parent similarity for the given stable key. */
1310
+ setParentSim(key, sim) {
1311
+ this.parentSimCache.set(key, sim);
1211
1312
  }
1212
1313
  addKeyForFile(filePath, key) {
1213
1314
  const current = this.fileIndex.get(filePath) ?? /* @__PURE__ */ new Set();
@@ -1224,145 +1325,106 @@ var DuplicationCache = class _DuplicationCache {
1224
1325
  };
1225
1326
 
1226
1327
  // src/services/UpdateService.ts
1227
- var log4 = debug4("DryScan:UpdateService");
1328
+ var log5 = debug5("DryScan:UpdateService");
1228
1329
  var UpdateService = class {
1229
1330
  constructor(deps, exclusionService) {
1230
1331
  this.deps = deps;
1231
1332
  this.exclusionService = exclusionService;
1232
1333
  }
1334
+ /** Returns the list of file paths that were modified or deleted (dirty). */
1233
1335
  async updateIndex() {
1234
1336
  const extractor = this.deps.extractor;
1235
1337
  const cache = DuplicationCache.getInstance();
1236
1338
  try {
1237
1339
  const changeSet = await performIncrementalUpdate(this.deps.repoPath, extractor, this.deps.db);
1238
1340
  await this.exclusionService.cleanupExcludedFiles();
1239
- await cache.invalidate([...changeSet.changed, ...changeSet.deleted]);
1341
+ const dirtyPaths = [...changeSet.changed, ...changeSet.deleted, ...changeSet.added];
1342
+ await cache.invalidate(dirtyPaths);
1343
+ return dirtyPaths;
1240
1344
  } catch (err) {
1241
- log4("Error during index update:", err);
1345
+ log5("Error during index update:", err);
1242
1346
  throw err;
1243
1347
  }
1244
1348
  }
1245
1349
  };
1246
1350
 
1247
1351
  // src/services/DuplicateService.ts
1248
- import debug5 from "debug";
1352
+ import debug6 from "debug";
1249
1353
  import shortUuid from "short-uuid";
1250
- import { cosineSimilarity } from "@langchain/core/utils/math";
1251
- var log5 = debug5("DryScan:DuplicateService");
1354
+ var log6 = debug6("DryScan:DuplicateService");
1252
1355
  var DuplicateService = class {
1253
1356
  constructor(deps) {
1254
1357
  this.deps = deps;
1255
1358
  }
1256
1359
  config;
1257
1360
  cache = DuplicationCache.getInstance();
1258
- //todo vetter optimisation
1259
- async findDuplicates(config) {
1361
+ /**
1362
+ * @param dirtyPaths - File paths changed since last run. When provided, only
1363
+ * dirty×all similarities are recomputed; clean×clean values are reused from
1364
+ * the existing matrix. Pass undefined (or omit) for a full rebuild.
1365
+ */
1366
+ async findDuplicates(config, dirtyPaths) {
1260
1367
  this.config = config;
1261
1368
  const t0 = performance.now();
1262
1369
  const allUnits = await this.deps.db.getAllUnits();
1263
- log5("Starting duplicate analysis on %d units", allUnits.length);
1370
+ log6("Starting duplicate analysis on %d units", allUnits.length);
1264
1371
  if (allUnits.length < 2) {
1265
- log5("Not enough units to compare, returning empty result");
1266
- const score2 = this.computeDuplicationScore([], allUnits);
1267
- return { duplicates: [], score: score2 };
1372
+ return { duplicates: [], score: this.computeDuplicationScore([], allUnits) };
1268
1373
  }
1269
1374
  const thresholds = this.resolveThresholds(config.threshold);
1270
- log5("Resolved thresholds: function=%d, block=%d, class=%d", thresholds.function, thresholds.block, thresholds.class);
1271
- const duplicates = this.computeDuplicates(allUnits, thresholds);
1272
- const filteredDuplicates = duplicates.filter((group) => !this.isGroupExcluded(group));
1273
- log5("Found %d duplicate groups (%d excluded)", filteredDuplicates.length, duplicates.length - filteredDuplicates.length);
1274
- this.cache.update(filteredDuplicates).catch((err) => log5("Cache update failed: %O", err));
1275
- const score = this.computeDuplicationScore(filteredDuplicates, allUnits);
1276
- log5("findDuplicates completed in %dms", (performance.now() - t0).toFixed(2));
1277
- return { duplicates: filteredDuplicates, score };
1375
+ const duplicates = this.computeDuplicates(allUnits, thresholds, dirtyPaths);
1376
+ const filtered = duplicates.filter((g) => !this.isGroupExcluded(g));
1377
+ log6("Found %d duplicate groups (%d excluded)", filtered.length, duplicates.length - filtered.length);
1378
+ this.cache.update(filtered).catch((err) => log6("Cache update failed: %O", err));
1379
+ const score = this.computeDuplicationScore(filtered, allUnits);
1380
+ log6("findDuplicates completed in %dms", (performance.now() - t0).toFixed(2));
1381
+ return { duplicates: filtered, score };
1278
1382
  }
1279
1383
  resolveThresholds(functionThreshold) {
1280
- const defaults = indexConfig.thresholds;
1281
- const clamp = (value) => Math.min(1, Math.max(0, value));
1282
- const base = functionThreshold ?? defaults.function;
1283
- const blockOffset = defaults.block - defaults.function;
1284
- const classOffset = defaults.class - defaults.function;
1285
- const functionThresholdValue = clamp(base);
1384
+ const d = indexConfig.thresholds;
1385
+ const clamp = (v) => Math.min(1, Math.max(0, v));
1386
+ const fn = clamp(functionThreshold ?? d.function);
1286
1387
  return {
1287
- function: functionThresholdValue,
1288
- block: clamp(functionThresholdValue + blockOffset),
1289
- class: clamp(functionThresholdValue + classOffset)
1388
+ function: fn,
1389
+ block: clamp(fn + d.block - d.function),
1390
+ class: clamp(fn + d.class - d.function)
1290
1391
  };
1291
1392
  }
1292
- computeDuplicates(units, thresholds) {
1393
+ computeDuplicates(units, thresholds, dirtyPaths) {
1394
+ this.cache.clearRunCaches();
1395
+ this.cache.buildEmbSimCache(units, dirtyPaths);
1293
1396
  const duplicates = [];
1294
- const byType = /* @__PURE__ */ new Map();
1295
- for (const unit of units) {
1296
- const list = byType.get(unit.unitType) ?? [];
1297
- list.push(unit);
1298
- byType.set(unit.unitType, list);
1299
- }
1300
1397
  const t0 = performance.now();
1301
- for (const [type, typedUnits] of byType.entries()) {
1398
+ for (const [type, typedUnits] of this.groupByType(units)) {
1302
1399
  const threshold = this.getThreshold(type, thresholds);
1303
- log5("Comparing %d units of type '%s' with threshold %d", typedUnits.length, type, threshold);
1304
- const typeStart = performance.now();
1400
+ log6("Comparing %d %s units (threshold=%.3f)", typedUnits.length, type, threshold);
1305
1401
  for (let i = 0; i < typedUnits.length; i++) {
1306
1402
  for (let j = i + 1; j < typedUnits.length; j++) {
1307
- const left = typedUnits[i];
1308
- const right = typedUnits[j];
1309
- if (this.shouldSkipComparison(left, right)) {
1310
- log5("Skipping nested block comparison: '%s' and '%s'", left.name, right.name);
1311
- continue;
1312
- }
1403
+ const left = typedUnits[i], right = typedUnits[j];
1404
+ if (this.shouldSkipComparison(left, right)) continue;
1313
1405
  const cached = this.cache.get(left.id, right.id, left.filePath, right.filePath);
1314
- let similarity = null;
1315
- if (cached !== null) {
1316
- log5("Cache hit for '%s' <-> '%s': similarity=%d", left.name, right.name, cached);
1317
- similarity = cached;
1318
- } else {
1319
- if (!left.embedding || !right.embedding) {
1320
- log5("Skipping '%s' <-> '%s': missing embedding", left.name, right.name);
1321
- continue;
1322
- }
1323
- similarity = this.computeWeightedSimilarity(left, right);
1324
- log5("Computed similarity for '%s' <-> '%s': %d", left.name, right.name, similarity);
1325
- }
1326
- if (similarity === null) continue;
1327
- if (similarity >= threshold) {
1328
- const exclusionString = this.deps.pairing.pairKeyForUnits(left, right);
1329
- if (!exclusionString) continue;
1330
- log5("Duplicate found: '%s' <-> '%s' (similarity=%d)", left.name, right.name, similarity);
1331
- duplicates.push({
1332
- id: `${left.id}::${right.id}`,
1333
- similarity,
1334
- shortId: shortUuid.generate(),
1335
- exclusionString,
1336
- left: {
1337
- id: left.id,
1338
- name: left.name,
1339
- filePath: left.filePath,
1340
- startLine: left.startLine,
1341
- endLine: left.endLine,
1342
- code: left.code,
1343
- unitType: left.unitType
1344
- },
1345
- right: {
1346
- id: right.id,
1347
- name: right.name,
1348
- filePath: right.filePath,
1349
- startLine: right.startLine,
1350
- endLine: right.endLine,
1351
- code: right.code,
1352
- unitType: right.unitType
1353
- }
1354
- });
1355
- }
1406
+ const hasEmbeddings = left.embedding?.length && right.embedding?.length;
1407
+ const similarity = cached ?? (hasEmbeddings ? this.computeWeightedSimilarity(left, right, threshold) : 0);
1408
+ if (similarity < threshold) continue;
1409
+ const exclusionString = this.deps.pairing.pairKeyForUnits(left, right);
1410
+ if (!exclusionString) continue;
1411
+ duplicates.push({
1412
+ id: `${left.id}::${right.id}`,
1413
+ similarity,
1414
+ shortId: shortUuid.generate(),
1415
+ exclusionString,
1416
+ left: this.toMember(left),
1417
+ right: this.toMember(right)
1418
+ });
1356
1419
  }
1357
1420
  }
1358
- log5("Type '%s' comparisons completed in %dms", type, (performance.now() - typeStart).toFixed(2));
1359
1421
  }
1360
- log5("computeDuplicates completed in %dms, found %d raw duplicates", (performance.now() - t0).toFixed(2), duplicates.length);
1422
+ log6("computeDuplicates: %d duplicates in %dms", duplicates.length, (performance.now() - t0).toFixed(2));
1361
1423
  return duplicates.sort((a, b) => b.similarity - a.similarity);
1362
1424
  }
1363
1425
  isGroupExcluded(group) {
1364
1426
  const config = this.config;
1365
- if (!config || !config.excludedPairs || config.excludedPairs.length === 0) return false;
1427
+ if (!config?.excludedPairs?.length) return false;
1366
1428
  const key = this.deps.pairing.pairKeyForUnits(group.left, group.right);
1367
1429
  if (!key) return false;
1368
1430
  const actual = this.deps.pairing.parsePairKey(key);
@@ -1377,109 +1439,108 @@ var DuplicateService = class {
1377
1439
  if (type === "block" /* BLOCK */) return thresholds.block;
1378
1440
  return thresholds.function;
1379
1441
  }
1380
- computeWeightedSimilarity(left, right) {
1381
- const selfSimilarity = this.similarityWithFallback(left, right);
1442
+ computeWeightedSimilarity(left, right, threshold) {
1443
+ const selfSim = this.similarity(left, right);
1382
1444
  if (left.unitType === "class" /* CLASS */) {
1383
- return selfSimilarity * indexConfig.weights.class.self;
1445
+ return selfSim * indexConfig.weights.class.self;
1384
1446
  }
1385
1447
  if (left.unitType === "function" /* FUNCTION */) {
1386
- const weights2 = indexConfig.weights.function;
1387
- const hasParentClass2 = !!this.findParentOfType(left, "class" /* CLASS */) && !!this.findParentOfType(right, "class" /* CLASS */);
1388
- const parentClassSimilarity = hasParentClass2 ? this.parentSimilarity(left, right, "class" /* CLASS */) : 0;
1389
- const totalWeight2 = weights2.self + (hasParentClass2 ? weights2.parentClass : 0);
1390
- return (weights2.self * selfSimilarity + (hasParentClass2 ? weights2.parentClass * parentClassSimilarity : 0)) / totalWeight2;
1448
+ const w2 = indexConfig.weights.function;
1449
+ const hasPC2 = this.bothHaveParent(left, right, "class" /* CLASS */);
1450
+ const total2 = w2.self + (hasPC2 ? w2.parentClass : 0);
1451
+ if ((w2.self * selfSim + (hasPC2 ? w2.parentClass : 0)) / total2 < threshold) return 0;
1452
+ return (w2.self * selfSim + (hasPC2 ? w2.parentClass * this.parentSimilarity(left, right, "class" /* CLASS */) : 0)) / total2;
1391
1453
  }
1392
- const weights = indexConfig.weights.block;
1393
- const hasParentFunction = !!this.findParentOfType(left, "function" /* FUNCTION */) && !!this.findParentOfType(right, "function" /* FUNCTION */);
1394
- const hasParentClass = !!this.findParentOfType(left, "class" /* CLASS */) && !!this.findParentOfType(right, "class" /* CLASS */);
1395
- const parentFuncSim = hasParentFunction ? this.parentSimilarity(left, right, "function" /* FUNCTION */) : 0;
1396
- const parentClassSim = hasParentClass ? this.parentSimilarity(left, right, "class" /* CLASS */) : 0;
1397
- const totalWeight = weights.self + (hasParentFunction ? weights.parentFunction : 0) + (hasParentClass ? weights.parentClass : 0);
1398
- return (weights.self * selfSimilarity + (hasParentFunction ? weights.parentFunction * parentFuncSim : 0) + (hasParentClass ? weights.parentClass * parentClassSim : 0)) / totalWeight;
1399
- }
1400
- parentSimilarity(left, right, targetType) {
1401
- const leftParent = this.findParentOfType(left, targetType);
1402
- const rightParent = this.findParentOfType(right, targetType);
1403
- if (!leftParent || !rightParent) return 0;
1404
- return this.similarityWithFallback(leftParent, rightParent);
1405
- }
1406
- similarityWithFallback(left, right) {
1407
- const leftHasEmbedding = this.hasVector(left);
1408
- const rightHasEmbedding = this.hasVector(right);
1409
- if (leftHasEmbedding && rightHasEmbedding) {
1410
- return cosineSimilarity([left.embedding], [right.embedding])[0][0];
1454
+ const w = indexConfig.weights.block;
1455
+ const hasPF = this.bothHaveParent(left, right, "function" /* FUNCTION */);
1456
+ const hasPC = this.bothHaveParent(left, right, "class" /* CLASS */);
1457
+ const total = w.self + (hasPF ? w.parentFunction : 0) + (hasPC ? w.parentClass : 0);
1458
+ if ((w.self * selfSim + (hasPF ? w.parentFunction : 0) + (hasPC ? w.parentClass : 0)) / total < threshold) return 0;
1459
+ return (w.self * selfSim + (hasPF ? w.parentFunction * this.parentSimilarity(left, right, "function" /* FUNCTION */) : 0) + (hasPC ? w.parentClass * this.parentSimilarity(left, right, "class" /* CLASS */) : 0)) / total;
1460
+ }
1461
+ /** Groups all units by type for the comparison loop. Units without embeddings are included
1462
+ * so that cache hits can still be returned for pairs whose embeddings were cleared. */
1463
+ groupByType(units) {
1464
+ const byType = /* @__PURE__ */ new Map();
1465
+ for (const unit of units) {
1466
+ const list = byType.get(unit.unitType) ?? [];
1467
+ list.push(unit);
1468
+ byType.set(unit.unitType, list);
1411
1469
  }
1412
- return this.childSimilarity(left, right);
1470
+ return byType;
1471
+ }
1472
+ toMember(unit) {
1473
+ return {
1474
+ id: unit.id,
1475
+ name: unit.name,
1476
+ filePath: unit.filePath,
1477
+ startLine: unit.startLine,
1478
+ endLine: unit.endLine,
1479
+ code: unit.code,
1480
+ unitType: unit.unitType
1481
+ };
1482
+ }
1483
+ bothHaveParent(left, right, type) {
1484
+ return !!this.findParent(left, type) && !!this.findParent(right, type);
1485
+ }
1486
+ parentSimilarity(left, right, type) {
1487
+ const lp = this.findParent(left, type), rp = this.findParent(right, type);
1488
+ if (!lp || !rp) return 0;
1489
+ const key = lp.id < rp.id ? `${lp.id}::${rp.id}` : `${rp.id}::${lp.id}`;
1490
+ const cached = this.cache.getParentSim(key);
1491
+ if (cached !== void 0) return cached;
1492
+ const sim = this.similarity(lp, rp);
1493
+ this.cache.setParentSim(key, sim);
1494
+ return sim;
1495
+ }
1496
+ /** Resolves similarity via the pre-computed embedding matrix, falling back to best child match. */
1497
+ similarity(left, right) {
1498
+ return this.cache.getEmbSim(left.id, right.id) ?? this.childSimilarity(left, right);
1413
1499
  }
1414
1500
  childSimilarity(left, right) {
1415
- const leftChildren = left.children ?? [];
1416
- const rightChildren = right.children ?? [];
1417
- if (leftChildren.length === 0 || rightChildren.length === 0) return 0;
1501
+ const lc = left.children ?? [], rc = right.children ?? [];
1502
+ if (!lc.length || !rc.length) return 0;
1418
1503
  let best = 0;
1419
- for (const lChild of leftChildren) {
1420
- for (const rChild of rightChildren) {
1421
- if (lChild.unitType !== rChild.unitType) continue;
1422
- const sim = this.similarityWithFallback(lChild, rChild);
1504
+ for (const l of lc) {
1505
+ for (const r of rc) {
1506
+ if (l.unitType !== r.unitType) continue;
1507
+ const sim = this.similarity(l, r);
1423
1508
  if (sim > best) best = sim;
1424
1509
  }
1425
1510
  }
1426
1511
  return best;
1427
1512
  }
1428
- hasVector(unit) {
1429
- return Array.isArray(unit.embedding) && unit.embedding.length > 0;
1430
- }
1431
1513
  shouldSkipComparison(left, right) {
1432
- if (left.unitType !== "block" /* BLOCK */ || right.unitType !== "block" /* BLOCK */) {
1433
- return false;
1434
- }
1435
- if (left.filePath !== right.filePath) {
1436
- return false;
1437
- }
1438
- const leftContainsRight = left.startLine <= right.startLine && left.endLine >= right.endLine;
1439
- const rightContainsLeft = right.startLine <= left.startLine && right.endLine >= left.endLine;
1440
- return leftContainsRight || rightContainsLeft;
1441
- }
1442
- findParentOfType(unit, targetType) {
1443
- let current = unit.parent;
1444
- while (current) {
1445
- if (current.unitType === targetType) return current;
1446
- current = current.parent;
1514
+ if (left.unitType !== "block" /* BLOCK */ || right.unitType !== "block" /* BLOCK */) return false;
1515
+ if (left.filePath !== right.filePath) return false;
1516
+ return left.startLine <= right.startLine && left.endLine >= right.endLine || right.startLine <= left.startLine && right.endLine >= left.endLine;
1517
+ }
1518
+ findParent(unit, type) {
1519
+ let p = unit.parent;
1520
+ while (p) {
1521
+ if (p.unitType === type) return p;
1522
+ p = p.parent;
1447
1523
  }
1448
1524
  return null;
1449
1525
  }
1450
1526
  computeDuplicationScore(duplicates, allUnits) {
1451
- const totalLines = this.calculateTotalLines(allUnits);
1452
- if (totalLines === 0 || duplicates.length === 0) {
1453
- return {
1454
- score: 0,
1455
- grade: "Excellent",
1456
- totalLines,
1457
- duplicateLines: 0,
1458
- duplicateGroups: 0
1459
- };
1527
+ const totalLines = allUnits.reduce((sum, u) => sum + u.endLine - u.startLine + 1, 0);
1528
+ if (!totalLines || !duplicates.length) {
1529
+ return { score: 0, grade: "Excellent", totalLines, duplicateLines: 0, duplicateGroups: 0 };
1460
1530
  }
1461
- const weightedDuplicateLines = duplicates.reduce((sum, group) => {
1462
- const leftLines = group.left.endLine - group.left.startLine + 1;
1463
- const rightLines = group.right.endLine - group.right.startLine + 1;
1464
- const avgLines = (leftLines + rightLines) / 2;
1465
- return sum + group.similarity * avgLines;
1531
+ const duplicateLines = duplicates.reduce((sum, g) => {
1532
+ const avg = (g.left.endLine - g.left.startLine + 1 + (g.right.endLine - g.right.startLine + 1)) / 2;
1533
+ return sum + g.similarity * avg;
1466
1534
  }, 0);
1467
- const score = weightedDuplicateLines / totalLines * 100;
1468
- const grade = this.getScoreGrade(score);
1535
+ const score = duplicateLines / totalLines * 100;
1469
1536
  return {
1470
1537
  score,
1471
- grade,
1538
+ grade: this.getScoreGrade(score),
1472
1539
  totalLines,
1473
- duplicateLines: Math.round(weightedDuplicateLines),
1540
+ duplicateLines: Math.round(duplicateLines),
1474
1541
  duplicateGroups: duplicates.length
1475
1542
  };
1476
1543
  }
1477
- calculateTotalLines(units) {
1478
- return units.reduce((sum, unit) => {
1479
- const lines = unit.endLine - unit.startLine + 1;
1480
- return sum + lines;
1481
- }, 0);
1482
- }
1483
1544
  getScoreGrade(score) {
1484
1545
  if (score < 5) return "Excellent";
1485
1546
  if (score < 15) return "Good";
@@ -1575,9 +1636,9 @@ var ExclusionService = class {
1575
1636
 
1576
1637
  // src/services/PairingService.ts
1577
1638
  import crypto3 from "crypto";
1578
- import debug6 from "debug";
1639
+ import debug7 from "debug";
1579
1640
  import { minimatch as minimatch2 } from "minimatch";
1580
- var log6 = debug6("DryScan:pairs");
1641
+ var log7 = debug7("DryScan:pairs");
1581
1642
  var PairingService = class {
1582
1643
  constructor(indexUnitExtractor) {
1583
1644
  this.indexUnitExtractor = indexUnitExtractor;
@@ -1588,7 +1649,7 @@ var PairingService = class {
1588
1649
  */
1589
1650
  pairKeyForUnits(left, right) {
1590
1651
  if (left.unitType !== right.unitType) {
1591
- log6("Skipping pair with mismatched types: %s vs %s", left.unitType, right.unitType);
1652
+ log7("Skipping pair with mismatched types: %s vs %s", left.unitType, right.unitType);
1592
1653
  return null;
1593
1654
  }
1594
1655
  const type = left.unitType;
@@ -1604,13 +1665,13 @@ var PairingService = class {
1604
1665
  parsePairKey(value) {
1605
1666
  const parts = value.split("|");
1606
1667
  if (parts.length !== 3) {
1607
- log6("Invalid pair key format: %s", value);
1668
+ log7("Invalid pair key format: %s", value);
1608
1669
  return null;
1609
1670
  }
1610
1671
  const [typeRaw, leftRaw, rightRaw] = parts;
1611
1672
  const type = this.stringToUnitType(typeRaw);
1612
1673
  if (!type) {
1613
- log6("Unknown unit type in pair key: %s", typeRaw);
1674
+ log7("Unknown unit type in pair key: %s", typeRaw);
1614
1675
  return null;
1615
1676
  }
1616
1677
  const [left, right] = [leftRaw, rightRaw].sort();
@@ -1744,9 +1805,10 @@ var DryScan = class {
1744
1805
  console.log("[DryScan] Checking for file changes...");
1745
1806
  const start = Date.now();
1746
1807
  await this.ensureDatabase();
1747
- await this.services.updater.updateIndex();
1808
+ const dirtyPaths = await this.services.updater.updateIndex();
1748
1809
  const duration = Date.now() - start;
1749
1810
  console.log(`[DryScan] Index update complete. Took ${duration}ms.`);
1811
+ return dirtyPaths;
1750
1812
  }
1751
1813
  /**
1752
1814
  * Runs duplicate detection and returns a normalized report payload ready for persistence or display.
@@ -1775,12 +1837,12 @@ var DryScan = class {
1775
1837
  await this.ensureDatabase();
1776
1838
  console.log("[DryScan] Updating index...");
1777
1839
  const updateStart = Date.now();
1778
- await this.updateIndex();
1840
+ const dirtyPaths = await this.updateIndex();
1779
1841
  const updateDuration = Date.now() - updateStart;
1780
1842
  console.log(`[DryScan] Index update took ${updateDuration}ms.`);
1781
1843
  console.log("[DryScan] Detecting duplicates...");
1782
1844
  const dupStart = Date.now();
1783
- const result = await this.services.duplicate.findDuplicates(config);
1845
+ const result = await this.services.duplicate.findDuplicates(config, dirtyPaths);
1784
1846
  const dupDuration = Date.now() - dupStart;
1785
1847
  console.log(`[DryScan] Duplicate detection took ${dupDuration}ms.`);
1786
1848
  return result;